src/arch/gcn3/insts/instructions.cc - public/gem5 - Git at Google

 /*
  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Authors: Anthony Gutierrez
  */

 #include "arch/gcn3/insts/instructions.hh"

 #include <cmath>

 #include "arch/gcn3/insts/inst_util.hh"
 #include "debug/GCN3.hh"
 #include "debug/GPUSync.hh"
 #include "gpu-compute/shader.hh"

 namespace Gcn3ISA
 {

     Inst_SOP2__S_ADD_U32::Inst_SOP2__S_ADD_U32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_add_u32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ADD_U32

     Inst_SOP2__S_ADD_U32::~Inst_SOP2__S_ADD_U32()
     {
     } // ~Inst_SOP2__S_ADD_U32

     // D.u = S0.u + S1.u;
     // SCC = (S0.u + S1.u >= 0x100000000ULL ? 1 : 0) is an unsigned
     // overflow/carry-out.
     void
     Inst_SOP2__S_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() + src1.rawData();
         scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData())
             >= 0x100000000ULL ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_SUB_U32::Inst_SOP2__S_SUB_U32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_sub_u32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_SUB_U32

     Inst_SOP2__S_SUB_U32::~Inst_SOP2__S_SUB_U32()
     {
     } // ~Inst_SOP2__S_SUB_U32

     // D.u = S0.u - S1.u;
     // SCC = (S1.u > S0.u ? 1 : 0) is an unsigned overflow or carry-out.
     void
     Inst_SOP2__S_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() - src1.rawData();
         scc = (src1.rawData() > src0.rawData()) ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_ADD_I32::Inst_SOP2__S_ADD_I32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_add_i32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ADD_I32

     Inst_SOP2__S_ADD_I32::~Inst_SOP2__S_ADD_I32()
     {
     } // ~Inst_SOP2__S_ADD_I32

     // D.i = S0.i + S1.i;
     // SCC = (S0.u[31] == S1.u[31] && S0.u[31] != D.u[31]) is a signed
     // overflow.
     void
     Inst_SOP2__S_ADD_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() + src1.rawData();
         scc = (bits(src0.rawData(), 31) == bits(src1.rawData(), 31)
             && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31))
             ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_SUB_I32::Inst_SOP2__S_SUB_I32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_sub_i32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_SUB_I32

     Inst_SOP2__S_SUB_I32::~Inst_SOP2__S_SUB_I32()
     {
     } // ~Inst_SOP2__S_SUB_I32

     // D.i = S0.i - S1.i;
     // SCC = (S0.u[31] != S1.u[31] && S0.u[31] != D.u[31]) is a signed
     // overflow.
     void
     Inst_SOP2__S_SUB_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() - src1.rawData();
         scc = (bits(src0.rawData(), 31) != bits(src1.rawData(), 31)
             && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_ADDC_U32::Inst_SOP2__S_ADDC_U32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_addc_u32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ADDC_U32

     Inst_SOP2__S_ADDC_U32::~Inst_SOP2__S_ADDC_U32()
     {
     } // ~Inst_SOP2__S_ADDC_U32

     // D.u = S0.u + S1.u + SCC;
     // SCC = (S0.u + S1.u + SCC >= 0x100000000ULL ? 1 : 0) is an unsigned
     // overflow.
     void
     Inst_SOP2__S_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();
         scc.read();

         sdst = src0.rawData() + src1.rawData() + scc.rawData();
         scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData()
             + (ScalarRegU64)scc.rawData()) >= 0x100000000ULL ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_SUBB_U32::Inst_SOP2__S_SUBB_U32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_subb_u32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_SUBB_U32

     Inst_SOP2__S_SUBB_U32::~Inst_SOP2__S_SUBB_U32()
     {
     } // ~Inst_SOP2__S_SUBB_U32

     // D.u = S0.u - S1.u - SCC;
     // SCC = (S1.u + SCC > S0.u ? 1 : 0) is an unsigned overflow.
     void
     Inst_SOP2__S_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();
         scc.read();

         sdst = src0.rawData() - src1.rawData() - scc.rawData();
         scc = (src1.rawData() + scc.rawData()) > src0.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_MIN_I32::Inst_SOP2__S_MIN_I32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_min_i32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_MIN_I32

     Inst_SOP2__S_MIN_I32::~Inst_SOP2__S_MIN_I32()
     {
     } // ~Inst_SOP2__S_MIN_I32

     // D.i = (S0.i < S1.i) ? S0.i : S1.i;
     // SCC = 1 if S0 is chosen as the minimum value.
     void
     Inst_SOP2__S_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = std::min(src0.rawData(), src1.rawData());
         scc = (src0.rawData() < src1.rawData()) ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_MIN_U32::Inst_SOP2__S_MIN_U32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_min_u32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_MIN_U32

     Inst_SOP2__S_MIN_U32::~Inst_SOP2__S_MIN_U32()
     {
     } // ~Inst_SOP2__S_MIN_U32

     // D.u = (S0.u < S1.u) ? S0.u : S1.u;
     // SCC = 1 if S0 is chosen as the minimum value.
     void
     Inst_SOP2__S_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = std::min(src0.rawData(), src1.rawData());
         scc = (src0.rawData() < src1.rawData()) ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_MAX_I32::Inst_SOP2__S_MAX_I32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_max_i32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_MAX_I32

     Inst_SOP2__S_MAX_I32::~Inst_SOP2__S_MAX_I32()
     {
     } // ~Inst_SOP2__S_MAX_I32

     // D.i = (S0.i > S1.i) ? S0.i : S1.i;
     // SCC = 1 if S0 is chosen as the maximum value.
     void
     Inst_SOP2__S_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = std::max(src0.rawData(), src1.rawData());
         scc = (src0.rawData() > src1.rawData()) ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_MAX_U32::Inst_SOP2__S_MAX_U32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_max_u32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_MAX_U32

     Inst_SOP2__S_MAX_U32::~Inst_SOP2__S_MAX_U32()
     {
     } // ~Inst_SOP2__S_MAX_U32

     // D.u = (S0.u > S1.u) ? S0.u : S1.u;
     // SCC = 1 if S0 is chosen as the maximum value.
     void
     Inst_SOP2__S_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = std::max(src0.rawData(), src1.rawData());
         scc = (src0.rawData() > src1.rawData()) ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_CSELECT_B32::Inst_SOP2__S_CSELECT_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_cselect_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_CSELECT_B32

     Inst_SOP2__S_CSELECT_B32::~Inst_SOP2__S_CSELECT_B32()
     {
     } // ~Inst_SOP2__S_CSELECT_B32

     // D.u = SCC ? S0.u : S1.u (conditional select).
     void
     Inst_SOP2__S_CSELECT_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();
         scc.read();

         sdst = scc.rawData() ? src0.rawData() : src1.rawData();

         sdst.write();
     }

     Inst_SOP2__S_CSELECT_B64::Inst_SOP2__S_CSELECT_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_cselect_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_CSELECT_B64

     Inst_SOP2__S_CSELECT_B64::~Inst_SOP2__S_CSELECT_B64()
     {
     } // ~Inst_SOP2__S_CSELECT_B64

     // D.u64 = SCC ? S0.u64 : S1.u64 (conditional select).
     void
     Inst_SOP2__S_CSELECT_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();
         scc.read();

         sdst = scc.rawData() ? src0.rawData() : src1.rawData();

         sdst.write();
     }

     Inst_SOP2__S_AND_B32::Inst_SOP2__S_AND_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_and_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_AND_B32

     Inst_SOP2__S_AND_B32::~Inst_SOP2__S_AND_B32()
     {
     } // ~Inst_SOP2__S_AND_B32

     // D.u = S0.u & S1.u;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_AND_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() & src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_AND_B64::Inst_SOP2__S_AND_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_and_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_AND_B64

     Inst_SOP2__S_AND_B64::~Inst_SOP2__S_AND_B64()
     {
     } // ~Inst_SOP2__S_AND_B64

     // D.u64 = S0.u64 & S1.u64;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_AND_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() & src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_OR_B32::Inst_SOP2__S_OR_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_or_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_OR_B32

     Inst_SOP2__S_OR_B32::~Inst_SOP2__S_OR_B32()
     {
     } // ~Inst_SOP2__S_OR_B32

     // D.u = S0.u | S1.u;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_OR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() | src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_OR_B64::Inst_SOP2__S_OR_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_or_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_OR_B64

     Inst_SOP2__S_OR_B64::~Inst_SOP2__S_OR_B64()
     {
     } // ~Inst_SOP2__S_OR_B64

     // D.u64 = S0.u64 | S1.u64;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_OR_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() | src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_XOR_B32::Inst_SOP2__S_XOR_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_xor_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_XOR_B32

     Inst_SOP2__S_XOR_B32::~Inst_SOP2__S_XOR_B32()
     {
     } // ~Inst_SOP2__S_XOR_B32

     // D.u = S0.u ^ S1.u;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() ^ src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_XOR_B64::Inst_SOP2__S_XOR_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_xor_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_XOR_B64

     Inst_SOP2__S_XOR_B64::~Inst_SOP2__S_XOR_B64()
     {
     } // ~Inst_SOP2__S_XOR_B64

     // D.u64 = S0.u64 ^ S1.u64;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() ^ src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_ANDN2_B32::Inst_SOP2__S_ANDN2_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_andn2_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ANDN2_B32

     Inst_SOP2__S_ANDN2_B32::~Inst_SOP2__S_ANDN2_B32()
     {
     } // ~Inst_SOP2__S_ANDN2_B32

     // D.u = S0.u & ~S1.u;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_ANDN2_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() &~ src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_ANDN2_B64::Inst_SOP2__S_ANDN2_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_andn2_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ANDN2_B64

     Inst_SOP2__S_ANDN2_B64::~Inst_SOP2__S_ANDN2_B64()
     {
     } // ~Inst_SOP2__S_ANDN2_B64

     // D.u64 = S0.u64 & ~S1.u64;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_ANDN2_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() &~ src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_ORN2_B32::Inst_SOP2__S_ORN2_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_orn2_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ORN2_B32

     Inst_SOP2__S_ORN2_B32::~Inst_SOP2__S_ORN2_B32()
     {
     } // ~Inst_SOP2__S_ORN2_B32

     // D.u = S0.u | ~S1.u;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_ORN2_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() |~ src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_ORN2_B64::Inst_SOP2__S_ORN2_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_orn2_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ORN2_B64

     Inst_SOP2__S_ORN2_B64::~Inst_SOP2__S_ORN2_B64()
     {
     } // ~Inst_SOP2__S_ORN2_B64

     // D.u64 = S0.u64 | ~S1.u64;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_ORN2_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = src0.rawData() |~ src1.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_NAND_B32::Inst_SOP2__S_NAND_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_nand_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_NAND_B32

     Inst_SOP2__S_NAND_B32::~Inst_SOP2__S_NAND_B32()
     {
     } // ~Inst_SOP2__S_NAND_B32

     // D.u = ~(S0.u & S1.u);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_NAND_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = ~(src0.rawData() & src1.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_NAND_B64::Inst_SOP2__S_NAND_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_nand_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_NAND_B64

     Inst_SOP2__S_NAND_B64::~Inst_SOP2__S_NAND_B64()
     {
     } // ~Inst_SOP2__S_NAND_B64

     // D.u64 = ~(S0.u64 & S1.u64);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_NAND_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = ~(src0.rawData() & src1.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_NOR_B32::Inst_SOP2__S_NOR_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_nor_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_NOR_B32

     Inst_SOP2__S_NOR_B32::~Inst_SOP2__S_NOR_B32()
     {
     } // ~Inst_SOP2__S_NOR_B32

     // D.u = ~(S0.u | S1.u);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_NOR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = ~(src0.rawData() | src1.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_NOR_B64::Inst_SOP2__S_NOR_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_nor_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_NOR_B64

     Inst_SOP2__S_NOR_B64::~Inst_SOP2__S_NOR_B64()
     {
     } // ~Inst_SOP2__S_NOR_B64

     // D.u64 = ~(S0.u64 | S1.u64);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_NOR_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = ~(src0.rawData() | src1.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_XNOR_B32::Inst_SOP2__S_XNOR_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_xnor_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_XNOR_B32

     Inst_SOP2__S_XNOR_B32::~Inst_SOP2__S_XNOR_B32()
     {
     } // ~Inst_SOP2__S_XNOR_B32

     // D.u = ~(S0.u ^ S1.u);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = ~(src0.rawData() ^ src1.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_XNOR_B64::Inst_SOP2__S_XNOR_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_xnor_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_XNOR_B64

     Inst_SOP2__S_XNOR_B64::~Inst_SOP2__S_XNOR_B64()
     {
     } // ~Inst_SOP2__S_XNOR_B64

     // D.u64 = ~(S0.u64 ^ S1.u64);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_XNOR_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = ~(src0.rawData() ^ src1.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_LSHL_B32::Inst_SOP2__S_LSHL_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_lshl_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_LSHL_B32

     Inst_SOP2__S_LSHL_B32::~Inst_SOP2__S_LSHL_B32()
     {
     } // ~Inst_SOP2__S_LSHL_B32

     // D.u = S0.u << S1.u[4:0];
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_LSHL_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() << bits(src1.rawData(), 4, 0));
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_LSHL_B64::Inst_SOP2__S_LSHL_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_lshl_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_LSHL_B64

     Inst_SOP2__S_LSHL_B64::~Inst_SOP2__S_LSHL_B64()
     {
     } // ~Inst_SOP2__S_LSHL_B64

     // D.u64 = S0.u64 << S1.u[5:0];
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_LSHL_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() << bits(src1.rawData(), 5, 0));
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_LSHR_B32::Inst_SOP2__S_LSHR_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_lshr_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_LSHR_B32

     Inst_SOP2__S_LSHR_B32::~Inst_SOP2__S_LSHR_B32()
     {
     } // ~Inst_SOP2__S_LSHR_B32

     // D.u = S0.u >> S1.u[4:0];
     // SCC = 1 if result is non-zero.
     // The vacated bits are set to zero.
     void
     Inst_SOP2__S_LSHR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_LSHR_B64::Inst_SOP2__S_LSHR_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_lshr_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_LSHR_B64

     Inst_SOP2__S_LSHR_B64::~Inst_SOP2__S_LSHR_B64()
     {
     } // ~Inst_SOP2__S_LSHR_B64

     // D.u64 = S0.u64 >> S1.u[5:0];
     // SCC = 1 if result is non-zero.
     // The vacated bits are set to zero.
     void
     Inst_SOP2__S_LSHR_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_ASHR_I32::Inst_SOP2__S_ASHR_I32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_ashr_i32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ASHR_I32

     Inst_SOP2__S_ASHR_I32::~Inst_SOP2__S_ASHR_I32()
     {
     } // ~Inst_SOP2__S_ASHR_I32

     // D.i = signext(S0.i) >> S1.u[4:0];
     // SCC = 1 if result is non-zero.
     // The vacated bits are set to the sign bit of the input value.
     void
     Inst_SOP2__S_ASHR_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_ASHR_I64::Inst_SOP2__S_ASHR_I64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_ashr_i64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ASHR_I64

     Inst_SOP2__S_ASHR_I64::~Inst_SOP2__S_ASHR_I64()
     {
     } // ~Inst_SOP2__S_ASHR_I64

     // D.i64 = signext(S0.i64) >> S1.u[5:0];
     // SCC = 1 if result is non-zero.
     // The vacated bits are set to the sign bit of the input value.
     void
     Inst_SOP2__S_ASHR_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_BFM_B32::Inst_SOP2__S_BFM_B32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_bfm_b32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_BFM_B32

     Inst_SOP2__S_BFM_B32::~Inst_SOP2__S_BFM_B32()
     {
     } // ~Inst_SOP2__S_BFM_B32

     // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0] (bitfield mask).
     void
     Inst_SOP2__S_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

         src0.read();
         src1.read();

         sdst = ((1 << bits(src0.rawData(), 4, 0)) - 1)
             << bits(src1.rawData(), 4, 0);

         sdst.write();
     }

     Inst_SOP2__S_BFM_B64::Inst_SOP2__S_BFM_B64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_bfm_b64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_BFM_B64

     Inst_SOP2__S_BFM_B64::~Inst_SOP2__S_BFM_B64()
     {
     } // ~Inst_SOP2__S_BFM_B64

     // D.u64 = ((1ULL << S0.u[5:0]) - 1) << S1.u[5:0] (bitfield mask).
     void
     Inst_SOP2__S_BFM_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

         src0.read();
         src1.read();

         sdst = ((1ULL << bits(src0.rawData(), 5, 0)) - 1)
             << bits(src1.rawData(), 5, 0);

         sdst.write();
     }

     Inst_SOP2__S_MUL_I32::Inst_SOP2__S_MUL_I32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_mul_i32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_MUL_I32

     Inst_SOP2__S_MUL_I32::~Inst_SOP2__S_MUL_I32()
     {
     } // ~Inst_SOP2__S_MUL_I32

     // D.i = S0.i * S1.i.
     void
     Inst_SOP2__S_MUL_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src0.read();
         src1.read();

         sdst = src0.rawData() * src1.rawData();

         sdst.write();
     }

     Inst_SOP2__S_BFE_U32::Inst_SOP2__S_BFE_U32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_bfe_u32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_BFE_U32

     Inst_SOP2__S_BFE_U32::~Inst_SOP2__S_BFE_U32()
     {
     } // ~Inst_SOP2__S_BFE_U32

     // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
     // field width.
     // D.u = (S0.u >> S1.u[4:0]) & ((1 << S1.u[22:16]) - 1);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
             & ((1 << bits(src1.rawData(), 22, 16)) - 1);
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_BFE_I32::Inst_SOP2__S_BFE_I32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_bfe_i32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_BFE_I32

     Inst_SOP2__S_BFE_I32::~Inst_SOP2__S_BFE_I32()
     {
     } // ~Inst_SOP2__S_BFE_I32

     // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
     // field width.
     // D.i = (S0.i >> S1.u[4:0]) & ((1 << S1.u[22:16]) - 1);
     // Sign-extend the result;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
             & ((1 << bits(src1.rawData(), 22, 16)) - 1);
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_BFE_U64::Inst_SOP2__S_BFE_U64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_bfe_u64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_BFE_U64

     Inst_SOP2__S_BFE_U64::~Inst_SOP2__S_BFE_U64()
     {
     } // ~Inst_SOP2__S_BFE_U64

     // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
     // field width.
     // D.u64 = (S0.u64 >> S1.u[5:0]) & ((1 << S1.u[22:16]) - 1);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_BFE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
             & ((1 << bits(src1.rawData(), 22, 16)) - 1);
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_BFE_I64::Inst_SOP2__S_BFE_I64(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_bfe_i64")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_BFE_I64

     Inst_SOP2__S_BFE_I64::~Inst_SOP2__S_BFE_I64()
     {
     } // ~Inst_SOP2__S_BFE_I64

     // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
     // field width.
     // D.i64 = (S0.i64 >> S1.u[5:0]) & ((1 << S1.u[22:16]) - 1);
     // Sign-extend result;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP2__S_BFE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
             & ((1 << bits(src1.rawData(), 22, 16)) - 1);
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_CBRANCH_G_FORK::Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_cbranch_g_fork")
     {
         setFlag(Branch);
     } // Inst_SOP2__S_CBRANCH_G_FORK

     Inst_SOP2__S_CBRANCH_G_FORK::~Inst_SOP2__S_CBRANCH_G_FORK()
     {
     } // ~Inst_SOP2__S_CBRANCH_G_FORK

     // Conditional branch using branch-stack.
     // S0 = compare mask(vcc or any sgpr) and
     // S1 = 64-bit byte address of target instruction.
     void
     Inst_SOP2__S_CBRANCH_G_FORK::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOP2__S_ABSDIFF_I32::Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_absdiff_i32")
     {
         setFlag(ALU);
     } // Inst_SOP2__S_ABSDIFF_I32

     Inst_SOP2__S_ABSDIFF_I32::~Inst_SOP2__S_ABSDIFF_I32()
     {
     } // ~Inst_SOP2__S_ABSDIFF_I32

     // D.i = S0.i - S1.i;
     // if (D.i < 0) then D.i = -D.i;
     // SCC = 1 if result is non-zero.
     // Compute the absolute value of difference between two values.
     void
     Inst_SOP2__S_ABSDIFF_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         sdst = std::abs(src0.rawData() - src1.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP2__S_RFE_RESTORE_B64::Inst_SOP2__S_RFE_RESTORE_B64(
           InFmt_SOP2 *iFmt)
         : Inst_SOP2(iFmt, "s_rfe_restore_b64")
     {
     } // Inst_SOP2__S_RFE_RESTORE_B64

     Inst_SOP2__S_RFE_RESTORE_B64::~Inst_SOP2__S_RFE_RESTORE_B64()
     {
     } // ~Inst_SOP2__S_RFE_RESTORE_B64

     // Return from exception handler and continue.
     void
     Inst_SOP2__S_RFE_RESTORE_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_movk_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_MOVK_I32

     Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
     {
     } // ~Inst_SOPK__S_MOVK_I32

     // D.i = signext(SIMM16) (sign extension).
     void
     Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         sdst = simm16;

         sdst.write();
     }

     Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmovk_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMOVK_I32

     Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
     {
     } // ~Inst_SOPK__S_CMOVK_I32

     // if (SCC) then D.i = signext(SIMM16);
     // else NOP.
     // Conditional move with sign extension.
     void
     Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

         scc.read();

         if (scc.rawData()) {
             sdst = simm16;
             sdst.write();
         }
     }

     Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_EQ_I32

     Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
     {
     } // ~Inst_SOPK__S_CMPK_EQ_I32

     // SCC = (S0.i == signext(SIMM16)).
     void
     Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
         ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() == simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_LG_I32

     Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
     {
     } // ~Inst_SOPK__S_CMPK_LG_I32

     // SCC = (S0.i != signext(SIMM16)).
     void
     Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
         ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() != simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_GT_I32

     Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
     {
     } // ~Inst_SOPK__S_CMPK_GT_I32

     // SCC = (S0.i > signext(SIMM16)).
     void
     Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
         ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() > simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_GE_I32

     Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
     {
     } // ~Inst_SOPK__S_CMPK_GE_I32

     // SCC = (S0.i >= signext(SIMM16)).
     void
     Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
         ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() >= simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_LT_I32

     Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
     {
     } // ~Inst_SOPK__S_CMPK_LT_I32

     // SCC = (S0.i < signext(SIMM16)).
     void
     Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
         ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() < simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_le_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_LE_I32

     Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
     {
     } // ~Inst_SOPK__S_CMPK_LE_I32

     // SCC = (S0.i <= signext(SIMM16)).
     void
     Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
         ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() <= simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_EQ_U32

     Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
     {
     } // ~Inst_SOPK__S_CMPK_EQ_U32

     // SCC = (S0.u == SIMM16).
     void
     Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
         ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() == simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_LG_U32

     Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
     {
     } // ~Inst_SOPK__S_CMPK_LG_U32

     // SCC = (S0.u != SIMM16).
     void
     Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
         ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() != simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_GT_U32

     Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
     {
     } // ~Inst_SOPK__S_CMPK_GT_U32

     // SCC = (S0.u > SIMM16).
     void
     Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
         ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() > simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_GE_U32

     Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
     {
     } // ~Inst_SOPK__S_CMPK_GE_U32

     // SCC = (S0.u >= SIMM16).
     void
     Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
         ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() >= simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_LT_U32

     Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
     {
     } // ~Inst_SOPK__S_CMPK_LT_U32

     // SCC = (S0.u < SIMM16).
     void
     Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
         ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() < simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cmpk_le_u32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_CMPK_LE_U32

     Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
     {
     } // ~Inst_SOPK__S_CMPK_LE_U32

     // SCC = (S0.u <= SIMM16).
     void
     Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
         ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         scc = (src.rawData() <= simm16) ? 1 : 0;

         scc.write();
     }

     Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_addk_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_ADDK_I32

     Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
     {
     } // ~Inst_SOPK__S_ADDK_I32

     // D.i = D.i + signext(SIMM16);
     // SCC = overflow.
     void
     Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI16 simm16 = instData.SIMM16;
         ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = src.rawData() + (ScalarRegI32)simm16;
         scc = (bits(src.rawData(), 31) == bits(simm16, 15)
             && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_mulk_i32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_MULK_I32

     Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
     {
     } // ~Inst_SOPK__S_MULK_I32

     // D.i = D.i * signext(SIMM16).
     void
     Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI16 simm16 = instData.SIMM16;
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         sdst.read();

         sdst = sdst.rawData() * (ScalarRegI32)simm16;

         sdst.write();
     }

     Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_cbranch_i_fork")
     {
         setFlag(Branch);
     } // Inst_SOPK__S_CBRANCH_I_FORK

     Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
     {
     } // ~Inst_SOPK__S_CBRANCH_I_FORK

     // Conditional branch using branch-stack.
     // S0 = compare mask(vcc or any sgpr), and
     // SIMM16 = signed DWORD branch offset relative to next instruction.
     void
     Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_getreg_b32")
     {
     } // Inst_SOPK__S_GETREG_B32

     Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
     {
     } // ~Inst_SOPK__S_GETREG_B32

     // D.u = hardware-reg. Read some or all of a hardware register into the
     // LSBs of D.
     // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
     // is 1..32.
     void
     Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_setreg_b32")
     {
         setFlag(ALU);
     } // Inst_SOPK__S_SETREG_B32

     Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
     {
     } // ~Inst_SOPK__S_SETREG_B32

     // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
     // register.
     // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
     // is 1..32.
     void
     Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI16 simm16 = instData.SIMM16;
         ScalarRegU32 hwregId = simm16 & 0x3f;
         ScalarRegU32 offset = (simm16 >> 6) & 31;
         ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;

         ScalarOperandU32 hwreg(gpuDynInst, hwregId);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         hwreg.read();
         sdst.read();

         // Store value from SDST to part of the hardware register.
         ScalarRegU32 mask = (((1U << size) - 1U) << offset);
         hwreg = ((hwreg.rawData() & ~mask)
                         | ((sdst.rawData() << offset) & mask));
         hwreg.write();

         // set MODE register to control the behavior of single precision
         // floating-point numbers: denormal mode or round mode
         if (hwregId==1 && size==2
                         && (offset==4 || offset==0)) {
             warn_once("Be cautious that s_setreg_b32 has no real effect "
                             "on FP modes: %s\n", gpuDynInst->disassemble());
             return;
         }

         // panic if not changing MODE of floating-point numbers
         panicUnimplemented();
     }

     Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
           InFmt_SOPK *iFmt)
         : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
     {
     } // Inst_SOPK__S_SETREG_IMM32_B32

     Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
     {
     } // ~Inst_SOPK__S_SETREG_IMM32_B32

     // Write some or all of the LSBs of IMM32 into a hardware register; this
     // instruction requires a 32-bit literal constant.
     // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
     // is 1..32.
     void
     Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOP1__S_MOV_B32::Inst_SOP1__S_MOV_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_mov_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_MOV_B32

     Inst_SOP1__S_MOV_B32::~Inst_SOP1__S_MOV_B32()
     {
     } // ~Inst_SOP1__S_MOV_B32

     // D.u = S0.u.
     void
     Inst_SOP1__S_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = src.rawData();

         sdst.write();
     }

     Inst_SOP1__S_MOV_B64::Inst_SOP1__S_MOV_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_mov_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_MOV_B64

     Inst_SOP1__S_MOV_B64::~Inst_SOP1__S_MOV_B64()
     {
     } // ~Inst_SOP1__S_MOV_B64

     // D.u64 = S0.u64.
     void
     Inst_SOP1__S_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = src.rawData();

         sdst.write();
     }

     Inst_SOP1__S_CMOV_B32::Inst_SOP1__S_CMOV_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_cmov_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_CMOV_B32

     Inst_SOP1__S_CMOV_B32::~Inst_SOP1__S_CMOV_B32()
     {
     } // ~Inst_SOP1__S_CMOV_B32

     // if (SCC) then D.u = S0.u;
     // else NOP.
     // Conditional move.
     void
     Inst_SOP1__S_CMOV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();
         scc.read();

         if (scc.rawData()) {
             sdst = src.rawData();
             sdst.write();
         }
     }

     Inst_SOP1__S_CMOV_B64::Inst_SOP1__S_CMOV_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_cmov_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_CMOV_B64

     Inst_SOP1__S_CMOV_B64::~Inst_SOP1__S_CMOV_B64()
     {
     } // ~Inst_SOP1__S_CMOV_B64

     // if (SCC) then D.u64 = S0.u64;
     // else NOP.
     // Conditional move.
     void
     Inst_SOP1__S_CMOV_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();
         scc.read();

         if (scc.rawData()) {
             sdst = src.rawData();
             sdst.write();
         }
     }

     Inst_SOP1__S_NOT_B32::Inst_SOP1__S_NOT_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_not_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_NOT_B32

     Inst_SOP1__S_NOT_B32::~Inst_SOP1__S_NOT_B32()
     {
     } // ~Inst_SOP1__S_NOT_B32

     // D.u = ~S0.u;
     // SCC = 1 if result is non-zero.
     // Bitwise negation.
     void
     Inst_SOP1__S_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = ~src.rawData();

         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_NOT_B64::Inst_SOP1__S_NOT_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_not_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_NOT_B64

     Inst_SOP1__S_NOT_B64::~Inst_SOP1__S_NOT_B64()
     {
     } // ~Inst_SOP1__S_NOT_B64

     // D.u64 = ~S0.u64;
     // SCC = 1 if result is non-zero.
     // Bitwise negation.
     void
     Inst_SOP1__S_NOT_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = ~src.rawData();
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_WQM_B32::Inst_SOP1__S_WQM_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_wqm_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_WQM_B32

     Inst_SOP1__S_WQM_B32::~Inst_SOP1__S_WQM_B32()
     {
     } // ~Inst_SOP1__S_WQM_B32

     // Computes whole quad mode for an active/valid mask.
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP1__S_WQM_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wholeQuadMode(src.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_WQM_B64::Inst_SOP1__S_WQM_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_wqm_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_WQM_B64

     Inst_SOP1__S_WQM_B64::~Inst_SOP1__S_WQM_B64()
     {
     } // ~Inst_SOP1__S_WQM_B64

     // Computes whole quad mode for an active/valid mask.
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP1__S_WQM_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wholeQuadMode(src.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_BREV_B32::Inst_SOP1__S_BREV_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_brev_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BREV_B32

     Inst_SOP1__S_BREV_B32::~Inst_SOP1__S_BREV_B32()
     {
     } // ~Inst_SOP1__S_BREV_B32

     // D.u[31:0] = S0.u[0:31] (reverse bits).
     void
     Inst_SOP1__S_BREV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = reverseBits(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_BREV_B64::Inst_SOP1__S_BREV_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_brev_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BREV_B64

     Inst_SOP1__S_BREV_B64::~Inst_SOP1__S_BREV_B64()
     {
     } // ~Inst_SOP1__S_BREV_B64

     // D.u64[63:0] = S0.u64[0:63] (reverse bits).
     void
     Inst_SOP1__S_BREV_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = reverseBits(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_BCNT0_I32_B32::Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_bcnt0_i32_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BCNT0_I32_B32

     Inst_SOP1__S_BCNT0_I32_B32::~Inst_SOP1__S_BCNT0_I32_B32()
     {
     } // ~Inst_SOP1__S_BCNT0_I32_B32

     // D.i = CountZeroBits(S0.u);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP1__S_BCNT0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = countZeroBits(src.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_BCNT0_I32_B64::Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_bcnt0_i32_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BCNT0_I32_B64

     Inst_SOP1__S_BCNT0_I32_B64::~Inst_SOP1__S_BCNT0_I32_B64()
     {
     } // ~Inst_SOP1__S_BCNT0_I32_B64

     // D.i = CountZeroBits(S0.u64);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP1__S_BCNT0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = countZeroBits(src.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_BCNT1_I32_B32::Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_bcnt1_i32_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BCNT1_I32_B32

     Inst_SOP1__S_BCNT1_I32_B32::~Inst_SOP1__S_BCNT1_I32_B32()
     {
     } // ~Inst_SOP1__S_BCNT1_I32_B32

     // D.i = CountOneBits(S0.u);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP1__S_BCNT1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = popCount(src.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_BCNT1_I32_B64::Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_bcnt1_i32_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BCNT1_I32_B64

     Inst_SOP1__S_BCNT1_I32_B64::~Inst_SOP1__S_BCNT1_I32_B64()
     {
     } // ~Inst_SOP1__S_BCNT1_I32_B64

     // D.i = CountOneBits(S0.u64);
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP1__S_BCNT1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = popCount(src.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_FF0_I32_B32::Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_ff0_i32_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_FF0_I32_B32

     Inst_SOP1__S_FF0_I32_B32::~Inst_SOP1__S_FF0_I32_B32()
     {
     } // ~Inst_SOP1__S_FF0_I32_B32

     // D.i = FindFirstZero(S0.u);
     // If no zeros are found, return -1.
     // Returns the bit position of the first zero from the LSB.
     void
     Inst_SOP1__S_FF0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = findFirstZero(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_FF0_I32_B64::Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_ff0_i32_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_FF0_I32_B64

     Inst_SOP1__S_FF0_I32_B64::~Inst_SOP1__S_FF0_I32_B64()
     {
     } // ~Inst_SOP1__S_FF0_I32_B64

     // D.i = FindFirstZero(S0.u64);
     // If no zeros are found, return -1.
     // Returns the bit position of the first zero from the LSB.
     void
     Inst_SOP1__S_FF0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = findFirstZero(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_FF1_I32_B32::Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_ff1_i32_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_FF1_I32_B32

     Inst_SOP1__S_FF1_I32_B32::~Inst_SOP1__S_FF1_I32_B32()
     {
     } // ~Inst_SOP1__S_FF1_I32_B32

     // D.i = FindFirstOne(S0.u);
     // If no ones are found, return -1.
     // Returns the bit position of the first one from the LSB.
     void
     Inst_SOP1__S_FF1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = findFirstOne(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_FF1_I32_B64::Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_ff1_i32_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_FF1_I32_B64

     Inst_SOP1__S_FF1_I32_B64::~Inst_SOP1__S_FF1_I32_B64()
     {
     } // ~Inst_SOP1__S_FF1_I32_B64

     // D.i = FindFirstOne(S0.u64);
     // If no ones are found, return -1.
     // Returns the bit position of the first one from the LSB.
     void
     Inst_SOP1__S_FF1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = findFirstOne(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_FLBIT_I32_B32::Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_flbit_i32_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_FLBIT_I32_B32

     Inst_SOP1__S_FLBIT_I32_B32::~Inst_SOP1__S_FLBIT_I32_B32()
     {
     } // ~Inst_SOP1__S_FLBIT_I32_B32

     // D.i = FindFirstOne(S0.u);
     // If no ones are found, return -1.
     // Counts how many zeros before the first one starting from the MSB.
     void
     Inst_SOP1__S_FLBIT_I32_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = countZeroBitsMsb(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_FLBIT_I32_B64::Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_flbit_i32_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_FLBIT_I32_B64

     Inst_SOP1__S_FLBIT_I32_B64::~Inst_SOP1__S_FLBIT_I32_B64()
     {
     } // ~Inst_SOP1__S_FLBIT_I32_B64

     // D.i = FindFirstOne(S0.u64);
     // If no ones are found, return -1.
     // Counts how many zeros before the first one starting from the MSB.
     void
     Inst_SOP1__S_FLBIT_I32_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = countZeroBitsMsb(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_FLBIT_I32::Inst_SOP1__S_FLBIT_I32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_flbit_i32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_FLBIT_I32

     Inst_SOP1__S_FLBIT_I32::~Inst_SOP1__S_FLBIT_I32()
     {
     } // ~Inst_SOP1__S_FLBIT_I32

     // D.i = FirstOppositeSignBit(S0.i);
     // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
     // Counts how many bits in a row (from MSB to LSB) are the same as the
     // sign bit.
     void
     Inst_SOP1__S_FLBIT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = firstOppositeSignBit(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_FLBIT_I32_I64::Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_flbit_i32_i64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_FLBIT_I32_I64

     Inst_SOP1__S_FLBIT_I32_I64::~Inst_SOP1__S_FLBIT_I32_I64()
     {
     } // ~Inst_SOP1__S_FLBIT_I32_I64

     // D.i = FirstOppositeSignBit(S0.i64);
     // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
     // Counts how many bits in a row (from MSB to LSB) are the same as the
     // sign bit.
     void
     Inst_SOP1__S_FLBIT_I32_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = firstOppositeSignBit(src.rawData());

         sdst.write();
     }

     Inst_SOP1__S_SEXT_I32_I8::Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_sext_i32_i8")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_SEXT_I32_I8

     Inst_SOP1__S_SEXT_I32_I8::~Inst_SOP1__S_SEXT_I32_I8()
     {
     } // ~Inst_SOP1__S_SEXT_I32_I8

     // D.i = signext(S0.i[7:0]) (sign extension).
     void
     Inst_SOP1__S_SEXT_I32_I8::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = sext<std::numeric_limits<ScalarRegI8>::digits>(
             bits(src.rawData(), 7, 0));

         sdst.write();
     }

     Inst_SOP1__S_SEXT_I32_I16::Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_sext_i32_i16")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_SEXT_I32_I16

     Inst_SOP1__S_SEXT_I32_I16::~Inst_SOP1__S_SEXT_I32_I16()
     {
     } // ~Inst_SOP1__S_SEXT_I32_I16

     // D.i = signext(S0.i[15:0]) (sign extension).
     void
     Inst_SOP1__S_SEXT_I32_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = sext<std::numeric_limits<ScalarRegI16>::digits>(
             bits(src.rawData(), 15, 0));

         sdst.write();
     }

     Inst_SOP1__S_BITSET0_B32::Inst_SOP1__S_BITSET0_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_bitset0_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BITSET0_B32

     Inst_SOP1__S_BITSET0_B32::~Inst_SOP1__S_BITSET0_B32()
     {
     } // ~Inst_SOP1__S_BITSET0_B32

     // D.u[S0.u[4:0]] = 0.
     void
     Inst_SOP1__S_BITSET0_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst.setBit(bits(src.rawData(), 4, 0), 0);

         sdst.write();
     }

     Inst_SOP1__S_BITSET0_B64::Inst_SOP1__S_BITSET0_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_bitset0_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BITSET0_B64

     Inst_SOP1__S_BITSET0_B64::~Inst_SOP1__S_BITSET0_B64()
     {
     } // ~Inst_SOP1__S_BITSET0_B64

     // D.u64[S0.u[5:0]] = 0.
     void
     Inst_SOP1__S_BITSET0_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst.setBit(bits(src.rawData(), 5, 0), 0);

         sdst.write();
     }

     Inst_SOP1__S_BITSET1_B32::Inst_SOP1__S_BITSET1_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_bitset1_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BITSET1_B32

     Inst_SOP1__S_BITSET1_B32::~Inst_SOP1__S_BITSET1_B32()
     {
     } // ~Inst_SOP1__S_BITSET1_B32

     // D.u[S0.u[4:0]] = 1.
     void
     Inst_SOP1__S_BITSET1_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst.setBit(bits(src.rawData(), 4, 0), 1);

         sdst.write();
     }

     Inst_SOP1__S_BITSET1_B64::Inst_SOP1__S_BITSET1_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_bitset1_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_BITSET1_B64

     Inst_SOP1__S_BITSET1_B64::~Inst_SOP1__S_BITSET1_B64()
     {
     } // ~Inst_SOP1__S_BITSET1_B64

     // D.u64[S0.u[5:0]] = 1.
     void
     Inst_SOP1__S_BITSET1_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst.setBit(bits(src.rawData(), 5, 0), 1);

         sdst.write();
     }

     Inst_SOP1__S_GETPC_B64::Inst_SOP1__S_GETPC_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_getpc_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_GETPC_B64

     Inst_SOP1__S_GETPC_B64::~Inst_SOP1__S_GETPC_B64()
     {
     } // ~Inst_SOP1__S_GETPC_B64

     // D.u64 = PC + 4.
     // Destination receives the byte address of the next instruction.
     void
     Inst_SOP1__S_GETPC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         Addr pc = wf->pc();
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

         sdst = pc + 4;

         sdst.write();
     }

     Inst_SOP1__S_SETPC_B64::Inst_SOP1__S_SETPC_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_setpc_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_SETPC_B64

     Inst_SOP1__S_SETPC_B64::~Inst_SOP1__S_SETPC_B64()
     {
     } // ~Inst_SOP1__S_SETPC_B64

     // PC = S0.u64.
     // S0.u64 is a byte address of the instruction to jump to.
     void
     Inst_SOP1__S_SETPC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);

         src.read();

         wf->pc(src.rawData());
     }

     Inst_SOP1__S_SWAPPC_B64::Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_swappc_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_SWAPPC_B64

     Inst_SOP1__S_SWAPPC_B64::~Inst_SOP1__S_SWAPPC_B64()
     {
     } // ~Inst_SOP1__S_SWAPPC_B64

     // D.u64 = PC + 4; PC = S0.u64.
     // S0.u64 is a byte address of the instruction to jump to.
     void
     Inst_SOP1__S_SWAPPC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         Addr pc = wf->pc();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = pc + 4;

         wf->pc(src.rawData());
         sdst.write();
     }

     Inst_SOP1__S_RFE_B64::Inst_SOP1__S_RFE_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_rfe_b64")
     {
     } // Inst_SOP1__S_RFE_B64

     Inst_SOP1__S_RFE_B64::~Inst_SOP1__S_RFE_B64()
     {
     } // ~Inst_SOP1__S_RFE_B64

     // Return from exception handler and continue.
     void
     Inst_SOP1__S_RFE_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOP1__S_AND_SAVEEXEC_B64::Inst_SOP1__S_AND_SAVEEXEC_B64(
           InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_and_saveexec_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_AND_SAVEEXEC_B64

     Inst_SOP1__S_AND_SAVEEXEC_B64::~Inst_SOP1__S_AND_SAVEEXEC_B64()
     {
     } // ~Inst_SOP1__S_AND_SAVEEXEC_B64

     // D.u64 = EXEC;
     // EXEC = S0.u64 & EXEC;
     // SCC = 1 if the new value of EXEC is non-zero.
     void
     Inst_SOP1__S_AND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wf->execMask().to_ullong();
         wf->execMask() = src.rawData() & wf->execMask().to_ullong();
         scc = wf->execMask().any() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_OR_SAVEEXEC_B64::Inst_SOP1__S_OR_SAVEEXEC_B64(
           InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_or_saveexec_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_OR_SAVEEXEC_B64

     Inst_SOP1__S_OR_SAVEEXEC_B64::~Inst_SOP1__S_OR_SAVEEXEC_B64()
     {
     } // ~Inst_SOP1__S_OR_SAVEEXEC_B64

     // D.u64 = EXEC;
     // EXEC = S0.u64 | EXEC;
     // SCC = 1 if the new value of EXEC is non-zero.
     void
     Inst_SOP1__S_OR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wf->execMask().to_ullong();
         wf->execMask() = src.rawData() | wf->execMask().to_ullong();
         scc = wf->execMask().any() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_XOR_SAVEEXEC_B64::Inst_SOP1__S_XOR_SAVEEXEC_B64(
           InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_xor_saveexec_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_XOR_SAVEEXEC_B64

     Inst_SOP1__S_XOR_SAVEEXEC_B64::~Inst_SOP1__S_XOR_SAVEEXEC_B64()
     {
     } // ~Inst_SOP1__S_XOR_SAVEEXEC_B64

     // D.u64 = EXEC;
     // EXEC = S0.u64 ^ EXEC;
     // SCC = 1 if the new value of EXEC is non-zero.
     void
     Inst_SOP1__S_XOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wf->execMask().to_ullong();
         wf->execMask() = src.rawData() ^ wf->execMask().to_ullong();
         scc = wf->execMask().any() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_ANDN2_SAVEEXEC_B64::Inst_SOP1__S_ANDN2_SAVEEXEC_B64(
           InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_andn2_saveexec_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_ANDN2_SAVEEXEC_B64

     Inst_SOP1__S_ANDN2_SAVEEXEC_B64::~Inst_SOP1__S_ANDN2_SAVEEXEC_B64()
     {
     } // ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64

     // D.u64 = EXEC;
     // EXEC = S0.u64 & ~EXEC;
     // SCC = 1 if the new value of EXEC is non-zero.
     void
     Inst_SOP1__S_ANDN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wf->execMask().to_ullong();
         wf->execMask() = src.rawData() &~ wf->execMask().to_ullong();
         scc = wf->execMask().any() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_ORN2_SAVEEXEC_B64::Inst_SOP1__S_ORN2_SAVEEXEC_B64(
           InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_orn2_saveexec_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_ORN2_SAVEEXEC_B64

     Inst_SOP1__S_ORN2_SAVEEXEC_B64::~Inst_SOP1__S_ORN2_SAVEEXEC_B64()
     {
     } // ~Inst_SOP1__S_ORN2_SAVEEXEC_B64

     // D.u64 = EXEC;
     // EXEC = S0.u64 | ~EXEC;
     // SCC = 1 if the new value of EXEC is non-zero.
     void
     Inst_SOP1__S_ORN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wf->execMask().to_ullong();
         wf->execMask() = src.rawData() |~ wf->execMask().to_ullong();
         scc = wf->execMask().any() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_NAND_SAVEEXEC_B64::Inst_SOP1__S_NAND_SAVEEXEC_B64(
           InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_nand_saveexec_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_NAND_SAVEEXEC_B64

     Inst_SOP1__S_NAND_SAVEEXEC_B64::~Inst_SOP1__S_NAND_SAVEEXEC_B64()
     {
     } // ~Inst_SOP1__S_NAND_SAVEEXEC_B64

     // D.u64 = EXEC;
     // EXEC = ~(S0.u64 & EXEC);
     // SCC = 1 if the new value of EXEC is non-zero.
     void
     Inst_SOP1__S_NAND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wf->execMask().to_ullong();
         wf->execMask() = ~(src.rawData() & wf->execMask().to_ullong());
         scc = wf->execMask().any() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_NOR_SAVEEXEC_B64::Inst_SOP1__S_NOR_SAVEEXEC_B64(
           InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_nor_saveexec_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_NOR_SAVEEXEC_B64

     Inst_SOP1__S_NOR_SAVEEXEC_B64::~Inst_SOP1__S_NOR_SAVEEXEC_B64()
     {
     } // ~Inst_SOP1__S_NOR_SAVEEXEC_B64

     // D.u64 = EXEC;
     // EXEC = ~(S0.u64 | EXEC);
     // SCC = 1 if the new value of EXEC is non-zero.
     void
     Inst_SOP1__S_NOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wf->execMask().to_ullong();
         wf->execMask() = ~(src.rawData() | wf->execMask().to_ullong());
         scc = wf->execMask().any() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_XNOR_SAVEEXEC_B64::Inst_SOP1__S_XNOR_SAVEEXEC_B64(
           InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_xnor_saveexec_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_XNOR_SAVEEXEC_B64

     Inst_SOP1__S_XNOR_SAVEEXEC_B64::~Inst_SOP1__S_XNOR_SAVEEXEC_B64()
     {
     } // ~Inst_SOP1__S_XNOR_SAVEEXEC_B64

     // D.u64 = EXEC;
     // EXEC = ~(S0.u64 ^ EXEC);
     // SCC = 1 if the new value of EXEC is non-zero.
     void
     Inst_SOP1__S_XNOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = wf->execMask().to_ullong();
         wf->execMask() = ~(src.rawData() ^ wf->execMask().to_ullong());
         scc = wf->execMask().any() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_QUADMASK_B32::Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_quadmask_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_QUADMASK_B32

     Inst_SOP1__S_QUADMASK_B32::~Inst_SOP1__S_QUADMASK_B32()
     {
     } // ~Inst_SOP1__S_QUADMASK_B32

     // D.u = QuadMask(S0.u):
     // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[31:8] = 0;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP1__S_QUADMASK_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = quadMask(src.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_QUADMASK_B64::Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_quadmask_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_QUADMASK_B64

     Inst_SOP1__S_QUADMASK_B64::~Inst_SOP1__S_QUADMASK_B64()
     {
     } // ~Inst_SOP1__S_QUADMASK_B64

     // D.u64 = QuadMask(S0.u64):
     // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[63:16] = 0;
     // SCC = 1 if result is non-zero.
     void
     Inst_SOP1__S_QUADMASK_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = quadMask(src.rawData());
         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_MOVRELS_B32::Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_movrels_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_MOVRELS_B32

     Inst_SOP1__S_MOVRELS_B32::~Inst_SOP1__S_MOVRELS_B32()
     {
     } // ~Inst_SOP1__S_MOVRELS_B32

     // D.u = SGPR[S0.u + M0.u].u (move from relative source).
     void
     Inst_SOP1__S_MOVRELS_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
         m0.read();
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0 + m0.rawData());
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = src.rawData();

         sdst.write();
     }

     Inst_SOP1__S_MOVRELS_B64::Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_movrels_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_MOVRELS_B64

     Inst_SOP1__S_MOVRELS_B64::~Inst_SOP1__S_MOVRELS_B64()
     {
     } // ~Inst_SOP1__S_MOVRELS_B64

     // D.u64 = SGPR[S0.u + M0.u].u64 (move from relative source).
     // The index in M0.u must be even for this operation.
     void
     Inst_SOP1__S_MOVRELS_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
         m0.read();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0 + m0.rawData());
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

         src.read();

         sdst = src.rawData();

         sdst.write();
     }

     Inst_SOP1__S_MOVRELD_B32::Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_movreld_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_MOVRELD_B32

     Inst_SOP1__S_MOVRELD_B32::~Inst_SOP1__S_MOVRELD_B32()
     {
     } // ~Inst_SOP1__S_MOVRELD_B32

     // SGPR[D.u + M0.u].u = S0.u (move to relative destination).
     void
     Inst_SOP1__S_MOVRELD_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
         m0.read();
         ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.SDST + m0.rawData());

         src.read();

         sdst = src.rawData();

         sdst.write();
     }

     Inst_SOP1__S_MOVRELD_B64::Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_movreld_b64")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_MOVRELD_B64

     Inst_SOP1__S_MOVRELD_B64::~Inst_SOP1__S_MOVRELD_B64()
     {
     } // ~Inst_SOP1__S_MOVRELD_B64

     // SGPR[D.u + M0.u].u64 = S0.u64 (move to relative destination).
     // The index in M0.u must be even for this operation.
     void
     Inst_SOP1__S_MOVRELD_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
         m0.read();
         ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST + m0.rawData());

         src.read();

         sdst = src.rawData();

         sdst.write();
     }

     Inst_SOP1__S_CBRANCH_JOIN::Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_cbranch_join")
     {
         setFlag(Branch);
     } // Inst_SOP1__S_CBRANCH_JOIN

     Inst_SOP1__S_CBRANCH_JOIN::~Inst_SOP1__S_CBRANCH_JOIN()
     {
     } // ~Inst_SOP1__S_CBRANCH_JOIN

     // Conditional branch join point (end of conditional branch block).
     void
     Inst_SOP1__S_CBRANCH_JOIN::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOP1__S_ABS_I32::Inst_SOP1__S_ABS_I32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_abs_i32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_ABS_I32

     Inst_SOP1__S_ABS_I32::~Inst_SOP1__S_ABS_I32()
     {
     } // ~Inst_SOP1__S_ABS_I32

     // if (S.i < 0) then D.i = -S.i;
     // else D.i = S.i;
     // SCC = 1 if result is non-zero.
     // Integer absolute value.
     void
     Inst_SOP1__S_ABS_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
         ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src.read();

         sdst = std::abs(src.rawData());

         scc = sdst.rawData() ? 1 : 0;

         sdst.write();
         scc.write();
     }

     Inst_SOP1__S_MOV_FED_B32::Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_mov_fed_b32")
     {
         setFlag(ALU);
     } // Inst_SOP1__S_MOV_FED_B32

     Inst_SOP1__S_MOV_FED_B32::~Inst_SOP1__S_MOV_FED_B32()
     {
     } // ~Inst_SOP1__S_MOV_FED_B32

     // D.u = S0.u.
     void
     Inst_SOP1__S_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOP1__S_SET_GPR_IDX_IDX::Inst_SOP1__S_SET_GPR_IDX_IDX(
           InFmt_SOP1 *iFmt)
         : Inst_SOP1(iFmt, "s_set_gpr_idx_idx")
     {
     } // Inst_SOP1__S_SET_GPR_IDX_IDX

     Inst_SOP1__S_SET_GPR_IDX_IDX::~Inst_SOP1__S_SET_GPR_IDX_IDX()
     {
     } // ~Inst_SOP1__S_SET_GPR_IDX_IDX

     // M0[7:0] = S0.u[7:0].
     // Modify the index used in vector GPR indexing.
     void
     Inst_SOP1__S_SET_GPR_IDX_IDX::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_eq_i32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_EQ_I32

     Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
     {
     } // ~Inst_SOPC__S_CMP_EQ_I32

     // SCC = (S0.i == S1.i).
     void
     Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() == src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_lg_i32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_LG_I32

     Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
     {
     } // ~Inst_SOPC__S_CMP_LG_I32

     // SCC = (S0.i != S1.i).
     void
     Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() != src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_gt_i32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_GT_I32

     Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
     {
     } // ~Inst_SOPC__S_CMP_GT_I32

     // SCC = (S0.i > S1.i).
     void
     Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() > src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_ge_i32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_GE_I32

     Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
     {
     } // ~Inst_SOPC__S_CMP_GE_I32

     // SCC = (S0.i >= S1.i).
     void
     Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_lt_i32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_LT_I32

     Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
     {
     } // ~Inst_SOPC__S_CMP_LT_I32

     // SCC = (S0.i < S1.i).
     void
     Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() < src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_le_i32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_LE_I32

     Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
     {
     } // ~Inst_SOPC__S_CMP_LE_I32

     // SCC = (S0.i <= S1.i).
     void
     Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_eq_u32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_EQ_U32

     Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
     {
     } // ~Inst_SOPC__S_CMP_EQ_U32

     // SCC = (S0.u == S1.u).
     void
     Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() == src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_lg_u32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_LG_U32

     Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
     {
     } // ~Inst_SOPC__S_CMP_LG_U32

     // SCC = (S0.u != S1.u).
     void
     Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() != src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_gt_u32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_GT_U32

     Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
     {
     } // ~Inst_SOPC__S_CMP_GT_U32

     // SCC = (S0.u > S1.u).
     void
     Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() > src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_ge_u32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_GE_U32

     Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
     {
     } // ~Inst_SOPC__S_CMP_GE_U32

     // SCC = (S0.u >= S1.u).
     void
     Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_lt_u32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_LT_U32

     Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
     {
     } // ~Inst_SOPC__S_CMP_LT_U32

     // SCC = (S0.u < S1.u).
     void
     Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_le_u32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_LE_U32

     Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
     {
     } // ~Inst_SOPC__S_CMP_LE_U32

     // SCC = (S0.u <= S1.u).
     void
     Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_bitcmp0_b32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_BITCMP0_B32

     Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
     {
     } // ~Inst_SOPC__S_BITCMP0_B32

     // SCC = (S0.u[S1.u[4:0]] == 0).
     void
     Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_bitcmp1_b32")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_BITCMP1_B32

     Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
     {
     } // ~Inst_SOPC__S_BITCMP1_B32

     // SCC = (S0.u[S1.u[4:0]] == 1).
     void
     Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_bitcmp0_b64")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_BITCMP0_B64

     Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
     {
     } // ~Inst_SOPC__S_BITCMP0_B64

     // SCC = (S0.u64[S1.u[5:0]] == 0).
     void
     Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_bitcmp1_b64")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_BITCMP1_B64

     Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
     {
     } // ~Inst_SOPC__S_BITCMP1_B64

     // SCC = (S0.u64[S1.u[5:0]] == 1).
     void
     Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_setvskip")
     {
         setFlag(UnconditionalJump);
     } // Inst_SOPC__S_SETVSKIP

     Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
     {
     } // ~Inst_SOPC__S_SETVSKIP

     // VSKIP = S0.u[S1.u[4:0]].
     // Enables and disables VSKIP mode.
     // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
     // issued.
     void
     Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
     {
     } // Inst_SOPC__S_SET_GPR_IDX_ON

     Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
     {
     } // ~Inst_SOPC__S_SET_GPR_IDX_ON

     // MODE.gpr_idx_en = 1;
     // M0[7:0] = S0.u[7:0];
     // M0[15:12] = SIMM4 (direct contents of S1 field);
     // Remaining bits of M0 are unmodified.
     // Enable GPR indexing mode. Vector operations after this will perform
     // relative GPR addressing based on the contents of M0.
     // The raw contents of the S1 field are read and used to set the enable
     // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
     // S1[3] = VDST_REL.
     void
     Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_eq_u64")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_EQ_U64

     Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
     {
     } // ~Inst_SOPC__S_CMP_EQ_U64

     // SCC = (S0.i64 == S1.i64).
     void
     Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() == src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
         : Inst_SOPC(iFmt, "s_cmp_lg_u64")
     {
         setFlag(ALU);
     } // Inst_SOPC__S_CMP_LG_U64

     Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
     {
     } // ~Inst_SOPC__S_CMP_LG_U64

     // SCC = (S0.i64 != S1.i64).
     void
     Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
         ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
         ScalarOperandU32 scc(gpuDynInst, REG_SCC);

         src0.read();
         src1.read();

         scc = (src0.rawData() != src1.rawData()) ? 1 : 0;

         scc.write();
     }

     Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_nop")
     {
         setFlag(Nop);
     } // Inst_SOPP__S_NOP

     Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
     {
     } // ~Inst_SOPP__S_NOP

     // Do nothing.
     void
     Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_endpgm")
     {
         setFlag(EndOfKernel);
     } // Inst_SOPP__S_ENDPGM

     Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
     {
     } // ~Inst_SOPP__S_ENDPGM

     // End of program; terminate wavefront.
     void
     Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ComputeUnit *cu = gpuDynInst->computeUnit();

         // delete extra instructions fetched for completed work-items
         wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
             wf->instructionBuffer.end());

         if (wf->pendingFetch) {
             wf->dropFetch = true;
         }

         wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
             .flushBuf(wf->wfSlotId);
         wf->setStatus(Wavefront::S_STOPPED);

         int refCount = wf->computeUnit->getLds()
             .decreaseRefCounter(wf->dispatchId, wf->wgId);

         /**
          * The parent WF of this instruction is exiting, therefore
          * it should not participate in this barrier any longer. This
          * prevents possible deadlock issues if WFs exit early.
          */
         int bar_id = WFBarrier::InvalidID;
         if (wf->hasBarrier()) {
             assert(wf->getStatus() != Wavefront::S_BARRIER);
             bar_id = wf->barrierId();
             assert(bar_id != WFBarrier::InvalidID);
             wf->releaseBarrier();
             cu->decMaxBarrierCnt(bar_id);
             DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
                     "program and decrementing max barrier count for "
                     "barrier Id%d. New max count: %d.\n", cu->cu_id,
                     wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
                     cu->maxBarrierCnt(bar_id));
         }

         DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
             wf->computeUnit->cu_id, wf->wgId, refCount);

         wf->computeUnit->registerManager->freeRegisters(wf);
         wf->computeUnit->completedWfs++;
         wf->computeUnit->activeWaves--;

         panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
             "than zero\n", wf->computeUnit->cu_id);

         DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
             wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);

         for (int i = 0; i < wf->vecReads.size(); i++) {
             if (wf->rawDist.find(i) != wf->rawDist.end()) {
                 wf->readsPerWrite.sample(wf->vecReads.at(i));
             }
         }
         wf->vecReads.clear();
         wf->rawDist.clear();
         wf->lastInstExec = 0;

         if (!refCount) {
             /**
              * If all WFs have finished, and hence the WG has finished,
              * then we can free up the barrier belonging to the parent
              * WG, but only if we actually used a barrier (i.e., more
              * than one WF in the WG).
              */
             if (bar_id != WFBarrier::InvalidID) {
                 DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
                         "now complete. Releasing barrier Id%d.\n", cu->cu_id,
                         wf->simdId, wf->wfSlotId, wf->wfDynId,
                         wf->barrierId());
                 cu->releaseBarrier(bar_id);
             }

            /**
              * Last wavefront of the workgroup has executed return. If the
              * workgroup is not the final one in the kernel, then simply
              * retire it; however, if it is the final one (i.e., indicating
              * the kernel end) then release operation is needed.
              */

             // check whether the workgroup is indicating the kernel end (i.e.,
             // the last workgroup in the kernel).
             bool kernelEnd =
                 wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
             // further check whether 'release @ kernel end' is needed
             bool relNeeded =
                 wf->computeUnit->shader->impl_kern_end_rel;

             // if not a kernel end or no release needed, retire the workgroup
             // directly
             if (!kernelEnd || !relNeeded) {
                 wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
                 wf->setStatus(Wavefront::S_STOPPED);
                 wf->computeUnit->completedWGs++;

                 return;
             }

             /**
              * If a kernel end and release needed, inject a memory sync and
              * retire the workgroup after receving all acks.
              */
             setFlag(MemSync);
             setFlag(GlobalSegment);
             // Notify Memory System of Kernel Completion
             wf->setStatus(Wavefront::S_RETURNING);
             gpuDynInst->simdId = wf->simdId;
             gpuDynInst->wfSlotId = wf->wfSlotId;
             gpuDynInst->wfDynId = wf->wfDynId;

             DPRINTF(GPUExec, "inject global memory fence for CU%d: "
                             "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
                             wf->simdId, wf->wfSlotId, wf->wfDynId);

             // call shader to prepare the flush operations
             wf->computeUnit->shader->prepareFlush(gpuDynInst);

             wf->computeUnit->completedWGs++;
         } else {
             wf->computeUnit->shader->dispatcher().scheduleDispatch();
         }
     }


     Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_branch")
     {
         setFlag(Branch);
     } // Inst_SOPP__S_BRANCH

     Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
     {
     } // ~Inst_SOPP__S_BRANCH

     // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
     void
     Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         Addr pc = wf->pc();
         ScalarRegI16 simm16 = instData.SIMM16;

         pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;

         wf->pc(pc);
     }

     Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_wakeup")
     {
     } // Inst_SOPP__S_WAKEUP

     Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
     {
     } // ~Inst_SOPP__S_WAKEUP

     // Allow a wave to wakeup all the other waves in its workgroup to force
     // them to wake up immediately from an S_SLEEP instruction. The wakeup is
     // ignored if the waves are not sleeping.
     void
     Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_cbranch_scc0")
     {
         setFlag(Branch);
     } // Inst_SOPP__S_CBRANCH_SCC0

     Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
     {
     } // ~Inst_SOPP__S_CBRANCH_SCC0

     // if (SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
     // else NOP.
     void
     Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         Addr pc = wf->pc();
         ScalarRegI16 simm16 = instData.SIMM16;
         ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

         scc.read();

         if (!scc.rawData()) {
             pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
         }

         wf->pc(pc);
     }

     Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_cbranch_scc1")
     {
         setFlag(Branch);
     } // Inst_SOPP__S_CBRANCH_SCC1

     Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
     {
     } // ~Inst_SOPP__S_CBRANCH_SCC1

     // if (SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
     // else NOP.
     void
     Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         Addr pc = wf->pc();
         ScalarRegI16 simm16 = instData.SIMM16;
         ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

         scc.read();

         if (scc.rawData()) {
             pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
         }

         wf->pc(pc);
     }

     Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_cbranch_vccz")
     {
         setFlag(Branch);
         setFlag(ReadsVCC);
     } // Inst_SOPP__S_CBRANCH_VCCZ

     Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
     {
     } // ~Inst_SOPP__S_CBRANCH_VCCZ

     // if (VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
     // else NOP.
     void
     Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
         Addr pc = wf->pc();
         ScalarRegI16 simm16 = instData.SIMM16;

         vcc.read();

         if (!vcc.rawData()) {
             pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
         }

         wf->pc(pc);
     }

     Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_cbranch_vccnz")
     {
         setFlag(Branch);
         setFlag(ReadsVCC);
     } // Inst_SOPP__S_CBRANCH_VCCNZ

     Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
     {
     } // ~Inst_SOPP__S_CBRANCH_VCCNZ

     // if (VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
     // else NOP.
     void
     Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         vcc.read();

         if (vcc.rawData()) {
             Addr pc = wf->pc();
             ScalarRegI16 simm16 = instData.SIMM16;
             pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
             wf->pc(pc);
         }
     }

     Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_cbranch_execz")
     {
         setFlag(Branch);
     } // Inst_SOPP__S_CBRANCH_EXECZ

     Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
     {
     } // ~Inst_SOPP__S_CBRANCH_EXECZ

     // if (EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
     // else NOP.
     void
     Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             Addr pc = wf->pc();
             ScalarRegI16 simm16 = instData.SIMM16;
             pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
             wf->pc(pc);
         }
     }

     Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_cbranch_execnz")
     {
         setFlag(Branch);
     } // Inst_SOPP__S_CBRANCH_EXECNZ

     Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
     {
     } // ~Inst_SOPP__S_CBRANCH_EXECNZ

     // if (EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
     // else NOP.
     void
     Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().any()) {
             Addr pc = wf->pc();
             ScalarRegI16 simm16 = instData.SIMM16;
             pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
             wf->pc(pc);
         }
     }

     Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_barrier")
     {
         setFlag(MemBarrier);
     } // Inst_SOPP__S_BARRIER

     Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
     {
     } // ~Inst_SOPP__S_BARRIER

     /**
      * Synchronize waves within a workgroup. If not all waves of the workgroup
      * have been created yet, wait for entire group before proceeding. If some
      * waves in the wokgroup have already terminated, this waits on only the
      * surviving waves.
      */
     void
     Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ComputeUnit *cu = gpuDynInst->computeUnit();

         if (wf->hasBarrier()) {
             int bar_id = wf->barrierId();
             assert(wf->getStatus() != Wavefront::S_BARRIER);
             wf->setStatus(Wavefront::S_BARRIER);
             cu->incNumAtBarrier(bar_id);
             DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
                     "barrier Id%d. %d waves now at barrier, %d waves "
                     "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
                     wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
                     cu->numYetToReachBarrier(bar_id));
         }
     } // execute
     // --- Inst_SOPP__S_SETKILL class methods ---

     Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_setkill")
     {
     } // Inst_SOPP__S_SETKILL

     Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
     {
     } // ~Inst_SOPP__S_SETKILL

     void
     Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_waitcnt")
     {
         setFlag(ALU);
         setFlag(Waitcnt);
     } // Inst_SOPP__S_WAITCNT

     Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
     {
     } // ~Inst_SOPP__S_WAITCNT

     // Wait for the counts of outstanding lds, vector-memory and
     // export/vmem-write-data to be at or below the specified levels.
     // SIMM16[3:0] = vmcount (vector memory operations),
     // SIMM16[6:4] = export/mem-write-data count,
     // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
     void
     Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
     {
         ScalarRegI32 vm_cnt = 0;
         ScalarRegI32 exp_cnt = 0;
         ScalarRegI32 lgkm_cnt = 0;
         vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
         exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
         lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
         gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
     }

     Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_sethalt")
     {
     } // Inst_SOPP__S_SETHALT

     Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
     {
     } // ~Inst_SOPP__S_SETHALT

     void
     Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_sleep")
     {
     } // Inst_SOPP__S_SLEEP

     Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
     {
     } // ~Inst_SOPP__S_SLEEP

     // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
     void
     Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_setprio")
     {
     } // Inst_SOPP__S_SETPRIO

     Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
     {
     } // ~Inst_SOPP__S_SETPRIO

     // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
     // 3 = highest.
     void
     Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_sendmsg")
     {
     } // Inst_SOPP__S_SENDMSG

     Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
     {
     } // ~Inst_SOPP__S_SENDMSG

     void
     Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_sendmsghalt")
     {
     } // Inst_SOPP__S_SENDMSGHALT

     Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
     {
     } // ~Inst_SOPP__S_SENDMSGHALT

     void
     Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_trap")
     {
     } // Inst_SOPP__S_TRAP

     Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
     {
     } // ~Inst_SOPP__S_TRAP

     // Enter the trap handler.
     void
     Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_icache_inv")
     {
     } // Inst_SOPP__S_ICACHE_INV

     Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
     {
     } // ~Inst_SOPP__S_ICACHE_INV

     // Invalidate entire L1 instruction cache.
     void
     Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_incperflevel")
     {
     } // Inst_SOPP__S_INCPERFLEVEL

     Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
     {
     } // ~Inst_SOPP__S_INCPERFLEVEL

     void
     Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_decperflevel")
     {
     } // Inst_SOPP__S_DECPERFLEVEL

     Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
     {
     } // ~Inst_SOPP__S_DECPERFLEVEL

     void
     Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_ttracedata")
     {
     } // Inst_SOPP__S_TTRACEDATA

     Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
     {
     } // ~Inst_SOPP__S_TTRACEDATA

     void
     Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
           InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
     {
         setFlag(Branch);
     } // Inst_SOPP__S_CBRANCH_CDBGSYS

     Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
     {
     } // ~Inst_SOPP__S_CBRANCH_CDBGSYS

     void
     Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
           InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
     {
         setFlag(Branch);
     } // Inst_SOPP__S_CBRANCH_CDBGUSER

     Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
     {
     } // ~Inst_SOPP__S_CBRANCH_CDBGUSER

     void
     Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
           InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
     {
         setFlag(Branch);
     } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER

     Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
         ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
     {
     } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER

     void
     Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
         Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
             : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
     {
         setFlag(Branch);
     } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER

     Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
         ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
     {
     } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER

     void
     Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_endpgm_saved")
     {
     } // Inst_SOPP__S_ENDPGM_SAVED

     Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
     {
     } // ~Inst_SOPP__S_ENDPGM_SAVED

     // End of program.
     void
     Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
           InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
     {
     } // Inst_SOPP__S_SET_GPR_IDX_OFF

     Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
     {
     } // ~Inst_SOPP__S_SET_GPR_IDX_OFF

     // MODE.gpr_idx_en = 0.
     // Clear GPR indexing mode. Vector operations after this will not perform
     // relative GPR addressing regardless of the contents of M0.
     void
     Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
           InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
     {
     } // Inst_SOPP__S_SET_GPR_IDX_MODE

     Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
     {
     } // ~Inst_SOPP__S_SET_GPR_IDX_MODE

     // M0[15:12] = SIMM4.
     // Modify the mode used for vector GPR indexing.
     // The raw contents of the source field are read and used to set the enable
     // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
     // and SIMM4[3] = VDST_REL.
     void
     Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SMEM__S_LOAD_DWORD::Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_load_dword")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_LOAD_DWORD

     Inst_SMEM__S_LOAD_DWORD::~Inst_SMEM__S_LOAD_DWORD()
     {
     } // ~Inst_SMEM__S_LOAD_DWORD

     /**
      * Read 1 dword from scalar data cache. If the offset is specified as an
      * sgpr, the sgpr contains an unsigned byte offset (the 2 LSBs are
      * ignored). If the offset is specified as an immediate 20-bit constant,
      * the constant is an unsigned byte offset.
      */
     void
     Inst_SMEM__S_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

         addr.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, addr, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe
             .getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_SMEM__S_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<1>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_LOAD_DWORDX2::Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_load_dwordx2")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_LOAD_DWORDX2

     Inst_SMEM__S_LOAD_DWORDX2::~Inst_SMEM__S_LOAD_DWORDX2()
     {
     } // ~Inst_SMEM__S_LOAD_DWORDX2

     /**
      * Read 2 dwords from scalar data cache. See s_load_dword for details on
      * the offset input.
      */
     void
     Inst_SMEM__S_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

         addr.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, addr, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe.
             getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_SMEM__S_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<2>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_LOAD_DWORDX4::Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_load_dwordx4")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_LOAD_DWORDX4

     Inst_SMEM__S_LOAD_DWORDX4::~Inst_SMEM__S_LOAD_DWORDX4()
     {
     } // ~Inst_SMEM__S_LOAD_DWORDX4

     // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

         addr.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, addr, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe.
             getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_SMEM__S_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<4>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_LOAD_DWORDX8::Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_load_dwordx8")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_LOAD_DWORDX8

     Inst_SMEM__S_LOAD_DWORDX8::~Inst_SMEM__S_LOAD_DWORDX8()
     {
     } // ~Inst_SMEM__S_LOAD_DWORDX8

     // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

         addr.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, addr, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe.
             getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_SMEM__S_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<8>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_LOAD_DWORDX16::Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_load_dwordx16")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_LOAD_DWORDX16

     Inst_SMEM__S_LOAD_DWORDX16::~Inst_SMEM__S_LOAD_DWORDX16()
     {
     } // ~Inst_SMEM__S_LOAD_DWORDX16

     // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

         addr.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, addr, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe.
             getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_SMEM__S_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<16>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_BUFFER_LOAD_DWORD::Inst_SMEM__S_BUFFER_LOAD_DWORD(
           InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_buffer_load_dword")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_BUFFER_LOAD_DWORD

     Inst_SMEM__S_BUFFER_LOAD_DWORD::~Inst_SMEM__S_BUFFER_LOAD_DWORD()
     {
     } // ~Inst_SMEM__S_BUFFER_LOAD_DWORD

     // Read 1 dword from scalar data cache. See S_LOAD_DWORD for details on the
     // offset input.
     void
     Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

         rsrcDesc.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, rsrcDesc, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe
             .getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     } // execute

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<1>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         // 1 request, size 32
         ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
           InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_buffer_load_dwordx2")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_BUFFER_LOAD_DWORDX2

     Inst_SMEM__S_BUFFER_LOAD_DWORDX2::~Inst_SMEM__S_BUFFER_LOAD_DWORDX2()
     {
     } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2

     // Read 2 dwords from scalar data cache. See S_LOAD_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

         rsrcDesc.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, rsrcDesc, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe
             .getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     } // execute

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<2>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         // use U64 because 2 requests, each size 32
         ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
           InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_buffer_load_dwordx4")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_BUFFER_LOAD_DWORDX4

     Inst_SMEM__S_BUFFER_LOAD_DWORDX4::~Inst_SMEM__S_BUFFER_LOAD_DWORDX4()
     {
     } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4

     // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

         rsrcDesc.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, rsrcDesc, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe
             .getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     } // execute

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<4>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         // 4 requests, each size 32
         ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_BUFFER_LOAD_DWORDX8::Inst_SMEM__S_BUFFER_LOAD_DWORDX8(
           InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_buffer_load_dwordx8")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_BUFFER_LOAD_DWORDX8

     Inst_SMEM__S_BUFFER_LOAD_DWORDX8::~Inst_SMEM__S_BUFFER_LOAD_DWORDX8()
     {
     } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8

     // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

         rsrcDesc.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, rsrcDesc, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe
             .getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     } // execute

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<8>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         // 8 requests, each size 32
         ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_BUFFER_LOAD_DWORDX16::Inst_SMEM__S_BUFFER_LOAD_DWORDX16(
           InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_buffer_load_dwordx16")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_SMEM__S_BUFFER_LOAD_DWORDX16

     Inst_SMEM__S_BUFFER_LOAD_DWORDX16::~Inst_SMEM__S_BUFFER_LOAD_DWORDX16()
     {
     } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16

     // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

         rsrcDesc.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, rsrcDesc, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe
             .getGMReqFIFO().push(gpuDynInst);

         wf->scalarRdGmReqsInPipe--;
         wf->scalarOutstandingReqsRdGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     } // execute

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<16>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         // 16 requests, each size 32
         ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
         sdst.write();
     } // completeAcc

     Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_store_dword")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_SMEM__S_STORE_DWORD

     Inst_SMEM__S_STORE_DWORD::~Inst_SMEM__S_STORE_DWORD()
     {
     } // ~Inst_SMEM__S_STORE_DWORD

     // Write 1 dword to scalar data cache.
     // If the offset is specified as an SGPR, the SGPR contains an unsigned
     // BYTE offset (the 2 LSBs are ignored).
     // If the offset is specified as an immediate 20-bit constant, the
     // constant is an unsigned BYTE offset.
     void
     Inst_SMEM__S_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

         addr.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, addr, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe.
             getGMReqFIFO().push(gpuDynInst);

         wf->scalarWrGmReqsInPipe--;
         wf->scalarOutstandingReqsWrGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
         sdata.read();
         std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
             sizeof(ScalarRegU32));
         initMemWrite<1>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_SMEM__S_STORE_DWORDX2::Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_store_dwordx2")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_SMEM__S_STORE_DWORDX2

     Inst_SMEM__S_STORE_DWORDX2::~Inst_SMEM__S_STORE_DWORDX2()
     {
     } // ~Inst_SMEM__S_STORE_DWORDX2

     // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

         addr.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, addr, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe.
             getGMReqFIFO().push(gpuDynInst);

         wf->scalarWrGmReqsInPipe--;
         wf->scalarOutstandingReqsWrGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
         sdata.read();
         std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
             sizeof(ScalarRegU64));
         initMemWrite<2>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_SMEM__S_STORE_DWORDX4::Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_store_dwordx4")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_SMEM__S_STORE_DWORDX4

     Inst_SMEM__S_STORE_DWORDX4::~Inst_SMEM__S_STORE_DWORDX4()
     {
     } // ~Inst_SMEM__S_STORE_DWORDX4

     // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
         ScalarRegU32 offset(0);
         ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

         addr.read();

         if (instData.IMM) {
             offset = extData.OFFSET;
         } else {
             ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
             off_sgpr.read();
             offset = off_sgpr.rawData();
         }

         calcAddr(gpuDynInst, addr, offset);

         gpuDynInst->computeUnit()->scalarMemoryPipe.
             getGMReqFIFO().push(gpuDynInst);

         wf->scalarWrGmReqsInPipe--;
         wf->scalarOutstandingReqsWrGm++;
         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA);
         sdata.read();
         std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
             4 * sizeof(ScalarRegU32));
         initMemWrite<4>(gpuDynInst);
     } // initiateAcc

     void
     Inst_SMEM__S_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_SMEM__S_BUFFER_STORE_DWORD::Inst_SMEM__S_BUFFER_STORE_DWORD(
           InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_buffer_store_dword")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_SMEM__S_BUFFER_STORE_DWORD

     Inst_SMEM__S_BUFFER_STORE_DWORD::~Inst_SMEM__S_BUFFER_STORE_DWORD()
     {
     } // ~Inst_SMEM__S_BUFFER_STORE_DWORD

     // Write 1 dword to scalar data cache. See S_STORE_DWORD for details on the
     // offset input.
     void
     Inst_SMEM__S_BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_SMEM__S_BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_SMEM__S_BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_SMEM__S_BUFFER_STORE_DWORDX2::Inst_SMEM__S_BUFFER_STORE_DWORDX2(
           InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_buffer_store_dwordx2")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_SMEM__S_BUFFER_STORE_DWORDX2

     Inst_SMEM__S_BUFFER_STORE_DWORDX2::~Inst_SMEM__S_BUFFER_STORE_DWORDX2()
     {
     } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX2

     // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_SMEM__S_BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_SMEM__S_BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_SMEM__S_BUFFER_STORE_DWORDX4::Inst_SMEM__S_BUFFER_STORE_DWORDX4(
           InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_buffer_store_dwordx4")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_SMEM__S_BUFFER_STORE_DWORDX4

     Inst_SMEM__S_BUFFER_STORE_DWORDX4::~Inst_SMEM__S_BUFFER_STORE_DWORDX4()
     {
     } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX4

     // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
     // the offset input.
     void
     Inst_SMEM__S_BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_SMEM__S_BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_SMEM__S_BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_SMEM__S_DCACHE_INV::Inst_SMEM__S_DCACHE_INV(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_dcache_inv")
     {
     } // Inst_SMEM__S_DCACHE_INV

     Inst_SMEM__S_DCACHE_INV::~Inst_SMEM__S_DCACHE_INV()
     {
     } // ~Inst_SMEM__S_DCACHE_INV

     // Invalidate the scalar data cache.
     void
     Inst_SMEM__S_DCACHE_INV::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SMEM__S_DCACHE_WB::Inst_SMEM__S_DCACHE_WB(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_dcache_wb")
     {
     } // Inst_SMEM__S_DCACHE_WB

     Inst_SMEM__S_DCACHE_WB::~Inst_SMEM__S_DCACHE_WB()
     {
     } // ~Inst_SMEM__S_DCACHE_WB

     // Write back dirty data in the scalar data cache.
     void
     Inst_SMEM__S_DCACHE_WB::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SMEM__S_DCACHE_INV_VOL::Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_dcache_inv_vol")
     {
     } // Inst_SMEM__S_DCACHE_INV_VOL

     Inst_SMEM__S_DCACHE_INV_VOL::~Inst_SMEM__S_DCACHE_INV_VOL()
     {
     } // ~Inst_SMEM__S_DCACHE_INV_VOL

     // Invalidate the scalar data cache volatile lines.
     void
     Inst_SMEM__S_DCACHE_INV_VOL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SMEM__S_DCACHE_WB_VOL::Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_dcache_wb_vol")
     {
     } // Inst_SMEM__S_DCACHE_WB_VOL

     Inst_SMEM__S_DCACHE_WB_VOL::~Inst_SMEM__S_DCACHE_WB_VOL()
     {
     } // ~Inst_SMEM__S_DCACHE_WB_VOL

     // Write back dirty data in the scalar data cache volatile lines.
     void
     Inst_SMEM__S_DCACHE_WB_VOL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SMEM__S_MEMTIME::Inst_SMEM__S_MEMTIME(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_memtime")
     {
     } // Inst_SMEM__S_MEMTIME

     Inst_SMEM__S_MEMTIME::~Inst_SMEM__S_MEMTIME()
     {
     } // ~Inst_SMEM__S_MEMTIME

     // Return current 64-bit timestamp.
     void
     Inst_SMEM__S_MEMTIME::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SMEM__S_MEMREALTIME::Inst_SMEM__S_MEMREALTIME(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_memrealtime")
     {
     } // Inst_SMEM__S_MEMREALTIME

     Inst_SMEM__S_MEMREALTIME::~Inst_SMEM__S_MEMREALTIME()
     {
     } // ~Inst_SMEM__S_MEMREALTIME

     // Return current 64-bit RTC.
     void
     Inst_SMEM__S_MEMREALTIME::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SMEM__S_ATC_PROBE::Inst_SMEM__S_ATC_PROBE(InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_atc_probe")
     {
     } // Inst_SMEM__S_ATC_PROBE

     Inst_SMEM__S_ATC_PROBE::~Inst_SMEM__S_ATC_PROBE()
     {
     } // ~Inst_SMEM__S_ATC_PROBE

     void
     Inst_SMEM__S_ATC_PROBE::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_SMEM__S_ATC_PROBE_BUFFER::Inst_SMEM__S_ATC_PROBE_BUFFER(
           InFmt_SMEM *iFmt)
         : Inst_SMEM(iFmt, "s_atc_probe_buffer")
     {
     } // Inst_SMEM__S_ATC_PROBE_BUFFER

     Inst_SMEM__S_ATC_PROBE_BUFFER::~Inst_SMEM__S_ATC_PROBE_BUFFER()
     {
     } // ~Inst_SMEM__S_ATC_PROBE_BUFFER

     void
     Inst_SMEM__S_ATC_PROBE_BUFFER::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_cndmask_b32")
     {
         setFlag(ALU);
         setFlag(ReadsVCC);
     } // Inst_VOP2__V_CNDMASK_B32

     Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()
     {
     } // ~Inst_VOP2__V_CNDMASK_B32

     // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
     // as a scalar GPR in S2.
     void
     Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();
         vcc.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane]
                     = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_add_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP2__V_ADD_F32

     Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()
     {
     } // ~Inst_VOP2__V_ADD_F32

     // D.f = S0.f + S1.f.
     void
     Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         VecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         if (isDPPInst()) {
             VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
             src0_dpp.read();

             DPRINTF(GCN3, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
                     "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
                     "SRC1_ABS: %d, SRC1_NEG: %d, BOUND_CTRL: %d, "
                     "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
                     extData.iFmt_VOP_DPP.DPP_CTRL,
                     extData.iFmt_VOP_DPP.SRC0_ABS,
                     extData.iFmt_VOP_DPP.SRC0_NEG,
                     extData.iFmt_VOP_DPP.SRC1_ABS,
                     extData.iFmt_VOP_DPP.SRC1_NEG,
                     extData.iFmt_VOP_DPP.BOUND_CTRL,
                     extData.iFmt_VOP_DPP.BANK_MASK,
                     extData.iFmt_VOP_DPP.ROW_MASK);

             processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src0_dpp[lane] + src1[lane];
                 }
             }
         } else {
             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src0[lane] + src1[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_sub_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP2__V_SUB_F32

     Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()
     {
     } // ~Inst_VOP2__V_SUB_F32

     // D.f = S0.f - S1.f.
     void
     Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_subrev_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP2__V_SUBREV_F32

     Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()
     {
     } // ~Inst_VOP2__V_SUBREV_F32

     // D.f = S1.f - S0.f.
     void
     Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mul_legacy_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP2__V_MUL_LEGACY_F32

     Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()
     {
     } // ~Inst_VOP2__V_MUL_LEGACY_F32

     // D.f = S0.f * S1.f
     void
     Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] * src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mul_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP2__V_MUL_F32

     Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()
     {
     } // ~Inst_VOP2__V_MUL_F32

     // D.f = S0.f * S1.f.
     void
     Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isnan(src0[lane]) ||
                     std::isnan(src1[lane])) {
                     vdst[lane] = NAN;
                 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src0[lane]) == FP_ZERO) &&
                            !std::signbit(src0[lane])) {
                     if (std::isinf(src1[lane])) {
                         vdst[lane] = NAN;
                     } else if (!std::signbit(src1[lane])) {
                         vdst[lane] = +0.0;
                     } else {
                         vdst[lane] = -0.0;
                     }
                 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src0[lane]) == FP_ZERO) &&
                            std::signbit(src0[lane])) {
                     if (std::isinf(src1[lane])) {
                         vdst[lane] = NAN;
                     } else if (std::signbit(src1[lane])) {
                         vdst[lane] = +0.0;
                     } else {
                         vdst[lane] = -0.0;
                     }
                 } else if (std::isinf(src0[lane]) &&
                            !std::signbit(src0[lane])) {
                     if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src1[lane]) == FP_ZERO) {
                         vdst[lane] = NAN;
                     } else if (!std::signbit(src1[lane])) {
                         vdst[lane] = +INFINITY;
                     } else {
                         vdst[lane] = -INFINITY;
                     }
                 } else if (std::isinf(src0[lane]) &&
                            std::signbit(src0[lane])) {
                     if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src1[lane]) == FP_ZERO) {
                         vdst[lane] = NAN;
                     } else if (std::signbit(src1[lane])) {
                         vdst[lane] = +INFINITY;
                     } else {
                         vdst[lane] = -INFINITY;
                     }
                 } else {
                     vdst[lane] = src0[lane] * src1[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mul_i32_i24")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MUL_I32_I24

     Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()
     {
     } // ~Inst_VOP2__V_MUL_I32_I24

     // D.i = S0.i[23:0] * S1.i[23:0].
     void
     Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
                     * sext<24>(bits(src1[lane], 23, 0));
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MUL_HI_I32_I24

     Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()
     {
     } // ~Inst_VOP2__V_MUL_HI_I32_I24

     // D.i = (S0.i[23:0] * S1.i[23:0]) >> 32.
     void
     Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemI64 tmp_src0
                     = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
                 VecElemI64 tmp_src1
                     = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));

                 vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mul_u32_u24")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MUL_U32_U24

     Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()
     {
     } // ~Inst_VOP2__V_MUL_U32_U24

     // D.u = S0.u[23:0] * S1.u[23:0].
     void
     Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
             // use copies of original src0, src1, and dest during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
             VecOperandU32 origVdst(gpuDynInst, instData.VDST);

             src0_sdwa.read();
             origSrc0_sdwa.read();
             origSrc1.read();

             DPRINTF(GCN3, "Handling V_MUL_U32_U24 SRC SDWA. SRC0: register "
                     "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
                     "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
                     "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
                     extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
                     extData.iFmt_VOP_SDWA.DST_UNUSED,
                     extData.iFmt_VOP_SDWA.CLAMP,
                     extData.iFmt_VOP_SDWA.SRC0_SEL,
                     extData.iFmt_VOP_SDWA.SRC0_SEXT,
                     extData.iFmt_VOP_SDWA.SRC0_NEG,
                     extData.iFmt_VOP_SDWA.SRC0_ABS,
                     extData.iFmt_VOP_SDWA.SRC1_SEL,
                     extData.iFmt_VOP_SDWA.SRC1_SEXT,
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);

             processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
                             src1, origSrc1);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
                                  bits(src1[lane], 23, 0);
                     origVdst[lane] = vdst[lane]; // keep copy consistent
                 }
             }

             processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
         } else {
             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = bits(src0[lane], 23, 0) *
                                  bits(src1[lane], 23, 0);
                 }
             }
         }


         vdst.write();
     }

     Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MUL_HI_U32_U24

     Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()
     {
     } // ~Inst_VOP2__V_MUL_HI_U32_U24

     // D.i = (S0.u[23:0] * S1.u[23:0]) >> 32.
     void
     Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
                 VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
                 vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_min_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP2__V_MIN_F32

     Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()
     {
     } // ~Inst_VOP2__V_MIN_F32

     // D.f = (S0.f < S1.f ? S0.f : S1.f).
     void
     Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fmin(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_max_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP2__V_MAX_F32

     Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()
     {
     } // ~Inst_VOP2__V_MAX_F32

     // D.f = (S0.f >= S1.f ? S0.f : S1.f).
     void
     Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fmax(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_min_i32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MIN_I32

     Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()
     {
     } // ~Inst_VOP2__V_MIN_I32

     // D.i = min(S0.i, S1.i).
     void
     Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_max_i32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MAX_I32

     Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()
     {
     } // ~Inst_VOP2__V_MAX_I32

     // D.i = max(S0.i, S1.i).
     void
     Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_min_u32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MIN_U32

     Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()
     {
     } // ~Inst_VOP2__V_MIN_U32

     // D.u = min(S0.u, S1.u).
     void
     Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_max_u32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MAX_U32

     Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()
     {
     } // ~Inst_VOP2__V_MAX_U32

     // D.u = max(S0.u, S1.u).
     void
     Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_lshrrev_b32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_LSHRREV_B32

     Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()
     {
     } // ~Inst_VOP2__V_LSHRREV_B32

     // D.u = S1.u >> S0.u[4:0].
     // The vacated bits are set to zero.
     void
     Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_ashrrev_i32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_ASHRREV_I32

     Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()
     {
     } // ~Inst_VOP2__V_ASHRREV_I32

     // D.i = signext(S1.i) >> S0.i[4:0].
     // The vacated bits are set to the sign bit of the input value.
     void
     Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_lshlrev_b32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_LSHLREV_B32

     Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()
     {
     } // ~Inst_VOP2__V_LSHLREV_B32

     // D.u = S1.u << S0.u[4:0].
     void
     Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
             // use copies of original src0, src1, and vdst during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
             VecOperandU32 origVdst(gpuDynInst, instData.VDST);

             src0_sdwa.read();
             origSrc0_sdwa.read();
             origSrc1.read();

             DPRINTF(GCN3, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
                     "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
                     "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
                     "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
                     extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
                     extData.iFmt_VOP_SDWA.DST_UNUSED,
                     extData.iFmt_VOP_SDWA.CLAMP,
                     extData.iFmt_VOP_SDWA.SRC0_SEL,
                     extData.iFmt_VOP_SDWA.SRC0_SEXT,
                     extData.iFmt_VOP_SDWA.SRC0_NEG,
                     extData.iFmt_VOP_SDWA.SRC0_ABS,
                     extData.iFmt_VOP_SDWA.SRC1_SEL,
                     extData.iFmt_VOP_SDWA.SRC1_SEXT,
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);

             processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
                             src1, origSrc1);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
                     origVdst[lane] = vdst[lane]; // keep copy consistent
                 }
             }

             processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
         } else {
             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_and_b32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_AND_B32

     Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()
     {
     } // ~Inst_VOP2__V_AND_B32

     // D.u = S0.u & S1.u.
     // Input and output modifiers not supported.
     void
     Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] & src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_or_b32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_OR_B32

     Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()
     {
     } // ~Inst_VOP2__V_OR_B32

     // D.u = S0.u | S1.u.
     // Input and output modifiers not supported.
     void
     Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
             // use copies of original src0, src1, and dest during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
             VecOperandU32 origVdst(gpuDynInst, instData.VDST);

             src0_sdwa.read();
             origSrc0_sdwa.read();
             origSrc1.read();

             DPRINTF(GCN3, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
                     "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
                     "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
                     "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
                     extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
                     extData.iFmt_VOP_SDWA.DST_UNUSED,
                     extData.iFmt_VOP_SDWA.CLAMP,
                     extData.iFmt_VOP_SDWA.SRC0_SEL,
                     extData.iFmt_VOP_SDWA.SRC0_SEXT,
                     extData.iFmt_VOP_SDWA.SRC0_NEG,
                     extData.iFmt_VOP_SDWA.SRC0_ABS,
                     extData.iFmt_VOP_SDWA.SRC1_SEL,
                     extData.iFmt_VOP_SDWA.SRC1_SEXT,
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);

             processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
                             src1, origSrc1);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src0_sdwa[lane] | src1[lane];
                     origVdst[lane] = vdst[lane]; // keep copy consistent
                 }
             }

             processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
         } else {
             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src0[lane] | src1[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_xor_b32")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_XOR_B32

     Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()
     {
     } // ~Inst_VOP2__V_XOR_B32

     // D.u = S0.u ^ S1.u.
     // Input and output modifiers not supported.
     void
     Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] ^ src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mac_f32")
     {
         setFlag(ALU);
         setFlag(F32);
         setFlag(MAC);
     } // Inst_VOP2__V_MAC_F32

     Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()
     {
     } // ~Inst_VOP2__V_MAC_F32

     // D.f = S0.f * S1.f + D.f.
     void
     Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         VecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();
         vdst.read();

         if (isDPPInst()) {
             VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
             src0_dpp.read();

             DPRINTF(GCN3, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
                     "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
                     "SRC1_ABS: %d, SRC1_NEG: %d, BOUND_CTRL: %d, "
                     "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
                     extData.iFmt_VOP_DPP.DPP_CTRL,
                     extData.iFmt_VOP_DPP.SRC0_ABS,
                     extData.iFmt_VOP_DPP.SRC0_NEG,
                     extData.iFmt_VOP_DPP.SRC1_ABS,
                     extData.iFmt_VOP_DPP.SRC1_NEG,
                     extData.iFmt_VOP_DPP.BOUND_CTRL,
                     extData.iFmt_VOP_DPP.BANK_MASK,
                     extData.iFmt_VOP_DPP.ROW_MASK);

             processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
                                           vdst[lane]);
                 }
             }
         } else {
             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_madmk_f32")
     {
         setFlag(ALU);
         setFlag(F32);
         setFlag(MAD);
     } // Inst_VOP2__V_MADMK_F32

     Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()
     {
     } // ~Inst_VOP2__V_MADMK_F32

     // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
     // This opcode cannot use the input/output modifiers.
     void
     Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);
         VecElemF32 k = extData.imm_f32;

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], k, src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_madak_f32")
     {
         setFlag(ALU);
         setFlag(F32);
         setFlag(MAD);
     } // Inst_VOP2__V_MADAK_F32

     Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()
     {
     } // ~Inst_VOP2__V_MADAK_F32

     // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
     // This opcode cannot use input/output modifiers.
     void
     Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);
         VecElemF32 k = extData.imm_f32;

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], src1[lane], k);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_add_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
     } // Inst_VOP2__V_ADD_U32

     Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
     {
     } // ~Inst_VOP2__V_ADD_U32

     // D.u = S0.u + S1.u;
     // VCC[threadId] = (S0.u + S1.u >= 0x100000000ULL ? 1 : 0) is an UNSIGNED
     // overflow or carry-out.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
     Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
             // use copies of original src0, src1, and dest during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
             VecOperandU32 origVdst(gpuDynInst, instData.VDST);

             src0_sdwa.read();
             origSrc0_sdwa.read();
             origSrc1.read();

             DPRINTF(GCN3, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
                     "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
                     "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
                     "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
                     extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
                     extData.iFmt_VOP_SDWA.DST_UNUSED,
                     extData.iFmt_VOP_SDWA.CLAMP,
                     extData.iFmt_VOP_SDWA.SRC0_SEL,
                     extData.iFmt_VOP_SDWA.SRC0_SEXT,
                     extData.iFmt_VOP_SDWA.SRC0_NEG,
                     extData.iFmt_VOP_SDWA.SRC0_ABS,
                     extData.iFmt_VOP_SDWA.SRC1_SEL,
                     extData.iFmt_VOP_SDWA.SRC1_SEXT,
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);

             processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
                             src1, origSrc1);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src0_sdwa[lane] + src1[lane];
                     origVdst[lane] = vdst[lane]; // keep copy consistent
                     vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
                         + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
                 }
             }

             processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
         } else {
             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src0[lane] + src1[lane];
                     vcc.setBit(lane, ((VecElemU64)src0[lane]
                         + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
                 }
             }
         }

         vcc.write();
         vdst.write();
     }

     Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_sub_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
     } // Inst_VOP2__V_SUB_U32

     Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
     {
     } // ~Inst_VOP2__V_SUB_U32

     // D.u = S0.u - S1.u;
     // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
     // carry-out.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
     Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
                 vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
             }
         }

         vdst.write();
         vcc.write();
     }

     Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_subrev_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
     } // Inst_VOP2__V_SUBREV_U32

     Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
     {
     } // ~Inst_VOP2__V_SUBREV_U32

     // D.u = S1.u - S0.u;
     // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
     // carry-out.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
     Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vdst.write();
         vcc.write();
     }

     Inst_VOP2__V_ADDC_U32::Inst_VOP2__V_ADDC_U32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_addc_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
     } // Inst_VOP2__V_ADDC_U32

     Inst_VOP2__V_ADDC_U32::~Inst_VOP2__V_ADDC_U32()
     {
     } // ~Inst_VOP2__V_ADDC_U32

     // D.u = S0.u + S1.u + VCC[threadId];
     // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x100000000ULL ? 1 : 0)
     // is an UNSIGNED overflow.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
     // source comes from the SGPR-pair at S2.u.
     void
     Inst_VOP2__V_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();
         vcc.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] + src1[lane]
                     + bits(vcc.rawData(), lane);
                 vcc.setBit(lane, ((VecElemU64)src0[lane]
                     + (VecElemU64)src1[lane]
                         + (VecElemU64)bits(vcc.rawData(), lane, lane))
                             >= 0x100000000 ? 1 : 0);
             }
         }

         vdst.write();
         vcc.write();
     }

     Inst_VOP2__V_SUBB_U32::Inst_VOP2__V_SUBB_U32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_subb_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
     } // Inst_VOP2__V_SUBB_U32

     Inst_VOP2__V_SUBB_U32::~Inst_VOP2__V_SUBB_U32()
     {
     } // ~Inst_VOP2__V_SUBB_U32

     // D.u = S0.u - S1.u - VCC[threadId];
     // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
     // overflow.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
     // source comes from the SGPR-pair at S2.u.
     void
     Inst_VOP2__V_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();
         vcc.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane]
                     = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
                 vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
                     > src0[lane] ? 1 : 0);
             }
         }

         vdst.write();
         vcc.write();
     }

     Inst_VOP2__V_SUBBREV_U32::Inst_VOP2__V_SUBBREV_U32(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_subbrev_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
     } // Inst_VOP2__V_SUBBREV_U32

     Inst_VOP2__V_SUBBREV_U32::~Inst_VOP2__V_SUBBREV_U32()
     {
     } // ~Inst_VOP2__V_SUBBREV_U32

     // D.u = S1.u - S0.u - VCC[threadId];
     // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
     // overflow.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
     // source comes from the SGPR-pair at S2.u.
     void
     Inst_VOP2__V_SUBBREV_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();
         vcc.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane]
                     = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
                 vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
                     > src1[lane] ? 1 : 0);
             }
         }

         vdst.write();
         vcc.write();
     }

     Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_add_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP2__V_ADD_F16

     Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()
     {
     } // ~Inst_VOP2__V_ADD_F16

     // D.f16 = S0.f16 + S1.f16.
     void
     Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_sub_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP2__V_SUB_F16

     Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()
     {
     } // ~Inst_VOP2__V_SUB_F16

     // D.f16 = S0.f16 - S1.f16.
     void
     Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_subrev_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP2__V_SUBREV_F16

     Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()
     {
     } // ~Inst_VOP2__V_SUBREV_F16

     // D.f16 = S1.f16 - S0.f16.
     void
     Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mul_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP2__V_MUL_F16

     Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()
     {
     } // ~Inst_VOP2__V_MUL_F16

     // D.f16 = S0.f16 * S1.f16.
     void
     Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mac_f16")
     {
         setFlag(ALU);
         setFlag(F16);
         setFlag(MAC);
     } // Inst_VOP2__V_MAC_F16

     Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()
     {
     } // ~Inst_VOP2__V_MAC_F16

     // D.f16 = S0.f16 * S1.f16 + D.f16.
     void
     Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_madmk_f16")
     {
         setFlag(ALU);
         setFlag(F16);
         setFlag(MAD);
     } // Inst_VOP2__V_MADMK_F16

     Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()
     {
     } // ~Inst_VOP2__V_MADMK_F16

     // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
     // in the following literal DWORD.
     // This opcode cannot use the VOP3 encoding and cannot use input/output
     // modifiers.
     void
     Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_madak_f16")
     {
         setFlag(ALU);
         setFlag(F16);
         setFlag(MAD);
     } // Inst_VOP2__V_MADAK_F16

     Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()
     {
     } // ~Inst_VOP2__V_MADAK_F16

     // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
     // in the following literal DWORD.
     // This opcode cannot use the VOP3 encoding and cannot use input/output
     // modifiers.
     void
     Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_add_u16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_ADD_U16

     Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()
     {
     } // ~Inst_VOP2__V_ADD_U16

     // D.u16 = S0.u16 + S1.u16.
     void
     Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] + src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_sub_u16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_SUB_U16

     Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()
     {
     } // ~Inst_VOP2__V_SUB_U16

     // D.u16 = S0.u16 - S1.u16.
     void
     Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_subrev_u16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_SUBREV_U16

     Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()
     {
     } // ~Inst_VOP2__V_SUBREV_U16

     // D.u16 = S1.u16 - S0.u16.
     void
     Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_mul_lo_u16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MUL_LO_U16

     Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()
     {
     } // ~Inst_VOP2__V_MUL_LO_U16

     // D.u16 = S0.u16 * S1.u16.
     void
     Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] * src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_lshlrev_b16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_LSHLREV_B16

     Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()
     {
     } // ~Inst_VOP2__V_LSHLREV_B16

     // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
     void
     Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_lshrrev_b16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_LSHRREV_B16

     Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()
     {
     } // ~Inst_VOP2__V_LSHRREV_B16

     // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
     // The vacated bits are set to zero.
     void
     Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> src0[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_ashrrev_i16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_ASHRREV_I16

     Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()
     {
     } // ~Inst_VOP2__V_ASHRREV_I16

     // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
     // The vacated bits are set to the sign bit of the input value.
     void
     Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         VecOperandI16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> src0[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_max_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP2__V_MAX_F16

     Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()
     {
     } // ~Inst_VOP2__V_MAX_F16

     // D.f16 = max(S0.f16, S1.f16).
     void
     Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_min_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP2__V_MIN_F16

     Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()
     {
     } // ~Inst_VOP2__V_MIN_F16

     // D.f16 = min(S0.f16, S1.f16).
     void
     Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_max_u16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MAX_U16

     Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()
     {
     } // ~Inst_VOP2__V_MAX_U16

     // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
     void
     Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_max_i16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MAX_I16

     Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()
     {
     } // ~Inst_VOP2__V_MAX_I16

     // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
     void
     Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         VecOperandI16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_min_u16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MIN_U16

     Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()
     {
     } // ~Inst_VOP2__V_MIN_U16

     // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
     void
     Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_min_i16")
     {
         setFlag(ALU);
     } // Inst_VOP2__V_MIN_I16

     Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()
     {
     } // ~Inst_VOP2__V_MIN_I16

     // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
     void
     Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         VecOperandI16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)
         : Inst_VOP2(iFmt, "v_ldexp_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP2__V_LDEXP_F16

     Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()
     {
     } // ~Inst_VOP2__V_LDEXP_F16

     // D.f16 = S0.f16 * (2 ** S1.i16).
     void
     Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_nop")
     {
         setFlag(Nop);
         setFlag(ALU);
     } // Inst_VOP1__V_NOP

     Inst_VOP1__V_NOP::~Inst_VOP1__V_NOP()
     {
     } // ~Inst_VOP1__V_NOP

     // Do nothing.
     void
     Inst_VOP1__V_NOP::execute(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_VOP1__V_MOV_B32::Inst_VOP1__V_MOV_B32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_mov_b32")
     {
         setFlag(ALU);
     } // Inst_VOP1__V_MOV_B32

     Inst_VOP1__V_MOV_B32::~Inst_VOP1__V_MOV_B32()
     {
     } // ~Inst_VOP1__V_MOV_B32

     // D.u = S0.u.
     // Input and output modifiers not supported; this is an untyped operation.
     void
     Inst_VOP1__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (isDPPInst()) {
             VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
             src_dpp.read();

             DPRINTF(GCN3, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
                     "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
                     "SRC1_ABS: %d, SRC1_NEG: %d, BOUND_CTRL: %d, "
                     "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
                     extData.iFmt_VOP_DPP.DPP_CTRL,
                     extData.iFmt_VOP_DPP.SRC0_ABS,
                     extData.iFmt_VOP_DPP.SRC0_NEG,
                     extData.iFmt_VOP_DPP.SRC1_ABS,
                     extData.iFmt_VOP_DPP.SRC1_NEG,
                     extData.iFmt_VOP_DPP.BOUND_CTRL,
                     extData.iFmt_VOP_DPP.BANK_MASK,
                     extData.iFmt_VOP_DPP.ROW_MASK);

             // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
             // to negate it or take the absolute value of it
             assert(!extData.iFmt_VOP_DPP.SRC1_ABS);
             assert(!extData.iFmt_VOP_DPP.SRC1_NEG);
             processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src_dpp[lane];
                 }
             }
         } else {
             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (wf->execMask(lane)) {
                     vdst[lane] = src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_READFIRSTLANE_B32::Inst_VOP1__V_READFIRSTLANE_B32(
           InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_readfirstlane_b32")
     {
         setFlag(ALU);
     } // Inst_VOP1__V_READFIRSTLANE_B32

     Inst_VOP1__V_READFIRSTLANE_B32::~Inst_VOP1__V_READFIRSTLANE_B32()
     {
     } // ~Inst_VOP1__V_READFIRSTLANE_B32

     // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
     // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
     // (Lane# = 0 if exec is zero). Ignores exec mask for the access.
     // Input and output modifiers not supported; this is an untyped operation.
     void
     Inst_VOP1__V_READFIRSTLANE_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarRegI32 src_lane(0);
         ScalarRegU64 exec_mask = wf->execMask().to_ullong();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         ScalarOperandU32 sdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (exec_mask) {
             src_lane = findLsbSet(exec_mask);
         }

         sdst = src[src_lane];

         sdst.write();
     }

     Inst_VOP1__V_CVT_I32_F64::Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_i32_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_CVT_I32_F64

     Inst_VOP1__V_CVT_I32_F64::~Inst_VOP1__V_CVT_I32_F64()
     {
     } // ~Inst_VOP1__V_CVT_I32_F64

     // D.i = (int)S0.d.
     // Out-of-range floating point values (including infinity) saturate. NaN
     // is converted to 0.
     void
     Inst_VOP1__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
                 std::frexp(src[lane],&exp);
                 if (std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else if (std::isinf(src[lane]) || exp > 30) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = INT_MIN;
                     } else {
                         vdst[lane] = INT_MAX;
                     }
                 } else {
                     vdst[lane] = (VecElemI32)src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_F64_I32::Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f64_i32")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_CVT_F64_I32

     Inst_VOP1__V_CVT_F64_I32::~Inst_VOP1__V_CVT_F64_I32()
     {
     } // ~Inst_VOP1__V_CVT_F64_I32

     // D.d = (double)S0.i.
     void
     Inst_VOP1__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF64)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_F32_I32::Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f32_i32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_F32_I32

     Inst_VOP1__V_CVT_F32_I32::~Inst_VOP1__V_CVT_F32_I32()
     {
     } // ~Inst_VOP1__V_CVT_F32_I32

     // D.f = (float)S0.i.
     void
     Inst_VOP1__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_F32_U32::Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f32_u32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_F32_U32

     Inst_VOP1__V_CVT_F32_U32::~Inst_VOP1__V_CVT_F32_U32()
     {
     } // ~Inst_VOP1__V_CVT_F32_U32

     // D.f = (float)S0.u.
     void
     Inst_VOP1__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_U32_F32::Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_u32_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_U32_F32

     Inst_VOP1__V_CVT_U32_F32::~Inst_VOP1__V_CVT_U32_F32()
     {
     } // ~Inst_VOP1__V_CVT_U32_F32

     // D.u = (unsigned)S0.f.
     // Out-of-range floating point values (including infinity) saturate. NaN
     // is converted to 0.
     void
     Inst_VOP1__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
                 std::frexp(src[lane],&exp);
                 if (std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else if (std::isinf(src[lane])) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = 0;
                     } else {
                         vdst[lane] = UINT_MAX;
                     }
                 } else if (exp > 31) {
                     vdst[lane] = UINT_MAX;
                 } else {
                     vdst[lane] = (VecElemU32)src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_I32_F32::Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_i32_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_I32_F32

     Inst_VOP1__V_CVT_I32_F32::~Inst_VOP1__V_CVT_I32_F32()
     {
     } // ~Inst_VOP1__V_CVT_I32_F32

     // D.i = (int)S0.f.
     // Out-of-range floating point values (including infinity) saturate. NaN
     // is converted to 0.
     void
     Inst_VOP1__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
                 std::frexp(src[lane],&exp);
                 if (std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else if (std::isinf(src[lane]) || exp > 30) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = INT_MIN;
                     } else {
                         vdst[lane] = INT_MAX;
                     }
                 } else {
                     vdst[lane] = (VecElemI32)src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_MOV_FED_B32::Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_mov_fed_b32")
     {
         setFlag(ALU);
     } // Inst_VOP1__V_MOV_FED_B32

     Inst_VOP1__V_MOV_FED_B32::~Inst_VOP1__V_MOV_FED_B32()
     {
     } // ~Inst_VOP1__V_MOV_FED_B32

     // D.u = S0.u;
     // Input and output modifiers not supported; this is an untyped operation.
     void
     Inst_VOP1__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_CVT_F16_F32::Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f16_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_F16_F32

     Inst_VOP1__V_CVT_F16_F32::~Inst_VOP1__V_CVT_F16_F32()
     {
     } // ~Inst_VOP1__V_CVT_F16_F32

     // D.f16 = flt32_to_flt16(S0.f).
     void
     Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_CVT_F32_F16::Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f32_f16")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_F32_F16

     Inst_VOP1__V_CVT_F32_F16::~Inst_VOP1__V_CVT_F32_F16()
     {
     } // ~Inst_VOP1__V_CVT_F32_F16

     // D.f = flt16_to_flt32(S0.f16).
     void
     Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_CVT_RPI_I32_F32::Inst_VOP1__V_CVT_RPI_I32_F32(
           InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_RPI_I32_F32

     Inst_VOP1__V_CVT_RPI_I32_F32::~Inst_VOP1__V_CVT_RPI_I32_F32()
     {
     } // ~Inst_VOP1__V_CVT_RPI_I32_F32

     // D.i = (int)floor(S0.f + 0.5).
     void
     Inst_VOP1__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_FLR_I32_F32::Inst_VOP1__V_CVT_FLR_I32_F32(
           InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_FLR_I32_F32

     Inst_VOP1__V_CVT_FLR_I32_F32::~Inst_VOP1__V_CVT_FLR_I32_F32()
     {
     } // ~Inst_VOP1__V_CVT_FLR_I32_F32

     // D.i = (int)floor(S0.f).
     void
     Inst_VOP1__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemI32)std::floor(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_OFF_F32_I4::Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_OFF_F32_I4

     Inst_VOP1__V_CVT_OFF_F32_I4::~Inst_VOP1__V_CVT_OFF_F32_I4()
     {
     } // ~Inst_VOP1__V_CVT_OFF_F32_I4

     // 4-bit signed int to 32-bit float.
     void
     Inst_VOP1__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_CVT_F32_F64::Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f32_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_CVT_F32_F64

     Inst_VOP1__V_CVT_F32_F64::~Inst_VOP1__V_CVT_F32_F64()
     {
     } // ~Inst_VOP1__V_CVT_F32_F64

     // D.f = (float)S0.d.
     void
     Inst_VOP1__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_F64_F32::Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f64_f32")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_CVT_F64_F32

     Inst_VOP1__V_CVT_F64_F32::~Inst_VOP1__V_CVT_F64_F32()
     {
     } // ~Inst_VOP1__V_CVT_F64_F32

     // D.d = (double)S0.f.
     void
     Inst_VOP1__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF64)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_F32_UBYTE0::Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_F32_UBYTE0

     Inst_VOP1__V_CVT_F32_UBYTE0::~Inst_VOP1__V_CVT_F32_UBYTE0()
     {
     } // ~Inst_VOP1__V_CVT_F32_UBYTE0

     // D.f = (float)(S0.u[7:0]).
     void
     Inst_VOP1__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_F32_UBYTE1::Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_F32_UBYTE1

     Inst_VOP1__V_CVT_F32_UBYTE1::~Inst_VOP1__V_CVT_F32_UBYTE1()
     {
     } // ~Inst_VOP1__V_CVT_F32_UBYTE1

     // D.f = (float)(S0.u[15:8]).
     void
     Inst_VOP1__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_F32_UBYTE2::Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_F32_UBYTE2

     Inst_VOP1__V_CVT_F32_UBYTE2::~Inst_VOP1__V_CVT_F32_UBYTE2()
     {
     } // ~Inst_VOP1__V_CVT_F32_UBYTE2

     // D.f = (float)(S0.u[23:16]).
     void
     Inst_VOP1__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_F32_UBYTE3::Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CVT_F32_UBYTE3

     Inst_VOP1__V_CVT_F32_UBYTE3::~Inst_VOP1__V_CVT_F32_UBYTE3()
     {
     } // ~Inst_VOP1__V_CVT_F32_UBYTE3

     // D.f = (float)(S0.u[31:24]).
     void
     Inst_VOP1__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_U32_F64::Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_u32_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_CVT_U32_F64

     Inst_VOP1__V_CVT_U32_F64::~Inst_VOP1__V_CVT_U32_F64()
     {
     } // ~Inst_VOP1__V_CVT_U32_F64

     // D.u = (unsigned)S0.d.
     // Out-of-range floating point values (including infinity) saturate. NaN
     // is converted to 0.
     void
     Inst_VOP1__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
                 std::frexp(src[lane],&exp);
                 if (std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else if (std::isinf(src[lane])) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = 0;
                     } else {
                         vdst[lane] = UINT_MAX;
                     }
                 } else if (exp > 31) {
                     vdst[lane] = UINT_MAX;
                 } else {
                     vdst[lane] = (VecElemU32)src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CVT_F64_U32::Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f64_u32")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_CVT_F64_U32

     Inst_VOP1__V_CVT_F64_U32::~Inst_VOP1__V_CVT_F64_U32()
     {
     } // ~Inst_VOP1__V_CVT_F64_U32

     // D.d = (double)S0.u.
     void
     Inst_VOP1__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF64)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_TRUNC_F64::Inst_VOP1__V_TRUNC_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_trunc_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_TRUNC_F64

     Inst_VOP1__V_TRUNC_F64::~Inst_VOP1__V_TRUNC_F64()
     {
     } // ~Inst_VOP1__V_TRUNC_F64

     // D.d = trunc(S0.d), return integer part of S0.d.
     void
     Inst_VOP1__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::trunc(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CEIL_F64::Inst_VOP1__V_CEIL_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_ceil_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_CEIL_F64

     Inst_VOP1__V_CEIL_F64::~Inst_VOP1__V_CEIL_F64()
     {
     } // ~Inst_VOP1__V_CEIL_F64

     // D.d = ceil(S0.d);
     void
     Inst_VOP1__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::ceil(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_RNDNE_F64::Inst_VOP1__V_RNDNE_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rndne_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_RNDNE_F64

     Inst_VOP1__V_RNDNE_F64::~Inst_VOP1__V_RNDNE_F64()
     {
     } // ~Inst_VOP1__V_RNDNE_F64

     // D.d = round_nearest_even(S0.d).
     void
     Inst_VOP1__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = roundNearestEven(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FLOOR_F64::Inst_VOP1__V_FLOOR_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_floor_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_FLOOR_F64

     Inst_VOP1__V_FLOOR_F64::~Inst_VOP1__V_FLOOR_F64()
     {
     } // ~Inst_VOP1__V_FLOOR_F64

     // D.d = floor(S0.d);
     void
     Inst_VOP1__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::floor(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FRACT_F32::Inst_VOP1__V_FRACT_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_fract_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_FRACT_F32

     Inst_VOP1__V_FRACT_F32::~Inst_VOP1__V_FRACT_F32()
     {
     } // ~Inst_VOP1__V_FRACT_F32

     // D.f = modf(S0.f).
     void
     Inst_VOP1__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemF32 int_part(0.0);
                 vdst[lane] = std::modf(src[lane], &int_part);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_TRUNC_F32::Inst_VOP1__V_TRUNC_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_trunc_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_TRUNC_F32

     Inst_VOP1__V_TRUNC_F32::~Inst_VOP1__V_TRUNC_F32()
     {
     } // ~Inst_VOP1__V_TRUNC_F32

     // D.f = trunc(S0.f), return integer part of S0.f.
     void
     Inst_VOP1__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst (gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::trunc(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CEIL_F32::Inst_VOP1__V_CEIL_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_ceil_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_CEIL_F32

     Inst_VOP1__V_CEIL_F32::~Inst_VOP1__V_CEIL_F32()
     {
     } // ~Inst_VOP1__V_CEIL_F32

     // D.f = ceil(S0.f);
     void
     Inst_VOP1__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::ceil(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_RNDNE_F32::Inst_VOP1__V_RNDNE_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rndne_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_RNDNE_F32

     Inst_VOP1__V_RNDNE_F32::~Inst_VOP1__V_RNDNE_F32()
     {
     } // ~Inst_VOP1__V_RNDNE_F32

     // D.f = round_nearest_even(S0.f).
     void
     Inst_VOP1__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = roundNearestEven(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FLOOR_F32::Inst_VOP1__V_FLOOR_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_floor_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_FLOOR_F32

     Inst_VOP1__V_FLOOR_F32::~Inst_VOP1__V_FLOOR_F32()
     {
     } // ~Inst_VOP1__V_FLOOR_F32

     // D.f = floor(S0.f);
     void
     Inst_VOP1__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::floor(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_EXP_F32::Inst_VOP1__V_EXP_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_exp_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_EXP_F32

     Inst_VOP1__V_EXP_F32::~Inst_VOP1__V_EXP_F32()
     {
     } // ~Inst_VOP1__V_EXP_F32

     // D.f = pow(2.0, S0.f).
     void
     Inst_VOP1__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::pow(2.0, src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_LOG_F32::Inst_VOP1__V_LOG_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_log_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_LOG_F32

     Inst_VOP1__V_LOG_F32::~Inst_VOP1__V_LOG_F32()
     {
     } // ~Inst_VOP1__V_LOG_F32

     // D.f = log2(S0.f).
     void
     Inst_VOP1__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::log2(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_RCP_F32::Inst_VOP1__V_RCP_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rcp_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_RCP_F32

     Inst_VOP1__V_RCP_F32::~Inst_VOP1__V_RCP_F32()
     {
     } // ~Inst_VOP1__V_RCP_F32

     // D.f = 1.0 / S0.f.
     void
     Inst_VOP1__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = 1.0 / src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_RCP_IFLAG_F32::Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_RCP_IFLAG_F32

     Inst_VOP1__V_RCP_IFLAG_F32::~Inst_VOP1__V_RCP_IFLAG_F32()
     {
     } // ~Inst_VOP1__V_RCP_IFLAG_F32

     // D.f = 1.0 / S0.f.
     void
     Inst_VOP1__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = 1.0 / src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_RSQ_F32::Inst_VOP1__V_RSQ_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rsq_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_RSQ_F32

     Inst_VOP1__V_RSQ_F32::~Inst_VOP1__V_RSQ_F32()
     {
     } // ~Inst_VOP1__V_RSQ_F32

     // D.f = 1.0 / sqrt(S0.f).
     void
     Inst_VOP1__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = 1.0 / std::sqrt(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_RCP_F64::Inst_VOP1__V_RCP_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rcp_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_RCP_F64

     Inst_VOP1__V_RCP_F64::~Inst_VOP1__V_RCP_F64()
     {
     } // ~Inst_VOP1__V_RCP_F64

     // D.d = 1.0 / S0.d.
     void
     Inst_VOP1__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::fpclassify(src[lane]) == FP_ZERO) {
                     vdst[lane] = +INFINITY;
                 } else if (std::isnan(src[lane])) {
                     vdst[lane] = NAN;
                 } else if (std::isinf(src[lane])) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = -0.0;
                     } else {
                         vdst[lane] = 0.0;
                     }
                 } else {
                     vdst[lane] = 1.0 / src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_RSQ_F64::Inst_VOP1__V_RSQ_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rsq_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_RSQ_F64

     Inst_VOP1__V_RSQ_F64::~Inst_VOP1__V_RSQ_F64()
     {
     } // ~Inst_VOP1__V_RSQ_F64

     // D.d = 1.0 / sqrt(S0.d).
     void
     Inst_VOP1__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::fpclassify(src[lane]) == FP_ZERO) {
                     vdst[lane] = +INFINITY;
                 } else if (std::isnan(src[lane])) {
                     vdst[lane] = NAN;
                 } else if (std::isinf(src[lane])
                            && !std::signbit(src[lane])) {
                     vdst[lane] = 0.0;
                 } else if (std::signbit(src[lane])) {
                     vdst[lane] = NAN;
                 } else {
                     vdst[lane] = 1.0 / std::sqrt(src[lane]);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_SQRT_F32::Inst_VOP1__V_SQRT_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_sqrt_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_SQRT_F32

     Inst_VOP1__V_SQRT_F32::~Inst_VOP1__V_SQRT_F32()
     {
     } // ~Inst_VOP1__V_SQRT_F32

     // D.f = sqrt(S0.f).
     void
     Inst_VOP1__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::sqrt(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_SQRT_F64::Inst_VOP1__V_SQRT_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_sqrt_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_SQRT_F64

     Inst_VOP1__V_SQRT_F64::~Inst_VOP1__V_SQRT_F64()
     {
     } // ~Inst_VOP1__V_SQRT_F64

     // D.d = sqrt(S0.d).
     void
     Inst_VOP1__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::sqrt(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_SIN_F32::Inst_VOP1__V_SIN_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_sin_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_SIN_F32

     Inst_VOP1__V_SIN_F32::~Inst_VOP1__V_SIN_F32()
     {
     } // ~Inst_VOP1__V_SIN_F32

     // D.f = sin(S0.f * 2 * PI).
     void
     Inst_VOP1__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();
         pi.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (src[lane] < -256.0 || src[lane] > 256.0) {
                     vdst[lane] = 0.0;
                 } else {
                     vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_COS_F32::Inst_VOP1__V_COS_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cos_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_COS_F32

     Inst_VOP1__V_COS_F32::~Inst_VOP1__V_COS_F32()
     {
     } // ~Inst_VOP1__V_COS_F32

     // D.f = cos(S0.f * 2 * PI).
     void
     Inst_VOP1__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();
         pi.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (src[lane] < -256.0 || src[lane] > 256.0) {
                     vdst[lane] = 0.0;
                 } else {
                     vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_NOT_B32::Inst_VOP1__V_NOT_B32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_not_b32")
     {
         setFlag(ALU);
     } // Inst_VOP1__V_NOT_B32

     Inst_VOP1__V_NOT_B32::~Inst_VOP1__V_NOT_B32()
     {
     } // ~Inst_VOP1__V_NOT_B32

     // D.u = ~S0.u.
     // Input and output modifiers not supported.
     void
     Inst_VOP1__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = ~src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_BFREV_B32::Inst_VOP1__V_BFREV_B32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_bfrev_b32")
     {
         setFlag(ALU);
     } // Inst_VOP1__V_BFREV_B32

     Inst_VOP1__V_BFREV_B32::~Inst_VOP1__V_BFREV_B32()
     {
     } // ~Inst_VOP1__V_BFREV_B32

     // D.u[31:0] = S0.u[0:31], bitfield reverse.
     // Input and output modifiers not supported.
     void
     Inst_VOP1__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = reverseBits(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FFBH_U32::Inst_VOP1__V_FFBH_U32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_ffbh_u32")
     {
         setFlag(ALU);
     } // Inst_VOP1__V_FFBH_U32

     Inst_VOP1__V_FFBH_U32::~Inst_VOP1__V_FFBH_U32()
     {
     } // ~Inst_VOP1__V_FFBH_U32

     // D.u = position of first 1 in S0.u from MSB;
     // D.u = 0xffffffff if S0.u == 0.
     void
     Inst_VOP1__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = findFirstOneMsb(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FFBL_B32::Inst_VOP1__V_FFBL_B32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_ffbl_b32")
     {
         setFlag(ALU);
     } // Inst_VOP1__V_FFBL_B32

     Inst_VOP1__V_FFBL_B32::~Inst_VOP1__V_FFBL_B32()
     {
     } // ~Inst_VOP1__V_FFBL_B32

     // D.u = position of first 1 in S0.u from LSB;
     // D.u = 0xffffffff if S0.u == 0.
     void
     Inst_VOP1__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = findFirstOne(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FFBH_I32::Inst_VOP1__V_FFBH_I32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_ffbh_i32")
     {
         setFlag(ALU);
     } // Inst_VOP1__V_FFBH_I32

     Inst_VOP1__V_FFBH_I32::~Inst_VOP1__V_FFBH_I32()
     {
     } // ~Inst_VOP1__V_FFBH_I32

     // D.u = position of first bit different from sign bit in S0.i from MSB;
     // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
     void
     Inst_VOP1__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = firstOppositeSignBit(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FREXP_EXP_I32_F64::Inst_VOP1__V_FREXP_EXP_I32_F64(
           InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_FREXP_EXP_I32_F64

     Inst_VOP1__V_FREXP_EXP_I32_F64::~Inst_VOP1__V_FREXP_EXP_I32_F64()
     {
     } // ~Inst_VOP1__V_FREXP_EXP_I32_F64

     void
     Inst_VOP1__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else {
                     VecElemI32 exp = 0;
                     std::frexp(src[lane], &exp);
                     vdst[lane] = exp;
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FREXP_MANT_F64::Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_frexp_mant_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_FREXP_MANT_F64

     Inst_VOP1__V_FREXP_MANT_F64::~Inst_VOP1__V_FREXP_MANT_F64()
     {
     } // ~Inst_VOP1__V_FREXP_MANT_F64

     void
     Inst_VOP1__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                     vdst[lane] = src[lane];
                 } else {
                     VecElemI32 exp(0);
                     vdst[lane] = std::frexp(src[lane], &exp);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FRACT_F64::Inst_VOP1__V_FRACT_F64(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_fract_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP1__V_FRACT_F64

     Inst_VOP1__V_FRACT_F64::~Inst_VOP1__V_FRACT_F64()
     {
     } // ~Inst_VOP1__V_FRACT_F64

     void
     Inst_VOP1__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemF64 int_part(0.0);
                 vdst[lane] = std::modf(src[lane], &int_part);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FREXP_EXP_I32_F32::Inst_VOP1__V_FREXP_EXP_I32_F32(
           InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_FREXP_EXP_I32_F32

     Inst_VOP1__V_FREXP_EXP_I32_F32::~Inst_VOP1__V_FREXP_EXP_I32_F32()
     {
     } // ~Inst_VOP1__V_FREXP_EXP_I32_F32

     // frexp(S0.f, Exponent(S0.f))
     // if (S0.f == INF || S0.f == NAN) then D.i = 0;
     // else D.i = Exponent(S0.f);
     void
     Inst_VOP1__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else {
                     VecElemI32 exp(0);
                     std::frexp(src[lane], &exp);
                     vdst[lane] = exp;
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_FREXP_MANT_F32::Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_frexp_mant_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_FREXP_MANT_F32

     Inst_VOP1__V_FREXP_MANT_F32::~Inst_VOP1__V_FREXP_MANT_F32()
     {
     } // ~Inst_VOP1__V_FREXP_MANT_F32

     // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
     // else D.f = frexp(S0.f, Exponent(S0.f)).
     void
     Inst_VOP1__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                     vdst[lane] = src[lane];
                 } else {
                     VecElemI32 exp(0);
                     vdst[lane] = std::frexp(src[lane], &exp);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_CLREXCP::Inst_VOP1__V_CLREXCP(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_clrexcp")
     {
         setFlag(ALU);
     } // Inst_VOP1__V_CLREXCP

     Inst_VOP1__V_CLREXCP::~Inst_VOP1__V_CLREXCP()
     {
     } // ~Inst_VOP1__V_CLREXCP

     void
     Inst_VOP1__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_CVT_F16_U16::Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f16_u16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_CVT_F16_U16

     Inst_VOP1__V_CVT_F16_U16::~Inst_VOP1__V_CVT_F16_U16()
     {
     } // ~Inst_VOP1__V_CVT_F16_U16

     // D.f16 = uint16_to_flt16(S.u16).
     void
     Inst_VOP1__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_CVT_F16_I16::Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_f16_i16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_CVT_F16_I16

     Inst_VOP1__V_CVT_F16_I16::~Inst_VOP1__V_CVT_F16_I16()
     {
     } // ~Inst_VOP1__V_CVT_F16_I16

     // D.f16 = int16_to_flt16(S.i16).
     void
     Inst_VOP1__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_CVT_U16_F16::Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_u16_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_CVT_U16_F16

     Inst_VOP1__V_CVT_U16_F16::~Inst_VOP1__V_CVT_U16_F16()
     {
     } // ~Inst_VOP1__V_CVT_U16_F16

     // D.u16 = flt16_to_uint16(S.f16).
     void
     Inst_VOP1__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_CVT_I16_F16::Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cvt_i16_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_CVT_I16_F16

     Inst_VOP1__V_CVT_I16_F16::~Inst_VOP1__V_CVT_I16_F16()
     {
     } // ~Inst_VOP1__V_CVT_I16_F16

     // D.i16 = flt16_to_int16(S.f16).
     void
     Inst_VOP1__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_RCP_F16::Inst_VOP1__V_RCP_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rcp_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_RCP_F16

     Inst_VOP1__V_RCP_F16::~Inst_VOP1__V_RCP_F16()
     {
     } // ~Inst_VOP1__V_RCP_F16

     // if (S0.f16 == 1.0f)
     //     D.f16 = 1.0f;
     // else
     //     D.f16 = 1 / S0.f16;
     void
     Inst_VOP1__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_SQRT_F16::Inst_VOP1__V_SQRT_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_sqrt_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_SQRT_F16

     Inst_VOP1__V_SQRT_F16::~Inst_VOP1__V_SQRT_F16()
     {
     } // ~Inst_VOP1__V_SQRT_F16

     // if (S0.f16 == 1.0f)
     //     D.f16 = 1.0f;
     // else
     //     D.f16 = sqrt(S0.f16);
     void
     Inst_VOP1__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_RSQ_F16::Inst_VOP1__V_RSQ_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rsq_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_RSQ_F16

     Inst_VOP1__V_RSQ_F16::~Inst_VOP1__V_RSQ_F16()
     {
     } // ~Inst_VOP1__V_RSQ_F16

     // if (S0.f16 == 1.0f)
     //     D.f16 = 1.0f;
     // else
     //     D.f16 = 1 / sqrt(S0.f16);
     void
     Inst_VOP1__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_LOG_F16::Inst_VOP1__V_LOG_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_log_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_LOG_F16

     Inst_VOP1__V_LOG_F16::~Inst_VOP1__V_LOG_F16()
     {
     } // ~Inst_VOP1__V_LOG_F16

     // if (S0.f16 == 1.0f)
     //     D.f16 = 0.0f;
     // else
     //     D.f16 = log2(S0.f16);
     void
     Inst_VOP1__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_EXP_F16::Inst_VOP1__V_EXP_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_exp_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_EXP_F16

     Inst_VOP1__V_EXP_F16::~Inst_VOP1__V_EXP_F16()
     {
     } // ~Inst_VOP1__V_EXP_F16

     // if (S0.f16 == 0.0f)
     //     D.f16 = 1.0f;
     // else
     //     D.f16 = pow(2.0, S0.f16).
     void
     Inst_VOP1__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_FREXP_MANT_F16::Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_frexp_mant_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_FREXP_MANT_F16

     Inst_VOP1__V_FREXP_MANT_F16::~Inst_VOP1__V_FREXP_MANT_F16()
     {
     } // ~Inst_VOP1__V_FREXP_MANT_F16

     // if (S0.f16 == +-INF || S0.f16 == NAN)
     //     D.f16 = S0.f16;
     // else
     //     D.f16 = mantissa(S0.f16).
     void
     Inst_VOP1__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_FREXP_EXP_I16_F16::Inst_VOP1__V_FREXP_EXP_I16_F16(
           InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_FREXP_EXP_I16_F16

     Inst_VOP1__V_FREXP_EXP_I16_F16::~Inst_VOP1__V_FREXP_EXP_I16_F16()
     {
     } // ~Inst_VOP1__V_FREXP_EXP_I16_F16

     // frexp(S0.f16, Exponent(S0.f16))
     // if (S0.f16 == +-INF || S0.f16 == NAN)
     //     D.i16 = 0;
     // else
     //     D.i16 = Exponent(S0.f16);
     void
     Inst_VOP1__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_FLOOR_F16::Inst_VOP1__V_FLOOR_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_floor_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_FLOOR_F16

     Inst_VOP1__V_FLOOR_F16::~Inst_VOP1__V_FLOOR_F16()
     {
     } // ~Inst_VOP1__V_FLOOR_F16

     // D.f16 = floor(S0.f16);
     void
     Inst_VOP1__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_CEIL_F16::Inst_VOP1__V_CEIL_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_ceil_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_CEIL_F16

     Inst_VOP1__V_CEIL_F16::~Inst_VOP1__V_CEIL_F16()
     {
     } // ~Inst_VOP1__V_CEIL_F16

     // D.f16 = ceil(S0.f16);
     void
     Inst_VOP1__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_TRUNC_F16::Inst_VOP1__V_TRUNC_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_trunc_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_TRUNC_F16

     Inst_VOP1__V_TRUNC_F16::~Inst_VOP1__V_TRUNC_F16()
     {
     } // ~Inst_VOP1__V_TRUNC_F16

     // D.f16 = trunc(S0.f16).
     void
     Inst_VOP1__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_RNDNE_F16::Inst_VOP1__V_RNDNE_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_rndne_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_RNDNE_F16

     Inst_VOP1__V_RNDNE_F16::~Inst_VOP1__V_RNDNE_F16()
     {
     } // ~Inst_VOP1__V_RNDNE_F16

     // D.f16 = roundNearestEven(S0.f16);
     void
     Inst_VOP1__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_FRACT_F16::Inst_VOP1__V_FRACT_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_fract_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_FRACT_F16

     Inst_VOP1__V_FRACT_F16::~Inst_VOP1__V_FRACT_F16()
     {
     } // ~Inst_VOP1__V_FRACT_F16

     // D.f16 = S0.f16 + -floor(S0.f16).
     void
     Inst_VOP1__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_SIN_F16::Inst_VOP1__V_SIN_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_sin_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_SIN_F16

     Inst_VOP1__V_SIN_F16::~Inst_VOP1__V_SIN_F16()
     {
     } // ~Inst_VOP1__V_SIN_F16

     // D.f16 = sin(S0.f16 * 2 * PI).
     void
     Inst_VOP1__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_COS_F16::Inst_VOP1__V_COS_F16(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_cos_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP1__V_COS_F16

     Inst_VOP1__V_COS_F16::~Inst_VOP1__V_COS_F16()
     {
     } // ~Inst_VOP1__V_COS_F16

     // D.f16 = cos(S0.f16 * 2 * PI).
     void
     Inst_VOP1__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP1__V_EXP_LEGACY_F32::Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_exp_legacy_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_EXP_LEGACY_F32

     Inst_VOP1__V_EXP_LEGACY_F32::~Inst_VOP1__V_EXP_LEGACY_F32()
     {
     } // ~Inst_VOP1__V_EXP_LEGACY_F32

     // D.f = pow(2.0, S0.f)
     void
     Inst_VOP1__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::pow(2.0, src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP1__V_LOG_LEGACY_F32::Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1 *iFmt)
         : Inst_VOP1(iFmt, "v_log_legacy_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP1__V_LOG_LEGACY_F32

     Inst_VOP1__V_LOG_LEGACY_F32::~Inst_VOP1__V_LOG_LEGACY_F32()
     {
     } // ~Inst_VOP1__V_LOG_LEGACY_F32

     // D.f = log2(S0.f).
     void
     Inst_VOP1__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::log2(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOPC__V_CMP_CLASS_F32::Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_class_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_CLASS_F32

     Inst_VOPC__V_CMP_CLASS_F32::~Inst_VOPC__V_CMP_CLASS_F32()
     {
     } // ~Inst_VOPC__V_CMP_CLASS_F32

     // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOPC__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                     // is NaN
                     if (std::isnan(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 2)) {
                     // is -infinity
                     if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 3)) {
                     // is -normal
                     if (std::isnormal(src0[lane])
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 4)) {
                     // is -denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 5)) {
                     // is -zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 6)) {
                     // is +zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 7)) {
                     // is +denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 8)) {
                     // is +normal
                     if (std::isnormal(src0[lane])
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 9)) {
                     // is +infinity
                     if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMPX_CLASS_F32::Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_class_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_CLASS_F32

     Inst_VOPC__V_CMPX_CLASS_F32::~Inst_VOPC__V_CMPX_CLASS_F32()
     {
     } // ~Inst_VOPC__V_CMPX_CLASS_F32

     // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
     // S0.f The function reports true if the floating point value is any of
     // the numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOPC__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                     // is NaN
                     if (std::isnan(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 2)) {
                     // is -infinity
                     if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 3)) {
                     // is -normal
                     if (std::isnormal(src0[lane])
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 4)) {
                     // is -denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 5)) {
                     // is -zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 6)) {
                     // is +zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 7)) {
                     // is +denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 8)) {
                     // is +normal
                     if (std::isnormal(src0[lane])
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 9)) {
                     // is +infinity
                     if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMP_CLASS_F64::Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_class_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_CLASS_F64

     Inst_VOPC__V_CMP_CLASS_F64::~Inst_VOPC__V_CMP_CLASS_F64()
     {
     } // ~Inst_VOPC__V_CMP_CLASS_F64

     // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOPC__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                     // is NaN
                     if (std::isnan(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 2)) {
                     // is -infinity
                     if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 3)) {
                     // is -normal
                     if (std::isnormal(src0[lane])
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 4)) {
                     // is -denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 5)) {
                     // is -zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 6)) {
                     // is +zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 7)) {
                     // is +denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 8)) {
                     // is +normal
                     if (std::isnormal(src0[lane])
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 9)) {
                     // is +infinity
                     if (std::isinf(src0[lane])
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMPX_CLASS_F64::Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_class_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_CLASS_F64

     Inst_VOPC__V_CMPX_CLASS_F64::~Inst_VOPC__V_CMPX_CLASS_F64()
     {
     } // ~Inst_VOPC__V_CMPX_CLASS_F64

     // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
     // S0.d The function reports true if the floating point value is any of
     // the numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOPC__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                     // is NaN
                     if (std::isnan(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 2)) {
                     // is -infinity
                     if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 3)) {
                     // is -normal
                     if (std::isnormal(src0[lane])
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 4)) {
                     // is -denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 5)) {
                     // is -zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 6)) {
                     // is +zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 7)) {
                     // is +denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 8)) {
                     // is +normal
                     if (std::isnormal(src0[lane])
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 9)) {
                     // is +infinity
                     if (std::isinf(src0[lane])
                         && !std::signbit(src0[lane])) {
                         vcc.setBit(lane, 1);
                         continue;
                     }
                 }
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMP_CLASS_F16::Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_class_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_CLASS_F16

     Inst_VOPC__V_CMP_CLASS_F16::~Inst_VOPC__V_CMP_CLASS_F16()
     {
     } // ~Inst_VOPC__V_CMP_CLASS_F16

     // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOPC__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_CLASS_F16::Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_class_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_CLASS_F16

     Inst_VOPC__V_CMPX_CLASS_F16::~Inst_VOPC__V_CMPX_CLASS_F16()
     {
     } // ~Inst_VOPC__V_CMPX_CLASS_F16

     // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
     // S0.f16
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOPC__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_F_F16::Inst_VOPC__V_CMP_F_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_f_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_F_F16

     Inst_VOPC__V_CMP_F_F16::~Inst_VOPC__V_CMP_F_F16()
     {
     } // ~Inst_VOPC__V_CMP_F_F16

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_LT_F16::Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lt_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_LT_F16

     Inst_VOPC__V_CMP_LT_F16::~Inst_VOPC__V_CMP_LT_F16()
     {
     } // ~Inst_VOPC__V_CMP_LT_F16

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_EQ_F16::Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_eq_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_EQ_F16

     Inst_VOPC__V_CMP_EQ_F16::~Inst_VOPC__V_CMP_EQ_F16()
     {
     } // ~Inst_VOPC__V_CMP_EQ_F16

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_LE_F16::Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_le_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_LE_F16

     Inst_VOPC__V_CMP_LE_F16::~Inst_VOPC__V_CMP_LE_F16()
     {
     } // ~Inst_VOPC__V_CMP_LE_F16

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_GT_F16::Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_gt_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_GT_F16

     Inst_VOPC__V_CMP_GT_F16::~Inst_VOPC__V_CMP_GT_F16()
     {
     } // ~Inst_VOPC__V_CMP_GT_F16

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_LG_F16::Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lg_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_LG_F16

     Inst_VOPC__V_CMP_LG_F16::~Inst_VOPC__V_CMP_LG_F16()
     {
     } // ~Inst_VOPC__V_CMP_LG_F16

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_GE_F16::Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ge_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_GE_F16

     Inst_VOPC__V_CMP_GE_F16::~Inst_VOPC__V_CMP_GE_F16()
     {
     } // ~Inst_VOPC__V_CMP_GE_F16

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_O_F16::Inst_VOPC__V_CMP_O_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_o_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_O_F16

     Inst_VOPC__V_CMP_O_F16::~Inst_VOPC__V_CMP_O_F16()
     {
     } // ~Inst_VOPC__V_CMP_O_F16

     // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_U_F16::Inst_VOPC__V_CMP_U_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_u_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_U_F16

     Inst_VOPC__V_CMP_U_F16::~Inst_VOPC__V_CMP_U_F16()
     {
     } // ~Inst_VOPC__V_CMP_U_F16

     // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_NGE_F16::Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nge_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_NGE_F16

     Inst_VOPC__V_CMP_NGE_F16::~Inst_VOPC__V_CMP_NGE_F16()
     {
     } // ~Inst_VOPC__V_CMP_NGE_F16

     // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_NLG_F16::Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nlg_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_NLG_F16

     Inst_VOPC__V_CMP_NLG_F16::~Inst_VOPC__V_CMP_NLG_F16()
     {
     } // ~Inst_VOPC__V_CMP_NLG_F16

     // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_NGT_F16::Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ngt_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_NGT_F16

     Inst_VOPC__V_CMP_NGT_F16::~Inst_VOPC__V_CMP_NGT_F16()
     {
     } // ~Inst_VOPC__V_CMP_NGT_F16

     // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_NLE_F16::Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nle_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_NLE_F16

     Inst_VOPC__V_CMP_NLE_F16::~Inst_VOPC__V_CMP_NLE_F16()
     {
     } // ~Inst_VOPC__V_CMP_NLE_F16

     // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_NEQ_F16::Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_neq_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_NEQ_F16

     Inst_VOPC__V_CMP_NEQ_F16::~Inst_VOPC__V_CMP_NEQ_F16()
     {
     } // ~Inst_VOPC__V_CMP_NEQ_F16

     // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_NLT_F16::Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nlt_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_NLT_F16

     Inst_VOPC__V_CMP_NLT_F16::~Inst_VOPC__V_CMP_NLT_F16()
     {
     } // ~Inst_VOPC__V_CMP_NLT_F16

     // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_TRU_F16::Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_tru_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMP_TRU_F16

     Inst_VOPC__V_CMP_TRU_F16::~Inst_VOPC__V_CMP_TRU_F16()
     {
     } // ~Inst_VOPC__V_CMP_TRU_F16

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_F_F16::Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_f_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_F_F16

     Inst_VOPC__V_CMPX_F_F16::~Inst_VOPC__V_CMPX_F_F16()
     {
     } // ~Inst_VOPC__V_CMPX_F_F16

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_LT_F16::Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lt_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_LT_F16

     Inst_VOPC__V_CMPX_LT_F16::~Inst_VOPC__V_CMPX_LT_F16()
     {
     } // ~Inst_VOPC__V_CMPX_LT_F16

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_EQ_F16::Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_eq_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_EQ_F16

     Inst_VOPC__V_CMPX_EQ_F16::~Inst_VOPC__V_CMPX_EQ_F16()
     {
     } // ~Inst_VOPC__V_CMPX_EQ_F16

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_LE_F16::Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_le_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_LE_F16

     Inst_VOPC__V_CMPX_LE_F16::~Inst_VOPC__V_CMPX_LE_F16()
     {
     } // ~Inst_VOPC__V_CMPX_LE_F16

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_GT_F16::Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_gt_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_GT_F16

     Inst_VOPC__V_CMPX_GT_F16::~Inst_VOPC__V_CMPX_GT_F16()
     {
     } // ~Inst_VOPC__V_CMPX_GT_F16

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_LG_F16::Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lg_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_LG_F16

     Inst_VOPC__V_CMPX_LG_F16::~Inst_VOPC__V_CMPX_LG_F16()
     {
     } // ~Inst_VOPC__V_CMPX_LG_F16

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_GE_F16::Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ge_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_GE_F16

     Inst_VOPC__V_CMPX_GE_F16::~Inst_VOPC__V_CMPX_GE_F16()
     {
     } // ~Inst_VOPC__V_CMPX_GE_F16

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_O_F16::Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_o_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_O_F16

     Inst_VOPC__V_CMPX_O_F16::~Inst_VOPC__V_CMPX_O_F16()
     {
     } // ~Inst_VOPC__V_CMPX_O_F16

     // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOPC__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_U_F16::Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_u_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_U_F16

     Inst_VOPC__V_CMPX_U_F16::~Inst_VOPC__V_CMPX_U_F16()
     {
     } // ~Inst_VOPC__V_CMPX_U_F16

     // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOPC__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_NGE_F16::Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nge_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_NGE_F16

     Inst_VOPC__V_CMPX_NGE_F16::~Inst_VOPC__V_CMPX_NGE_F16()
     {
     } // ~Inst_VOPC__V_CMPX_NGE_F16

     // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_NLG_F16::Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nlg_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_NLG_F16

     Inst_VOPC__V_CMPX_NLG_F16::~Inst_VOPC__V_CMPX_NLG_F16()
     {
     } // ~Inst_VOPC__V_CMPX_NLG_F16

     // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_NGT_F16::Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ngt_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_NGT_F16

     Inst_VOPC__V_CMPX_NGT_F16::~Inst_VOPC__V_CMPX_NGT_F16()
     {
     } // ~Inst_VOPC__V_CMPX_NGT_F16

     // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_NLE_F16::Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nle_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_NLE_F16

     Inst_VOPC__V_CMPX_NLE_F16::~Inst_VOPC__V_CMPX_NLE_F16()
     {
     } // ~Inst_VOPC__V_CMPX_NLE_F16

     // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_NEQ_F16::Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_neq_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_NEQ_F16

     Inst_VOPC__V_CMPX_NEQ_F16::~Inst_VOPC__V_CMPX_NEQ_F16()
     {
     } // ~Inst_VOPC__V_CMPX_NEQ_F16

     // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_NLT_F16::Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nlt_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_NLT_F16

     Inst_VOPC__V_CMPX_NLT_F16::~Inst_VOPC__V_CMPX_NLT_F16()
     {
     } // ~Inst_VOPC__V_CMPX_NLT_F16

     // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMPX_TRU_F16::Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_tru_f16")
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOPC__V_CMPX_TRU_F16

     Inst_VOPC__V_CMPX_TRU_F16::~Inst_VOPC__V_CMPX_TRU_F16()
     {
     } // ~Inst_VOPC__V_CMPX_TRU_F16

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOPC__V_CMP_F_F32::Inst_VOPC__V_CMP_F_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_f_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_F_F32

     Inst_VOPC__V_CMP_F_F32::~Inst_VOPC__V_CMP_F_F32()
     {
     } // ~Inst_VOPC__V_CMP_F_F32

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LT_F32::Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lt_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_LT_F32

     Inst_VOPC__V_CMP_LT_F32::~Inst_VOPC__V_CMP_LT_F32()
     {
     } // ~Inst_VOPC__V_CMP_LT_F32

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_EQ_F32::Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_eq_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_EQ_F32

     Inst_VOPC__V_CMP_EQ_F32::~Inst_VOPC__V_CMP_EQ_F32()
     {
     } // ~Inst_VOPC__V_CMP_EQ_F32

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LE_F32::Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_le_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_LE_F32

     Inst_VOPC__V_CMP_LE_F32::~Inst_VOPC__V_CMP_LE_F32()
     {
     } // ~Inst_VOPC__V_CMP_LE_F32

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GT_F32::Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_gt_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_GT_F32

     Inst_VOPC__V_CMP_GT_F32::~Inst_VOPC__V_CMP_GT_F32()
     {
     } // ~Inst_VOPC__V_CMP_GT_F32

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LG_F32::Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lg_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_LG_F32

     Inst_VOPC__V_CMP_LG_F32::~Inst_VOPC__V_CMP_LG_F32()
     {
     } // ~Inst_VOPC__V_CMP_LG_F32

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GE_F32::Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ge_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_GE_F32

     Inst_VOPC__V_CMP_GE_F32::~Inst_VOPC__V_CMP_GE_F32()
     {
     } // ~Inst_VOPC__V_CMP_GE_F32

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_O_F32::Inst_VOPC__V_CMP_O_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_o_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_O_F32

     Inst_VOPC__V_CMP_O_F32::~Inst_VOPC__V_CMP_O_F32()
     {
     } // ~Inst_VOPC__V_CMP_O_F32

     // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (!std::isnan(src0[lane])
                     && !std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_U_F32::Inst_VOPC__V_CMP_U_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_u_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_U_F32

     Inst_VOPC__V_CMP_U_F32::~Inst_VOPC__V_CMP_U_F32()
     {
     } // ~Inst_VOPC__V_CMP_U_F32

     // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (std::isnan(src0[lane])
                     || std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NGE_F32::Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nge_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_NGE_F32

     Inst_VOPC__V_CMP_NGE_F32::~Inst_VOPC__V_CMP_NGE_F32()
     {
     } // ~Inst_VOPC__V_CMP_NGE_F32

     // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NLG_F32::Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nlg_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_NLG_F32

     Inst_VOPC__V_CMP_NLG_F32::~Inst_VOPC__V_CMP_NLG_F32()
     {
     } // ~Inst_VOPC__V_CMP_NLG_F32

     // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NGT_F32::Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ngt_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_NGT_F32

     Inst_VOPC__V_CMP_NGT_F32::~Inst_VOPC__V_CMP_NGT_F32()
     {
     } // ~Inst_VOPC__V_CMP_NGT_F32

     // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NLE_F32::Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nle_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_NLE_F32

     Inst_VOPC__V_CMP_NLE_F32::~Inst_VOPC__V_CMP_NLE_F32()
     {
     } // ~Inst_VOPC__V_CMP_NLE_F32

     // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NEQ_F32::Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_neq_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_NEQ_F32

     Inst_VOPC__V_CMP_NEQ_F32::~Inst_VOPC__V_CMP_NEQ_F32()
     {
     } // ~Inst_VOPC__V_CMP_NEQ_F32

     // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NLT_F32::Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nlt_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_NLT_F32

     Inst_VOPC__V_CMP_NLT_F32::~Inst_VOPC__V_CMP_NLT_F32()
     {
     } // ~Inst_VOPC__V_CMP_NLT_F32

     // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_TRU_F32::Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_tru_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMP_TRU_F32

     Inst_VOPC__V_CMP_TRU_F32::~Inst_VOPC__V_CMP_TRU_F32()
     {
     } // ~Inst_VOPC__V_CMP_TRU_F32

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMPX_F_F32::Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_f_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_F_F32

     Inst_VOPC__V_CMPX_F_F32::~Inst_VOPC__V_CMPX_F_F32()
     {
     } // ~Inst_VOPC__V_CMPX_F_F32

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_LT_F32::Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lt_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_LT_F32

     Inst_VOPC__V_CMPX_LT_F32::~Inst_VOPC__V_CMPX_LT_F32()
     {
     } // ~Inst_VOPC__V_CMPX_LT_F32

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_EQ_F32::Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_eq_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_EQ_F32

     Inst_VOPC__V_CMPX_EQ_F32::~Inst_VOPC__V_CMPX_EQ_F32()
     {
     } // ~Inst_VOPC__V_CMPX_EQ_F32

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_LE_F32::Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_le_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_LE_F32

     Inst_VOPC__V_CMPX_LE_F32::~Inst_VOPC__V_CMPX_LE_F32()
     {
     } // ~Inst_VOPC__V_CMPX_LE_F32

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_GT_F32::Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_gt_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_GT_F32

     Inst_VOPC__V_CMPX_GT_F32::~Inst_VOPC__V_CMPX_GT_F32()
     {
     } // ~Inst_VOPC__V_CMPX_GT_F32

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_LG_F32::Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lg_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_LG_F32

     Inst_VOPC__V_CMPX_LG_F32::~Inst_VOPC__V_CMPX_LG_F32()
     {
     } // ~Inst_VOPC__V_CMPX_LG_F32

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_GE_F32::Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ge_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_GE_F32

     Inst_VOPC__V_CMPX_GE_F32::~Inst_VOPC__V_CMPX_GE_F32()
     {
     } // ~Inst_VOPC__V_CMPX_GE_F32

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_O_F32::Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_o_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_O_F32

     Inst_VOPC__V_CMPX_O_F32::~Inst_VOPC__V_CMPX_O_F32()
     {
     } // ~Inst_VOPC__V_CMPX_O_F32

     // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOPC__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (!std::isnan(src0[lane])
                     && !std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_U_F32::Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_u_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_U_F32

     Inst_VOPC__V_CMPX_U_F32::~Inst_VOPC__V_CMPX_U_F32()
     {
     } // ~Inst_VOPC__V_CMPX_U_F32

     // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOPC__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (std::isnan(src0[lane])
                     || std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_NGE_F32::Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nge_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_NGE_F32

     Inst_VOPC__V_CMPX_NGE_F32::~Inst_VOPC__V_CMPX_NGE_F32()
     {
     } // ~Inst_VOPC__V_CMPX_NGE_F32

     // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_NLG_F32::Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nlg_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_NLG_F32

     Inst_VOPC__V_CMPX_NLG_F32::~Inst_VOPC__V_CMPX_NLG_F32()
     {
     } // ~Inst_VOPC__V_CMPX_NLG_F32

     // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_NGT_F32::Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ngt_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_NGT_F32

     Inst_VOPC__V_CMPX_NGT_F32::~Inst_VOPC__V_CMPX_NGT_F32()
     {
     } // ~Inst_VOPC__V_CMPX_NGT_F32

     // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_NLE_F32::Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nle_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_NLE_F32

     Inst_VOPC__V_CMPX_NLE_F32::~Inst_VOPC__V_CMPX_NLE_F32()
     {
     } // ~Inst_VOPC__V_CMPX_NLE_F32

     // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_NEQ_F32::Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_neq_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_NEQ_F32

     Inst_VOPC__V_CMPX_NEQ_F32::~Inst_VOPC__V_CMPX_NEQ_F32()
     {
     } // ~Inst_VOPC__V_CMPX_NEQ_F32

     // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMPX_NLT_F32::Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nlt_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_NLT_F32

     Inst_VOPC__V_CMPX_NLT_F32::~Inst_VOPC__V_CMPX_NLT_F32()
     {
     } // ~Inst_VOPC__V_CMPX_NLT_F32

     // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_TRU_F32::Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_tru_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOPC__V_CMPX_TRU_F32

     Inst_VOPC__V_CMPX_TRU_F32::~Inst_VOPC__V_CMPX_TRU_F32()
     {
     } // ~Inst_VOPC__V_CMPX_TRU_F32

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMP_F_F64::Inst_VOPC__V_CMP_F_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_f_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_F_F64

     Inst_VOPC__V_CMP_F_F64::~Inst_VOPC__V_CMP_F_F64()
     {
     } // ~Inst_VOPC__V_CMP_F_F64

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LT_F64::Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lt_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_LT_F64

     Inst_VOPC__V_CMP_LT_F64::~Inst_VOPC__V_CMP_LT_F64()
     {
     } // ~Inst_VOPC__V_CMP_LT_F64

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_EQ_F64::Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_eq_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_EQ_F64

     Inst_VOPC__V_CMP_EQ_F64::~Inst_VOPC__V_CMP_EQ_F64()
     {
     } // ~Inst_VOPC__V_CMP_EQ_F64

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LE_F64::Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_le_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_LE_F64

     Inst_VOPC__V_CMP_LE_F64::~Inst_VOPC__V_CMP_LE_F64()
     {
     } // ~Inst_VOPC__V_CMP_LE_F64

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GT_F64::Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_gt_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_GT_F64

     Inst_VOPC__V_CMP_GT_F64::~Inst_VOPC__V_CMP_GT_F64()
     {
     } // ~Inst_VOPC__V_CMP_GT_F64

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LG_F64::Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lg_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_LG_F64

     Inst_VOPC__V_CMP_LG_F64::~Inst_VOPC__V_CMP_LG_F64()
     {
     } // ~Inst_VOPC__V_CMP_LG_F64

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GE_F64::Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ge_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_GE_F64

     Inst_VOPC__V_CMP_GE_F64::~Inst_VOPC__V_CMP_GE_F64()
     {
     } // ~Inst_VOPC__V_CMP_GE_F64

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_O_F64::Inst_VOPC__V_CMP_O_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_o_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_O_F64

     Inst_VOPC__V_CMP_O_F64::~Inst_VOPC__V_CMP_O_F64()
     {
     } // ~Inst_VOPC__V_CMP_O_F64

     // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (!std::isnan(src0[lane])
                     && !std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_U_F64::Inst_VOPC__V_CMP_U_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_u_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_U_F64

     Inst_VOPC__V_CMP_U_F64::~Inst_VOPC__V_CMP_U_F64()
     {
     } // ~Inst_VOPC__V_CMP_U_F64

     // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (std::isnan(src0[lane])
                     || std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NGE_F64::Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nge_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_NGE_F64

     Inst_VOPC__V_CMP_NGE_F64::~Inst_VOPC__V_CMP_NGE_F64()
     {
     } // ~Inst_VOPC__V_CMP_NGE_F64

     // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NLG_F64::Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nlg_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_NLG_F64

     Inst_VOPC__V_CMP_NLG_F64::~Inst_VOPC__V_CMP_NLG_F64()
     {
     } // ~Inst_VOPC__V_CMP_NLG_F64

     // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NGT_F64::Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ngt_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_NGT_F64

     Inst_VOPC__V_CMP_NGT_F64::~Inst_VOPC__V_CMP_NGT_F64()
     {
     } // ~Inst_VOPC__V_CMP_NGT_F64

     // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NLE_F64::Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nle_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_NLE_F64

     Inst_VOPC__V_CMP_NLE_F64::~Inst_VOPC__V_CMP_NLE_F64()
     {
     } // ~Inst_VOPC__V_CMP_NLE_F64

     // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NEQ_F64::Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_neq_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_NEQ_F64

     Inst_VOPC__V_CMP_NEQ_F64::~Inst_VOPC__V_CMP_NEQ_F64()
     {
     } // ~Inst_VOPC__V_CMP_NEQ_F64

     // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NLT_F64::Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_nlt_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_NLT_F64

     Inst_VOPC__V_CMP_NLT_F64::~Inst_VOPC__V_CMP_NLT_F64()
     {
     } // ~Inst_VOPC__V_CMP_NLT_F64

     // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_TRU_F64::Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_tru_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMP_TRU_F64

     Inst_VOPC__V_CMP_TRU_F64::~Inst_VOPC__V_CMP_TRU_F64()
     {
     } // ~Inst_VOPC__V_CMP_TRU_F64

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMPX_F_F64::Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_f_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_F_F64

     Inst_VOPC__V_CMPX_F_F64::~Inst_VOPC__V_CMPX_F_F64()
     {
     } // ~Inst_VOPC__V_CMPX_F_F64

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_LT_F64::Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lt_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_LT_F64

     Inst_VOPC__V_CMPX_LT_F64::~Inst_VOPC__V_CMPX_LT_F64()
     {
     } // ~Inst_VOPC__V_CMPX_LT_F64

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_EQ_F64::Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_eq_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_EQ_F64

     Inst_VOPC__V_CMPX_EQ_F64::~Inst_VOPC__V_CMPX_EQ_F64()
     {
     } // ~Inst_VOPC__V_CMPX_EQ_F64

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
         wf->execMask() = vcc.rawData();
     }

     Inst_VOPC__V_CMPX_LE_F64::Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_le_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_LE_F64

     Inst_VOPC__V_CMPX_LE_F64::~Inst_VOPC__V_CMPX_LE_F64()
     {
     } // ~Inst_VOPC__V_CMPX_LE_F64

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GT_F64::Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_gt_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_GT_F64

     Inst_VOPC__V_CMPX_GT_F64::~Inst_VOPC__V_CMPX_GT_F64()
     {
     } // ~Inst_VOPC__V_CMPX_GT_F64

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LG_F64::Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lg_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_LG_F64

     Inst_VOPC__V_CMPX_LG_F64::~Inst_VOPC__V_CMPX_LG_F64()
     {
     } // ~Inst_VOPC__V_CMPX_LG_F64

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GE_F64::Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ge_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_GE_F64

     Inst_VOPC__V_CMPX_GE_F64::~Inst_VOPC__V_CMPX_GE_F64()
     {
     } // ~Inst_VOPC__V_CMPX_GE_F64

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_O_F64::Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_o_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_O_F64

     Inst_VOPC__V_CMPX_O_F64::~Inst_VOPC__V_CMPX_O_F64()
     {
     } // ~Inst_VOPC__V_CMPX_O_F64

     // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOPC__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (!std::isnan(src0[lane])
                     && !std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_U_F64::Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_u_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_U_F64

     Inst_VOPC__V_CMPX_U_F64::~Inst_VOPC__V_CMPX_U_F64()
     {
     } // ~Inst_VOPC__V_CMPX_U_F64

     // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOPC__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (std::isnan(src0[lane])
                     || std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NGE_F64::Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nge_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_NGE_F64

     Inst_VOPC__V_CMPX_NGE_F64::~Inst_VOPC__V_CMPX_NGE_F64()
     {
     } // ~Inst_VOPC__V_CMPX_NGE_F64

     // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NLG_F64::Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nlg_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_NLG_F64

     Inst_VOPC__V_CMPX_NLG_F64::~Inst_VOPC__V_CMPX_NLG_F64()
     {
     } // ~Inst_VOPC__V_CMPX_NLG_F64

     // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NGT_F64::Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ngt_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_NGT_F64

     Inst_VOPC__V_CMPX_NGT_F64::~Inst_VOPC__V_CMPX_NGT_F64()
     {
     } // ~Inst_VOPC__V_CMPX_NGT_F64

     // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NLE_F64::Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nle_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_NLE_F64

     Inst_VOPC__V_CMPX_NLE_F64::~Inst_VOPC__V_CMPX_NLE_F64()
     {
     } // ~Inst_VOPC__V_CMPX_NLE_F64

     // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NEQ_F64::Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_neq_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_NEQ_F64

     Inst_VOPC__V_CMPX_NEQ_F64::~Inst_VOPC__V_CMPX_NEQ_F64()
     {
     } // ~Inst_VOPC__V_CMPX_NEQ_F64

     // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NLT_F64::Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_nlt_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_NLT_F64

     Inst_VOPC__V_CMPX_NLT_F64::~Inst_VOPC__V_CMPX_NLT_F64()
     {
     } // ~Inst_VOPC__V_CMPX_NLT_F64

     // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_TRU_F64::Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_tru_f64")
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOPC__V_CMPX_TRU_F64

     Inst_VOPC__V_CMPX_TRU_F64::~Inst_VOPC__V_CMPX_TRU_F64()
     {
     } // ~Inst_VOPC__V_CMPX_TRU_F64

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMP_F_I16::Inst_VOPC__V_CMP_F_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_f_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_F_I16

     Inst_VOPC__V_CMP_F_I16::~Inst_VOPC__V_CMP_F_I16()
     {
     } // ~Inst_VOPC__V_CMP_F_I16

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LT_I16::Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lt_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LT_I16

     Inst_VOPC__V_CMP_LT_I16::~Inst_VOPC__V_CMP_LT_I16()
     {
     } // ~Inst_VOPC__V_CMP_LT_I16

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_EQ_I16::Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_eq_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_EQ_I16

     Inst_VOPC__V_CMP_EQ_I16::~Inst_VOPC__V_CMP_EQ_I16()
     {
     } // ~Inst_VOPC__V_CMP_EQ_I16

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LE_I16::Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_le_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LE_I16

     Inst_VOPC__V_CMP_LE_I16::~Inst_VOPC__V_CMP_LE_I16()
     {
     } // ~Inst_VOPC__V_CMP_LE_I16

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GT_I16::Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_gt_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GT_I16

     Inst_VOPC__V_CMP_GT_I16::~Inst_VOPC__V_CMP_GT_I16()
     {
     } // ~Inst_VOPC__V_CMP_GT_I16

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NE_I16::Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ne_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_NE_I16

     Inst_VOPC__V_CMP_NE_I16::~Inst_VOPC__V_CMP_NE_I16()
     {
     } // ~Inst_VOPC__V_CMP_NE_I16

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GE_I16::Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ge_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GE_I16

     Inst_VOPC__V_CMP_GE_I16::~Inst_VOPC__V_CMP_GE_I16()
     {
     } // ~Inst_VOPC__V_CMP_GE_I16

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_T_I16::Inst_VOPC__V_CMP_T_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_t_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_T_I16

     Inst_VOPC__V_CMP_T_I16::~Inst_VOPC__V_CMP_T_I16()
     {
     } // ~Inst_VOPC__V_CMP_T_I16

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_F_U16::Inst_VOPC__V_CMP_F_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_f_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_F_U16

     Inst_VOPC__V_CMP_F_U16::~Inst_VOPC__V_CMP_F_U16()
     {
     } // ~Inst_VOPC__V_CMP_F_U16

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LT_U16::Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lt_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LT_U16

     Inst_VOPC__V_CMP_LT_U16::~Inst_VOPC__V_CMP_LT_U16()
     {
     } // ~Inst_VOPC__V_CMP_LT_U16

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_EQ_U16::Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_eq_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_EQ_U16

     Inst_VOPC__V_CMP_EQ_U16::~Inst_VOPC__V_CMP_EQ_U16()
     {
     } // ~Inst_VOPC__V_CMP_EQ_U16

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LE_U16::Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_le_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LE_U16

     Inst_VOPC__V_CMP_LE_U16::~Inst_VOPC__V_CMP_LE_U16()
     {
     } // ~Inst_VOPC__V_CMP_LE_U16

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GT_U16::Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_gt_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GT_U16

     Inst_VOPC__V_CMP_GT_U16::~Inst_VOPC__V_CMP_GT_U16()
     {
     } // ~Inst_VOPC__V_CMP_GT_U16

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NE_U16::Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ne_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_NE_U16

     Inst_VOPC__V_CMP_NE_U16::~Inst_VOPC__V_CMP_NE_U16()
     {
     } // ~Inst_VOPC__V_CMP_NE_U16

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GE_U16::Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ge_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GE_U16

     Inst_VOPC__V_CMP_GE_U16::~Inst_VOPC__V_CMP_GE_U16()
     {
     } // ~Inst_VOPC__V_CMP_GE_U16

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_T_U16::Inst_VOPC__V_CMP_T_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_t_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_T_U16

     Inst_VOPC__V_CMP_T_U16::~Inst_VOPC__V_CMP_T_U16()
     {
     } // ~Inst_VOPC__V_CMP_T_U16

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMPX_F_I16::Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_f_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_F_I16

     Inst_VOPC__V_CMPX_F_I16::~Inst_VOPC__V_CMPX_F_I16()
     {
     } // ~Inst_VOPC__V_CMPX_F_I16

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LT_I16::Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lt_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LT_I16

     Inst_VOPC__V_CMPX_LT_I16::~Inst_VOPC__V_CMPX_LT_I16()
     {
     } // ~Inst_VOPC__V_CMPX_LT_I16

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_EQ_I16::Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_eq_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_EQ_I16

     Inst_VOPC__V_CMPX_EQ_I16::~Inst_VOPC__V_CMPX_EQ_I16()
     {
     } // ~Inst_VOPC__V_CMPX_EQ_I16

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LE_I16::Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_le_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LE_I16

     Inst_VOPC__V_CMPX_LE_I16::~Inst_VOPC__V_CMPX_LE_I16()
     {
     } // ~Inst_VOPC__V_CMPX_LE_I16

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GT_I16::Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_gt_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GT_I16

     Inst_VOPC__V_CMPX_GT_I16::~Inst_VOPC__V_CMPX_GT_I16()
     {
     } // ~Inst_VOPC__V_CMPX_GT_I16

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NE_I16::Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ne_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_NE_I16

     Inst_VOPC__V_CMPX_NE_I16::~Inst_VOPC__V_CMPX_NE_I16()
     {
     } // ~Inst_VOPC__V_CMPX_NE_I16

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GE_I16::Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ge_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GE_I16

     Inst_VOPC__V_CMPX_GE_I16::~Inst_VOPC__V_CMPX_GE_I16()
     {
     } // ~Inst_VOPC__V_CMPX_GE_I16

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_T_I16::Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_t_i16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_T_I16

     Inst_VOPC__V_CMPX_T_I16::~Inst_VOPC__V_CMPX_T_I16()
     {
     } // ~Inst_VOPC__V_CMPX_T_I16

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_F_U16::Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_f_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_F_U16

     Inst_VOPC__V_CMPX_F_U16::~Inst_VOPC__V_CMPX_F_U16()
     {
     } // ~Inst_VOPC__V_CMPX_F_U16

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LT_U16::Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lt_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LT_U16

     Inst_VOPC__V_CMPX_LT_U16::~Inst_VOPC__V_CMPX_LT_U16()
     {
     } // ~Inst_VOPC__V_CMPX_LT_U16

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_EQ_U16::Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_eq_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_EQ_U16

     Inst_VOPC__V_CMPX_EQ_U16::~Inst_VOPC__V_CMPX_EQ_U16()
     {
     } // ~Inst_VOPC__V_CMPX_EQ_U16

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LE_U16::Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_le_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LE_U16

     Inst_VOPC__V_CMPX_LE_U16::~Inst_VOPC__V_CMPX_LE_U16()
     {
     } // ~Inst_VOPC__V_CMPX_LE_U16

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GT_U16::Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_gt_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GT_U16

     Inst_VOPC__V_CMPX_GT_U16::~Inst_VOPC__V_CMPX_GT_U16()
     {
     } // ~Inst_VOPC__V_CMPX_GT_U16

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NE_U16::Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ne_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_NE_U16

     Inst_VOPC__V_CMPX_NE_U16::~Inst_VOPC__V_CMPX_NE_U16()
     {
     } // ~Inst_VOPC__V_CMPX_NE_U16

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GE_U16::Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ge_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GE_U16

     Inst_VOPC__V_CMPX_GE_U16::~Inst_VOPC__V_CMPX_GE_U16()
     {
     } // ~Inst_VOPC__V_CMPX_GE_U16

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_T_U16::Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_t_u16")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_T_U16

     Inst_VOPC__V_CMPX_T_U16::~Inst_VOPC__V_CMPX_T_U16()
     {
     } // ~Inst_VOPC__V_CMPX_T_U16

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMP_F_I32::Inst_VOPC__V_CMP_F_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_f_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_F_I32

     Inst_VOPC__V_CMP_F_I32::~Inst_VOPC__V_CMP_F_I32()
     {
     } // ~Inst_VOPC__V_CMP_F_I32

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LT_I32::Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lt_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LT_I32

     Inst_VOPC__V_CMP_LT_I32::~Inst_VOPC__V_CMP_LT_I32()
     {
     } // ~Inst_VOPC__V_CMP_LT_I32

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_EQ_I32::Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_eq_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_EQ_I32

     Inst_VOPC__V_CMP_EQ_I32::~Inst_VOPC__V_CMP_EQ_I32()
     {
     } // ~Inst_VOPC__V_CMP_EQ_I32

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LE_I32::Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_le_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LE_I32

     Inst_VOPC__V_CMP_LE_I32::~Inst_VOPC__V_CMP_LE_I32()
     {
     } // ~Inst_VOPC__V_CMP_LE_I32

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GT_I32::Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_gt_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GT_I32

     Inst_VOPC__V_CMP_GT_I32::~Inst_VOPC__V_CMP_GT_I32()
     {
     } // ~Inst_VOPC__V_CMP_GT_I32

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NE_I32::Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ne_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_NE_I32

     Inst_VOPC__V_CMP_NE_I32::~Inst_VOPC__V_CMP_NE_I32()
     {
     } // ~Inst_VOPC__V_CMP_NE_I32

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GE_I32::Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ge_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GE_I32

     Inst_VOPC__V_CMP_GE_I32::~Inst_VOPC__V_CMP_GE_I32()
     {
     } // ~Inst_VOPC__V_CMP_GE_I32

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_T_I32::Inst_VOPC__V_CMP_T_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_t_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_T_I32

     Inst_VOPC__V_CMP_T_I32::~Inst_VOPC__V_CMP_T_I32()
     {
     } // ~Inst_VOPC__V_CMP_T_I32

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_F_U32::Inst_VOPC__V_CMP_F_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_f_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_F_U32

     Inst_VOPC__V_CMP_F_U32::~Inst_VOPC__V_CMP_F_U32()
     {
     } // ~Inst_VOPC__V_CMP_F_U32

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LT_U32::Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lt_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LT_U32

     Inst_VOPC__V_CMP_LT_U32::~Inst_VOPC__V_CMP_LT_U32()
     {
     } // ~Inst_VOPC__V_CMP_LT_U32

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_EQ_U32::Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_eq_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_EQ_U32

     Inst_VOPC__V_CMP_EQ_U32::~Inst_VOPC__V_CMP_EQ_U32()
     {
     } // ~Inst_VOPC__V_CMP_EQ_U32

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LE_U32::Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_le_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LE_U32

     Inst_VOPC__V_CMP_LE_U32::~Inst_VOPC__V_CMP_LE_U32()
     {
     } // ~Inst_VOPC__V_CMP_LE_U32

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GT_U32::Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_gt_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GT_U32

     Inst_VOPC__V_CMP_GT_U32::~Inst_VOPC__V_CMP_GT_U32()
     {
     } // ~Inst_VOPC__V_CMP_GT_U32

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NE_U32::Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ne_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_NE_U32

     Inst_VOPC__V_CMP_NE_U32::~Inst_VOPC__V_CMP_NE_U32()
     {
     } // ~Inst_VOPC__V_CMP_NE_U32

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GE_U32::Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ge_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GE_U32

     Inst_VOPC__V_CMP_GE_U32::~Inst_VOPC__V_CMP_GE_U32()
     {
     } // ~Inst_VOPC__V_CMP_GE_U32

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_T_U32::Inst_VOPC__V_CMP_T_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_t_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_T_U32

     Inst_VOPC__V_CMP_T_U32::~Inst_VOPC__V_CMP_T_U32()
     {
     } // ~Inst_VOPC__V_CMP_T_U32

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMPX_F_I32::Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_f_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_F_I32

     Inst_VOPC__V_CMPX_F_I32::~Inst_VOPC__V_CMPX_F_I32()
     {
     } // ~Inst_VOPC__V_CMPX_F_I32

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LT_I32::Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lt_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LT_I32

     Inst_VOPC__V_CMPX_LT_I32::~Inst_VOPC__V_CMPX_LT_I32()
     {
     } // ~Inst_VOPC__V_CMPX_LT_I32

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_EQ_I32::Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_eq_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_EQ_I32

     Inst_VOPC__V_CMPX_EQ_I32::~Inst_VOPC__V_CMPX_EQ_I32()
     {
     } // ~Inst_VOPC__V_CMPX_EQ_I32

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LE_I32::Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_le_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LE_I32

     Inst_VOPC__V_CMPX_LE_I32::~Inst_VOPC__V_CMPX_LE_I32()
     {
     } // ~Inst_VOPC__V_CMPX_LE_I32

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GT_I32::Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_gt_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GT_I32

     Inst_VOPC__V_CMPX_GT_I32::~Inst_VOPC__V_CMPX_GT_I32()
     {
     } // ~Inst_VOPC__V_CMPX_GT_I32

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NE_I32::Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ne_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_NE_I32

     Inst_VOPC__V_CMPX_NE_I32::~Inst_VOPC__V_CMPX_NE_I32()
     {
     } // ~Inst_VOPC__V_CMPX_NE_I32

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GE_I32::Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ge_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GE_I32

     Inst_VOPC__V_CMPX_GE_I32::~Inst_VOPC__V_CMPX_GE_I32()
     {
     } // ~Inst_VOPC__V_CMPX_GE_I32

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_T_I32::Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_t_i32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_T_I32

     Inst_VOPC__V_CMPX_T_I32::~Inst_VOPC__V_CMPX_T_I32()
     {
     } // ~Inst_VOPC__V_CMPX_T_I32

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_F_U32::Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_f_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_F_U32

     Inst_VOPC__V_CMPX_F_U32::~Inst_VOPC__V_CMPX_F_U32()
     {
     } // ~Inst_VOPC__V_CMPX_F_U32

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LT_U32::Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lt_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LT_U32

     Inst_VOPC__V_CMPX_LT_U32::~Inst_VOPC__V_CMPX_LT_U32()
     {
     } // ~Inst_VOPC__V_CMPX_LT_U32

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_EQ_U32::Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_eq_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_EQ_U32

     Inst_VOPC__V_CMPX_EQ_U32::~Inst_VOPC__V_CMPX_EQ_U32()
     {
     } // ~Inst_VOPC__V_CMPX_EQ_U32

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LE_U32::Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_le_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LE_U32

     Inst_VOPC__V_CMPX_LE_U32::~Inst_VOPC__V_CMPX_LE_U32()
     {
     } // ~Inst_VOPC__V_CMPX_LE_U32

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GT_U32::Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_gt_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GT_U32

     Inst_VOPC__V_CMPX_GT_U32::~Inst_VOPC__V_CMPX_GT_U32()
     {
     } // ~Inst_VOPC__V_CMPX_GT_U32

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NE_U32::Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ne_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_NE_U32

     Inst_VOPC__V_CMPX_NE_U32::~Inst_VOPC__V_CMPX_NE_U32()
     {
     } // ~Inst_VOPC__V_CMPX_NE_U32

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GE_U32::Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ge_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GE_U32

     Inst_VOPC__V_CMPX_GE_U32::~Inst_VOPC__V_CMPX_GE_U32()
     {
     } // ~Inst_VOPC__V_CMPX_GE_U32

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_T_U32::Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_t_u32")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_T_U32

     Inst_VOPC__V_CMPX_T_U32::~Inst_VOPC__V_CMPX_T_U32()
     {
     } // ~Inst_VOPC__V_CMPX_T_U32

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMP_F_I64::Inst_VOPC__V_CMP_F_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_f_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_F_I64

     Inst_VOPC__V_CMP_F_I64::~Inst_VOPC__V_CMP_F_I64()
     {
     } // ~Inst_VOPC__V_CMP_F_I64

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LT_I64::Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lt_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LT_I64

     Inst_VOPC__V_CMP_LT_I64::~Inst_VOPC__V_CMP_LT_I64()
     {
     } // ~Inst_VOPC__V_CMP_LT_I64

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_EQ_I64::Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_eq_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_EQ_I64

     Inst_VOPC__V_CMP_EQ_I64::~Inst_VOPC__V_CMP_EQ_I64()
     {
     } // ~Inst_VOPC__V_CMP_EQ_I64

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LE_I64::Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_le_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LE_I64

     Inst_VOPC__V_CMP_LE_I64::~Inst_VOPC__V_CMP_LE_I64()
     {
     } // ~Inst_VOPC__V_CMP_LE_I64

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GT_I64::Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_gt_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GT_I64

     Inst_VOPC__V_CMP_GT_I64::~Inst_VOPC__V_CMP_GT_I64()
     {
     } // ~Inst_VOPC__V_CMP_GT_I64

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NE_I64::Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ne_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_NE_I64

     Inst_VOPC__V_CMP_NE_I64::~Inst_VOPC__V_CMP_NE_I64()
     {
     } // ~Inst_VOPC__V_CMP_NE_I64

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GE_I64::Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ge_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GE_I64

     Inst_VOPC__V_CMP_GE_I64::~Inst_VOPC__V_CMP_GE_I64()
     {
     } // ~Inst_VOPC__V_CMP_GE_I64

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_T_I64::Inst_VOPC__V_CMP_T_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_t_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_T_I64

     Inst_VOPC__V_CMP_T_I64::~Inst_VOPC__V_CMP_T_I64()
     {
     } // ~Inst_VOPC__V_CMP_T_I64

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_F_U64::Inst_VOPC__V_CMP_F_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_f_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_F_U64

     Inst_VOPC__V_CMP_F_U64::~Inst_VOPC__V_CMP_F_U64()
     {
     } // ~Inst_VOPC__V_CMP_F_U64

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LT_U64::Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_lt_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LT_U64

     Inst_VOPC__V_CMP_LT_U64::~Inst_VOPC__V_CMP_LT_U64()
     {
     } // ~Inst_VOPC__V_CMP_LT_U64

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_EQ_U64::Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_eq_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_EQ_U64

     Inst_VOPC__V_CMP_EQ_U64::~Inst_VOPC__V_CMP_EQ_U64()
     {
     } // ~Inst_VOPC__V_CMP_EQ_U64

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_LE_U64::Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_le_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_LE_U64

     Inst_VOPC__V_CMP_LE_U64::~Inst_VOPC__V_CMP_LE_U64()
     {
     } // ~Inst_VOPC__V_CMP_LE_U64

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GT_U64::Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_gt_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GT_U64

     Inst_VOPC__V_CMP_GT_U64::~Inst_VOPC__V_CMP_GT_U64()
     {
     } // ~Inst_VOPC__V_CMP_GT_U64

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_NE_U64::Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ne_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_NE_U64

     Inst_VOPC__V_CMP_NE_U64::~Inst_VOPC__V_CMP_NE_U64()
     {
     } // ~Inst_VOPC__V_CMP_NE_U64

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_GE_U64::Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_ge_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_GE_U64

     Inst_VOPC__V_CMP_GE_U64::~Inst_VOPC__V_CMP_GE_U64()
     {
     } // ~Inst_VOPC__V_CMP_GE_U64

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMP_T_U64::Inst_VOPC__V_CMP_T_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmp_t_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMP_T_U64

     Inst_VOPC__V_CMP_T_U64::~Inst_VOPC__V_CMP_T_U64()
     {
     } // ~Inst_VOPC__V_CMP_T_U64

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         vcc.write();
     }

     Inst_VOPC__V_CMPX_F_I64::Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_f_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_F_I64

     Inst_VOPC__V_CMPX_F_I64::~Inst_VOPC__V_CMPX_F_I64()
     {
     } // ~Inst_VOPC__V_CMPX_F_I64

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LT_I64::Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lt_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LT_I64

     Inst_VOPC__V_CMPX_LT_I64::~Inst_VOPC__V_CMPX_LT_I64()
     {
     } // ~Inst_VOPC__V_CMPX_LT_I64

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_EQ_I64::Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_eq_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_EQ_I64

     Inst_VOPC__V_CMPX_EQ_I64::~Inst_VOPC__V_CMPX_EQ_I64()
     {
     } // ~Inst_VOPC__V_CMPX_EQ_I64

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LE_I64::Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_le_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LE_I64

     Inst_VOPC__V_CMPX_LE_I64::~Inst_VOPC__V_CMPX_LE_I64()
     {
     } // ~Inst_VOPC__V_CMPX_LE_I64

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GT_I64::Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_gt_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GT_I64

     Inst_VOPC__V_CMPX_GT_I64::~Inst_VOPC__V_CMPX_GT_I64()
     {
     } // ~Inst_VOPC__V_CMPX_GT_I64

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NE_I64::Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ne_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_NE_I64

     Inst_VOPC__V_CMPX_NE_I64::~Inst_VOPC__V_CMPX_NE_I64()
     {
     } // ~Inst_VOPC__V_CMPX_NE_I64

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GE_I64::Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ge_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GE_I64

     Inst_VOPC__V_CMPX_GE_I64::~Inst_VOPC__V_CMPX_GE_I64()
     {
     } // ~Inst_VOPC__V_CMPX_GE_I64

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_T_I64::Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_t_i64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_T_I64

     Inst_VOPC__V_CMPX_T_I64::~Inst_VOPC__V_CMPX_T_I64()
     {
     } // ~Inst_VOPC__V_CMPX_T_I64

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_F_U64::Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_f_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_F_U64

     Inst_VOPC__V_CMPX_F_U64::~Inst_VOPC__V_CMPX_F_U64()
     {
     } // ~Inst_VOPC__V_CMPX_F_U64

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LT_U64::Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_lt_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LT_U64

     Inst_VOPC__V_CMPX_LT_U64::~Inst_VOPC__V_CMPX_LT_U64()
     {
     } // ~Inst_VOPC__V_CMPX_LT_U64

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_EQ_U64::Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_eq_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_EQ_U64

     Inst_VOPC__V_CMPX_EQ_U64::~Inst_VOPC__V_CMPX_EQ_U64()
     {
     } // ~Inst_VOPC__V_CMPX_EQ_U64

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_LE_U64::Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_le_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_LE_U64

     Inst_VOPC__V_CMPX_LE_U64::~Inst_VOPC__V_CMPX_LE_U64()
     {
     } // ~Inst_VOPC__V_CMPX_LE_U64

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GT_U64::Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_gt_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GT_U64

     Inst_VOPC__V_CMPX_GT_U64::~Inst_VOPC__V_CMPX_GT_U64()
     {
     } // ~Inst_VOPC__V_CMPX_GT_U64

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_NE_U64::Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ne_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_NE_U64

     Inst_VOPC__V_CMPX_NE_U64::~Inst_VOPC__V_CMPX_NE_U64()
     {
     } // ~Inst_VOPC__V_CMPX_NE_U64

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_GE_U64::Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_ge_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_GE_U64

     Inst_VOPC__V_CMPX_GE_U64::~Inst_VOPC__V_CMPX_GE_U64()
     {
     } // ~Inst_VOPC__V_CMPX_GE_U64

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VOPC__V_CMPX_T_U64::Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC *iFmt)
         : Inst_VOPC(iFmt, "v_cmpx_t_u64")
     {
         setFlag(ALU);
     } // Inst_VOPC__V_CMPX_T_U64

     Inst_VOPC__V_CMPX_T_U64::~Inst_VOPC__V_CMPX_T_U64()
     {
     } // ~Inst_VOPC__V_CMPX_T_U64

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOPC__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, 1);
             }
         }

         wf->execMask() = vcc.rawData();
         vcc.write();
     }

     Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
           InFmt_VINTRP *iFmt)
         : Inst_VINTRP(iFmt, "v_interp_p1_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VINTRP__V_INTERP_P1_F32

     Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
     {
     } // ~Inst_VINTRP__V_INTERP_P1_F32

     // D.f = P10 * S.f + P0; parameter interpolation
     void
     Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
           InFmt_VINTRP *iFmt)
         : Inst_VINTRP(iFmt, "v_interp_p2_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VINTRP__V_INTERP_P2_F32

     Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
     {
     } // ~Inst_VINTRP__V_INTERP_P2_F32

     // D.f = P20 * S.f + D.f; parameter interpolation
     void
     Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
           InFmt_VINTRP *iFmt)
         : Inst_VINTRP(iFmt, "v_interp_mov_f32")
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VINTRP__V_INTERP_MOV_F32

     Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
     {
     } // ~Inst_VINTRP__V_INTERP_MOV_F32

     void
     Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_CLASS_F32::Inst_VOP3__V_CMP_CLASS_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_class_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_CLASS_F32

     Inst_VOP3__V_CMP_CLASS_F32::~Inst_VOP3__V_CMP_CLASS_F32()
     {
     } // ~Inst_VOP3__V_CMP_CLASS_F32

     // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOP3__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                     // is NaN
                     if (std::isnan(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 2)) {
                     // is -infinity
                     if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 3)) {
                     // is -normal
                     if (std::isnormal(src0[lane])
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 4)) {
                     // is -denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 5)) {
                     // is -zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 6)) {
                     // is +zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 7)) {
                     // is +denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 8)) {
                     // is +normal
                     if (std::isnormal(src0[lane])
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 9)) {
                     // is +infinity
                     if (std::isinf(src0[lane])
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMPX_CLASS_F32::Inst_VOP3__V_CMPX_CLASS_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_class_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_CLASS_F32

     Inst_VOP3__V_CMPX_CLASS_F32::~Inst_VOP3__V_CMPX_CLASS_F32()
     {
     } // ~Inst_VOP3__V_CMPX_CLASS_F32

     // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
     // S0.f
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOP3__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                     // is NaN
                     if (std::isnan(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 2)) {
                     // is -infinity
                     if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 3)) {
                     // is -normal
                     if (std::isnormal(src0[lane])
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 4)) {
                     // is -denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 5)) {
                     // is -zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 6)) {
                     // is +zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 7)) {
                     // is +denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 8)) {
                     // is +normal
                     if (std::isnormal(src0[lane])
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 9)) {
                     // is +infinity
                     if (std::isinf(src0[lane])
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane,  1);
                         continue;
                     }
                 }
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMP_CLASS_F64::Inst_VOP3__V_CMP_CLASS_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_class_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_CLASS_F64

     Inst_VOP3__V_CMP_CLASS_F64::~Inst_VOP3__V_CMP_CLASS_F64()
     {
     } // ~Inst_VOP3__V_CMP_CLASS_F64

     // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOP3__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                     // is NaN
                     if (std::isnan(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 2)) {
                     // is -infinity
                     if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 3)) {
                     // is -normal
                     if (std::isnormal(src0[lane])
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 4)) {
                     // is -denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 5)) {
                     // is -zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 6)) {
                     // is +zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 7)) {
                     // is +denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 8)) {
                     // is +normal
                     if (std::isnormal(src0[lane])
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 9)) {
                     // is +infinity
                     if (std::isinf(src0[lane])
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMPX_CLASS_F64::Inst_VOP3__V_CMPX_CLASS_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_class_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_CLASS_F64

     Inst_VOP3__V_CMPX_CLASS_F64::~Inst_VOP3__V_CMPX_CLASS_F64()
     {
     } // ~Inst_VOP3__V_CMPX_CLASS_F64

     // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
     // S0.d
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOP3__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                     // is NaN
                     if (std::isnan(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 2)) {
                     // is -infinity
                     if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 3)) {
                     // is -normal
                     if (std::isnormal(src0[lane])
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 4)) {
                     // is -denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 5)) {
                     // is -zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 6)) {
                     // is +zero
                     if (std::fpclassify(src0[lane]) == FP_ZERO
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 7)) {
                     // is +denormal
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 8)) {
                     // is +normal
                     if (std::isnormal(src0[lane])
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
                 if (bits(src1[lane], 9)) {
                     // is +infinity
                     if (std::isinf(src0[lane])
                         && !std::signbit(src0[lane])) {
                         sdst.setBit(lane, 1);
                         continue;
                     }
                 }
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMP_CLASS_F16::Inst_VOP3__V_CMP_CLASS_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_class_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_CLASS_F16

     Inst_VOP3__V_CMP_CLASS_F16::~Inst_VOP3__V_CMP_CLASS_F16()
     {
     } // ~Inst_VOP3__V_CMP_CLASS_F16

     // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOP3__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_CLASS_F16::Inst_VOP3__V_CMPX_CLASS_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_class_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_CLASS_F16

     Inst_VOP3__V_CMPX_CLASS_F16::~Inst_VOP3__V_CMPX_CLASS_F16()
     {
     } // ~Inst_VOP3__V_CMPX_CLASS_F16

     // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
     // S0.f16
     // The function reports true if the floating point value is any of the
     // numeric types selected in S1.u according to the following list:
     // S1.u[0] -- value is a signaling NaN.
     // S1.u[1] -- value is a quiet NaN.
     // S1.u[2] -- value is negative infinity.
     // S1.u[3] -- value is a negative normal value.
     // S1.u[4] -- value is a negative denormal value.
     // S1.u[5] -- value is negative zero.
     // S1.u[6] -- value is positive zero.
     // S1.u[7] -- value is a positive denormal value.
     // S1.u[8] -- value is a positive normal value.
     // S1.u[9] -- value is positive infinity.
     void
     Inst_VOP3__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_F_F16::Inst_VOP3__V_CMP_F_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_f_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_F_F16

     Inst_VOP3__V_CMP_F_F16::~Inst_VOP3__V_CMP_F_F16()
     {
     } // ~Inst_VOP3__V_CMP_F_F16

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_LT_F16::Inst_VOP3__V_CMP_LT_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lt_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_LT_F16

     Inst_VOP3__V_CMP_LT_F16::~Inst_VOP3__V_CMP_LT_F16()
     {
     } // ~Inst_VOP3__V_CMP_LT_F16

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_EQ_F16::Inst_VOP3__V_CMP_EQ_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_eq_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_EQ_F16

     Inst_VOP3__V_CMP_EQ_F16::~Inst_VOP3__V_CMP_EQ_F16()
     {
     } // ~Inst_VOP3__V_CMP_EQ_F16

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_LE_F16::Inst_VOP3__V_CMP_LE_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_le_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_LE_F16

     Inst_VOP3__V_CMP_LE_F16::~Inst_VOP3__V_CMP_LE_F16()
     {
     } // ~Inst_VOP3__V_CMP_LE_F16

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_GT_F16::Inst_VOP3__V_CMP_GT_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_gt_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_GT_F16

     Inst_VOP3__V_CMP_GT_F16::~Inst_VOP3__V_CMP_GT_F16()
     {
     } // ~Inst_VOP3__V_CMP_GT_F16

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_LG_F16::Inst_VOP3__V_CMP_LG_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lg_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_LG_F16

     Inst_VOP3__V_CMP_LG_F16::~Inst_VOP3__V_CMP_LG_F16()
     {
     } // ~Inst_VOP3__V_CMP_LG_F16

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_GE_F16::Inst_VOP3__V_CMP_GE_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ge_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_GE_F16

     Inst_VOP3__V_CMP_GE_F16::~Inst_VOP3__V_CMP_GE_F16()
     {
     } // ~Inst_VOP3__V_CMP_GE_F16

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_O_F16::Inst_VOP3__V_CMP_O_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_o_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_O_F16

     Inst_VOP3__V_CMP_O_F16::~Inst_VOP3__V_CMP_O_F16()
     {
     } // ~Inst_VOP3__V_CMP_O_F16

     // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_U_F16::Inst_VOP3__V_CMP_U_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_u_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_U_F16

     Inst_VOP3__V_CMP_U_F16::~Inst_VOP3__V_CMP_U_F16()
     {
     } // ~Inst_VOP3__V_CMP_U_F16

     // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_NGE_F16::Inst_VOP3__V_CMP_NGE_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nge_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_NGE_F16

     Inst_VOP3__V_CMP_NGE_F16::~Inst_VOP3__V_CMP_NGE_F16()
     {
     } // ~Inst_VOP3__V_CMP_NGE_F16

     // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_NLG_F16::Inst_VOP3__V_CMP_NLG_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nlg_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_NLG_F16

     Inst_VOP3__V_CMP_NLG_F16::~Inst_VOP3__V_CMP_NLG_F16()
     {
     } // ~Inst_VOP3__V_CMP_NLG_F16

     // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_NGT_F16::Inst_VOP3__V_CMP_NGT_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ngt_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_NGT_F16

     Inst_VOP3__V_CMP_NGT_F16::~Inst_VOP3__V_CMP_NGT_F16()
     {
     } // ~Inst_VOP3__V_CMP_NGT_F16

     // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_NLE_F16::Inst_VOP3__V_CMP_NLE_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nle_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_NLE_F16

     Inst_VOP3__V_CMP_NLE_F16::~Inst_VOP3__V_CMP_NLE_F16()
     {
     } // ~Inst_VOP3__V_CMP_NLE_F16

     // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_NEQ_F16::Inst_VOP3__V_CMP_NEQ_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_neq_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_NEQ_F16

     Inst_VOP3__V_CMP_NEQ_F16::~Inst_VOP3__V_CMP_NEQ_F16()
     {
     } // ~Inst_VOP3__V_CMP_NEQ_F16

     // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_NLT_F16::Inst_VOP3__V_CMP_NLT_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nlt_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_NLT_F16

     Inst_VOP3__V_CMP_NLT_F16::~Inst_VOP3__V_CMP_NLT_F16()
     {
     } // ~Inst_VOP3__V_CMP_NLT_F16

     // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMP_TRU_F16::Inst_VOP3__V_CMP_TRU_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_tru_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMP_TRU_F16

     Inst_VOP3__V_CMP_TRU_F16::~Inst_VOP3__V_CMP_TRU_F16()
     {
     } // ~Inst_VOP3__V_CMP_TRU_F16

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMPX_F_F16::Inst_VOP3__V_CMPX_F_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_f_f16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_F_F16

     Inst_VOP3__V_CMPX_F_F16::~Inst_VOP3__V_CMPX_F_F16()
     {
     } // ~Inst_VOP3__V_CMPX_F_F16

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LT_F16::Inst_VOP3__V_CMPX_LT_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lt_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_LT_F16

     Inst_VOP3__V_CMPX_LT_F16::~Inst_VOP3__V_CMPX_LT_F16()
     {
     } // ~Inst_VOP3__V_CMPX_LT_F16

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_EQ_F16::Inst_VOP3__V_CMPX_EQ_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_eq_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_EQ_F16

     Inst_VOP3__V_CMPX_EQ_F16::~Inst_VOP3__V_CMPX_EQ_F16()
     {
     } // ~Inst_VOP3__V_CMPX_EQ_F16

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_LE_F16::Inst_VOP3__V_CMPX_LE_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_le_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_LE_F16

     Inst_VOP3__V_CMPX_LE_F16::~Inst_VOP3__V_CMPX_LE_F16()
     {
     } // ~Inst_VOP3__V_CMPX_LE_F16

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_GT_F16::Inst_VOP3__V_CMPX_GT_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_gt_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_GT_F16

     Inst_VOP3__V_CMPX_GT_F16::~Inst_VOP3__V_CMPX_GT_F16()
     {
     } // ~Inst_VOP3__V_CMPX_GT_F16

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_LG_F16::Inst_VOP3__V_CMPX_LG_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lg_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_LG_F16

     Inst_VOP3__V_CMPX_LG_F16::~Inst_VOP3__V_CMPX_LG_F16()
     {
     } // ~Inst_VOP3__V_CMPX_LG_F16

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_GE_F16::Inst_VOP3__V_CMPX_GE_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ge_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_GE_F16

     Inst_VOP3__V_CMPX_GE_F16::~Inst_VOP3__V_CMPX_GE_F16()
     {
     } // ~Inst_VOP3__V_CMPX_GE_F16

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_O_F16::Inst_VOP3__V_CMPX_O_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_o_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_O_F16

     Inst_VOP3__V_CMPX_O_F16::~Inst_VOP3__V_CMPX_O_F16()
     {
     } // ~Inst_VOP3__V_CMPX_O_F16

     // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOP3__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_U_F16::Inst_VOP3__V_CMPX_U_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_u_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_U_F16

     Inst_VOP3__V_CMPX_U_F16::~Inst_VOP3__V_CMPX_U_F16()
     {
     } // ~Inst_VOP3__V_CMPX_U_F16

     // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOP3__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_NGE_F16::Inst_VOP3__V_CMPX_NGE_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nge_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_NGE_F16

     Inst_VOP3__V_CMPX_NGE_F16::~Inst_VOP3__V_CMPX_NGE_F16()
     {
     } // ~Inst_VOP3__V_CMPX_NGE_F16

     // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_NLG_F16::Inst_VOP3__V_CMPX_NLG_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nlg_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_NLG_F16

     Inst_VOP3__V_CMPX_NLG_F16::~Inst_VOP3__V_CMPX_NLG_F16()
     {
     } // ~Inst_VOP3__V_CMPX_NLG_F16

     // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_NGT_F16::Inst_VOP3__V_CMPX_NGT_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ngt_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_NGT_F16

     Inst_VOP3__V_CMPX_NGT_F16::~Inst_VOP3__V_CMPX_NGT_F16()
     {
     } // ~Inst_VOP3__V_CMPX_NGT_F16

     // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_NLE_F16::Inst_VOP3__V_CMPX_NLE_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nle_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_NLE_F16

     Inst_VOP3__V_CMPX_NLE_F16::~Inst_VOP3__V_CMPX_NLE_F16()
     {
     } // ~Inst_VOP3__V_CMPX_NLE_F16

     // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_NEQ_F16::Inst_VOP3__V_CMPX_NEQ_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_neq_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_NEQ_F16

     Inst_VOP3__V_CMPX_NEQ_F16::~Inst_VOP3__V_CMPX_NEQ_F16()
     {
     } // ~Inst_VOP3__V_CMPX_NEQ_F16

     // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_NLT_F16::Inst_VOP3__V_CMPX_NLT_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nlt_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_NLT_F16

     Inst_VOP3__V_CMPX_NLT_F16::~Inst_VOP3__V_CMPX_NLT_F16()
     {
     } // ~Inst_VOP3__V_CMPX_NLT_F16

     // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CMPX_TRU_F16::Inst_VOP3__V_CMPX_TRU_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_tru_f16", true)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CMPX_TRU_F16

     Inst_VOP3__V_CMPX_TRU_F16::~Inst_VOP3__V_CMPX_TRU_F16()
     {
     } // ~Inst_VOP3__V_CMPX_TRU_F16

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMP_F_F32::Inst_VOP3__V_CMP_F_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_f_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_F_F32

     Inst_VOP3__V_CMP_F_F32::~Inst_VOP3__V_CMP_F_F32()
     {
     } // ~Inst_VOP3__V_CMP_F_F32

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LT_F32::Inst_VOP3__V_CMP_LT_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lt_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_LT_F32

     Inst_VOP3__V_CMP_LT_F32::~Inst_VOP3__V_CMP_LT_F32()
     {
     } // ~Inst_VOP3__V_CMP_LT_F32

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_EQ_F32::Inst_VOP3__V_CMP_EQ_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_eq_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_EQ_F32

     Inst_VOP3__V_CMP_EQ_F32::~Inst_VOP3__V_CMP_EQ_F32()
     {
     } // ~Inst_VOP3__V_CMP_EQ_F32

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LE_F32::Inst_VOP3__V_CMP_LE_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_le_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_LE_F32

     Inst_VOP3__V_CMP_LE_F32::~Inst_VOP3__V_CMP_LE_F32()
     {
     } // ~Inst_VOP3__V_CMP_LE_F32

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GT_F32::Inst_VOP3__V_CMP_GT_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_gt_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_GT_F32

     Inst_VOP3__V_CMP_GT_F32::~Inst_VOP3__V_CMP_GT_F32()
     {
     } // ~Inst_VOP3__V_CMP_GT_F32

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LG_F32::Inst_VOP3__V_CMP_LG_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lg_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_LG_F32

     Inst_VOP3__V_CMP_LG_F32::~Inst_VOP3__V_CMP_LG_F32()
     {
     } // ~Inst_VOP3__V_CMP_LG_F32

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GE_F32::Inst_VOP3__V_CMP_GE_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ge_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_GE_F32

     Inst_VOP3__V_CMP_GE_F32::~Inst_VOP3__V_CMP_GE_F32()
     {
     } // ~Inst_VOP3__V_CMP_GE_F32

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_O_F32::Inst_VOP3__V_CMP_O_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_o_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_O_F32

     Inst_VOP3__V_CMP_O_F32::~Inst_VOP3__V_CMP_O_F32()
     {
     } // ~Inst_VOP3__V_CMP_O_F32

     // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (!std::isnan(src0[lane])
                     && !std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_U_F32::Inst_VOP3__V_CMP_U_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_u_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_U_F32

     Inst_VOP3__V_CMP_U_F32::~Inst_VOP3__V_CMP_U_F32()
     {
     } // ~Inst_VOP3__V_CMP_U_F32

     // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (std::isnan(src0[lane])
                     || std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NGE_F32::Inst_VOP3__V_CMP_NGE_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nge_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_NGE_F32

     Inst_VOP3__V_CMP_NGE_F32::~Inst_VOP3__V_CMP_NGE_F32()
     {
     } // ~Inst_VOP3__V_CMP_NGE_F32

     // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NLG_F32::Inst_VOP3__V_CMP_NLG_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nlg_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_NLG_F32

     Inst_VOP3__V_CMP_NLG_F32::~Inst_VOP3__V_CMP_NLG_F32()
     {
     } // ~Inst_VOP3__V_CMP_NLG_F32

     // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NGT_F32::Inst_VOP3__V_CMP_NGT_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ngt_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_NGT_F32

     Inst_VOP3__V_CMP_NGT_F32::~Inst_VOP3__V_CMP_NGT_F32()
     {
     } // ~Inst_VOP3__V_CMP_NGT_F32

     // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NLE_F32::Inst_VOP3__V_CMP_NLE_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nle_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_NLE_F32

     Inst_VOP3__V_CMP_NLE_F32::~Inst_VOP3__V_CMP_NLE_F32()
     {
     } // ~Inst_VOP3__V_CMP_NLE_F32

     // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NEQ_F32::Inst_VOP3__V_CMP_NEQ_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_neq_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_NEQ_F32

     Inst_VOP3__V_CMP_NEQ_F32::~Inst_VOP3__V_CMP_NEQ_F32()
     {
     } // ~Inst_VOP3__V_CMP_NEQ_F32

     // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NLT_F32::Inst_VOP3__V_CMP_NLT_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nlt_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_NLT_F32

     Inst_VOP3__V_CMP_NLT_F32::~Inst_VOP3__V_CMP_NLT_F32()
     {
     } // ~Inst_VOP3__V_CMP_NLT_F32

     // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_TRU_F32::Inst_VOP3__V_CMP_TRU_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_tru_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMP_TRU_F32

     Inst_VOP3__V_CMP_TRU_F32::~Inst_VOP3__V_CMP_TRU_F32()
     {
     } // ~Inst_VOP3__V_CMP_TRU_F32

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMPX_F_F32::Inst_VOP3__V_CMPX_F_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_f_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_F_F32

     Inst_VOP3__V_CMPX_F_F32::~Inst_VOP3__V_CMPX_F_F32()
     {
     } // ~Inst_VOP3__V_CMPX_F_F32

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LT_F32::Inst_VOP3__V_CMPX_LT_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lt_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_LT_F32

     Inst_VOP3__V_CMPX_LT_F32::~Inst_VOP3__V_CMPX_LT_F32()
     {
     } // ~Inst_VOP3__V_CMPX_LT_F32

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_EQ_F32::Inst_VOP3__V_CMPX_EQ_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_eq_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_EQ_F32

     Inst_VOP3__V_CMPX_EQ_F32::~Inst_VOP3__V_CMPX_EQ_F32()
     {
     } // ~Inst_VOP3__V_CMPX_EQ_F32

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LE_F32::Inst_VOP3__V_CMPX_LE_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_le_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_LE_F32

     Inst_VOP3__V_CMPX_LE_F32::~Inst_VOP3__V_CMPX_LE_F32()
     {
     } // ~Inst_VOP3__V_CMPX_LE_F32

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GT_F32::Inst_VOP3__V_CMPX_GT_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_gt_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_GT_F32

     Inst_VOP3__V_CMPX_GT_F32::~Inst_VOP3__V_CMPX_GT_F32()
     {
     } // ~Inst_VOP3__V_CMPX_GT_F32

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LG_F32::Inst_VOP3__V_CMPX_LG_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lg_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_LG_F32

     Inst_VOP3__V_CMPX_LG_F32::~Inst_VOP3__V_CMPX_LG_F32()
     {
     } // ~Inst_VOP3__V_CMPX_LG_F32

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GE_F32::Inst_VOP3__V_CMPX_GE_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ge_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_GE_F32

     Inst_VOP3__V_CMPX_GE_F32::~Inst_VOP3__V_CMPX_GE_F32()
     {
     } // ~Inst_VOP3__V_CMPX_GE_F32

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_O_F32::Inst_VOP3__V_CMPX_O_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_o_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_O_F32

     Inst_VOP3__V_CMPX_O_F32::~Inst_VOP3__V_CMPX_O_F32()
     {
     } // ~Inst_VOP3__V_CMPX_O_F32

     // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOP3__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (!std::isnan(src0[lane])
                     && !std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_U_F32::Inst_VOP3__V_CMPX_U_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_u_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_U_F32

     Inst_VOP3__V_CMPX_U_F32::~Inst_VOP3__V_CMPX_U_F32()
     {
     } // ~Inst_VOP3__V_CMPX_U_F32

     // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOP3__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (std::isnan(src0[lane])
                         || std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NGE_F32::Inst_VOP3__V_CMPX_NGE_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nge_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_NGE_F32

     Inst_VOP3__V_CMPX_NGE_F32::~Inst_VOP3__V_CMPX_NGE_F32()
     {
     } // ~Inst_VOP3__V_CMPX_NGE_F32

     // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NLG_F32::Inst_VOP3__V_CMPX_NLG_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nlg_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_NLG_F32

     Inst_VOP3__V_CMPX_NLG_F32::~Inst_VOP3__V_CMPX_NLG_F32()
     {
     } // ~Inst_VOP3__V_CMPX_NLG_F32

     // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NGT_F32::Inst_VOP3__V_CMPX_NGT_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ngt_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_NGT_F32

     Inst_VOP3__V_CMPX_NGT_F32::~Inst_VOP3__V_CMPX_NGT_F32()
     {
     } // ~Inst_VOP3__V_CMPX_NGT_F32

     // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NLE_F32::Inst_VOP3__V_CMPX_NLE_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nle_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_NLE_F32

     Inst_VOP3__V_CMPX_NLE_F32::~Inst_VOP3__V_CMPX_NLE_F32()
     {
     } // ~Inst_VOP3__V_CMPX_NLE_F32

     // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NEQ_F32::Inst_VOP3__V_CMPX_NEQ_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_neq_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_NEQ_F32

     Inst_VOP3__V_CMPX_NEQ_F32::~Inst_VOP3__V_CMPX_NEQ_F32()
     {
     } // ~Inst_VOP3__V_CMPX_NEQ_F32

     // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NLT_F32::Inst_VOP3__V_CMPX_NLT_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nlt_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_NLT_F32

     Inst_VOP3__V_CMPX_NLT_F32::~Inst_VOP3__V_CMPX_NLT_F32()
     {
     } // ~Inst_VOP3__V_CMPX_NLT_F32

     // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_TRU_F32::Inst_VOP3__V_CMPX_TRU_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_tru_f32", true)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CMPX_TRU_F32

     Inst_VOP3__V_CMPX_TRU_F32::~Inst_VOP3__V_CMPX_TRU_F32()
     {
     } // ~Inst_VOP3__V_CMPX_TRU_F32

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMP_F_F64::Inst_VOP3__V_CMP_F_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_f_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_F_F64

     Inst_VOP3__V_CMP_F_F64::~Inst_VOP3__V_CMP_F_F64()
     {
     } // ~Inst_VOP3__V_CMP_F_F64

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LT_F64::Inst_VOP3__V_CMP_LT_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lt_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_LT_F64

     Inst_VOP3__V_CMP_LT_F64::~Inst_VOP3__V_CMP_LT_F64()
     {
     } // ~Inst_VOP3__V_CMP_LT_F64

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_EQ_F64::Inst_VOP3__V_CMP_EQ_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_eq_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_EQ_F64

     Inst_VOP3__V_CMP_EQ_F64::~Inst_VOP3__V_CMP_EQ_F64()
     {
     } // ~Inst_VOP3__V_CMP_EQ_F64

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LE_F64::Inst_VOP3__V_CMP_LE_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_le_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_LE_F64

     Inst_VOP3__V_CMP_LE_F64::~Inst_VOP3__V_CMP_LE_F64()
     {
     } // ~Inst_VOP3__V_CMP_LE_F64

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GT_F64::Inst_VOP3__V_CMP_GT_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_gt_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_GT_F64

     Inst_VOP3__V_CMP_GT_F64::~Inst_VOP3__V_CMP_GT_F64()
     {
     } // ~Inst_VOP3__V_CMP_GT_F64

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LG_F64::Inst_VOP3__V_CMP_LG_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lg_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_LG_F64

     Inst_VOP3__V_CMP_LG_F64::~Inst_VOP3__V_CMP_LG_F64()
     {
     } // ~Inst_VOP3__V_CMP_LG_F64

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GE_F64::Inst_VOP3__V_CMP_GE_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ge_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_GE_F64

     Inst_VOP3__V_CMP_GE_F64::~Inst_VOP3__V_CMP_GE_F64()
     {
     } // ~Inst_VOP3__V_CMP_GE_F64

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_O_F64::Inst_VOP3__V_CMP_O_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_o_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_O_F64

     Inst_VOP3__V_CMP_O_F64::~Inst_VOP3__V_CMP_O_F64()
     {
     } // ~Inst_VOP3__V_CMP_O_F64

     // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (!std::isnan(src0[lane])
                     && !std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_U_F64::Inst_VOP3__V_CMP_U_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_u_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_U_F64

     Inst_VOP3__V_CMP_U_F64::~Inst_VOP3__V_CMP_U_F64()
     {
     } // ~Inst_VOP3__V_CMP_U_F64

     // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (std::isnan(src0[lane])
                     || std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NGE_F64::Inst_VOP3__V_CMP_NGE_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nge_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_NGE_F64

     Inst_VOP3__V_CMP_NGE_F64::~Inst_VOP3__V_CMP_NGE_F64()
     {
     } // ~Inst_VOP3__V_CMP_NGE_F64

     // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NLG_F64::Inst_VOP3__V_CMP_NLG_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nlg_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_NLG_F64

     Inst_VOP3__V_CMP_NLG_F64::~Inst_VOP3__V_CMP_NLG_F64()
     {
     } // ~Inst_VOP3__V_CMP_NLG_F64

     // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NGT_F64::Inst_VOP3__V_CMP_NGT_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ngt_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_NGT_F64

     Inst_VOP3__V_CMP_NGT_F64::~Inst_VOP3__V_CMP_NGT_F64()
     {
     } // ~Inst_VOP3__V_CMP_NGT_F64

     // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NLE_F64::Inst_VOP3__V_CMP_NLE_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nle_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_NLE_F64

     Inst_VOP3__V_CMP_NLE_F64::~Inst_VOP3__V_CMP_NLE_F64()
     {
     } // ~Inst_VOP3__V_CMP_NLE_F64

     // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NEQ_F64::Inst_VOP3__V_CMP_NEQ_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_neq_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_NEQ_F64

     Inst_VOP3__V_CMP_NEQ_F64::~Inst_VOP3__V_CMP_NEQ_F64()
     {
     } // ~Inst_VOP3__V_CMP_NEQ_F64

     // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NLT_F64::Inst_VOP3__V_CMP_NLT_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_nlt_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_NLT_F64

     Inst_VOP3__V_CMP_NLT_F64::~Inst_VOP3__V_CMP_NLT_F64()
     {
     } // ~Inst_VOP3__V_CMP_NLT_F64

     // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_TRU_F64::Inst_VOP3__V_CMP_TRU_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_tru_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMP_TRU_F64

     Inst_VOP3__V_CMP_TRU_F64::~Inst_VOP3__V_CMP_TRU_F64()
     {
     } // ~Inst_VOP3__V_CMP_TRU_F64

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMPX_F_F64::Inst_VOP3__V_CMPX_F_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_f_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_F_F64

     Inst_VOP3__V_CMPX_F_F64::~Inst_VOP3__V_CMPX_F_F64()
     {
     } // ~Inst_VOP3__V_CMPX_F_F64

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LT_F64::Inst_VOP3__V_CMPX_LT_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lt_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_LT_F64

     Inst_VOP3__V_CMPX_LT_F64::~Inst_VOP3__V_CMPX_LT_F64()
     {
     } // ~Inst_VOP3__V_CMPX_LT_F64

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_EQ_F64::Inst_VOP3__V_CMPX_EQ_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_eq_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_EQ_F64

     Inst_VOP3__V_CMPX_EQ_F64::~Inst_VOP3__V_CMPX_EQ_F64()
     {
     } // ~Inst_VOP3__V_CMPX_EQ_F64

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LE_F64::Inst_VOP3__V_CMPX_LE_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_le_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_LE_F64

     Inst_VOP3__V_CMPX_LE_F64::~Inst_VOP3__V_CMPX_LE_F64()
     {
     } // ~Inst_VOP3__V_CMPX_LE_F64

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GT_F64::Inst_VOP3__V_CMPX_GT_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_gt_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_GT_F64

     Inst_VOP3__V_CMPX_GT_F64::~Inst_VOP3__V_CMPX_GT_F64()
     {
     } // ~Inst_VOP3__V_CMPX_GT_F64

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LG_F64::Inst_VOP3__V_CMPX_LG_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lg_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_LG_F64

     Inst_VOP3__V_CMPX_LG_F64::~Inst_VOP3__V_CMPX_LG_F64()
     {
     } // ~Inst_VOP3__V_CMPX_LG_F64

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GE_F64::Inst_VOP3__V_CMPX_GE_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ge_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_GE_F64

     Inst_VOP3__V_CMPX_GE_F64::~Inst_VOP3__V_CMPX_GE_F64()
     {
     } // ~Inst_VOP3__V_CMPX_GE_F64

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_O_F64::Inst_VOP3__V_CMPX_O_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_o_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_O_F64

     Inst_VOP3__V_CMPX_O_F64::~Inst_VOP3__V_CMPX_O_F64()
     {
     } // ~Inst_VOP3__V_CMPX_O_F64

     // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOP3__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (!std::isnan(src0[lane])
                     && !std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_U_F64::Inst_VOP3__V_CMPX_U_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_u_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_U_F64

     Inst_VOP3__V_CMPX_U_F64::~Inst_VOP3__V_CMPX_U_F64()
     {
     } // ~Inst_VOP3__V_CMPX_U_F64

     // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
     // encoding.
     void
     Inst_VOP3__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, (std::isnan(src0[lane])
                     || std::isnan(src1[lane])) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NGE_F64::Inst_VOP3__V_CMPX_NGE_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nge_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_NGE_F64

     Inst_VOP3__V_CMPX_NGE_F64::~Inst_VOP3__V_CMPX_NGE_F64()
     {
     } // ~Inst_VOP3__V_CMPX_NGE_F64

     // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NLG_F64::Inst_VOP3__V_CMPX_NLG_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nlg_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_NLG_F64

     Inst_VOP3__V_CMPX_NLG_F64::~Inst_VOP3__V_CMPX_NLG_F64()
     {
     } // ~Inst_VOP3__V_CMPX_NLG_F64

     // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] < src1[lane]
                     || src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NGT_F64::Inst_VOP3__V_CMPX_NGT_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ngt_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_NGT_F64

     Inst_VOP3__V_CMPX_NGT_F64::~Inst_VOP3__V_CMPX_NGT_F64()
     {
     } // ~Inst_VOP3__V_CMPX_NGT_F64

     // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NLE_F64::Inst_VOP3__V_CMPX_NLE_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nle_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_NLE_F64

     Inst_VOP3__V_CMPX_NLE_F64::~Inst_VOP3__V_CMPX_NLE_F64()
     {
     } // ~Inst_VOP3__V_CMPX_NLE_F64

     // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NEQ_F64::Inst_VOP3__V_CMPX_NEQ_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_neq_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_NEQ_F64

     Inst_VOP3__V_CMPX_NEQ_F64::~Inst_VOP3__V_CMPX_NEQ_F64()
     {
     } // ~Inst_VOP3__V_CMPX_NEQ_F64

     // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NLT_F64::Inst_VOP3__V_CMPX_NLT_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_nlt_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_NLT_F64

     Inst_VOP3__V_CMPX_NLT_F64::~Inst_VOP3__V_CMPX_NLT_F64()
     {
     } // ~Inst_VOP3__V_CMPX_NLT_F64

     // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_TRU_F64::Inst_VOP3__V_CMPX_TRU_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_tru_f64", true)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CMPX_TRU_F64

     Inst_VOP3__V_CMPX_TRU_F64::~Inst_VOP3__V_CMPX_TRU_F64()
     {
     } // ~Inst_VOP3__V_CMPX_TRU_F64

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMP_F_I16::Inst_VOP3__V_CMP_F_I16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_f_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_F_I16

     Inst_VOP3__V_CMP_F_I16::~Inst_VOP3__V_CMP_F_I16()
     {
     } // ~Inst_VOP3__V_CMP_F_I16

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LT_I16::Inst_VOP3__V_CMP_LT_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lt_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LT_I16

     Inst_VOP3__V_CMP_LT_I16::~Inst_VOP3__V_CMP_LT_I16()
     {
     } // ~Inst_VOP3__V_CMP_LT_I16

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_EQ_I16::Inst_VOP3__V_CMP_EQ_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_eq_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_EQ_I16

     Inst_VOP3__V_CMP_EQ_I16::~Inst_VOP3__V_CMP_EQ_I16()
     {
     } // ~Inst_VOP3__V_CMP_EQ_I16

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LE_I16::Inst_VOP3__V_CMP_LE_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_le_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LE_I16

     Inst_VOP3__V_CMP_LE_I16::~Inst_VOP3__V_CMP_LE_I16()
     {
     } // ~Inst_VOP3__V_CMP_LE_I16

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GT_I16::Inst_VOP3__V_CMP_GT_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_gt_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GT_I16

     Inst_VOP3__V_CMP_GT_I16::~Inst_VOP3__V_CMP_GT_I16()
     {
     } // ~Inst_VOP3__V_CMP_GT_I16

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NE_I16::Inst_VOP3__V_CMP_NE_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ne_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_NE_I16

     Inst_VOP3__V_CMP_NE_I16::~Inst_VOP3__V_CMP_NE_I16()
     {
     } // ~Inst_VOP3__V_CMP_NE_I16

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GE_I16::Inst_VOP3__V_CMP_GE_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ge_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GE_I16

     Inst_VOP3__V_CMP_GE_I16::~Inst_VOP3__V_CMP_GE_I16()
     {
     } // ~Inst_VOP3__V_CMP_GE_I16

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_T_I16::Inst_VOP3__V_CMP_T_I16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_t_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_T_I16

     Inst_VOP3__V_CMP_T_I16::~Inst_VOP3__V_CMP_T_I16()
     {
     } // ~Inst_VOP3__V_CMP_T_I16

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_F_U16::Inst_VOP3__V_CMP_F_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_f_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_F_U16

     Inst_VOP3__V_CMP_F_U16::~Inst_VOP3__V_CMP_F_U16()
     {
     } // ~Inst_VOP3__V_CMP_F_U16

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LT_U16::Inst_VOP3__V_CMP_LT_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lt_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LT_U16

     Inst_VOP3__V_CMP_LT_U16::~Inst_VOP3__V_CMP_LT_U16()
     {
     } // ~Inst_VOP3__V_CMP_LT_U16

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_EQ_U16::Inst_VOP3__V_CMP_EQ_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_eq_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_EQ_U16

     Inst_VOP3__V_CMP_EQ_U16::~Inst_VOP3__V_CMP_EQ_U16()
     {
     } // ~Inst_VOP3__V_CMP_EQ_U16

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LE_U16::Inst_VOP3__V_CMP_LE_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_le_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LE_U16

     Inst_VOP3__V_CMP_LE_U16::~Inst_VOP3__V_CMP_LE_U16()
     {
     } // ~Inst_VOP3__V_CMP_LE_U16

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GT_U16::Inst_VOP3__V_CMP_GT_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_gt_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GT_U16

     Inst_VOP3__V_CMP_GT_U16::~Inst_VOP3__V_CMP_GT_U16()
     {
     } // ~Inst_VOP3__V_CMP_GT_U16

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NE_U16::Inst_VOP3__V_CMP_NE_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ne_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_NE_U16

     Inst_VOP3__V_CMP_NE_U16::~Inst_VOP3__V_CMP_NE_U16()
     {
     } // ~Inst_VOP3__V_CMP_NE_U16

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GE_U16::Inst_VOP3__V_CMP_GE_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ge_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GE_U16

     Inst_VOP3__V_CMP_GE_U16::~Inst_VOP3__V_CMP_GE_U16()
     {
     } // ~Inst_VOP3__V_CMP_GE_U16

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_T_U16::Inst_VOP3__V_CMP_T_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_t_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_T_U16

     Inst_VOP3__V_CMP_T_U16::~Inst_VOP3__V_CMP_T_U16()
     {
     } // ~Inst_VOP3__V_CMP_T_U16

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMPX_F_I16::Inst_VOP3__V_CMPX_F_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_f_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_F_I16

     Inst_VOP3__V_CMPX_F_I16::~Inst_VOP3__V_CMPX_F_I16()
     {
     } // ~Inst_VOP3__V_CMPX_F_I16

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LT_I16::Inst_VOP3__V_CMPX_LT_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lt_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LT_I16

     Inst_VOP3__V_CMPX_LT_I16::~Inst_VOP3__V_CMPX_LT_I16()
     {
     } // ~Inst_VOP3__V_CMPX_LT_I16

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_EQ_I16::Inst_VOP3__V_CMPX_EQ_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_eq_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_EQ_I16

     Inst_VOP3__V_CMPX_EQ_I16::~Inst_VOP3__V_CMPX_EQ_I16()
     {
     } // ~Inst_VOP3__V_CMPX_EQ_I16

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LE_I16::Inst_VOP3__V_CMPX_LE_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_le_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LE_I16

     Inst_VOP3__V_CMPX_LE_I16::~Inst_VOP3__V_CMPX_LE_I16()
     {
     } // ~Inst_VOP3__V_CMPX_LE_I16

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GT_I16::Inst_VOP3__V_CMPX_GT_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_gt_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GT_I16

     Inst_VOP3__V_CMPX_GT_I16::~Inst_VOP3__V_CMPX_GT_I16()
     {
     } // ~Inst_VOP3__V_CMPX_GT_I16

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NE_I16::Inst_VOP3__V_CMPX_NE_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ne_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_NE_I16

     Inst_VOP3__V_CMPX_NE_I16::~Inst_VOP3__V_CMPX_NE_I16()
     {
     } // ~Inst_VOP3__V_CMPX_NE_I16

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GE_I16::Inst_VOP3__V_CMPX_GE_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ge_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GE_I16

     Inst_VOP3__V_CMPX_GE_I16::~Inst_VOP3__V_CMPX_GE_I16()
     {
     } // ~Inst_VOP3__V_CMPX_GE_I16

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_T_I16::Inst_VOP3__V_CMPX_T_I16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_t_i16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_T_I16

     Inst_VOP3__V_CMPX_T_I16::~Inst_VOP3__V_CMPX_T_I16()
     {
     } // ~Inst_VOP3__V_CMPX_T_I16

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_F_U16::Inst_VOP3__V_CMPX_F_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_f_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_F_U16

     Inst_VOP3__V_CMPX_F_U16::~Inst_VOP3__V_CMPX_F_U16()
     {
     } // ~Inst_VOP3__V_CMPX_F_U16

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LT_U16::Inst_VOP3__V_CMPX_LT_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lt_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LT_U16

     Inst_VOP3__V_CMPX_LT_U16::~Inst_VOP3__V_CMPX_LT_U16()
     {
     } // ~Inst_VOP3__V_CMPX_LT_U16

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_EQ_U16::Inst_VOP3__V_CMPX_EQ_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_eq_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_EQ_U16

     Inst_VOP3__V_CMPX_EQ_U16::~Inst_VOP3__V_CMPX_EQ_U16()
     {
     } // ~Inst_VOP3__V_CMPX_EQ_U16

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LE_U16::Inst_VOP3__V_CMPX_LE_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_le_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LE_U16

     Inst_VOP3__V_CMPX_LE_U16::~Inst_VOP3__V_CMPX_LE_U16()
     {
     } // ~Inst_VOP3__V_CMPX_LE_U16

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GT_U16::Inst_VOP3__V_CMPX_GT_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_gt_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GT_U16

     Inst_VOP3__V_CMPX_GT_U16::~Inst_VOP3__V_CMPX_GT_U16()
     {
     } // ~Inst_VOP3__V_CMPX_GT_U16

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NE_U16::Inst_VOP3__V_CMPX_NE_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ne_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_NE_U16

     Inst_VOP3__V_CMPX_NE_U16::~Inst_VOP3__V_CMPX_NE_U16()
     {
     } // ~Inst_VOP3__V_CMPX_NE_U16

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GE_U16::Inst_VOP3__V_CMPX_GE_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ge_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GE_U16

     Inst_VOP3__V_CMPX_GE_U16::~Inst_VOP3__V_CMPX_GE_U16()
     {
     } // ~Inst_VOP3__V_CMPX_GE_U16

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_T_U16::Inst_VOP3__V_CMPX_T_U16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_t_u16", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_T_U16

     Inst_VOP3__V_CMPX_T_U16::~Inst_VOP3__V_CMPX_T_U16()
     {
     } // ~Inst_VOP3__V_CMPX_T_U16

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMP_F_I32::Inst_VOP3__V_CMP_F_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_f_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_F_I32

     Inst_VOP3__V_CMP_F_I32::~Inst_VOP3__V_CMP_F_I32()
     {
     } // ~Inst_VOP3__V_CMP_F_I32

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMP_LT_I32::Inst_VOP3__V_CMP_LT_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lt_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LT_I32

     Inst_VOP3__V_CMP_LT_I32::~Inst_VOP3__V_CMP_LT_I32()
     {
     } // ~Inst_VOP3__V_CMP_LT_I32

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_EQ_I32::Inst_VOP3__V_CMP_EQ_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_eq_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_EQ_I32

     Inst_VOP3__V_CMP_EQ_I32::~Inst_VOP3__V_CMP_EQ_I32()
     {
     } // ~Inst_VOP3__V_CMP_EQ_I32

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LE_I32::Inst_VOP3__V_CMP_LE_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_le_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LE_I32

     Inst_VOP3__V_CMP_LE_I32::~Inst_VOP3__V_CMP_LE_I32()
     {
     } // ~Inst_VOP3__V_CMP_LE_I32

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GT_I32::Inst_VOP3__V_CMP_GT_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_gt_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GT_I32

     Inst_VOP3__V_CMP_GT_I32::~Inst_VOP3__V_CMP_GT_I32()
     {
     } // ~Inst_VOP3__V_CMP_GT_I32

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NE_I32::Inst_VOP3__V_CMP_NE_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ne_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_NE_I32

     Inst_VOP3__V_CMP_NE_I32::~Inst_VOP3__V_CMP_NE_I32()
     {
     } // ~Inst_VOP3__V_CMP_NE_I32

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GE_I32::Inst_VOP3__V_CMP_GE_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ge_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GE_I32

     Inst_VOP3__V_CMP_GE_I32::~Inst_VOP3__V_CMP_GE_I32()
     {
     } // ~Inst_VOP3__V_CMP_GE_I32

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_T_I32::Inst_VOP3__V_CMP_T_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_t_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_T_I32

     Inst_VOP3__V_CMP_T_I32::~Inst_VOP3__V_CMP_T_I32()
     {
     } // ~Inst_VOP3__V_CMP_T_I32

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_F_U32::Inst_VOP3__V_CMP_F_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_f_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_F_U32

     Inst_VOP3__V_CMP_F_U32::~Inst_VOP3__V_CMP_F_U32()
     {
     } // ~Inst_VOP3__V_CMP_F_U32

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LT_U32::Inst_VOP3__V_CMP_LT_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lt_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LT_U32

     Inst_VOP3__V_CMP_LT_U32::~Inst_VOP3__V_CMP_LT_U32()
     {
     } // ~Inst_VOP3__V_CMP_LT_U32

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_EQ_U32::Inst_VOP3__V_CMP_EQ_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_eq_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_EQ_U32

     Inst_VOP3__V_CMP_EQ_U32::~Inst_VOP3__V_CMP_EQ_U32()
     {
     } // ~Inst_VOP3__V_CMP_EQ_U32

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LE_U32::Inst_VOP3__V_CMP_LE_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_le_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LE_U32

     Inst_VOP3__V_CMP_LE_U32::~Inst_VOP3__V_CMP_LE_U32()
     {
     } // ~Inst_VOP3__V_CMP_LE_U32

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GT_U32::Inst_VOP3__V_CMP_GT_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_gt_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GT_U32

     Inst_VOP3__V_CMP_GT_U32::~Inst_VOP3__V_CMP_GT_U32()
     {
     } // ~Inst_VOP3__V_CMP_GT_U32

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NE_U32::Inst_VOP3__V_CMP_NE_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ne_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_NE_U32

     Inst_VOP3__V_CMP_NE_U32::~Inst_VOP3__V_CMP_NE_U32()
     {
     } // ~Inst_VOP3__V_CMP_NE_U32

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GE_U32::Inst_VOP3__V_CMP_GE_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ge_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GE_U32

     Inst_VOP3__V_CMP_GE_U32::~Inst_VOP3__V_CMP_GE_U32()
     {
     } // ~Inst_VOP3__V_CMP_GE_U32

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_T_U32::Inst_VOP3__V_CMP_T_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_t_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_T_U32

     Inst_VOP3__V_CMP_T_U32::~Inst_VOP3__V_CMP_T_U32()
     {
     } // ~Inst_VOP3__V_CMP_T_U32

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMPX_F_I32::Inst_VOP3__V_CMPX_F_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_f_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_F_I32

     Inst_VOP3__V_CMPX_F_I32::~Inst_VOP3__V_CMPX_F_I32()
     {
     } // ~Inst_VOP3__V_CMPX_F_I32

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LT_I32::Inst_VOP3__V_CMPX_LT_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lt_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LT_I32

     Inst_VOP3__V_CMPX_LT_I32::~Inst_VOP3__V_CMPX_LT_I32()
     {
     } // ~Inst_VOP3__V_CMPX_LT_I32

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_EQ_I32::Inst_VOP3__V_CMPX_EQ_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_eq_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_EQ_I32

     Inst_VOP3__V_CMPX_EQ_I32::~Inst_VOP3__V_CMPX_EQ_I32()
     {
     } // ~Inst_VOP3__V_CMPX_EQ_I32

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LE_I32::Inst_VOP3__V_CMPX_LE_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_le_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LE_I32

     Inst_VOP3__V_CMPX_LE_I32::~Inst_VOP3__V_CMPX_LE_I32()
     {
     } // ~Inst_VOP3__V_CMPX_LE_I32

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GT_I32::Inst_VOP3__V_CMPX_GT_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_gt_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GT_I32

     Inst_VOP3__V_CMPX_GT_I32::~Inst_VOP3__V_CMPX_GT_I32()
     {
     } // ~Inst_VOP3__V_CMPX_GT_I32

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NE_I32::Inst_VOP3__V_CMPX_NE_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ne_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_NE_I32

     Inst_VOP3__V_CMPX_NE_I32::~Inst_VOP3__V_CMPX_NE_I32()
     {
     } // ~Inst_VOP3__V_CMPX_NE_I32

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GE_I32::Inst_VOP3__V_CMPX_GE_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ge_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GE_I32

     Inst_VOP3__V_CMPX_GE_I32::~Inst_VOP3__V_CMPX_GE_I32()
     {
     } // ~Inst_VOP3__V_CMPX_GE_I32

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_T_I32::Inst_VOP3__V_CMPX_T_I32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_t_i32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_T_I32

     Inst_VOP3__V_CMPX_T_I32::~Inst_VOP3__V_CMPX_T_I32()
     {
     } // ~Inst_VOP3__V_CMPX_T_I32

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_F_U32::Inst_VOP3__V_CMPX_F_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_f_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_F_U32

     Inst_VOP3__V_CMPX_F_U32::~Inst_VOP3__V_CMPX_F_U32()
     {
     } // ~Inst_VOP3__V_CMPX_F_U32

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LT_U32::Inst_VOP3__V_CMPX_LT_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lt_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LT_U32

     Inst_VOP3__V_CMPX_LT_U32::~Inst_VOP3__V_CMPX_LT_U32()
     {
     } // ~Inst_VOP3__V_CMPX_LT_U32

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_EQ_U32::Inst_VOP3__V_CMPX_EQ_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_eq_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_EQ_U32

     Inst_VOP3__V_CMPX_EQ_U32::~Inst_VOP3__V_CMPX_EQ_U32()
     {
     } // ~Inst_VOP3__V_CMPX_EQ_U32

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LE_U32::Inst_VOP3__V_CMPX_LE_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_le_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LE_U32

     Inst_VOP3__V_CMPX_LE_U32::~Inst_VOP3__V_CMPX_LE_U32()
     {
     } // ~Inst_VOP3__V_CMPX_LE_U32

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GT_U32::Inst_VOP3__V_CMPX_GT_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_gt_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GT_U32

     Inst_VOP3__V_CMPX_GT_U32::~Inst_VOP3__V_CMPX_GT_U32()
     {
     } // ~Inst_VOP3__V_CMPX_GT_U32

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NE_U32::Inst_VOP3__V_CMPX_NE_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ne_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_NE_U32

     Inst_VOP3__V_CMPX_NE_U32::~Inst_VOP3__V_CMPX_NE_U32()
     {
     } // ~Inst_VOP3__V_CMPX_NE_U32

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GE_U32::Inst_VOP3__V_CMPX_GE_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ge_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GE_U32

     Inst_VOP3__V_CMPX_GE_U32::~Inst_VOP3__V_CMPX_GE_U32()
     {
     } // ~Inst_VOP3__V_CMPX_GE_U32

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_T_U32::Inst_VOP3__V_CMPX_T_U32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_t_u32", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_T_U32

     Inst_VOP3__V_CMPX_T_U32::~Inst_VOP3__V_CMPX_T_U32()
     {
     } // ~Inst_VOP3__V_CMPX_T_U32

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMP_F_I64::Inst_VOP3__V_CMP_F_I64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_f_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_F_I64

     Inst_VOP3__V_CMP_F_I64::~Inst_VOP3__V_CMP_F_I64()
     {
     } // ~Inst_VOP3__V_CMP_F_I64

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LT_I64::Inst_VOP3__V_CMP_LT_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lt_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LT_I64

     Inst_VOP3__V_CMP_LT_I64::~Inst_VOP3__V_CMP_LT_I64()
     {
     } // ~Inst_VOP3__V_CMP_LT_I64

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_EQ_I64::Inst_VOP3__V_CMP_EQ_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_eq_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_EQ_I64

     Inst_VOP3__V_CMP_EQ_I64::~Inst_VOP3__V_CMP_EQ_I64()
     {
     } // ~Inst_VOP3__V_CMP_EQ_I64

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LE_I64::Inst_VOP3__V_CMP_LE_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_le_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LE_I64

     Inst_VOP3__V_CMP_LE_I64::~Inst_VOP3__V_CMP_LE_I64()
     {
     } // ~Inst_VOP3__V_CMP_LE_I64

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GT_I64::Inst_VOP3__V_CMP_GT_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_gt_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GT_I64

     Inst_VOP3__V_CMP_GT_I64::~Inst_VOP3__V_CMP_GT_I64()
     {
     } // ~Inst_VOP3__V_CMP_GT_I64

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NE_I64::Inst_VOP3__V_CMP_NE_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ne_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_NE_I64

     Inst_VOP3__V_CMP_NE_I64::~Inst_VOP3__V_CMP_NE_I64()
     {
     } // ~Inst_VOP3__V_CMP_NE_I64

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GE_I64::Inst_VOP3__V_CMP_GE_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ge_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GE_I64

     Inst_VOP3__V_CMP_GE_I64::~Inst_VOP3__V_CMP_GE_I64()
     {
     } // ~Inst_VOP3__V_CMP_GE_I64

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_T_I64::Inst_VOP3__V_CMP_T_I64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_t_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_T_I64

     Inst_VOP3__V_CMP_T_I64::~Inst_VOP3__V_CMP_T_I64()
     {
     } // ~Inst_VOP3__V_CMP_T_I64

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_F_U64::Inst_VOP3__V_CMP_F_U64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_f_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_F_U64

     Inst_VOP3__V_CMP_F_U64::~Inst_VOP3__V_CMP_F_U64()
     {
     } // ~Inst_VOP3__V_CMP_F_U64

     // D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LT_U64::Inst_VOP3__V_CMP_LT_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_lt_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LT_U64

     Inst_VOP3__V_CMP_LT_U64::~Inst_VOP3__V_CMP_LT_U64()
     {
     } // ~Inst_VOP3__V_CMP_LT_U64

     // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_EQ_U64::Inst_VOP3__V_CMP_EQ_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_eq_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_EQ_U64

     Inst_VOP3__V_CMP_EQ_U64::~Inst_VOP3__V_CMP_EQ_U64()
     {
     } // ~Inst_VOP3__V_CMP_EQ_U64

     // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_LE_U64::Inst_VOP3__V_CMP_LE_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_le_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_LE_U64

     Inst_VOP3__V_CMP_LE_U64::~Inst_VOP3__V_CMP_LE_U64()
     {
     } // ~Inst_VOP3__V_CMP_LE_U64

     // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GT_U64::Inst_VOP3__V_CMP_GT_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_gt_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GT_U64

     Inst_VOP3__V_CMP_GT_U64::~Inst_VOP3__V_CMP_GT_U64()
     {
     } // ~Inst_VOP3__V_CMP_GT_U64

     // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_NE_U64::Inst_VOP3__V_CMP_NE_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ne_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_NE_U64

     Inst_VOP3__V_CMP_NE_U64::~Inst_VOP3__V_CMP_NE_U64()
     {
     } // ~Inst_VOP3__V_CMP_NE_U64

     // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_GE_U64::Inst_VOP3__V_CMP_GE_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_ge_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_GE_U64

     Inst_VOP3__V_CMP_GE_U64::~Inst_VOP3__V_CMP_GE_U64()
     {
     } // ~Inst_VOP3__V_CMP_GE_U64

     // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMP_T_U64::Inst_VOP3__V_CMP_T_U64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmp_t_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMP_T_U64

     Inst_VOP3__V_CMP_T_U64::~Inst_VOP3__V_CMP_T_U64()
     {
     } // ~Inst_VOP3__V_CMP_T_U64

     // D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         sdst.write();
     }

     Inst_VOP3__V_CMPX_F_I64::Inst_VOP3__V_CMPX_F_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_f_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_F_I64

     Inst_VOP3__V_CMPX_F_I64::~Inst_VOP3__V_CMPX_F_I64()
     {
     } // ~Inst_VOP3__V_CMPX_F_I64

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LT_I64::Inst_VOP3__V_CMPX_LT_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lt_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LT_I64

     Inst_VOP3__V_CMPX_LT_I64::~Inst_VOP3__V_CMPX_LT_I64()
     {
     } // ~Inst_VOP3__V_CMPX_LT_I64

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_EQ_I64::Inst_VOP3__V_CMPX_EQ_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_eq_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_EQ_I64

     Inst_VOP3__V_CMPX_EQ_I64::~Inst_VOP3__V_CMPX_EQ_I64()
     {
     } // ~Inst_VOP3__V_CMPX_EQ_I64

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LE_I64::Inst_VOP3__V_CMPX_LE_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_le_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LE_I64

     Inst_VOP3__V_CMPX_LE_I64::~Inst_VOP3__V_CMPX_LE_I64()
     {
     } // ~Inst_VOP3__V_CMPX_LE_I64

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GT_I64::Inst_VOP3__V_CMPX_GT_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_gt_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GT_I64

     Inst_VOP3__V_CMPX_GT_I64::~Inst_VOP3__V_CMPX_GT_I64()
     {
     } // ~Inst_VOP3__V_CMPX_GT_I64

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NE_I64::Inst_VOP3__V_CMPX_NE_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ne_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_NE_I64

     Inst_VOP3__V_CMPX_NE_I64::~Inst_VOP3__V_CMPX_NE_I64()
     {
     } // ~Inst_VOP3__V_CMPX_NE_I64

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GE_I64::Inst_VOP3__V_CMPX_GE_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ge_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GE_I64

     Inst_VOP3__V_CMPX_GE_I64::~Inst_VOP3__V_CMPX_GE_I64()
     {
     } // ~Inst_VOP3__V_CMPX_GE_I64

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_T_I64::Inst_VOP3__V_CMPX_T_I64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_t_i64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_T_I64

     Inst_VOP3__V_CMPX_T_I64::~Inst_VOP3__V_CMPX_T_I64()
     {
     } // ~Inst_VOP3__V_CMPX_T_I64

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_F_U64::Inst_VOP3__V_CMPX_F_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_f_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_F_U64

     Inst_VOP3__V_CMPX_F_U64::~Inst_VOP3__V_CMPX_F_U64()
     {
     } // ~Inst_VOP3__V_CMPX_F_U64

     // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LT_U64::Inst_VOP3__V_CMPX_LT_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_lt_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LT_U64

     Inst_VOP3__V_CMPX_LT_U64::~Inst_VOP3__V_CMPX_LT_U64()
     {
     } // ~Inst_VOP3__V_CMPX_LT_U64

     // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_EQ_U64::Inst_VOP3__V_CMPX_EQ_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_eq_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_EQ_U64

     Inst_VOP3__V_CMPX_EQ_U64::~Inst_VOP3__V_CMPX_EQ_U64()
     {
     } // ~Inst_VOP3__V_CMPX_EQ_U64

     // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_LE_U64::Inst_VOP3__V_CMPX_LE_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_le_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_LE_U64

     Inst_VOP3__V_CMPX_LE_U64::~Inst_VOP3__V_CMPX_LE_U64()
     {
     } // ~Inst_VOP3__V_CMPX_LE_U64

     // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GT_U64::Inst_VOP3__V_CMPX_GT_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_gt_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GT_U64

     Inst_VOP3__V_CMPX_GT_U64::~Inst_VOP3__V_CMPX_GT_U64()
     {
     } // ~Inst_VOP3__V_CMPX_GT_U64

     // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_NE_U64::Inst_VOP3__V_CMPX_NE_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ne_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_NE_U64

     Inst_VOP3__V_CMPX_NE_U64::~Inst_VOP3__V_CMPX_NE_U64()
     {
     } // ~Inst_VOP3__V_CMPX_NE_U64

     // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_GE_U64::Inst_VOP3__V_CMPX_GE_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_ge_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_GE_U64

     Inst_VOP3__V_CMPX_GE_U64::~Inst_VOP3__V_CMPX_GE_U64()
     {
     } // ~Inst_VOP3__V_CMPX_GE_U64

     // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CMPX_T_U64::Inst_VOP3__V_CMPX_T_U64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cmpx_t_u64", true)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CMPX_T_U64

     Inst_VOP3__V_CMPX_T_U64::~Inst_VOP3__V_CMPX_T_U64()
     {
     } // ~Inst_VOP3__V_CMPX_T_U64

     // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
     void
     Inst_VOP3__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 sdst.setBit(lane, 1);
             }
         }

         wf->execMask() = sdst.rawData();
         sdst.write();
     }

     Inst_VOP3__V_CNDMASK_B32::Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cndmask_b32", false)
     {
         setFlag(ALU);
         setFlag(ReadsVCC);
     } // Inst_VOP3__V_CNDMASK_B32

     Inst_VOP3__V_CNDMASK_B32::~Inst_VOP3__V_CNDMASK_B32()
     {
     } // ~Inst_VOP3__V_CNDMASK_B32

     // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
     // as a scalar GPR in S2.
     void
     Inst_VOP3__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         vcc.read();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = bits(vcc.rawData(), lane)
                     ? src1[lane] : src0[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_ADD_F32::Inst_VOP3__V_ADD_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_add_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_ADD_F32

     Inst_VOP3__V_ADD_F32::~Inst_VOP3__V_ADD_F32()
     {
     } // ~Inst_VOP3__V_ADD_F32

     // D.f = S0.f + S1.f.
     void
     Inst_VOP3__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] + src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SUB_F32::Inst_VOP3__V_SUB_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sub_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_SUB_F32

     Inst_VOP3__V_SUB_F32::~Inst_VOP3__V_SUB_F32()
     {
     } // ~Inst_VOP3__V_SUB_F32

     // D.f = S0.f - S1.f.
     void
     Inst_VOP3__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SUBREV_F32::Inst_VOP3__V_SUBREV_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_subrev_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_SUBREV_F32

     Inst_VOP3__V_SUBREV_F32::~Inst_VOP3__V_SUBREV_F32()
     {
     } // ~Inst_VOP3__V_SUBREV_F32

     // D.f = S1.f - S0.f.
     void
     Inst_VOP3__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_LEGACY_F32::Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_legacy_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_MUL_LEGACY_F32

     Inst_VOP3__V_MUL_LEGACY_F32::~Inst_VOP3__V_MUL_LEGACY_F32()
     {
     } // ~Inst_VOP3__V_MUL_LEGACY_F32

     // D.f = S0.f * S1.f
     void
     Inst_VOP3__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isnan(src0[lane]) ||
                     std::isnan(src1[lane])) {
                     vdst[lane] = NAN;
                 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src0[lane]) == FP_ZERO) &&
                            !std::signbit(src0[lane])) {
                     if (std::isinf(src1[lane])) {
                         vdst[lane] = NAN;
                     } else if (!std::signbit(src1[lane])) {
                         vdst[lane] = +0.0;
                     } else {
                         vdst[lane] = -0.0;
                     }
                 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src0[lane]) == FP_ZERO) &&
                            std::signbit(src0[lane])) {
                     if (std::isinf(src1[lane])) {
                         vdst[lane] = NAN;
                     } else if (std::signbit(src1[lane])) {
                         vdst[lane] = +0.0;
                     } else {
                         vdst[lane] = -0.0;
                     }
                 } else if (std::isinf(src0[lane]) &&
                            !std::signbit(src0[lane])) {
                     if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src1[lane]) == FP_ZERO) {
                         vdst[lane] = NAN;
                     } else if (!std::signbit(src1[lane])) {
                         vdst[lane] = +INFINITY;
                     } else {
                         vdst[lane] = -INFINITY;
                     }
                 } else if (std::isinf(src0[lane]) &&
                            std::signbit(src0[lane])) {
                     if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src1[lane]) == FP_ZERO) {
                         vdst[lane] = NAN;
                     } else if (std::signbit(src1[lane])) {
                         vdst[lane] = +INFINITY;
                     } else {
                         vdst[lane] = -INFINITY;
                     }
                 } else {
                     vdst[lane] = src0[lane] * src1[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_F32::Inst_VOP3__V_MUL_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_MUL_F32

     Inst_VOP3__V_MUL_F32::~Inst_VOP3__V_MUL_F32()
     {
     } // ~Inst_VOP3__V_MUL_F32

     // D.f = S0.f * S1.f.
     void
     Inst_VOP3__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isnan(src0[lane]) ||
                     std::isnan(src1[lane])) {
                     vdst[lane] = NAN;
                 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src0[lane]) == FP_ZERO) &&
                            !std::signbit(src0[lane])) {
                     if (std::isinf(src1[lane])) {
                         vdst[lane] = NAN;
                     } else if (!std::signbit(src1[lane])) {
                         vdst[lane] = +0.0;
                     } else {
                         vdst[lane] = -0.0;
                     }
                 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src0[lane]) == FP_ZERO) &&
                            std::signbit(src0[lane])) {
                     if (std::isinf(src1[lane])) {
                         vdst[lane] = NAN;
                     } else if (std::signbit(src1[lane])) {
                         vdst[lane] = +0.0;
                     } else {
                         vdst[lane] = -0.0;
                     }
                 } else if (std::isinf(src0[lane]) &&
                            !std::signbit(src0[lane])) {
                     if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src1[lane]) == FP_ZERO) {
                         vdst[lane] = NAN;
                     } else if (!std::signbit(src1[lane])) {
                         vdst[lane] = +INFINITY;
                     } else {
                         vdst[lane] = -INFINITY;
                     }
                 } else if (std::isinf(src0[lane]) &&
                            std::signbit(src0[lane])) {
                     if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src1[lane]) == FP_ZERO) {
                         vdst[lane] = NAN;
                     } else if (std::signbit(src1[lane])) {
                         vdst[lane] = +INFINITY;
                     } else {
                         vdst[lane] = -INFINITY;
                     }
                 } else {
                     vdst[lane] = src0[lane] * src1[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_I32_I24::Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_i32_i24", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MUL_I32_I24

     Inst_VOP3__V_MUL_I32_I24::~Inst_VOP3__V_MUL_I32_I24()
     {
     } // ~Inst_VOP3__V_MUL_I32_I24

     // D.i = S0.i[23:0] * S1.i[23:0].
     void
     Inst_VOP3__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
                     * sext<24>(bits(src1[lane], 23, 0));
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_HI_I32_I24::Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_hi_i32_i24", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MUL_HI_I32_I24

     Inst_VOP3__V_MUL_HI_I32_I24::~Inst_VOP3__V_MUL_HI_I32_I24()
     {
     } // ~Inst_VOP3__V_MUL_HI_I32_I24

     // D.i = (S0.i[23:0] * S1.i[23:0]) >> 32.
     void
     Inst_VOP3__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemI64 tmp_src0
                     = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
                 VecElemI64 tmp_src1
                     = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));

                 vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_U32_U24::Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_u32_u24", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MUL_U32_U24

     Inst_VOP3__V_MUL_U32_U24::~Inst_VOP3__V_MUL_U32_U24()
     {
     } // ~Inst_VOP3__V_MUL_U32_U24

     // D.u = S0.u[23:0] * S1.u[23:0].
     void
     Inst_VOP3__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_HI_U32_U24::Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_hi_u32_u24", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MUL_HI_U32_U24

     Inst_VOP3__V_MUL_HI_U32_U24::~Inst_VOP3__V_MUL_HI_U32_U24()
     {
     } // ~Inst_VOP3__V_MUL_HI_U32_U24

     // D.i = (S0.u[23:0] * S1.u[23:0]) >> 32.
     void
     Inst_VOP3__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
                 VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
                 vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MIN_F32::Inst_VOP3__V_MIN_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_MIN_F32

     Inst_VOP3__V_MIN_F32::~Inst_VOP3__V_MIN_F32()
     {
     } // ~Inst_VOP3__V_MIN_F32

     // D.f = (S0.f < S1.f ? S0.f : S1.f).
     void
     Inst_VOP3__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fmin(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAX_F32::Inst_VOP3__V_MAX_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_MAX_F32

     Inst_VOP3__V_MAX_F32::~Inst_VOP3__V_MAX_F32()
     {
     } // ~Inst_VOP3__V_MAX_F32

     // D.f = (S0.f >= S1.f ? S0.f : S1.f).
     void
     Inst_VOP3__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fmax(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MIN_I32::Inst_VOP3__V_MIN_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MIN_I32

     Inst_VOP3__V_MIN_I32::~Inst_VOP3__V_MIN_I32()
     {
     } // ~Inst_VOP3__V_MIN_I32

     // D.i = min(S0.i, S1.i).
     void
     Inst_VOP3__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAX_I32::Inst_VOP3__V_MAX_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MAX_I32

     Inst_VOP3__V_MAX_I32::~Inst_VOP3__V_MAX_I32()
     {
     } // ~Inst_VOP3__V_MAX_I32

     // D.i = max(S0.i, S1.i).
     void
     Inst_VOP3__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MIN_U32::Inst_VOP3__V_MIN_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MIN_U32

     Inst_VOP3__V_MIN_U32::~Inst_VOP3__V_MIN_U32()
     {
     } // ~Inst_VOP3__V_MIN_U32

     // D.u = min(S0.u, S1.u).
     void
     Inst_VOP3__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAX_U32::Inst_VOP3__V_MAX_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MAX_U32

     Inst_VOP3__V_MAX_U32::~Inst_VOP3__V_MAX_U32()
     {
     } // ~Inst_VOP3__V_MAX_U32

     // D.u = max(S0.u, S1.u).
     void
     Inst_VOP3__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LSHRREV_B32::Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_lshrrev_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_LSHRREV_B32

     Inst_VOP3__V_LSHRREV_B32::~Inst_VOP3__V_LSHRREV_B32()
     {
     } // ~Inst_VOP3__V_LSHRREV_B32

     // D.u = S1.u >> S0.u[4:0].
     // The vacated bits are set to zero.
     void
     Inst_VOP3__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_ASHRREV_I32::Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ashrrev_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_ASHRREV_I32

     Inst_VOP3__V_ASHRREV_I32::~Inst_VOP3__V_ASHRREV_I32()
     {
     } // ~Inst_VOP3__V_ASHRREV_I32

     // D.i = signext(S1.i) >> S0.i[4:0].
     // The vacated bits are set to the sign bit of the input value.
     void
     Inst_VOP3__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LSHLREV_B32::Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_lshlrev_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_LSHLREV_B32

     Inst_VOP3__V_LSHLREV_B32::~Inst_VOP3__V_LSHLREV_B32()
     {
     } // ~Inst_VOP3__V_LSHLREV_B32

     // D.u = S1.u << S0.u[4:0].
     void
     Inst_VOP3__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_AND_B32::Inst_VOP3__V_AND_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_and_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_AND_B32

     Inst_VOP3__V_AND_B32::~Inst_VOP3__V_AND_B32()
     {
     } // ~Inst_VOP3__V_AND_B32

     // D.u = S0.u & S1.u.
     // Input and output modifiers not supported.
     void
     Inst_VOP3__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] & src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_OR_B32::Inst_VOP3__V_OR_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_or_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_OR_B32

     Inst_VOP3__V_OR_B32::~Inst_VOP3__V_OR_B32()
     {
     } // ~Inst_VOP3__V_OR_B32

     // D.u = S0.u | S1.u.
     // Input and output modifiers not supported.
     void
     Inst_VOP3__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] | src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_XOR_B32::Inst_VOP3__V_XOR_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_xor_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_XOR_B32

     Inst_VOP3__V_XOR_B32::~Inst_VOP3__V_XOR_B32()
     {
     } // ~Inst_VOP3__V_XOR_B32

     // D.u = S0.u ^ S1.u.
     // Input and output modifiers not supported.
     void
     Inst_VOP3__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] ^ src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAC_F32::Inst_VOP3__V_MAC_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mac_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
         setFlag(MAC);
     } // Inst_VOP3__V_MAC_F32

     Inst_VOP3__V_MAC_F32::~Inst_VOP3__V_MAC_F32()
     {
     } // ~Inst_VOP3__V_MAC_F32

     // D.f = S0.f * S1.f + D.f.
     void
     Inst_VOP3__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         vdst.read();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_add_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
     } // Inst_VOP3__V_ADD_U32

     Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
     {
     } // ~Inst_VOP3__V_ADD_U32

     // D.u = S0.u + S1.u;
     // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
     // overflow or carry-out.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
     Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, instData.SDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] + src1[lane];
                 vcc.setBit(lane, ((VecElemU64)src0[lane]
                     + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
             }
         }

         vdst.write();
         vcc.write();
     }

     Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_sub_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
     } // Inst_VOP3__V_SUB_U32

     Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
     {
     } // ~Inst_VOP3__V_SUB_U32

     // D.u = S0.u - S1.u;
     // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
     // carry-out.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
     Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, instData.SDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
                 vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
             }
         }

         vdst.write();
         vcc.write();
     }

     Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(
           InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_subrev_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
     } // Inst_VOP3__V_SUBREV_U32

     Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
     {
     } // ~Inst_VOP3__V_SUBREV_U32

     // D.u = S1.u - S0.u;
     // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
     // carry-out.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
     Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, instData.SDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
             }
         }

         vdst.write();
         vcc.write();
     }

     Inst_VOP3__V_ADDC_U32::Inst_VOP3__V_ADDC_U32(InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_addc_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
     } // Inst_VOP3__V_ADDC_U32

     Inst_VOP3__V_ADDC_U32::~Inst_VOP3__V_ADDC_U32()
     {
     } // ~Inst_VOP3__V_ADDC_U32

     // D.u = S0.u + S1.u + VCC[threadId];
     // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x100000000ULL ? 1 : 0)
     // is an UNSIGNED overflow.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
     // source comes from the SGPR-pair at S2.u.
     void
     Inst_VOP3__V_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

         src0.readSrc();
         src1.readSrc();
         vcc.read();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] + src1[lane]
                     + bits(vcc.rawData(), lane);
                 sdst.setBit(lane, ((VecElemU64)src0[lane]
                     + (VecElemU64)src1[lane]
                         + (VecElemU64)bits(vcc.rawData(), lane))
                             >= 0x100000000 ? 1 : 0);
             }
         }

         vdst.write();
         sdst.write();
     }

     Inst_VOP3__V_SUBB_U32::Inst_VOP3__V_SUBB_U32(InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_subb_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
     } // Inst_VOP3__V_SUBB_U32

     Inst_VOP3__V_SUBB_U32::~Inst_VOP3__V_SUBB_U32()
     {
     } // ~Inst_VOP3__V_SUBB_U32

     // D.u = S0.u - S1.u - VCC[threadId];
     // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
     // overflow.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
     // source comes from the SGPR-pair at S2.u.
     void
     Inst_VOP3__V_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
         ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         vcc.read();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane]
                     - bits(vcc.rawData(), lane);
                 sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
                     > src0[lane] ? 1 : 0);
             }
         }

         vdst.write();
         sdst.write();
     }

     Inst_VOP3__V_SUBBREV_U32::Inst_VOP3__V_SUBBREV_U32(
           InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_subbrev_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
     } // Inst_VOP3__V_SUBBREV_U32

     Inst_VOP3__V_SUBBREV_U32::~Inst_VOP3__V_SUBBREV_U32()
     {
     } // ~Inst_VOP3__V_SUBBREV_U32

     // D.u = S1.u - S0.u - VCC[threadId];
     // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
     // overflow.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
     // source comes from the SGPR-pair at S2.u.
     void
     Inst_VOP3__V_SUBBREV_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
         ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         vcc.read();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane]
                     - bits(vcc.rawData(), lane);
                 sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
                     > src0[lane] ? 1 : 0);
             }
         }

         vdst.write();
         sdst.write();
     }

     Inst_VOP3__V_ADD_F16::Inst_VOP3__V_ADD_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_add_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_ADD_F16

     Inst_VOP3__V_ADD_F16::~Inst_VOP3__V_ADD_F16()
     {
     } // ~Inst_VOP3__V_ADD_F16

     // D.f16 = S0.f16 + S1.f16.
     void
     Inst_VOP3__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_SUB_F16::Inst_VOP3__V_SUB_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sub_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_SUB_F16

     Inst_VOP3__V_SUB_F16::~Inst_VOP3__V_SUB_F16()
     {
     } // ~Inst_VOP3__V_SUB_F16

     // D.f16 = S0.f16 - S1.f16.
     void
     Inst_VOP3__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_SUBREV_F16::Inst_VOP3__V_SUBREV_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_subrev_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_SUBREV_F16

     Inst_VOP3__V_SUBREV_F16::~Inst_VOP3__V_SUBREV_F16()
     {
     } // ~Inst_VOP3__V_SUBREV_F16

     // D.f16 = S1.f16 - S0.f16.
     void
     Inst_VOP3__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_MUL_F16::Inst_VOP3__V_MUL_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_MUL_F16

     Inst_VOP3__V_MUL_F16::~Inst_VOP3__V_MUL_F16()
     {
     } // ~Inst_VOP3__V_MUL_F16

     // D.f16 = S0.f16 * S1.f16.
     void
     Inst_VOP3__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_MAC_F16::Inst_VOP3__V_MAC_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mac_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
         setFlag(MAC);
     } // Inst_VOP3__V_MAC_F16

     Inst_VOP3__V_MAC_F16::~Inst_VOP3__V_MAC_F16()
     {
     } // ~Inst_VOP3__V_MAC_F16

     // D.f16 = S0.f16 * S1.f16 + D.f16.
     void
     Inst_VOP3__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_ADD_U16::Inst_VOP3__V_ADD_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_add_u16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_ADD_U16

     Inst_VOP3__V_ADD_U16::~Inst_VOP3__V_ADD_U16()
     {
     } // ~Inst_VOP3__V_ADD_U16

     // D.u16 = S0.u16 + S1.u16.
     void
     Inst_VOP3__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] + src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SUB_U16::Inst_VOP3__V_SUB_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sub_u16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_SUB_U16

     Inst_VOP3__V_SUB_U16::~Inst_VOP3__V_SUB_U16()
     {
     } // ~Inst_VOP3__V_SUB_U16

     // D.u16 = S0.u16 - S1.u16.
     void
     Inst_VOP3__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SUBREV_U16::Inst_VOP3__V_SUBREV_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_subrev_u16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_SUBREV_U16

     Inst_VOP3__V_SUBREV_U16::~Inst_VOP3__V_SUBREV_U16()
     {
     } // ~Inst_VOP3__V_SUBREV_U16

     // D.u16 = S1.u16 - S0.u16.
     void
     Inst_VOP3__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_LO_U16::Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_lo_u16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MUL_LO_U16

     Inst_VOP3__V_MUL_LO_U16::~Inst_VOP3__V_MUL_LO_U16()
     {
     } // ~Inst_VOP3__V_MUL_LO_U16

     // D.u16 = S0.u16 * S1.u16.
     void
     Inst_VOP3__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] * src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LSHLREV_B16::Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_lshlrev_b16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_LSHLREV_B16

     Inst_VOP3__V_LSHLREV_B16::~Inst_VOP3__V_LSHLREV_B16()
     {
     } // ~Inst_VOP3__V_LSHLREV_B16

     // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
     void
     Inst_VOP3__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LSHRREV_B16::Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_lshrrev_b16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_LSHRREV_B16

     Inst_VOP3__V_LSHRREV_B16::~Inst_VOP3__V_LSHRREV_B16()
     {
     } // ~Inst_VOP3__V_LSHRREV_B16

     // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
     // The vacated bits are set to zero.
     void
     Inst_VOP3__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_ASHRREV_I16::Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ashrrev_i16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_ASHRREV_I16

     Inst_VOP3__V_ASHRREV_I16::~Inst_VOP3__V_ASHRREV_I16()
     {
     } // ~Inst_VOP3__V_ASHRREV_I16

     // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
     // The vacated bits are set to the sign bit of the input value.
     void
     Inst_VOP3__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         VecOperandI16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAX_F16::Inst_VOP3__V_MAX_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_MAX_F16

     Inst_VOP3__V_MAX_F16::~Inst_VOP3__V_MAX_F16()
     {
     } // ~Inst_VOP3__V_MAX_F16

     // D.f16 = max(S0.f16, S1.f16).
     void
     Inst_VOP3__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_MIN_F16::Inst_VOP3__V_MIN_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_MIN_F16

     Inst_VOP3__V_MIN_F16::~Inst_VOP3__V_MIN_F16()
     {
     } // ~Inst_VOP3__V_MIN_F16

     // D.f16 = min(S0.f16, S1.f16).
     void
     Inst_VOP3__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_MAX_U16::Inst_VOP3__V_MAX_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max_u16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MAX_U16

     Inst_VOP3__V_MAX_U16::~Inst_VOP3__V_MAX_U16()
     {
     } // ~Inst_VOP3__V_MAX_U16

     // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
     void
     Inst_VOP3__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAX_I16::Inst_VOP3__V_MAX_I16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max_i16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MAX_I16

     Inst_VOP3__V_MAX_I16::~Inst_VOP3__V_MAX_I16()
     {
     } // ~Inst_VOP3__V_MAX_I16

     // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
     void
     Inst_VOP3__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         VecOperandI16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MIN_U16::Inst_VOP3__V_MIN_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min_u16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MIN_U16

     Inst_VOP3__V_MIN_U16::~Inst_VOP3__V_MIN_U16()
     {
     } // ~Inst_VOP3__V_MIN_U16

     // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
     void
     Inst_VOP3__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MIN_I16::Inst_VOP3__V_MIN_I16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min_i16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MIN_I16

     Inst_VOP3__V_MIN_I16::~Inst_VOP3__V_MIN_I16()
     {
     } // ~Inst_VOP3__V_MIN_I16

     // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
     void
     Inst_VOP3__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         VecOperandI16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LDEXP_F16::Inst_VOP3__V_LDEXP_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ldexp_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_LDEXP_F16

     Inst_VOP3__V_LDEXP_F16::~Inst_VOP3__V_LDEXP_F16()
     {
     } // ~Inst_VOP3__V_LDEXP_F16

     // D.f16 = S0.f16 * (2 ** S1.i16).
     void
     Inst_VOP3__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_nop", false)
     {
         setFlag(Nop);
         setFlag(ALU);
     } // Inst_VOP3__V_NOP

     Inst_VOP3__V_NOP::~Inst_VOP3__V_NOP()
     {
     } // ~Inst_VOP3__V_NOP

     // Do nothing.
     void
     Inst_VOP3__V_NOP::execute(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_VOP3__V_MOV_B32::Inst_VOP3__V_MOV_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mov_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MOV_B32

     Inst_VOP3__V_MOV_B32::~Inst_VOP3__V_MOV_B32()
     {
     } // ~Inst_VOP3__V_MOV_B32

     // D.u = S0.u.
     // Input and output modifiers not supported; this is an untyped operation.
     void
     Inst_VOP3__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_I32_F64::Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_i32_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CVT_I32_F64

     Inst_VOP3__V_CVT_I32_F64::~Inst_VOP3__V_CVT_I32_F64()
     {
     } // ~Inst_VOP3__V_CVT_I32_F64

     // D.i = (int)S0.d.
     // Out-of-range floating point values (including infinity) saturate. NaN
     // is converted to 0.
     void
     Inst_VOP3__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
                 std::frexp(src[lane],&exp);
                 if (std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else if (std::isinf(src[lane]) || exp > 30) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = INT_MIN;
                     } else {
                         vdst[lane] = INT_MAX;
                     }
                 } else {
                     vdst[lane] = (VecElemI32)src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_F64_I32::Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f64_i32", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CVT_F64_I32

     Inst_VOP3__V_CVT_F64_I32::~Inst_VOP3__V_CVT_F64_I32()
     {
     } // ~Inst_VOP3__V_CVT_F64_I32

     // D.d = (double)S0.i.
     void
     Inst_VOP3__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF64)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_F32_I32::Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f32_i32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_F32_I32

     Inst_VOP3__V_CVT_F32_I32::~Inst_VOP3__V_CVT_F32_I32()
     {
     } // ~Inst_VOP3__V_CVT_F32_I32

     // D.f = (float)S0.i.
     void
     Inst_VOP3__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         VecOperandI32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_F32_U32::Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f32_u32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_F32_U32

     Inst_VOP3__V_CVT_F32_U32::~Inst_VOP3__V_CVT_F32_U32()
     {
     } // ~Inst_VOP3__V_CVT_F32_U32

     // D.f = (float)S0.u.
     void
     Inst_VOP3__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_U32_F32::Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_u32_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_U32_F32

     Inst_VOP3__V_CVT_U32_F32::~Inst_VOP3__V_CVT_U32_F32()
     {
     } // ~Inst_VOP3__V_CVT_U32_F32

     // D.u = (unsigned)S0.f.
     // Out-of-range floating point values (including infinity) saturate. NaN
     // is converted to 0.
     void
     Inst_VOP3__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
                 std::frexp(src[lane],&exp);
                 if (std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else if (std::isinf(src[lane])) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = 0;
                     } else {
                         vdst[lane] = UINT_MAX;
                     }
                 } else if (exp > 31) {
                     vdst[lane] = UINT_MAX;
                 } else {
                     vdst[lane] = (VecElemU32)src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_I32_F32::Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_i32_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_I32_F32

     Inst_VOP3__V_CVT_I32_F32::~Inst_VOP3__V_CVT_I32_F32()
     {
     } // ~Inst_VOP3__V_CVT_I32_F32

     // D.i = (int)S0.f.
     // Out-of-range floating point values (including infinity) saturate. NaN
     // is converted to 0.
     void
     Inst_VOP3__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
                 std::frexp(src[lane],&exp);
                 if (std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else if (std::isinf(src[lane]) || exp > 30) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = INT_MIN;
                     } else {
                         vdst[lane] = INT_MAX;
                     }
                 } else {
                     vdst[lane] = (VecElemI32)src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MOV_FED_B32::Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mov_fed_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MOV_FED_B32

     Inst_VOP3__V_MOV_FED_B32::~Inst_VOP3__V_MOV_FED_B32()
     {
     } // ~Inst_VOP3__V_MOV_FED_B32

     // D.u = S0.u;
     // Input and output modifiers not supported; this is an untyped operation.
     void
     Inst_VOP3__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_F16_F32::Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f16_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_F16_F32

     Inst_VOP3__V_CVT_F16_F32::~Inst_VOP3__V_CVT_F16_F32()
     {
     } // ~Inst_VOP3__V_CVT_F16_F32

     // D.f16 = flt32_to_flt16(S0.f).
     void
     Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_F32_F16::Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f32_f16", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_F32_F16

     Inst_VOP3__V_CVT_F32_F16::~Inst_VOP3__V_CVT_F32_F16()
     {
     } // ~Inst_VOP3__V_CVT_F32_F16

     // D.f = flt16_to_flt32(S0.f16).
     void
     Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_RPI_I32_F32::Inst_VOP3__V_CVT_RPI_I32_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_rpi_i32_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_RPI_I32_F32

     Inst_VOP3__V_CVT_RPI_I32_F32::~Inst_VOP3__V_CVT_RPI_I32_F32()
     {
     } // ~Inst_VOP3__V_CVT_RPI_I32_F32

     // D.i = (int)floor(S0.f + 0.5).
     void
     Inst_VOP3__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_FLR_I32_F32::Inst_VOP3__V_CVT_FLR_I32_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_flr_i32_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_FLR_I32_F32

     Inst_VOP3__V_CVT_FLR_I32_F32::~Inst_VOP3__V_CVT_FLR_I32_F32()
     {
     } // ~Inst_VOP3__V_CVT_FLR_I32_F32

     // D.i = (int)floor(S0.f).
     void
     Inst_VOP3__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemI32)std::floor(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_OFF_F32_I4::Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_off_f32_i4", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_OFF_F32_I4

     Inst_VOP3__V_CVT_OFF_F32_I4::~Inst_VOP3__V_CVT_OFF_F32_I4()
     {
     } // ~Inst_VOP3__V_CVT_OFF_F32_I4

     // 4-bit signed int to 32-bit float.
     void
     Inst_VOP3__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_F32_F64::Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f32_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CVT_F32_F64

     Inst_VOP3__V_CVT_F32_F64::~Inst_VOP3__V_CVT_F32_F64()
     {
     } // ~Inst_VOP3__V_CVT_F32_F64

     // D.f = (float)S0.d.
     void
     Inst_VOP3__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_F64_F32::Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f64_f32", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CVT_F64_F32

     Inst_VOP3__V_CVT_F64_F32::~Inst_VOP3__V_CVT_F64_F32()
     {
     } // ~Inst_VOP3__V_CVT_F64_F32

     // D.d = (double)S0.f.
     void
     Inst_VOP3__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF64)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_F32_UBYTE0::Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f32_ubyte0", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_F32_UBYTE0

     Inst_VOP3__V_CVT_F32_UBYTE0::~Inst_VOP3__V_CVT_F32_UBYTE0()
     {
     } // ~Inst_VOP3__V_CVT_F32_UBYTE0

     // D.f = (float)(S0.u[7:0]).
     void
     Inst_VOP3__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_F32_UBYTE1::Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f32_ubyte1", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_F32_UBYTE1

     Inst_VOP3__V_CVT_F32_UBYTE1::~Inst_VOP3__V_CVT_F32_UBYTE1()
     {
     } // ~Inst_VOP3__V_CVT_F32_UBYTE1

     // D.f = (float)(S0.u[15:8]).
     void
     Inst_VOP3__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_F32_UBYTE2::Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f32_ubyte2", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_F32_UBYTE2

     Inst_VOP3__V_CVT_F32_UBYTE2::~Inst_VOP3__V_CVT_F32_UBYTE2()
     {
     } // ~Inst_VOP3__V_CVT_F32_UBYTE2

     // D.f = (float)(S0.u[23:16]).
     void
     Inst_VOP3__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_F32_UBYTE3::Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f32_ubyte3", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_F32_UBYTE3

     Inst_VOP3__V_CVT_F32_UBYTE3::~Inst_VOP3__V_CVT_F32_UBYTE3()
     {
     } // ~Inst_VOP3__V_CVT_F32_UBYTE3

     // D.f = (float)(S0.u[31:24]).
     void
     Inst_VOP3__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_U32_F64::Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_u32_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CVT_U32_F64

     Inst_VOP3__V_CVT_U32_F64::~Inst_VOP3__V_CVT_U32_F64()
     {
     } // ~Inst_VOP3__V_CVT_U32_F64

     // D.u = (unsigned)S0.d.
     // Out-of-range floating point values (including infinity) saturate. NaN
     // is converted to 0.
     void
     Inst_VOP3__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
                 std::frexp(src[lane],&exp);
                 if (std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else if (std::isinf(src[lane])) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = 0;
                     } else {
                         vdst[lane] = UINT_MAX;
                     }
                 } else if (exp > 31) {
                     vdst[lane] = UINT_MAX;
                 } else {
                     vdst[lane] = (VecElemU32)src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_F64_U32::Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f64_u32", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CVT_F64_U32

     Inst_VOP3__V_CVT_F64_U32::~Inst_VOP3__V_CVT_F64_U32()
     {
     } // ~Inst_VOP3__V_CVT_F64_U32

     // D.d = (double)S0.u.
     void
     Inst_VOP3__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF64)src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_TRUNC_F64::Inst_VOP3__V_TRUNC_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_trunc_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_TRUNC_F64

     Inst_VOP3__V_TRUNC_F64::~Inst_VOP3__V_TRUNC_F64()
     {
     } // ~Inst_VOP3__V_TRUNC_F64

     // D.d = trunc(S0.d), return integer part of S0.d.
     void
     Inst_VOP3__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::trunc(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CEIL_F64::Inst_VOP3__V_CEIL_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ceil_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_CEIL_F64

     Inst_VOP3__V_CEIL_F64::~Inst_VOP3__V_CEIL_F64()
     {
     } // ~Inst_VOP3__V_CEIL_F64

     // D.d = ceil(S0.d);
     void
     Inst_VOP3__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::ceil(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_RNDNE_F64::Inst_VOP3__V_RNDNE_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rndne_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_RNDNE_F64

     Inst_VOP3__V_RNDNE_F64::~Inst_VOP3__V_RNDNE_F64()
     {
     } // ~Inst_VOP3__V_RNDNE_F64

     // D.d = round_nearest_even(S0.d).
     void
     Inst_VOP3__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = roundNearestEven(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FLOOR_F64::Inst_VOP3__V_FLOOR_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_floor_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_FLOOR_F64

     Inst_VOP3__V_FLOOR_F64::~Inst_VOP3__V_FLOOR_F64()
     {
     } // ~Inst_VOP3__V_FLOOR_F64

     // D.d = floor(S0.d);
     void
     Inst_VOP3__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::floor(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FRACT_F32::Inst_VOP3__V_FRACT_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_fract_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_FRACT_F32

     Inst_VOP3__V_FRACT_F32::~Inst_VOP3__V_FRACT_F32()
     {
     } // ~Inst_VOP3__V_FRACT_F32

     // D.f = modf(S0.f).
     void
     Inst_VOP3__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemF32 int_part(0.0);
                 vdst[lane] = std::modf(src[lane], &int_part);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_TRUNC_F32::Inst_VOP3__V_TRUNC_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_trunc_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_TRUNC_F32

     Inst_VOP3__V_TRUNC_F32::~Inst_VOP3__V_TRUNC_F32()
     {
     } // ~Inst_VOP3__V_TRUNC_F32

     // D.f = trunc(S0.f), return integer part of S0.f.
     void
     Inst_VOP3__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::trunc(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CEIL_F32::Inst_VOP3__V_CEIL_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ceil_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CEIL_F32

     Inst_VOP3__V_CEIL_F32::~Inst_VOP3__V_CEIL_F32()
     {
     } // ~Inst_VOP3__V_CEIL_F32

     // D.f = ceil(S0.f);
     void
     Inst_VOP3__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::ceil(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_RNDNE_F32::Inst_VOP3__V_RNDNE_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rndne_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_RNDNE_F32

     Inst_VOP3__V_RNDNE_F32::~Inst_VOP3__V_RNDNE_F32()
     {
     } // ~Inst_VOP3__V_RNDNE_F32

     // D.f = round_nearest_even(S0.f).
     void
     Inst_VOP3__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = roundNearestEven(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FLOOR_F32::Inst_VOP3__V_FLOOR_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_floor_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_FLOOR_F32

     Inst_VOP3__V_FLOOR_F32::~Inst_VOP3__V_FLOOR_F32()
     {
     } // ~Inst_VOP3__V_FLOOR_F32

     // D.f = floor(S0.f);
     void
     Inst_VOP3__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::floor(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_EXP_F32::Inst_VOP3__V_EXP_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_exp_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_EXP_F32

     Inst_VOP3__V_EXP_F32::~Inst_VOP3__V_EXP_F32()
     {
     } // ~Inst_VOP3__V_EXP_F32

     // D.f = pow(2.0, S0.f).
     void
     Inst_VOP3__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::pow(2.0, src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LOG_F32::Inst_VOP3__V_LOG_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_log_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_LOG_F32

     Inst_VOP3__V_LOG_F32::~Inst_VOP3__V_LOG_F32()
     {
     } // ~Inst_VOP3__V_LOG_F32

     // D.f = log2(S0.f).
     void
     Inst_VOP3__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::log2(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_RCP_F32::Inst_VOP3__V_RCP_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rcp_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_RCP_F32

     Inst_VOP3__V_RCP_F32::~Inst_VOP3__V_RCP_F32()
     {
     } // ~Inst_VOP3__V_RCP_F32

     // D.f = 1.0 / S0.f.
     void
     Inst_VOP3__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = 1.0 / src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_RCP_IFLAG_F32::Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rcp_iflag_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_RCP_IFLAG_F32

     Inst_VOP3__V_RCP_IFLAG_F32::~Inst_VOP3__V_RCP_IFLAG_F32()
     {
     } // ~Inst_VOP3__V_RCP_IFLAG_F32

     // D.f = 1.0 / S0.f.
     void
     Inst_VOP3__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = 1.0 / src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_RSQ_F32::Inst_VOP3__V_RSQ_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rsq_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_RSQ_F32

     Inst_VOP3__V_RSQ_F32::~Inst_VOP3__V_RSQ_F32()
     {
     } // ~Inst_VOP3__V_RSQ_F32

     // D.f = 1.0 / sqrt(S0.f).
     void
     Inst_VOP3__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = 1.0 / std::sqrt(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_RCP_F64::Inst_VOP3__V_RCP_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rcp_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_RCP_F64

     Inst_VOP3__V_RCP_F64::~Inst_VOP3__V_RCP_F64()
     {
     } // ~Inst_VOP3__V_RCP_F64

     // D.d = 1.0 / S0.d.
     void
     Inst_VOP3__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::fpclassify(src[lane]) == FP_ZERO) {
                     vdst[lane] = +INFINITY;
                 } else if (std::isnan(src[lane])) {
                     vdst[lane] = NAN;
                 } else if (std::isinf(src[lane])) {
                     if (std::signbit(src[lane])) {
                         vdst[lane] = -0.0;
                     } else {
                         vdst[lane] = 0.0;
                     }
                 } else {
                     vdst[lane] = 1.0 / src[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_RSQ_F64::Inst_VOP3__V_RSQ_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rsq_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_RSQ_F64

     Inst_VOP3__V_RSQ_F64::~Inst_VOP3__V_RSQ_F64()
     {
     } // ~Inst_VOP3__V_RSQ_F64

     // D.d = 1.0 / sqrt(S0.d).
     void
     Inst_VOP3__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::fpclassify(src[lane]) == FP_ZERO) {
                     vdst[lane] = +INFINITY;
                 } else if (std::isnan(src[lane])) {
                     vdst[lane] = NAN;
                 } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
                     vdst[lane] = 0.0;
                 } else if (std::signbit(src[lane])) {
                     vdst[lane] = NAN;
                 } else {
                     vdst[lane] = 1.0 / std::sqrt(src[lane]);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SQRT_F32::Inst_VOP3__V_SQRT_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sqrt_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_SQRT_F32

     Inst_VOP3__V_SQRT_F32::~Inst_VOP3__V_SQRT_F32()
     {
     } // ~Inst_VOP3__V_SQRT_F32

     // D.f = sqrt(S0.f).
     void
     Inst_VOP3__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::sqrt(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SQRT_F64::Inst_VOP3__V_SQRT_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sqrt_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_SQRT_F64

     Inst_VOP3__V_SQRT_F64::~Inst_VOP3__V_SQRT_F64()
     {
     } // ~Inst_VOP3__V_SQRT_F64

     // D.d = sqrt(S0.d).
     void
     Inst_VOP3__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::sqrt(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SIN_F32::Inst_VOP3__V_SIN_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sin_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_SIN_F32

     Inst_VOP3__V_SIN_F32::~Inst_VOP3__V_SIN_F32()
     {
     } // ~Inst_VOP3__V_SIN_F32

     // D.f = sin(S0.f * 2 * PI).
     void
     Inst_VOP3__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();
         pi.read();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_COS_F32::Inst_VOP3__V_COS_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cos_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_COS_F32

     Inst_VOP3__V_COS_F32::~Inst_VOP3__V_COS_F32()
     {
     } // ~Inst_VOP3__V_COS_F32

     // D.f = cos(S0.f * 2 * PI).
     void
     Inst_VOP3__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();
         pi.read();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_NOT_B32::Inst_VOP3__V_NOT_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_not_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_NOT_B32

     Inst_VOP3__V_NOT_B32::~Inst_VOP3__V_NOT_B32()
     {
     } // ~Inst_VOP3__V_NOT_B32

     // D.u = ~S0.u.
     // Input and output modifiers not supported.
     void
     Inst_VOP3__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = ~src[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_BFREV_B32::Inst_VOP3__V_BFREV_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_bfrev_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_BFREV_B32

     Inst_VOP3__V_BFREV_B32::~Inst_VOP3__V_BFREV_B32()
     {
     } // ~Inst_VOP3__V_BFREV_B32

     // D.u[31:0] = S0.u[0:31], bitfield reverse.
     // Input and output modifiers not supported.
     void
     Inst_VOP3__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = reverseBits(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FFBH_U32::Inst_VOP3__V_FFBH_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ffbh_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_FFBH_U32

     Inst_VOP3__V_FFBH_U32::~Inst_VOP3__V_FFBH_U32()
     {
     } // ~Inst_VOP3__V_FFBH_U32

     // D.u = position of first 1 in S0.u from MSB;
     // D.u = 0xffffffff if S0.u == 0.
     void
     Inst_VOP3__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = findFirstOneMsb(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FFBL_B32::Inst_VOP3__V_FFBL_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ffbl_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_FFBL_B32

     Inst_VOP3__V_FFBL_B32::~Inst_VOP3__V_FFBL_B32()
     {
     } // ~Inst_VOP3__V_FFBL_B32

     // D.u = position of first 1 in S0.u from LSB;
     // D.u = 0xffffffff if S0.u == 0.
     void
     Inst_VOP3__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = findFirstOne(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FFBH_I32::Inst_VOP3__V_FFBH_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ffbh_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_FFBH_I32

     Inst_VOP3__V_FFBH_I32::~Inst_VOP3__V_FFBH_I32()
     {
     } // ~Inst_VOP3__V_FFBH_I32

     // D.u = position of first bit different from sign bit in S0.i from MSB;
     // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
     void
     Inst_VOP3__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = firstOppositeSignBit(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FREXP_EXP_I32_F64::Inst_VOP3__V_FREXP_EXP_I32_F64(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_frexp_exp_i32_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_FREXP_EXP_I32_F64

     Inst_VOP3__V_FREXP_EXP_I32_F64::~Inst_VOP3__V_FREXP_EXP_I32_F64()
     {
     } // ~Inst_VOP3__V_FREXP_EXP_I32_F64

     // See V_FREXP_EXP_I32_F32.
     void
     Inst_VOP3__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else {
                     VecElemI32 exp(0);
                     std::frexp(src[lane], &exp);
                     vdst[lane] = exp;
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FREXP_MANT_F64::Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_frexp_mant_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_FREXP_MANT_F64

     Inst_VOP3__V_FREXP_MANT_F64::~Inst_VOP3__V_FREXP_MANT_F64()
     {
     } // ~Inst_VOP3__V_FREXP_MANT_F64

     void
     Inst_VOP3__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemI32 exp(0);
                 vdst[lane] = std::frexp(src[lane], &exp);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FRACT_F64::Inst_VOP3__V_FRACT_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_fract_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_FRACT_F64

     Inst_VOP3__V_FRACT_F64::~Inst_VOP3__V_FRACT_F64()
     {
     } // ~Inst_VOP3__V_FRACT_F64

     void
     Inst_VOP3__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemF32 int_part(0.0);
                 vdst[lane] = std::modf(src[lane], &int_part);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FREXP_EXP_I32_F32::Inst_VOP3__V_FREXP_EXP_I32_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_frexp_exp_i32_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_FREXP_EXP_I32_F32

     Inst_VOP3__V_FREXP_EXP_I32_F32::~Inst_VOP3__V_FREXP_EXP_I32_F32()
     {
     } // ~Inst_VOP3__V_FREXP_EXP_I32_F32

     // frexp(S0.f, Exponenti(S0.f))
     // if (S0.f == INF || S0.f == NAN) then D.i = 0;
     // else D.i = Exponent(S0.f)
     void
     Inst_VOP3__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane])|| std::isnan(src[lane])) {
                     vdst[lane] = 0;
                 } else {
                     VecElemI32 exp(0);
                     std::frexp(src[lane], &exp);
                     vdst[lane] = exp;
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FREXP_MANT_F32::Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_frexp_mant_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_FREXP_MANT_F32

     Inst_VOP3__V_FREXP_MANT_F32::~Inst_VOP3__V_FREXP_MANT_F32()
     {
     } // ~Inst_VOP3__V_FREXP_MANT_F32

     // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
     // else D.f = Mantissa(S0.f).
     void
     Inst_VOP3__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                     vdst[lane] = src[lane];
                 } else {
                     VecElemI32 exp(0);
                     vdst[lane] = std::frexp(src[lane], &exp);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CLREXCP::Inst_VOP3__V_CLREXCP(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_clrexcp", false)
     {
     } // Inst_VOP3__V_CLREXCP

     Inst_VOP3__V_CLREXCP::~Inst_VOP3__V_CLREXCP()
     {
     } // ~Inst_VOP3__V_CLREXCP

     void
     Inst_VOP3__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_F16_U16::Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f16_u16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CVT_F16_U16

     Inst_VOP3__V_CVT_F16_U16::~Inst_VOP3__V_CVT_F16_U16()
     {
     } // ~Inst_VOP3__V_CVT_F16_U16

     // D.f16 = uint16_to_flt16(S.u16).
     void
     Inst_VOP3__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_F16_I16::Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_f16_i16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CVT_F16_I16

     Inst_VOP3__V_CVT_F16_I16::~Inst_VOP3__V_CVT_F16_I16()
     {
     } // ~Inst_VOP3__V_CVT_F16_I16

     // D.f16 = int16_to_flt16(S.i16).
     void
     Inst_VOP3__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_U16_F16::Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_u16_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CVT_U16_F16

     Inst_VOP3__V_CVT_U16_F16::~Inst_VOP3__V_CVT_U16_F16()
     {
     } // ~Inst_VOP3__V_CVT_U16_F16

     // D.u16 = flt16_to_uint16(S.f16).
     void
     Inst_VOP3__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_I16_F16::Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_i16_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CVT_I16_F16

     Inst_VOP3__V_CVT_I16_F16::~Inst_VOP3__V_CVT_I16_F16()
     {
     } // ~Inst_VOP3__V_CVT_I16_F16

     // D.i16 = flt16_to_int16(S.f16).
     void
     Inst_VOP3__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_RCP_F16::Inst_VOP3__V_RCP_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rcp_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_RCP_F16

     Inst_VOP3__V_RCP_F16::~Inst_VOP3__V_RCP_F16()
     {
     } // ~Inst_VOP3__V_RCP_F16

     // if (S0.f16 == 1.0f)
     //     D.f16 = 1.0f;
     // else
     //     D.f16 = 1 / S0.f16.
     void
     Inst_VOP3__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_SQRT_F16::Inst_VOP3__V_SQRT_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sqrt_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_SQRT_F16

     Inst_VOP3__V_SQRT_F16::~Inst_VOP3__V_SQRT_F16()
     {
     } // ~Inst_VOP3__V_SQRT_F16

     // if (S0.f16 == 1.0f)
     //     D.f16 = 1.0f;
     // else
     //     D.f16 = sqrt(S0.f16).
     void
     Inst_VOP3__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_RSQ_F16::Inst_VOP3__V_RSQ_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rsq_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_RSQ_F16

     Inst_VOP3__V_RSQ_F16::~Inst_VOP3__V_RSQ_F16()
     {
     } // ~Inst_VOP3__V_RSQ_F16

     // if (S0.f16 == 1.0f)
     //     D.f16 = 1.0f;
     // else
     //     D.f16 = 1 / sqrt(S0.f16).
     void
     Inst_VOP3__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_LOG_F16::Inst_VOP3__V_LOG_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_log_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_LOG_F16

     Inst_VOP3__V_LOG_F16::~Inst_VOP3__V_LOG_F16()
     {
     } // ~Inst_VOP3__V_LOG_F16

     // if (S0.f16 == 1.0f)
     //     D.f16 = 0.0f;
     // else
     //     D.f16 = log2(S0.f16).
     void
     Inst_VOP3__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_EXP_F16::Inst_VOP3__V_EXP_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_exp_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_EXP_F16

     Inst_VOP3__V_EXP_F16::~Inst_VOP3__V_EXP_F16()
     {
     } // ~Inst_VOP3__V_EXP_F16

     // if (S0.f16 == 0.0f)
     //     D.f16 = 1.0f;
     // else
     //     D.f16 = pow(2.0, S0.f16).
     void
     Inst_VOP3__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_FREXP_MANT_F16::Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_frexp_mant_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_FREXP_MANT_F16

     Inst_VOP3__V_FREXP_MANT_F16::~Inst_VOP3__V_FREXP_MANT_F16()
     {
     } // ~Inst_VOP3__V_FREXP_MANT_F16

     // if (S0.f16 == +-INF || S0.f16 == NAN)
     //     D.f16 = S0.f16;
     // else
     //     D.f16 = mantissa(S0.f16).
     void
     Inst_VOP3__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_FREXP_EXP_I16_F16::Inst_VOP3__V_FREXP_EXP_I16_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_frexp_exp_i16_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_FREXP_EXP_I16_F16

     Inst_VOP3__V_FREXP_EXP_I16_F16::~Inst_VOP3__V_FREXP_EXP_I16_F16()
     {
     } // ~Inst_VOP3__V_FREXP_EXP_I16_F16

     void
     Inst_VOP3__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_FLOOR_F16::Inst_VOP3__V_FLOOR_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_floor_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_FLOOR_F16

     Inst_VOP3__V_FLOOR_F16::~Inst_VOP3__V_FLOOR_F16()
     {
     } // ~Inst_VOP3__V_FLOOR_F16

     // D.f16 = floor(S0.f16);
     void
     Inst_VOP3__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CEIL_F16::Inst_VOP3__V_CEIL_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ceil_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_CEIL_F16

     Inst_VOP3__V_CEIL_F16::~Inst_VOP3__V_CEIL_F16()
     {
     } // ~Inst_VOP3__V_CEIL_F16

     // D.f16 = ceil(S0.f16);
     void
     Inst_VOP3__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_TRUNC_F16::Inst_VOP3__V_TRUNC_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_trunc_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_TRUNC_F16

     Inst_VOP3__V_TRUNC_F16::~Inst_VOP3__V_TRUNC_F16()
     {
     } // ~Inst_VOP3__V_TRUNC_F16

     // D.f16 = trunc(S0.f16).
     void
     Inst_VOP3__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_RNDNE_F16::Inst_VOP3__V_RNDNE_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_rndne_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_RNDNE_F16

     Inst_VOP3__V_RNDNE_F16::~Inst_VOP3__V_RNDNE_F16()
     {
     } // ~Inst_VOP3__V_RNDNE_F16

     // D.f16 = roundNearestEven(S0.f16);
     void
     Inst_VOP3__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_FRACT_F16::Inst_VOP3__V_FRACT_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_fract_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_FRACT_F16

     Inst_VOP3__V_FRACT_F16::~Inst_VOP3__V_FRACT_F16()
     {
     } // ~Inst_VOP3__V_FRACT_F16

     // D.f16 = S0.f16 + -floor(S0.f16).
     void
     Inst_VOP3__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_SIN_F16::Inst_VOP3__V_SIN_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sin_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_SIN_F16

     Inst_VOP3__V_SIN_F16::~Inst_VOP3__V_SIN_F16()
     {
     } // ~Inst_VOP3__V_SIN_F16

     // D.f16 = sin(S0.f16 * 2 * PI).
     void
     Inst_VOP3__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_COS_F16::Inst_VOP3__V_COS_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cos_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_COS_F16

     Inst_VOP3__V_COS_F16::~Inst_VOP3__V_COS_F16()
     {
     } // ~Inst_VOP3__V_COS_F16

     // D.f16 = cos(S0.f16 * 2 * PI).
     void
     Inst_VOP3__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_EXP_LEGACY_F32::Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_exp_legacy_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_EXP_LEGACY_F32

     Inst_VOP3__V_EXP_LEGACY_F32::~Inst_VOP3__V_EXP_LEGACY_F32()
     {
     } // ~Inst_VOP3__V_EXP_LEGACY_F32

     // D.f = pow(2.0, S0.f)
     void
     Inst_VOP3__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         if (instData.ABS & 0x1) {
             src.absModifier();
         }

         if (extData.NEG & 0x1) {
             src.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::pow(2.0, src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LOG_LEGACY_F32::Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_log_legacy_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_LOG_LEGACY_F32

     Inst_VOP3__V_LOG_LEGACY_F32::~Inst_VOP3__V_LOG_LEGACY_F32()
     {
     } // ~Inst_VOP3__V_LOG_LEGACY_F32

     // D.f = log2(S0.f).
     void
     Inst_VOP3__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::log2(src[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAD_LEGACY_F32::Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mad_legacy_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
         setFlag(MAD);
     } // Inst_VOP3__V_MAD_LEGACY_F32

     Inst_VOP3__V_MAD_LEGACY_F32::~Inst_VOP3__V_MAD_LEGACY_F32()
     {
     } // ~Inst_VOP3__V_MAD_LEGACY_F32

     // D.f = S0.f * S1.f + S2.f
     void
     Inst_VOP3__V_MAD_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAD_F32::Inst_VOP3__V_MAD_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mad_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
         setFlag(MAD);
     } // Inst_VOP3__V_MAD_F32

     Inst_VOP3__V_MAD_F32::~Inst_VOP3__V_MAD_F32()
     {
     } // ~Inst_VOP3__V_MAD_F32

     // D.f = S0.f * S1.f + S2.f.
     void
     Inst_VOP3__V_MAD_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAD_I32_I24::Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mad_i32_i24", false)
     {
         setFlag(ALU);
         setFlag(MAD);
     } // Inst_VOP3__V_MAD_I32_I24

     Inst_VOP3__V_MAD_I32_I24::~Inst_VOP3__V_MAD_I32_I24()
     {
     } // ~Inst_VOP3__V_MAD_I32_I24

     // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
     void
     Inst_VOP3__V_MAD_I32_I24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
                     * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAD_U32_U24::Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mad_u32_u24", false)
     {
         setFlag(ALU);
         setFlag(MAD);
     } // Inst_VOP3__V_MAD_U32_U24

     Inst_VOP3__V_MAD_U32_U24::~Inst_VOP3__V_MAD_U32_U24()
     {
     } // ~Inst_VOP3__V_MAD_U32_U24

     // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
     void
     Inst_VOP3__V_MAD_U32_U24::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
                     + src2[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CUBEID_F32::Inst_VOP3__V_CUBEID_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cubeid_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CUBEID_F32

     Inst_VOP3__V_CUBEID_F32::~Inst_VOP3__V_CUBEID_F32()
     {
     } // ~Inst_VOP3__V_CUBEID_F32

     void
     Inst_VOP3__V_CUBEID_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CUBESC_F32::Inst_VOP3__V_CUBESC_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cubesc_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CUBESC_F32

     Inst_VOP3__V_CUBESC_F32::~Inst_VOP3__V_CUBESC_F32()
     {
     } // ~Inst_VOP3__V_CUBESC_F32

     void
     Inst_VOP3__V_CUBESC_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CUBETC_F32::Inst_VOP3__V_CUBETC_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cubetc_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CUBETC_F32

     Inst_VOP3__V_CUBETC_F32::~Inst_VOP3__V_CUBETC_F32()
     {
     } // ~Inst_VOP3__V_CUBETC_F32

     void
     Inst_VOP3__V_CUBETC_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CUBEMA_F32::Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cubema_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CUBEMA_F32

     Inst_VOP3__V_CUBEMA_F32::~Inst_VOP3__V_CUBEMA_F32()
     {
     } // ~Inst_VOP3__V_CUBEMA_F32

     void
     Inst_VOP3__V_CUBEMA_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_BFE_U32::Inst_VOP3__V_BFE_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_bfe_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_BFE_U32

     Inst_VOP3__V_BFE_U32::~Inst_VOP3__V_BFE_U32()
     {
     } // ~Inst_VOP3__V_BFE_U32

     // D.u = (S0.u >> S1.u[4:0]) & ((1 << S2.u[4:0]) - 1).
     // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
     void
     Inst_VOP3__V_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
                     & ((1 << bits(src2[lane], 4, 0)) - 1);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_BFE_I32::Inst_VOP3__V_BFE_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_bfe_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_BFE_I32

     Inst_VOP3__V_BFE_I32::~Inst_VOP3__V_BFE_I32()
     {
     } // ~Inst_VOP3__V_BFE_I32

     // D.i = (S0.i >> S1.u[4:0]) & ((1 << S2.u[4:0]) - 1).
     // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
     void
     Inst_VOP3__V_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
                     & ((1 << bits(src2[lane], 4, 0)) - 1);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_BFI_B32::Inst_VOP3__V_BFI_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_bfi_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_BFI_B32

     Inst_VOP3__V_BFI_B32::~Inst_VOP3__V_BFI_B32()
     {
     } // ~Inst_VOP3__V_BFI_B32

     // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
     void
     Inst_VOP3__V_BFI_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
                     & src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FMA_F32::Inst_VOP3__V_FMA_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_fma_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
         setFlag(FMA);
     } // Inst_VOP3__V_FMA_F32

     Inst_VOP3__V_FMA_F32::~Inst_VOP3__V_FMA_F32()
     {
     } // ~Inst_VOP3__V_FMA_F32

     // D.f = S0.f * S1.f + S2.f.
     void
     Inst_VOP3__V_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FMA_F64::Inst_VOP3__V_FMA_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_fma_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
         setFlag(FMA);
     } // Inst_VOP3__V_FMA_F64

     Inst_VOP3__V_FMA_F64::~Inst_VOP3__V_FMA_F64()
     {
     } // ~Inst_VOP3__V_FMA_F64

     // D.d = S0.d * S1.d + S2.d.
     void
     Inst_VOP3__V_FMA_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LERP_U8::Inst_VOP3__V_LERP_U8(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_lerp_u8", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_LERP_U8

     Inst_VOP3__V_LERP_U8::~Inst_VOP3__V_LERP_U8()
     {
     } // ~Inst_VOP3__V_LERP_U8

     // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
     // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
     // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
     // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
     void
     Inst_VOP3__V_LERP_U8::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = ((bits(src0[lane], 31, 24)
                     + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
                         << 24;
                 vdst[lane] += ((bits(src0[lane], 23, 16)
                     + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
                         << 16;
                 vdst[lane] += ((bits(src0[lane], 15, 8)
                     + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
                         << 8;
                 vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
                     + bits(src2[lane], 0)) >> 1);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_ALIGNBIT_B32::Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_alignbit_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_ALIGNBIT_B32

     Inst_VOP3__V_ALIGNBIT_B32::~Inst_VOP3__V_ALIGNBIT_B32()
     {
     } // ~Inst_VOP3__V_ALIGNBIT_B32

     // D.u = ({S0, S1} >> S2.u[4:0]) & 0xffffffff.
     void
     Inst_VOP3__V_ALIGNBIT_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
                     | (VecElemU64)src1[lane]);
                 vdst[lane] = (VecElemU32)((src_0_1
                     >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_ALIGNBYTE_B32::Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_alignbyte_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_ALIGNBYTE_B32

     Inst_VOP3__V_ALIGNBYTE_B32::~Inst_VOP3__V_ALIGNBYTE_B32()
     {
     } // ~Inst_VOP3__V_ALIGNBYTE_B32

     // D.u = ({S0, S1} >> (8 * S2.u[4:0])) & 0xffffffff.
     void
     Inst_VOP3__V_ALIGNBYTE_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
                     | (VecElemU64)src1[lane]);
                 vdst[lane] = (VecElemU32)((src_0_1
                     >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
                         & 0xffffffff);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MIN3_F32::Inst_VOP3__V_MIN3_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min3_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_MIN3_F32

     Inst_VOP3__V_MIN3_F32::~Inst_VOP3__V_MIN3_F32()
     {
     } // ~Inst_VOP3__V_MIN3_F32

     // D.f = min(S0.f, S1.f, S2.f).
     void
     Inst_VOP3__V_MIN3_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
                 vdst[lane] = std::fmin(min_0_1, src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MIN3_I32::Inst_VOP3__V_MIN3_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min3_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MIN3_I32

     Inst_VOP3__V_MIN3_I32::~Inst_VOP3__V_MIN3_I32()
     {
     } // ~Inst_VOP3__V_MIN3_I32

     // D.i = min(S0.i, S1.i, S2.i).
     void
     Inst_VOP3__V_MIN3_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
                 vdst[lane] = std::min(min_0_1, src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MIN3_U32::Inst_VOP3__V_MIN3_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min3_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MIN3_U32

     Inst_VOP3__V_MIN3_U32::~Inst_VOP3__V_MIN3_U32()
     {
     } // ~Inst_VOP3__V_MIN3_U32

     // D.u = min(S0.u, S1.u, S2.u).
     void
     Inst_VOP3__V_MIN3_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
                 vdst[lane] = std::min(min_0_1, src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAX3_F32::Inst_VOP3__V_MAX3_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max3_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_MAX3_F32

     Inst_VOP3__V_MAX3_F32::~Inst_VOP3__V_MAX3_F32()
     {
     } // ~Inst_VOP3__V_MAX3_F32

     // D.f = max(S0.f, S1.f, S2.f).
     void
     Inst_VOP3__V_MAX3_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
                 vdst[lane] = std::fmax(max_0_1, src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAX3_I32::Inst_VOP3__V_MAX3_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max3_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MAX3_I32

     Inst_VOP3__V_MAX3_I32::~Inst_VOP3__V_MAX3_I32()
     {
     } // ~Inst_VOP3__V_MAX3_I32

     // D.i = max(S0.i, S1.i, S2.i).
     void
     Inst_VOP3__V_MAX3_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
                 vdst[lane] = std::max(max_0_1, src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAX3_U32::Inst_VOP3__V_MAX3_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max3_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MAX3_U32

     Inst_VOP3__V_MAX3_U32::~Inst_VOP3__V_MAX3_U32()
     {
     } // ~Inst_VOP3__V_MAX3_U32

     // D.u = max(S0.u, S1.u, S2.u).
     void
     Inst_VOP3__V_MAX3_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
                 vdst[lane] = std::max(max_0_1, src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MED3_F32::Inst_VOP3__V_MED3_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_med3_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_MED3_F32

     Inst_VOP3__V_MED3_F32::~Inst_VOP3__V_MED3_F32()
     {
     } // ~Inst_VOP3__V_MED3_F32

     // D.f = median(S0.f, S1.f, S2.f).
     void
     Inst_VOP3__V_MED3_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MED3_I32::Inst_VOP3__V_MED3_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_med3_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MED3_I32

     Inst_VOP3__V_MED3_I32::~Inst_VOP3__V_MED3_I32()
     {
     } // ~Inst_VOP3__V_MED3_I32

     // D.i = median(S0.i, S1.i, S2.i).
     void
     Inst_VOP3__V_MED3_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MED3_U32::Inst_VOP3__V_MED3_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_med3_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MED3_U32

     Inst_VOP3__V_MED3_U32::~Inst_VOP3__V_MED3_U32()
     {
     } // ~Inst_VOP3__V_MED3_U32

     // D.u = median(S0.u, S1.u, S2.u).
     void
     Inst_VOP3__V_MED3_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SAD_U8::Inst_VOP3__V_SAD_U8(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sad_u8", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_SAD_U8

     Inst_VOP3__V_SAD_U8::~Inst_VOP3__V_SAD_U8()
     {
     } // ~Inst_VOP3__V_SAD_U8

     // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
     // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
     // Sum of absolute differences with accumulation, overflow into upper bits
     // is allowed.
     void
     Inst_VOP3__V_SAD_U8::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::abs(bits(src0[lane], 31, 24)
                     - bits(src1[lane], 31, 24))
                     + std::abs(bits(src0[lane], 23, 16)
                     - bits(src1[lane], 23, 16))
                     + std::abs(bits(src0[lane], 15, 8)
                     - bits(src1[lane], 15, 8))
                     + std::abs(bits(src0[lane], 7, 0)
                     - bits(src1[lane], 7, 0)) + src2[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SAD_HI_U8::Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sad_hi_u8", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_SAD_HI_U8

     Inst_VOP3__V_SAD_HI_U8::~Inst_VOP3__V_SAD_HI_U8()
     {
     } // ~Inst_VOP3__V_SAD_HI_U8

     // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
     // Sum of absolute differences with accumulation, overflow is lost.
     void
     Inst_VOP3__V_SAD_HI_U8::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (((bits(src0[lane], 31, 24)
                     - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
                     - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
                     - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
                     - bits(src1[lane], 7, 0))) << 16) + src2[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SAD_U16::Inst_VOP3__V_SAD_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sad_u16", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_SAD_U16

     Inst_VOP3__V_SAD_U16::~Inst_VOP3__V_SAD_U16()
     {
     } // ~Inst_VOP3__V_SAD_U16

     // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
     // + S2.u.
     // Word SAD with accumulation.
     void
     Inst_VOP3__V_SAD_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::abs(bits(src0[lane], 31, 16)
                     - bits(src1[lane], 31, 16))
                     + std::abs(bits(src0[lane], 15, 0)
                     - bits(src1[lane], 15, 0)) + src2[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_SAD_U32::Inst_VOP3__V_SAD_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_sad_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_SAD_U32

     Inst_VOP3__V_SAD_U32::~Inst_VOP3__V_SAD_U32()
     {
     } // ~Inst_VOP3__V_SAD_U32

     // D.u = abs(S0.i - S1.i) + S2.u.
     // Dword SAD with accumulation.
     void
     Inst_VOP3__V_SAD_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_PK_U8_F32::Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_pk_u8_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_PK_U8_F32

     Inst_VOP3__V_CVT_PK_U8_F32::~Inst_VOP3__V_CVT_PK_U8_F32()
     {
     } // ~Inst_VOP3__V_CVT_PK_U8_F32

     // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
     // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
     // Convert floating point value S0 to 8-bit unsigned integer and pack the
     // result into byte S1 of dword S2.
     void
     Inst_VOP3__V_CVT_PK_U8_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }


         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
                     << (8 * bits(src1[lane], 1, 0)))
                     | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_DIV_FIXUP_F32::Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_div_fixup_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_DIV_FIXUP_F32

     Inst_VOP3__V_DIV_FIXUP_F32::~Inst_VOP3__V_DIV_FIXUP_F32()
     {
     } // ~Inst_VOP3__V_DIV_FIXUP_F32

     // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
     // s2.f = Numerator.
     void
     Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::fpclassify(src1[lane]) == FP_ZERO) {
                     if (std::signbit(src1[lane])) {
                         vdst[lane] = -INFINITY;
                     } else {
                         vdst[lane] = +INFINITY;
                     }
                 } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
                     vdst[lane] = NAN;
                 } else if (std::isinf(src1[lane])) {
                     if (std::signbit(src1[lane])) {
                         vdst[lane] = -INFINITY;
                     } else {
                         vdst[lane] = +INFINITY;
                     }
                 } else {
                     vdst[lane] = src2[lane] / src1[lane];
                 }
             }
         }

         vdst.write();
     } // execute
     // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---

     Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_div_fixup_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_DIV_FIXUP_F64

     Inst_VOP3__V_DIV_FIXUP_F64::~Inst_VOP3__V_DIV_FIXUP_F64()
     {
     } // ~Inst_VOP3__V_DIV_FIXUP_F64

     // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
     // s2.d = Numerator.
     void
     Inst_VOP3__V_DIV_FIXUP_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int sign_out = std::signbit(src1[lane])
                               ^ std::signbit(src2[lane]);
                 int exp1(0);
                 int exp2(0);
                 std::frexp(src1[lane], &exp1);
                 std::frexp(src2[lane], &exp2);

                 if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
                     vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
                 } else if (std::fpclassify(src1[lane]) == FP_ZERO
                            && std::fpclassify(src2[lane]) == FP_ZERO) {
                     vdst[lane]
                         = std::numeric_limits<VecElemF64>::signaling_NaN();
                 } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
                     vdst[lane]
                         = std::numeric_limits<VecElemF64>::signaling_NaN();
                 } else if (std::fpclassify(src1[lane]) == FP_ZERO
                            || std::isinf(src2[lane])) {
                     vdst[lane] = sign_out ? -INFINITY : +INFINITY;
                 } else if (std::isinf(src1[lane])
                            || std::fpclassify(src2[lane]) == FP_ZERO) {
                     vdst[lane] = sign_out ? -0.0 : +0.0;
                 } else if (exp2 - exp1 < -1075) {
                     vdst[lane] = src0[lane];
                 } else if (exp1 == 2047) {
                     vdst[lane] = src0[lane];
                 } else {
                     vdst[lane] = sign_out ? -std::fabs(src0[lane])
                         : std::fabs(src0[lane]);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_DIV_SCALE_F32::Inst_VOP3__V_DIV_SCALE_F32(
           InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_div_scale_f32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(F32);
     } // Inst_VOP3__V_DIV_SCALE_F32

     Inst_VOP3__V_DIV_SCALE_F32::~Inst_VOP3__V_DIV_SCALE_F32()
     {
     } // ~Inst_VOP3__V_DIV_SCALE_F32

     // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
     // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
     // numerator and denominator, this opcode will appropriately scale inputs
     // for division to avoid subnormal terms during Newton-Raphson correction
     // algorithm. This opcode producses a VCC flag for post-scale of quotient.
     void
     Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
         ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane];
                 vcc.setBit(lane, 0);
             }
         }

         vcc.write();
         vdst.write();
     } // execute
     // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---

     Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
           InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_div_scale_f64")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(F64);
     } // Inst_VOP3__V_DIV_SCALE_F64

     Inst_VOP3__V_DIV_SCALE_F64::~Inst_VOP3__V_DIV_SCALE_F64()
     {
     } // ~Inst_VOP3__V_DIV_SCALE_F64

     // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
     // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
     // numerator and denominator, this opcode will appropriately scale inputs
     // for division to avoid subnormal terms during Newton-Raphson correction
     // algorithm. This opcode producses a VCC flag for post-scale of quotient.
     void
     Inst_VOP3__V_DIV_SCALE_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
         ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp1(0);
                 int exp2(0);
                 std::frexp(src1[lane], &exp1);
                 std::frexp(src2[lane], &exp2);
                 vcc.setBit(lane, 0);

                 if (std::fpclassify(src1[lane]) == FP_ZERO
                     || std::fpclassify(src2[lane]) == FP_ZERO) {
                     vdst[lane] = NAN;
                 } else if (exp2 - exp1 >= 768) {
                     vcc.setBit(lane, 1);
                     if (src0[lane] == src1[lane]) {
                         vdst[lane] = std::ldexp(src0[lane], 128);
                     }
                 } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
                     vdst[lane] = std::ldexp(src0[lane], 128);
                 } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
                            && std::fpclassify(src2[lane] / src1[lane])
                            == FP_SUBNORMAL) {
                     vcc.setBit(lane, 1);
                     if (src0[lane] == src1[lane]) {
                         vdst[lane] = std::ldexp(src0[lane], 128);
                     }
                 } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
                     vdst[lane] = std::ldexp(src0[lane], -128);
                 } else if (std::fpclassify(src2[lane] / src1[lane])
                            == FP_SUBNORMAL) {
                     vcc.setBit(lane, 1);
                     if (src0[lane] == src2[lane]) {
                         vdst[lane] = std::ldexp(src0[lane], 128);
                     }
                 } else if (exp2 <= 53) {
                     vdst[lane] = std::ldexp(src0[lane], 128);
                 }
             }
         }

         vcc.write();
         vdst.write();
     }

     Inst_VOP3__V_DIV_FMAS_F32::Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_div_fmas_f32", false)
     {
         setFlag(ALU);
         setFlag(ReadsVCC);
         setFlag(F32);
         setFlag(FMA);
     } // Inst_VOP3__V_DIV_FMAS_F32

     Inst_VOP3__V_DIV_FMAS_F32::~Inst_VOP3__V_DIV_FMAS_F32()
     {
     } // ~Inst_VOP3__V_DIV_FMAS_F32

     // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
     // s1.f = Denominator, s2.f = Numerator)
     void
     Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
             }
         }

         //vdst.write();
     } // execute
     // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---

     Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_div_fmas_f64", false)
     {
         setFlag(ALU);
         setFlag(ReadsVCC);
         setFlag(F64);
         setFlag(FMA);
     } // Inst_VOP3__V_DIV_FMAS_F64

     Inst_VOP3__V_DIV_FMAS_F64::~Inst_VOP3__V_DIV_FMAS_F64()
     {
     } // ~Inst_VOP3__V_DIV_FMAS_F64

     // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
     // s1.d = Denominator, s2.d = Numerator)
     void
     Inst_VOP3__V_DIV_FMAS_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);
         ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();
         vcc.read();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (instData.ABS & 0x4) {
             src2.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         if (extData.NEG & 0x4) {
             src2.negModifier();
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(vcc.rawData(), lane)) {
                     vdst[lane] = std::pow(2, 64)
                         * std::fma(src0[lane], src1[lane], src2[lane]);
                 } else {
                     vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MSAD_U8::Inst_VOP3__V_MSAD_U8(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_msad_u8", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MSAD_U8

     Inst_VOP3__V_MSAD_U8::~Inst_VOP3__V_MSAD_U8()
     {
     } // ~Inst_VOP3__V_MSAD_U8

     // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
     void
     Inst_VOP3__V_MSAD_U8::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_QSAD_PK_U16_U8::Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_qsad_pk_u16_u8", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_QSAD_PK_U16_U8

     Inst_VOP3__V_QSAD_PK_U16_U8::~Inst_VOP3__V_QSAD_PK_U16_U8()
     {
     } // ~Inst_VOP3__V_QSAD_PK_U16_U8

     // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
     // S1.u[31:0], S2.u[63:0])
     void
     Inst_VOP3__V_QSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_MQSAD_PK_U16_U8::Inst_VOP3__V_MQSAD_PK_U16_U8(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mqsad_pk_u16_u8", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MQSAD_PK_U16_U8

     Inst_VOP3__V_MQSAD_PK_U16_U8::~Inst_VOP3__V_MQSAD_PK_U16_U8()
     {
     } // ~Inst_VOP3__V_MQSAD_PK_U16_U8

     // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
     // S1.u[31:0], S2.u[63:0])
     void
     Inst_VOP3__V_MQSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_MQSAD_U32_U8::Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mqsad_u32_u8", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MQSAD_U32_U8

     Inst_VOP3__V_MQSAD_U32_U8::~Inst_VOP3__V_MQSAD_U32_U8()
     {
     } // ~Inst_VOP3__V_MQSAD_U32_U8

     // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
     // S1.u[31:0], S2.u[127:0])
     void
     Inst_VOP3__V_MQSAD_U32_U8::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_MAD_U64_U32::Inst_VOP3__V_MAD_U64_U32(
           InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_mad_u64_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(MAD);
     } // Inst_VOP3__V_MAD_U64_U32

     Inst_VOP3__V_MAD_U64_U32::~Inst_VOP3__V_MAD_U64_U32()
     {
     } // ~Inst_VOP3__V_MAD_U64_U32

     // {vcc_out, D.u64} = S0.u32 * S1.u32 + S2.u64.
     void
     Inst_VOP3__V_MAD_U64_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
         ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
         VecOperandU64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();
         vdst.read();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
                     src2[lane]));
             }
         }

         vcc.write();
         vdst.write();
     }

     Inst_VOP3__V_MAD_I64_I32::Inst_VOP3__V_MAD_I64_I32(
           InFmt_VOP3_SDST_ENC *iFmt)
         : Inst_VOP3_SDST_ENC(iFmt, "v_mad_i64_i32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(MAD);
     } // Inst_VOP3__V_MAD_I64_I32

     Inst_VOP3__V_MAD_I64_I32::~Inst_VOP3__V_MAD_I64_I32()
     {
     } // ~Inst_VOP3__V_MAD_I64_I32

     // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
     void
     Inst_VOP3__V_MAD_I64_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
         ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
         VecOperandI64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
                     src2[lane]));
             }
         }

         vcc.write();
         vdst.write();
     }

     Inst_VOP3__V_MAD_F16::Inst_VOP3__V_MAD_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mad_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
         setFlag(MAD);
     } // Inst_VOP3__V_MAD_F16

     Inst_VOP3__V_MAD_F16::~Inst_VOP3__V_MAD_F16()
     {
     } // ~Inst_VOP3__V_MAD_F16

     // D.f16 = S0.f16 * S1.f16 + S2.f16.
     // Supports round mode, exception flags, saturation.
     void
     Inst_VOP3__V_MAD_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_MAD_U16::Inst_VOP3__V_MAD_U16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mad_u16", false)
     {
         setFlag(ALU);
         setFlag(MAD);
     } // Inst_VOP3__V_MAD_U16

     Inst_VOP3__V_MAD_U16::~Inst_VOP3__V_MAD_U16()
     {
     } // ~Inst_VOP3__V_MAD_U16

     // D.u16 = S0.u16 * S1.u16 + S2.u16.
     // Supports saturation (unsigned 16-bit integer domain).
     void
     Inst_VOP3__V_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
         VecOperandU16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] * src1[lane] + src2[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAD_I16::Inst_VOP3__V_MAD_I16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mad_i16", false)
     {
         setFlag(ALU);
         setFlag(MAD);
     } // Inst_VOP3__V_MAD_I16

     Inst_VOP3__V_MAD_I16::~Inst_VOP3__V_MAD_I16()
     {
     } // ~Inst_VOP3__V_MAD_I16

     // D.i16 = S0.i16 * S1.i16 + S2.i16.
     // Supports saturation (signed 16-bit integer domain).
     void
     Inst_VOP3__V_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
         VecOperandI16 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] * src1[lane] + src2[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_PERM_B32::Inst_VOP3__V_PERM_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_perm_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_PERM_B32

     Inst_VOP3__V_PERM_B32::~Inst_VOP3__V_PERM_B32()
     {
     } // ~Inst_VOP3__V_PERM_B32

     // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
     // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
     // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
     // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
     // byte permute(byte in[8], byte sel) {
     //     if(sel>=13) then return 0xff;
     //     elsif(sel==12) then return 0x00;
     //     elsif(sel==11) then return in[7][7] * 0xff;
     //     elsif(sel==10) then return in[5][7] * 0xff;
     //     elsif(sel==9) then return in[3][7] * 0xff;
     //     elsif(sel==8) then return in[1][7] * 0xff;
     //     else return in[sel];
     //     }
     void
     Inst_VOP3__V_PERM_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();
         src2.readSrc();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemU64 selector = (VecElemU64)src0[lane];
                 selector = (selector << 32) | (VecElemU64)src1[lane];
                 vdst[lane] = 0;

                 DPRINTF(GCN3, "Executing v_perm_b32 src_0 0x%08x, src_1 "
                         "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
                         src1[lane], src2[lane], vdst[lane]);
                 DPRINTF(GCN3, "Selector: 0x%08x \n", selector);

                 for (int i = 0; i < 4 ; ++i) {
                     VecElemU32 permuted_val = permute(selector, 0xFF
                         & ((VecElemU32)src2[lane] >> (8 * i)));
                     vdst[lane] |= (permuted_val << i);
                 }

                 DPRINTF(GCN3, "v_perm result: 0x%08x\n", vdst[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_FMA_F16::Inst_VOP3__V_FMA_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_fma_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
         setFlag(FMA);
     } // Inst_VOP3__V_FMA_F16

     Inst_VOP3__V_FMA_F16::~Inst_VOP3__V_FMA_F16()
     {
     } // ~Inst_VOP3__V_FMA_F16

     // D.f16 = S0.f16 * S1.f16 + S2.f16.
     // Fused half precision multiply add.
     void
     Inst_VOP3__V_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_DIV_FIXUP_F16::Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_div_fixup_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_DIV_FIXUP_F16

     Inst_VOP3__V_DIV_FIXUP_F16::~Inst_VOP3__V_DIV_FIXUP_F16()
     {
     } // ~Inst_VOP3__V_DIV_FIXUP_F16

     // sign_out =  sign(S1.f16)^sign(S2.f16);
     // if (S2.f16 == NAN)
     //     D.f16 = Quiet(S2.f16);
     // else if (S1.f16 == NAN)
     //     D.f16 = Quiet(S1.f16);
     // else if (S1.f16 == S2.f16 == 0)
     //     # 0/0
     //     D.f16 = pele_nan(0xfe00);
     // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
     //     # inf/inf
     //     D.f16 = pele_nan(0xfe00);
     // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
     //     # x/0, or inf/y
     //     D.f16 = sign_out ? -INF : INF;
     // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
     //     # x/inf, 0/y
     //     D.f16 = sign_out ? -0 : 0;
     // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
     //     D.f16 = sign_out ? -underflow : underflow;
     // else if (exp(S1.f16) == 255)
     //     D.f16 = sign_out ? -overflow : overflow;
     // else
     //     D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
     // Half precision division fixup.
     // S0 = Quotient, S1 = Denominator, S3 = Numerator.
     // Given a numerator, denominator, and quotient from a divide, this opcode
     // will detect and apply special case numerics, touching up the quotient if
     // necessary. This opcode also generates invalid, denorm and divide by
     // zero exceptions caused by the division.
     void
     Inst_VOP3__V_DIV_FIXUP_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_pkaccum_u8_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_PKACCUM_U8_F32

     Inst_VOP3__V_CVT_PKACCUM_U8_F32::~Inst_VOP3__V_CVT_PKACCUM_U8_F32()
     {
     } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32

     // byte = S1.u[1:0]; bit = byte * 8;
     // D.u[bit + 7:bit] = flt32_to_uint8(S0.f);
     // Pack converted value of S0.f into byte S1 of the destination.
     // SQ translates to V_CVT_PK_U8_F32.
     // Note: this opcode uses src_c to pass destination in as a source.
     void
     Inst_VOP3__V_CVT_PKACCUM_U8_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_INTERP_P1_F32::Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_interp_p1_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_INTERP_P1_F32

     Inst_VOP3__V_INTERP_P1_F32::~Inst_VOP3__V_INTERP_P1_F32()
     {
     } // ~Inst_VOP3__V_INTERP_P1_F32

     // D.f = P10 * S.f + P0;
     void
     Inst_VOP3__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_INTERP_P2_F32::Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_interp_p2_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_INTERP_P2_F32

     Inst_VOP3__V_INTERP_P2_F32::~Inst_VOP3__V_INTERP_P2_F32()
     {
     } // ~Inst_VOP3__V_INTERP_P2_F32

     // D.f = P20 * S.f + D.f;
     void
     Inst_VOP3__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_INTERP_MOV_F32::Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_interp_mov_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_INTERP_MOV_F32

     Inst_VOP3__V_INTERP_MOV_F32::~Inst_VOP3__V_INTERP_MOV_F32()
     {
     } // ~Inst_VOP3__V_INTERP_MOV_F32

     // D.f = {P10,P20,P0}[S.u]; parameter load.
     void
     Inst_VOP3__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_INTERP_P1LL_F16::Inst_VOP3__V_INTERP_P1LL_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_interp_p1ll_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_INTERP_P1LL_F16

     Inst_VOP3__V_INTERP_P1LL_F16::~Inst_VOP3__V_INTERP_P1LL_F16()
     {
     } // ~Inst_VOP3__V_INTERP_P1LL_F16

     // D.f32 = P10.f16 * S0.f32 + P0.f16.
     void
     Inst_VOP3__V_INTERP_P1LL_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_INTERP_P1LV_F16::Inst_VOP3__V_INTERP_P1LV_F16(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_interp_p1lv_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_INTERP_P1LV_F16

     Inst_VOP3__V_INTERP_P1LV_F16::~Inst_VOP3__V_INTERP_P1LV_F16()
     {
     } // ~Inst_VOP3__V_INTERP_P1LV_F16

     void
     Inst_VOP3__V_INTERP_P1LV_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_INTERP_P2_F16::Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_interp_p2_f16", false)
     {
         setFlag(ALU);
         setFlag(F16);
     } // Inst_VOP3__V_INTERP_P2_F16

     Inst_VOP3__V_INTERP_P2_F16::~Inst_VOP3__V_INTERP_P2_F16()
     {
     } // ~Inst_VOP3__V_INTERP_P2_F16

     // D.f16 = P20.f16 * S0.f32 + S2.f32.
     void
     Inst_VOP3__V_INTERP_P2_F16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_ADD_F64::Inst_VOP3__V_ADD_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_add_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_ADD_F64

     Inst_VOP3__V_ADD_F64::~Inst_VOP3__V_ADD_F64()
     {
     } // ~Inst_VOP3__V_ADD_F64

     // D.d = S0.d + S1.d.
     void
     Inst_VOP3__V_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isnan(src0[lane]) ||
                     std::isnan(src1[lane]) ) {
                         vdst[lane] = NAN;
                 } else if (std::isinf(src0[lane]) &&
                            std::isinf(src1[lane])) {
                     if (std::signbit(src0[lane]) !=
                         std::signbit(src1[lane])) {
                         vdst[lane] = NAN;
                     } else {
                         vdst[lane] = src0[lane];
                     }
                 } else if (std::isinf(src0[lane])) {
                     vdst[lane] = src0[lane];
                 } else if (std::isinf(src1[lane])) {
                     vdst[lane] = src1[lane];
                 } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src0[lane]) == FP_ZERO) {
                     if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src1[lane]) == FP_ZERO) {
                         if (std::signbit(src0[lane]) &&
                             std::signbit(src1[lane])) {
                             vdst[lane] = -0.0;
                         } else {
                             vdst[lane] = 0.0;
                         }
                     } else {
                         vdst[lane] = src1[lane];
                     }
                 } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src1[lane]) == FP_ZERO) {
                     if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src0[lane]) == FP_ZERO) {
                         if (std::signbit(src0[lane]) &&
                             std::signbit(src1[lane])) {
                             vdst[lane] = -0.0;
                         } else {
                             vdst[lane] = 0.0;
                         }
                     } else {
                         vdst[lane] = src0[lane];
                     }
                 } else {
                     vdst[lane] = src0[lane] + src1[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_F64::Inst_VOP3__V_MUL_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_MUL_F64

     Inst_VOP3__V_MUL_F64::~Inst_VOP3__V_MUL_F64()
     {
     } // ~Inst_VOP3__V_MUL_F64

     // D.d = S0.d * S1.d.
     void
     Inst_VOP3__V_MUL_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isnan(src0[lane]) ||
                     std::isnan(src1[lane])) {
                     vdst[lane] = NAN;
                 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src0[lane]) == FP_ZERO) &&
                            !std::signbit(src0[lane])) {
                     if (std::isinf(src1[lane])) {
                         vdst[lane] = NAN;
                     } else if (!std::signbit(src1[lane])) {
                         vdst[lane] = +0.0;
                     } else {
                         vdst[lane] = -0.0;
                     }
                 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                            std::fpclassify(src0[lane]) == FP_ZERO) &&
                            std::signbit(src0[lane])) {
                     if (std::isinf(src1[lane])) {
                         vdst[lane] = NAN;
                     } else if (std::signbit(src1[lane])) {
                         vdst[lane] = +0.0;
                     } else {
                         vdst[lane] = -0.0;
                     }
                 } else if (std::isinf(src0[lane]) &&
                            !std::signbit(src0[lane])) {
                     if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src1[lane]) == FP_ZERO) {
                         vdst[lane] = NAN;
                     } else if (!std::signbit(src1[lane])) {
                         vdst[lane] = +INFINITY;
                     } else {
                         vdst[lane] = -INFINITY;
                     }
                 } else if (std::isinf(src0[lane]) &&
                            std::signbit(src0[lane])) {
                     if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                         std::fpclassify(src1[lane]) == FP_ZERO) {
                         vdst[lane] = NAN;
                     } else if (std::signbit(src1[lane])) {
                         vdst[lane] = +INFINITY;
                     } else {
                         vdst[lane] = -INFINITY;
                     }
                 } else {
                     vdst[lane] = src0[lane] * src1[lane];
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MIN_F64::Inst_VOP3__V_MIN_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_min_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_MIN_F64

     Inst_VOP3__V_MIN_F64::~Inst_VOP3__V_MIN_F64()
     {
     } // ~Inst_VOP3__V_MIN_F64

     // D.d = min(S0.d, S1.d).
     void
     Inst_VOP3__V_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fmin(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MAX_F64::Inst_VOP3__V_MAX_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_max_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_MAX_F64

     Inst_VOP3__V_MAX_F64::~Inst_VOP3__V_MAX_F64()
     {
     } // ~Inst_VOP3__V_MAX_F64

     // D.d = max(S0.d, S1.d).
     void
     Inst_VOP3__V_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (instData.ABS & 0x2) {
             src1.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         if (extData.NEG & 0x2) {
             src1.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fmax(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LDEXP_F64::Inst_VOP3__V_LDEXP_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ldexp_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_LDEXP_F64

     Inst_VOP3__V_LDEXP_F64::~Inst_VOP3__V_LDEXP_F64()
     {
     } // ~Inst_VOP3__V_LDEXP_F64

     // D.d = pow(S0.d, S1.i[31:0]).
     void
     Inst_VOP3__V_LDEXP_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandF64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         if (instData.ABS & 0x1) {
             src0.absModifier();
         }

         if (extData.NEG & 0x1) {
             src0.negModifier();
         }

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
                     vdst[lane] = src0[lane];
                 } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                            || std::fpclassify(src0[lane]) == FP_ZERO) {
                     if (std::signbit(src0[lane])) {
                         vdst[lane] = -0.0;
                     } else {
                         vdst[lane] = +0.0;
                     }
                 } else {
                     vdst[lane] = std::ldexp(src0[lane], src1[lane]);
                 }
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_LO_U32::Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_lo_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MUL_LO_U32

     Inst_VOP3__V_MUL_LO_U32::~Inst_VOP3__V_MUL_LO_U32()
     {
     } // ~Inst_VOP3__V_MUL_LO_U32

     // D.u = S0.u * S1.u.
     void
     Inst_VOP3__V_MUL_LO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemI64 s0 = (VecElemI64)src0[lane];
                 VecElemI64 s1 = (VecElemI64)src1[lane];
                 vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_HI_U32::Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_hi_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MUL_HI_U32

     Inst_VOP3__V_MUL_HI_U32::~Inst_VOP3__V_MUL_HI_U32()
     {
     } // ~Inst_VOP3__V_MUL_HI_U32

     // D.u = (S0.u * S1.u) >> 32.
     void
     Inst_VOP3__V_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemI64 s0 = (VecElemI64)src0[lane];
                 VecElemI64 s1 = (VecElemI64)src1[lane];
                 vdst[lane]
                     = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MUL_HI_I32::Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mul_hi_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MUL_HI_I32

     Inst_VOP3__V_MUL_HI_I32::~Inst_VOP3__V_MUL_HI_I32()
     {
     } // ~Inst_VOP3__V_MUL_HI_I32

     // D.i = (S0.i * S1.i) >> 32.
     void
     Inst_VOP3__V_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         VecOperandI32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemI64 s0 = (VecElemI64)src0[lane];
                 VecElemI64 s1 = (VecElemI64)src1[lane];
                 vdst[lane]
                     = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LDEXP_F32::Inst_VOP3__V_LDEXP_F32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ldexp_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_LDEXP_F32

     Inst_VOP3__V_LDEXP_F32::~Inst_VOP3__V_LDEXP_F32()
     {
     } // ~Inst_VOP3__V_LDEXP_F32

     // D.f = pow(S0.f, S1.i)
     void
     Inst_VOP3__V_LDEXP_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
         VecOperandF32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::ldexp(src0[lane], src1[lane]);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_READLANE_B32::Inst_VOP3__V_READLANE_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_readlane_b32", true)
     {
         setFlag(ALU);
         setFlag(IgnoreExec);
     } // Inst_VOP3__V_READLANE_B32

     Inst_VOP3__V_READLANE_B32::~Inst_VOP3__V_READLANE_B32()
     {
     } // ~Inst_VOP3__V_READLANE_B32

     // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
     // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
     // Input and output modifiers not supported; this is an untyped operation.
     void
     Inst_VOP3__V_READLANE_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
         ScalarOperandU32 sdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.read();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         sdst = src0[src1.rawData() & 0x3f];

         sdst.write();
     }

     Inst_VOP3__V_WRITELANE_B32::Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_writelane_b32", false)
     {
         setFlag(ALU);
         setFlag(IgnoreExec);
     } // Inst_VOP3__V_WRITELANE_B32

     Inst_VOP3__V_WRITELANE_B32::~Inst_VOP3__V_WRITELANE_B32()
     {
     } // ~Inst_VOP3__V_WRITELANE_B32

     // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
     // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
     // exec mask. Input and output modifiers not supported; this is an untyped
     // operation.
     void
     Inst_VOP3__V_WRITELANE_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.read();
         src1.read();
         vdst.read();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         vdst[src1.rawData() & 0x3f] = src0.rawData();

         vdst.write();
     }

     Inst_VOP3__V_BCNT_U32_B32::Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_bcnt_u32_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_BCNT_U32_B32

     Inst_VOP3__V_BCNT_U32_B32::~Inst_VOP3__V_BCNT_U32_B32()
     {
     } // ~Inst_VOP3__V_BCNT_U32_B32

     // D.u = CountOneBits(S0.u) + S1.u. Bit count.
     void
     Inst_VOP3__V_BCNT_U32_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = popCount(src0[lane]) + src1[lane];
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_MBCNT_LO_U32_B32::Inst_VOP3__V_MBCNT_LO_U32_B32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mbcnt_lo_u32_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MBCNT_LO_U32_B32

     Inst_VOP3__V_MBCNT_LO_U32_B32::~Inst_VOP3__V_MBCNT_LO_U32_B32()
     {
     } // ~Inst_VOP3__V_MBCNT_LO_U32_B32

     // Masked bit count, ThreadPosition is the position of this thread in the
     // wavefront (in 0..63).
     void
     Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         uint64_t threadMask = 0;

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 threadMask = ((1LL << lane) - 1LL);
                 vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
                              src1[lane];
             }
         }

         vdst.write();
     } // execute
     // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---

     Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_mbcnt_hi_u32_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_MBCNT_HI_U32_B32

     Inst_VOP3__V_MBCNT_HI_U32_B32::~Inst_VOP3__V_MBCNT_HI_U32_B32()
     {
     } // ~Inst_VOP3__V_MBCNT_HI_U32_B32

     // ThreadMask = (1 << ThreadPosition) - 1;
     // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
     // Masked bit count, ThreadPosition is the position of this thread in the
     // wavefront (in 0..63).
     void
     Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         uint64_t threadMask = 0;

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 threadMask = ((1LL << lane) - 1LL);
                 vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
                              src1[lane];
             }
         }

         vdst.write();
     } // execute
     // --- Inst_VOP3__V_LSHLREV_B64 class methods ---

     Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_lshlrev_b64", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_LSHLREV_B64

     Inst_VOP3__V_LSHLREV_B64::~Inst_VOP3__V_LSHLREV_B64()
     {
     } // ~Inst_VOP3__V_LSHLREV_B64

     // D.u64 = S1.u64 << S0.u[5:0].
     void
     Inst_VOP3__V_LSHLREV_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         VecOperandU64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_LSHRREV_B64::Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_lshrrev_b64", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_LSHRREV_B64

     Inst_VOP3__V_LSHRREV_B64::~Inst_VOP3__V_LSHRREV_B64()
     {
     } // ~Inst_VOP3__V_LSHRREV_B64

     // D.u64 = S1.u64 >> S0.u[5:0].
     // The vacated bits are set to zero.
     void
     Inst_VOP3__V_LSHRREV_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
         VecOperandU64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_ASHRREV_I64::Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_ashrrev_i64", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_ASHRREV_I64

     Inst_VOP3__V_ASHRREV_I64::~Inst_VOP3__V_ASHRREV_I64()
     {
     } // ~Inst_VOP3__V_ASHRREV_I64

     // D.u64 = signext(S1.u64) >> S0.u[5:0].
     // The vacated bits are set to the sign bit of the input value.
     void
     Inst_VOP3__V_ASHRREV_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
         VecOperandU64 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane]
                     = src1[lane] >> bits(src0[lane], 5, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_TRIG_PREOP_F64::Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_trig_preop_f64", false)
     {
         setFlag(ALU);
         setFlag(F64);
     } // Inst_VOP3__V_TRIG_PREOP_F64

     Inst_VOP3__V_TRIG_PREOP_F64::~Inst_VOP3__V_TRIG_PREOP_F64()
     {
     } // ~Inst_VOP3__V_TRIG_PREOP_F64

     void
     Inst_VOP3__V_TRIG_PREOP_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_BFM_B32::Inst_VOP3__V_BFM_B32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_bfm_b32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_BFM_B32

     Inst_VOP3__V_BFM_B32::~Inst_VOP3__V_BFM_B32()
     {
     } // ~Inst_VOP3__V_BFM_B32

     // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0];
     void
     Inst_VOP3__V_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);

         src0.readSrc();
         src1.readSrc();

         /**
          * input modifiers are supported by FP operations only
          */
         assert(!(instData.ABS & 0x1));
         assert(!(instData.ABS & 0x2));
         assert(!(instData.ABS & 0x4));
         assert(!(extData.NEG & 0x1));
         assert(!(extData.NEG & 0x2));
         assert(!(extData.NEG & 0x4));

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
                     << bits(src1[lane], 4, 0);
             }
         }

         vdst.write();
     }

     Inst_VOP3__V_CVT_PKNORM_I16_F32::Inst_VOP3__V_CVT_PKNORM_I16_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_pknorm_i16_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_PKNORM_I16_F32

     Inst_VOP3__V_CVT_PKNORM_I16_F32::~Inst_VOP3__V_CVT_PKNORM_I16_F32()
     {
     } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32

     // D = {(snorm)S1.f, (snorm)S0.f}.
     void
     Inst_VOP3__V_CVT_PKNORM_I16_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_PKNORM_U16_F32::Inst_VOP3__V_CVT_PKNORM_U16_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_pknorm_u16_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_PKNORM_U16_F32

     Inst_VOP3__V_CVT_PKNORM_U16_F32::~Inst_VOP3__V_CVT_PKNORM_U16_F32()
     {
     } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32

     // D = {(unorm)S1.f, (unorm)S0.f}.
     void
     Inst_VOP3__V_CVT_PKNORM_U16_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_PKRTZ_F16_F32::Inst_VOP3__V_CVT_PKRTZ_F16_F32(
           InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_pkrtz_f16_f32", false)
     {
         setFlag(ALU);
         setFlag(F32);
     } // Inst_VOP3__V_CVT_PKRTZ_F16_F32

     Inst_VOP3__V_CVT_PKRTZ_F16_F32::~Inst_VOP3__V_CVT_PKRTZ_F16_F32()
     {
     } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32

     void
     Inst_VOP3__V_CVT_PKRTZ_F16_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_PK_U16_U32::Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_pk_u16_u32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CVT_PK_U16_U32

     Inst_VOP3__V_CVT_PK_U16_U32::~Inst_VOP3__V_CVT_PK_U16_U32()
     {
     } // ~Inst_VOP3__V_CVT_PK_U16_U32

     // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
     void
     Inst_VOP3__V_CVT_PK_U16_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_VOP3__V_CVT_PK_I16_I32::Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3 *iFmt)
         : Inst_VOP3(iFmt, "v_cvt_pk_i16_i32", false)
     {
         setFlag(ALU);
     } // Inst_VOP3__V_CVT_PK_I16_I32

     Inst_VOP3__V_CVT_PK_I16_I32::~Inst_VOP3__V_CVT_PK_I16_I32()
     {
     } // ~Inst_VOP3__V_CVT_PK_I16_I32

     // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
     void
     Inst_VOP3__V_CVT_PK_I16_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_ADD_U32::Inst_DS__DS_ADD_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_add_u32")
     {
     } // Inst_DS__DS_ADD_U32

     Inst_DS__DS_ADD_U32::~Inst_DS__DS_ADD_U32()
     {
     } // ~Inst_DS__DS_ADD_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_SUB_U32::Inst_DS__DS_SUB_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_sub_u32")
     {
     } // Inst_DS__DS_SUB_U32

     Inst_DS__DS_SUB_U32::~Inst_DS__DS_SUB_U32()
     {
     } // ~Inst_DS__DS_SUB_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_RSUB_U32::Inst_DS__DS_RSUB_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_rsub_u32")
     {
     } // Inst_DS__DS_RSUB_U32

     Inst_DS__DS_RSUB_U32::~Inst_DS__DS_RSUB_U32()
     {
     } // ~Inst_DS__DS_RSUB_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA - MEM[ADDR];
     // RETURN_DATA = tmp.
     // Subtraction with reversed operands.
     void
     Inst_DS__DS_RSUB_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_INC_U32::Inst_DS__DS_INC_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_inc_u32")
     {
     } // Inst_DS__DS_INC_U32

     Inst_DS__DS_INC_U32::~Inst_DS__DS_INC_U32()
     {
     } // ~Inst_DS__DS_INC_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_INC_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_DEC_U32::Inst_DS__DS_DEC_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_dec_u32")
     {
     } // Inst_DS__DS_DEC_U32

     Inst_DS__DS_DEC_U32::~Inst_DS__DS_DEC_U32()
     {
     } // ~Inst_DS__DS_DEC_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
     // (unsigned compare); RETURN_DATA = tmp.
     void
     Inst_DS__DS_DEC_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_I32::Inst_DS__DS_MIN_I32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_i32")
     {
     } // Inst_DS__DS_MIN_I32

     Inst_DS__DS_MIN_I32::~Inst_DS__DS_MIN_I32()
     {
     } // ~Inst_DS__DS_MIN_I32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_I32::Inst_DS__DS_MAX_I32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_i32")
     {
     } // Inst_DS__DS_MAX_I32

     Inst_DS__DS_MAX_I32::~Inst_DS__DS_MAX_I32()
     {
     } // ~Inst_DS__DS_MAX_I32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_U32::Inst_DS__DS_MIN_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_u32")
     {
     } // Inst_DS__DS_MIN_U32

     Inst_DS__DS_MIN_U32::~Inst_DS__DS_MIN_U32()
     {
     } // ~Inst_DS__DS_MIN_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_U32::Inst_DS__DS_MAX_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_u32")
     {
     } // Inst_DS__DS_MAX_U32

     Inst_DS__DS_MAX_U32::~Inst_DS__DS_MAX_U32()
     {
     } // ~Inst_DS__DS_MAX_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_AND_B32::Inst_DS__DS_AND_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_and_b32")
     {
     } // Inst_DS__DS_AND_B32

     Inst_DS__DS_AND_B32::~Inst_DS__DS_AND_B32()
     {
     } // ~Inst_DS__DS_AND_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] &= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_AND_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_or_b32")
     {
     } // Inst_DS__DS_OR_B32

     Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
     {
     } // ~Inst_DS__DS_OR_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] |= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_xor_b32")
     {
     } // Inst_DS__DS_XOR_B32

     Inst_DS__DS_XOR_B32::~Inst_DS__DS_XOR_B32()
     {
     } // ~Inst_DS__DS_XOR_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] ^= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MSKOR_B32::Inst_DS__DS_MSKOR_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_mskor_b32")
     {
     } // Inst_DS__DS_MSKOR_B32

     Inst_DS__DS_MSKOR_B32::~Inst_DS__DS_MSKOR_B32()
     {
     } // ~Inst_DS__DS_MSKOR_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MSKOR_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRITE_B32::Inst_DS__DS_WRITE_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write_b32")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE_B32

     Inst_DS__DS_WRITE_B32::~Inst_DS__DS_WRITE_B32()
     {
     } // ~Inst_DS__DS_WRITE_B32

     // MEM[ADDR] = DATA.
     // Write dword.
     void
     Inst_DS__DS_WRITE_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data(gpuDynInst, extData.DATA0);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->wrLmReqsInPipe--;
         wf->outstandingReqsWrLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_DS__DS_WRITE_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0;
         Addr offset1 = instData.OFFSET1;
         Addr offset = (offset1 << 8) | offset0;

         initMemWrite<VecElemU32>(gpuDynInst, offset);
     } // initiateAcc

     void
     Inst_DS__DS_WRITE_B32::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_DS__DS_WRITE2_B32::Inst_DS__DS_WRITE2_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write2_b32")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE2_B32

     Inst_DS__DS_WRITE2_B32::~Inst_DS__DS_WRITE2_B32()
     {
     } // ~Inst_DS__DS_WRITE2_B32

     // MEM[ADDR_BASE + OFFSET0 * 4] = DATA;
     // MEM[ADDR_BASE + OFFSET1 * 4] = DATA2.
     // Write 2 dwords.
     void
     Inst_DS__DS_WRITE2_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
         ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);

         addr.read();
         data0.read();
         data1.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
                     = data0[lane];
                 (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
             }
         }

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->wrLmReqsInPipe--;
         wf->outstandingReqsWrLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_DS__DS_WRITE2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0 * 4;
         Addr offset1 = instData.OFFSET1 * 4;

         initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
     }

     void
     Inst_DS__DS_WRITE2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_DS__DS_WRITE2ST64_B32::Inst_DS__DS_WRITE2ST64_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write2st64_b32")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE2ST64_B32

     Inst_DS__DS_WRITE2ST64_B32::~Inst_DS__DS_WRITE2ST64_B32()
     {
     } // ~Inst_DS__DS_WRITE2ST64_B32

     // MEM[ADDR_BASE + OFFSET0 * 4 * 64] = DATA;
     // MEM[ADDR_BASE + OFFSET1 * 4 * 64] = DATA2;
     // Write 2 dwords.
     void
     Inst_DS__DS_WRITE2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
         ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);

         addr.read();
         data0.read();
         data1.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
                     = data0[lane];
                 (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
             }
         }

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->wrLmReqsInPipe--;
         wf->outstandingReqsWrLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_DS__DS_WRITE2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0 * 4 * 64;
         Addr offset1 = instData.OFFSET1 * 4 * 64;

         initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
     }

     void
     Inst_DS__DS_WRITE2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }
     // --- Inst_DS__DS_CMPST_B32 class methods ---

     Inst_DS__DS_CMPST_B32::Inst_DS__DS_CMPST_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_cmpst_b32")
     {
     } // Inst_DS__DS_CMPST_B32

     Inst_DS__DS_CMPST_B32::~Inst_DS__DS_CMPST_B32()
     {
     } // ~Inst_DS__DS_CMPST_B32

     // tmp = MEM[ADDR];
     // src = DATA2;
     // cmp = DATA;
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     // Compare and store.
     void
     Inst_DS__DS_CMPST_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_CMPST_F32::Inst_DS__DS_CMPST_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_cmpst_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_CMPST_F32

     Inst_DS__DS_CMPST_F32::~Inst_DS__DS_CMPST_F32()
     {
     } // ~Inst_DS__DS_CMPST_F32

     // tmp = MEM[ADDR];
     // src = DATA2;
     // cmp = DATA;
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     void
     Inst_DS__DS_CMPST_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_F32::Inst_DS__DS_MIN_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_MIN_F32

     Inst_DS__DS_MIN_F32::~Inst_DS__DS_MIN_F32()
     {
     } // ~Inst_DS__DS_MIN_F32

     // tmp = MEM[ADDR];
     // src = DATA;
     // cmp = DATA2;
     // MEM[ADDR] = (cmp < tmp) ? src : tmp.
     void
     Inst_DS__DS_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_F32::Inst_DS__DS_MAX_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_MAX_F32

     Inst_DS__DS_MAX_F32::~Inst_DS__DS_MAX_F32()
     {
     } // ~Inst_DS__DS_MAX_F32

     // tmp = MEM[ADDR];
     // src = DATA;
     // cmp = DATA2;
     // MEM[ADDR] = (tmp > cmp) ? src : tmp.
     void
     Inst_DS__DS_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_NOP::Inst_DS__DS_NOP(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_nop")
     {
         setFlag(Nop);
     } // Inst_DS__DS_NOP

     Inst_DS__DS_NOP::~Inst_DS__DS_NOP()
     {
     } // ~Inst_DS__DS_NOP

     // Do nothing.
     void
     Inst_DS__DS_NOP::execute(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_DS__DS_ADD_F32::Inst_DS__DS_ADD_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_add_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_ADD_F32

     Inst_DS__DS_ADD_F32::~Inst_DS__DS_ADD_F32()
     {
     } // ~Inst_DS__DS_ADD_F32

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRITE_B8::Inst_DS__DS_WRITE_B8(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write_b8")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE_B8

     Inst_DS__DS_WRITE_B8::~Inst_DS__DS_WRITE_B8()
     {
     } // ~Inst_DS__DS_WRITE_B8

     // MEM[ADDR] = DATA[7:0].
     void
     Inst_DS__DS_WRITE_B8::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU8 data(gpuDynInst, extData.DATA0);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->wrLmReqsInPipe--;
         wf->outstandingReqsWrLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_DS__DS_WRITE_B8::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0;
         Addr offset1 = instData.OFFSET1;
         Addr offset = (offset1 << 8) | offset0;

         initMemWrite<VecElemU8>(gpuDynInst, offset);
     } // initiateAcc

     void
     Inst_DS__DS_WRITE_B8::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc
     // --- Inst_DS__DS_WRITE_B16 class methods ---

     Inst_DS__DS_WRITE_B16::Inst_DS__DS_WRITE_B16(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write_b16")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE_B16

     Inst_DS__DS_WRITE_B16::~Inst_DS__DS_WRITE_B16()
     {
     } // ~Inst_DS__DS_WRITE_B16

     // MEM[ADDR] = DATA[15:0]
     void
     Inst_DS__DS_WRITE_B16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU16 data(gpuDynInst, extData.DATA0);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->wrLmReqsInPipe--;
         wf->outstandingReqsWrLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_DS__DS_WRITE_B16::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0;
         Addr offset1 = instData.OFFSET1;
         Addr offset = (offset1 << 8) | offset0;

         initMemWrite<VecElemU16>(gpuDynInst, offset);
     } // initiateAcc

     void
     Inst_DS__DS_WRITE_B16::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc
     // --- Inst_DS__DS_ADD_RTN_U32 class methods ---

     Inst_DS__DS_ADD_RTN_U32::Inst_DS__DS_ADD_RTN_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_add_rtn_u32")
     {
     } // Inst_DS__DS_ADD_RTN_U32

     Inst_DS__DS_ADD_RTN_U32::~Inst_DS__DS_ADD_RTN_U32()
     {
     } // ~Inst_DS__DS_ADD_RTN_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_ADD_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_SUB_RTN_U32::Inst_DS__DS_SUB_RTN_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_sub_rtn_u32")
     {
     } // Inst_DS__DS_SUB_RTN_U32

     Inst_DS__DS_SUB_RTN_U32::~Inst_DS__DS_SUB_RTN_U32()
     {
     } // ~Inst_DS__DS_SUB_RTN_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_SUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_RSUB_RTN_U32::Inst_DS__DS_RSUB_RTN_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_rsub_rtn_u32")
     {
     } // Inst_DS__DS_RSUB_RTN_U32

     Inst_DS__DS_RSUB_RTN_U32::~Inst_DS__DS_RSUB_RTN_U32()
     {
     } // ~Inst_DS__DS_RSUB_RTN_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA - MEM[ADDR];
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_RSUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_INC_RTN_U32::Inst_DS__DS_INC_RTN_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_inc_rtn_u32")
     {
     } // Inst_DS__DS_INC_RTN_U32

     Inst_DS__DS_INC_RTN_U32::~Inst_DS__DS_INC_RTN_U32()
     {
     } // ~Inst_DS__DS_INC_RTN_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_INC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_DEC_RTN_U32::Inst_DS__DS_DEC_RTN_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_dec_rtn_u32")
     {
     } // Inst_DS__DS_DEC_RTN_U32

     Inst_DS__DS_DEC_RTN_U32::~Inst_DS__DS_DEC_RTN_U32()
     {
     } // ~Inst_DS__DS_DEC_RTN_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
     // (unsigned compare); RETURN_DATA = tmp.
     void
     Inst_DS__DS_DEC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_RTN_I32::Inst_DS__DS_MIN_RTN_I32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_rtn_i32")
     {
     } // Inst_DS__DS_MIN_RTN_I32

     Inst_DS__DS_MIN_RTN_I32::~Inst_DS__DS_MIN_RTN_I32()
     {
     } // ~Inst_DS__DS_MIN_RTN_I32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MIN_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_RTN_I32::Inst_DS__DS_MAX_RTN_I32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_rtn_i32")
     {
     } // Inst_DS__DS_MAX_RTN_I32

     Inst_DS__DS_MAX_RTN_I32::~Inst_DS__DS_MAX_RTN_I32()
     {
     } // ~Inst_DS__DS_MAX_RTN_I32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MAX_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_RTN_U32::Inst_DS__DS_MIN_RTN_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_rtn_u32")
     {
     } // Inst_DS__DS_MIN_RTN_U32

     Inst_DS__DS_MIN_RTN_U32::~Inst_DS__DS_MIN_RTN_U32()
     {
     } // ~Inst_DS__DS_MIN_RTN_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MIN_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_RTN_U32::Inst_DS__DS_MAX_RTN_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_rtn_u32")
     {
     } // Inst_DS__DS_MAX_RTN_U32

     Inst_DS__DS_MAX_RTN_U32::~Inst_DS__DS_MAX_RTN_U32()
     {
     } // ~Inst_DS__DS_MAX_RTN_U32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MAX_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_AND_RTN_B32::Inst_DS__DS_AND_RTN_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_and_rtn_b32")
     {
     } // Inst_DS__DS_AND_RTN_B32

     Inst_DS__DS_AND_RTN_B32::~Inst_DS__DS_AND_RTN_B32()
     {
     } // ~Inst_DS__DS_AND_RTN_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] &= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_AND_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_OR_RTN_B32::Inst_DS__DS_OR_RTN_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_or_rtn_b32")
     {
     } // Inst_DS__DS_OR_RTN_B32

     Inst_DS__DS_OR_RTN_B32::~Inst_DS__DS_OR_RTN_B32()
     {
     } // ~Inst_DS__DS_OR_RTN_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] |= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_OR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_XOR_RTN_B32::Inst_DS__DS_XOR_RTN_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_xor_rtn_b32")
     {
     } // Inst_DS__DS_XOR_RTN_B32

     Inst_DS__DS_XOR_RTN_B32::~Inst_DS__DS_XOR_RTN_B32()
     {
     } // ~Inst_DS__DS_XOR_RTN_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] ^= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_XOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MSKOR_RTN_B32::Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_mskor_rtn_b32")
     {
     } // Inst_DS__DS_MSKOR_RTN_B32

     Inst_DS__DS_MSKOR_RTN_B32::~Inst_DS__DS_MSKOR_RTN_B32()
     {
     } // ~Inst_DS__DS_MSKOR_RTN_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MSKOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRXCHG_RTN_B32::Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_wrxchg_rtn_b32")
     {
     } // Inst_DS__DS_WRXCHG_RTN_B32

     Inst_DS__DS_WRXCHG_RTN_B32::~Inst_DS__DS_WRXCHG_RTN_B32()
     {
     } // ~Inst_DS__DS_WRXCHG_RTN_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA;
     // RETURN_DATA = tmp.
     // Write-exchange operation.
     void
     Inst_DS__DS_WRXCHG_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRXCHG2_RTN_B32::Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_wrxchg2_rtn_b32")
     {
     } // Inst_DS__DS_WRXCHG2_RTN_B32

     Inst_DS__DS_WRXCHG2_RTN_B32::~Inst_DS__DS_WRXCHG2_RTN_B32()
     {
     } // ~Inst_DS__DS_WRXCHG2_RTN_B32

     // Write-exchange 2 separate dwords.
     void
     Inst_DS__DS_WRXCHG2_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRXCHG2ST64_RTN_B32::Inst_DS__DS_WRXCHG2ST64_RTN_B32(
           InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b32")
     {
     } // Inst_DS__DS_WRXCHG2ST64_RTN_B32

     Inst_DS__DS_WRXCHG2ST64_RTN_B32::~Inst_DS__DS_WRXCHG2ST64_RTN_B32()
     {
     } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B32

     // Write-exchange 2 separate dwords with a stride of 64 dwords.
     void
     Inst_DS__DS_WRXCHG2ST64_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_CMPST_RTN_B32::Inst_DS__DS_CMPST_RTN_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_cmpst_rtn_b32")
     {
     } // Inst_DS__DS_CMPST_RTN_B32

     Inst_DS__DS_CMPST_RTN_B32::~Inst_DS__DS_CMPST_RTN_B32()
     {
     } // ~Inst_DS__DS_CMPST_RTN_B32

     // tmp = MEM[ADDR];
     // src = DATA2;
     // cmp = DATA;
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     // Compare and store.
     void
     Inst_DS__DS_CMPST_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_CMPST_RTN_F32::Inst_DS__DS_CMPST_RTN_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_cmpst_rtn_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_CMPST_RTN_F32

     Inst_DS__DS_CMPST_RTN_F32::~Inst_DS__DS_CMPST_RTN_F32()
     {
     } // ~Inst_DS__DS_CMPST_RTN_F32

     // tmp = MEM[ADDR];
     // src = DATA2;
     // cmp = DATA;
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     void
     Inst_DS__DS_CMPST_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_RTN_F32::Inst_DS__DS_MIN_RTN_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_rtn_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_MIN_RTN_F32

     Inst_DS__DS_MIN_RTN_F32::~Inst_DS__DS_MIN_RTN_F32()
     {
     } // ~Inst_DS__DS_MIN_RTN_F32

     // tmp = MEM[ADDR];
     // src = DATA;
     // cmp = DATA2;
     // MEM[ADDR] = (cmp < tmp) ? src : tmp.
     void
     Inst_DS__DS_MIN_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_RTN_F32::Inst_DS__DS_MAX_RTN_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_rtn_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_MAX_RTN_F32

     Inst_DS__DS_MAX_RTN_F32::~Inst_DS__DS_MAX_RTN_F32()
     {
     } // ~Inst_DS__DS_MAX_RTN_F32

     // tmp = MEM[ADDR];
     // src = DATA;
     // cmp = DATA2;
     // MEM[ADDR] = (tmp > cmp) ? src : tmp.
     void
     Inst_DS__DS_MAX_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRAP_RTN_B32::Inst_DS__DS_WRAP_RTN_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_wrap_rtn_b32")
     {
     } // Inst_DS__DS_WRAP_RTN_B32

     Inst_DS__DS_WRAP_RTN_B32::~Inst_DS__DS_WRAP_RTN_B32()
     {
     } // ~Inst_DS__DS_WRAP_RTN_B32

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA) ? tmp - DATA : tmp + DATA2;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_WRAP_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_ADD_RTN_F32::Inst_DS__DS_ADD_RTN_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_add_rtn_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_ADD_RTN_F32

     Inst_DS__DS_ADD_RTN_F32::~Inst_DS__DS_ADD_RTN_F32()
     {
     } // ~Inst_DS__DS_ADD_RTN_F32

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_ADD_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_DS__DS_READ_B32::Inst_DS__DS_READ_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read_b32")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ_B32

     Inst_DS__DS_READ_B32::~Inst_DS__DS_READ_B32()
     {
     } // ~Inst_DS__DS_READ_B32

     // RETURN_DATA = MEM[ADDR].
     // Dword read.
     void
     Inst_DS__DS_READ_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->rdLmReqsInPipe--;
         wf->outstandingReqsRdLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_DS__DS_READ_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0;
         Addr offset1 = instData.OFFSET1;
         Addr offset = (offset1 << 8) | offset0;

         initMemRead<VecElemU32>(gpuDynInst, offset);
     } // initiateAcc

     void
     Inst_DS__DS_READ_B32::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst(gpuDynInst, extData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane];
             }
         }

         vdst.write();
     } // completeAcc

     Inst_DS__DS_READ2_B32::Inst_DS__DS_READ2_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read2_b32")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ2_B32

     Inst_DS__DS_READ2_B32::~Inst_DS__DS_READ2_B32()
     {
     } // ~Inst_DS__DS_READ2_B32

     // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4];
     // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4].
     // Read 2 dwords.
     void
     Inst_DS__DS_READ2_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->rdLmReqsInPipe--;
         wf->outstandingReqsRdLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_DS__DS_READ2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0 * 4;
         Addr offset1 = instData.OFFSET1 * 4;

         initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
     } // initiateAcc

     void
     Inst_DS__DS_READ2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst0(gpuDynInst, extData.VDST);
         VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 2];
                 vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 2 + 1];
             }
         }

         vdst0.write();
         vdst1.write();
     } // completeAcc

     Inst_DS__DS_READ2ST64_B32::Inst_DS__DS_READ2ST64_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read2st64_b32")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ2ST64_B32

     Inst_DS__DS_READ2ST64_B32::~Inst_DS__DS_READ2ST64_B32()
     {
     } // ~Inst_DS__DS_READ2ST64_B32

     // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4 * 64];
     // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4 * 64].
     // Read 2 dwords.
     void
     Inst_DS__DS_READ2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->rdLmReqsInPipe--;
         wf->outstandingReqsRdLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_DS__DS_READ2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = (instData.OFFSET0 * 4 * 64);
         Addr offset1 = (instData.OFFSET1 * 4 * 64);

         initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
     }

     void
     Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst0(gpuDynInst, extData.VDST);
         VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst0[lane] = (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane * 2];
                 vdst1[lane] = (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane * 2 + 1];
             }
         }

         vdst0.write();
         vdst1.write();
     }
     // --- Inst_DS__DS_READ_I8 class methods ---

     Inst_DS__DS_READ_I8::Inst_DS__DS_READ_I8(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read_i8")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ_I8

     Inst_DS__DS_READ_I8::~Inst_DS__DS_READ_I8()
     {
     } // ~Inst_DS__DS_READ_I8

     // RETURN_DATA = signext(MEM[ADDR][7:0]).
     // Signed byte read.
     void
     Inst_DS__DS_READ_I8::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_READ_U8::Inst_DS__DS_READ_U8(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read_u8")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ_U8

     Inst_DS__DS_READ_U8::~Inst_DS__DS_READ_U8()
     {
     } // ~Inst_DS__DS_READ_U8

     // RETURN_DATA = {24'h0,MEM[ADDR][7:0]}.
     // Unsigned byte read.
     void
     Inst_DS__DS_READ_U8::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->rdLmReqsInPipe--;
         wf->outstandingReqsRdLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_DS__DS_READ_U8::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0;
         Addr offset1 = instData.OFFSET1;
         Addr offset = (offset1 << 8) | offset0;

         initMemRead<VecElemU8>(gpuDynInst, offset);
     } // initiateAcc

     void
     Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst(gpuDynInst, extData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU8*>(
                     gpuDynInst->d_data))[lane];
             }
         }

         vdst.write();
     } // completeAcc
     // --- Inst_DS__DS_READ_I16 class methods ---

     Inst_DS__DS_READ_I16::Inst_DS__DS_READ_I16(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read_i16")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ_I16

     Inst_DS__DS_READ_I16::~Inst_DS__DS_READ_I16()
     {
     } // ~Inst_DS__DS_READ_I16

     // RETURN_DATA = signext(MEM[ADDR][15:0]).
     // Signed short read.
     void
     Inst_DS__DS_READ_I16::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_READ_U16::Inst_DS__DS_READ_U16(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read_u16")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ_U16

     Inst_DS__DS_READ_U16::~Inst_DS__DS_READ_U16()
     {
     } // ~Inst_DS__DS_READ_U16

     // RETURN_DATA = {16'h0,MEM[ADDR][15:0]}.
     // Unsigned short read.
     void
     Inst_DS__DS_READ_U16::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->rdLmReqsInPipe--;
         wf->outstandingReqsRdLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute
     void
     Inst_DS__DS_READ_U16::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0;
         Addr offset1 = instData.OFFSET1;
         Addr offset = (offset1 << 8) | offset0;

         initMemRead<VecElemU16>(gpuDynInst, offset);
     } // initiateAcc

     void
     Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst(gpuDynInst, extData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU16*>(
                     gpuDynInst->d_data))[lane];
             }
         }

         vdst.write();
     } // completeAcc
     // --- Inst_DS__DS_SWIZZLE_B32 class methods ---

     Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_swizzle_b32")
     {
          setFlag(Load);
     } // Inst_DS__DS_SWIZZLE_B32

     Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
     {
     } // ~Inst_DS__DS_SWIZZLE_B32

     // RETURN_DATA = swizzle(vgpr_data, offset1:offset0).
     // Dword swizzle, no data is written to LDS memory;
     void
     Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         wf->rdLmReqsInPipe--;
         wf->validateRequestCounters();

         if (gpuDynInst->exec_mask.none()) {
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()
                                 ->cyclesToTicks(Cycles(24)));

         ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
         VecOperandU32 vdst(gpuDynInst, extData.VDST);
         /**
          * The "DS pattern" is comprised of both offset fields. That is, the
          * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
          * which swizzle mode to use. There are two different swizzle
          * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
          * QDMode else use Bit-masks mode. The remaining bits dictate how to
          * swizzle the lanes.
          *
          * QDMode:      Chunks the lanes into 4s and swizzles among them.
          *              Bits 7:6 dictate where lane 3 (of the current chunk)
          *              gets its date, 5:4 lane 2, etc.
          *
          * Bit-mask:    This mode breaks bits 14:0 into 3 equal-sized chunks.
          *              14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
          *              is the and_mask. Each lane is swizzled by performing
          *              the appropriate operation using these masks.
          */
         VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0);

         data.read();

         if (bits(ds_pattern, 15)) {
             // QDMode
             for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
                 /**
                  * This operation allows data sharing between groups
                  * of four consecutive threads. Note the increment by
                  * 4 in the for loop.
                  */
                 if (gpuDynInst->exec_mask[lane]) {
                     int index0 = lane + bits(ds_pattern, 1, 0);
                     panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) "
                              "is out of bounds.\n", gpuDynInst->disassemble(),
                              index0);
                     vdst[lane]
                         = gpuDynInst->exec_mask[index0] ? data[index0]: 0;
                 }
                 if (gpuDynInst->exec_mask[lane + 1]) {
                     int index1 = lane + bits(ds_pattern, 3, 2);
                     panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) "
                              "is out of bounds.\n", gpuDynInst->disassemble(),
                              index1);
                     vdst[lane + 1]
                         = gpuDynInst->exec_mask[index1] ? data[index1]: 0;
                 }
                 if (gpuDynInst->exec_mask[lane + 2]) {
                     int index2 = lane + bits(ds_pattern, 5, 4);
                     panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) "
                              "is out of bounds.\n", gpuDynInst->disassemble(),
                              index2);
                     vdst[lane + 2]
                         = gpuDynInst->exec_mask[index2] ? data[index2]: 0;
                 }
                 if (gpuDynInst->exec_mask[lane + 3]) {
                     int index3 = lane + bits(ds_pattern, 7, 6);
                     panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) "
                              "is out of bounds.\n", gpuDynInst->disassemble(),
                              index3);
                     vdst[lane + 3]
                         = gpuDynInst->exec_mask[index3] ? data[index3]: 0;
                 }
             }
         } else {
             // Bit Mode
             int and_mask = bits(ds_pattern, 4, 0);
             int or_mask = bits(ds_pattern, 9, 5);
             int xor_mask = bits(ds_pattern, 14, 10);
             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     int index = (((lane & and_mask) | or_mask) ^ xor_mask);
                     // Adjust for the next 32 lanes.
                     if (lane > 31) {
                         index += 32;
                     }
                     panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is "
                              "out of bounds.\n", gpuDynInst->disassemble(),
                              index);
                     vdst[lane]
                         = gpuDynInst->exec_mask[index] ? data[index] : 0;
                 }
             }
         }

         vdst.write();
     } // execute
     // --- Inst_DS__DS_PERMUTE_B32 class methods ---

     Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_permute_b32")
     {
         setFlag(MemoryRef);
         /**
          * While this operation doesn't actually use DS storage we classify
          * it as a load here because it does a writeback to a VGPR, which
          * fits in better with the LDS pipeline logic.
          */
          setFlag(Load);
     } // Inst_DS__DS_PERMUTE_B32

     Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
     {
     } // ~Inst_DS__DS_PERMUTE_B32

     // Forward permute.
     void
     Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()
                                 ->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
         VecOperandU32 vdst(gpuDynInst, extData.VDST);

         addr.read();
         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 /**
                  * One of the offset fields can be used for the index.
                  * It is assumed OFFSET0 would be used, as OFFSET1 is
                  * typically only used for DS ops that operate on two
                  * disparate pieces of data.
                  */
                 assert(!instData.OFFSET1);
                 /**
                  * The address provided is a byte address, but VGPRs are
                  * 4 bytes, so we must divide by 4 to get the actual VGPR
                  * index. Additionally, the index is calculated modulo the
                  * WF size, 64 in this case, so we simply extract bits 7-2.
                  */
                 int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
                 panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
                          "of bounds.\n", gpuDynInst->disassemble(), index);
                 /**
                  * If the shuffled index corresponds to a lane that is
                  * inactive then this instruction writes a 0 to the active
                  * lane in VDST.
                  */
                 if (wf->execMask(index)) {
                     vdst[index] = data[lane];
                 } else {
                     vdst[index] = 0;
                 }
             }
         }

         vdst.write();

         wf->decLGKMInstsIssued();
         wf->rdLmReqsInPipe--;
         wf->validateRequestCounters();
     } // execute
     // --- Inst_DS__DS_BPERMUTE_B32 class methods ---

     Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_bpermute_b32")
     {
         setFlag(MemoryRef);
         /**
          * While this operation doesn't actually use DS storage we classify
          * it as a load here because it does a writeback to a VGPR, which
          * fits in better with the LDS pipeline logic.
          */
         setFlag(Load);
     } // Inst_DS__DS_BPERMUTE_B32

     Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
     {
     } // ~Inst_DS__DS_BPERMUTE_B32

     // Backward permute.
     void
     Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()
                                 ->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
         VecOperandU32 vdst(gpuDynInst, extData.VDST);

         addr.read();
         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 /**
                  * One of the offset fields can be used for the index.
                  * It is assumed OFFSET0 would be used, as OFFSET1 is
                  * typically only used for DS ops that operate on two
                  * disparate pieces of data.
                  */
                 assert(!instData.OFFSET1);
                 /**
                  * The address provided is a byte address, but VGPRs are
                  * 4 bytes, so we must divide by 4 to get the actual VGPR
                  * index. Additionally, the index is calculated modulo the
                  * WF size, 64 in this case, so we simply extract bits 7-2.
                  */
                 int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
                 panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
                          "of bounds.\n", gpuDynInst->disassemble(), index);
                 /**
                  * If the shuffled index corresponds to a lane that is
                  * inactive then this instruction writes a 0 to the active
                  * lane in VDST.
                  */
                 if (wf->execMask(index)) {
                     vdst[lane] = data[index];
                 } else {
                     vdst[lane] = 0;
                 }
             }
         }

         vdst.write();

         wf->decLGKMInstsIssued();
         wf->rdLmReqsInPipe--;
         wf->validateRequestCounters();
     } // execute

     // --- Inst_DS__DS_ADD_U64 class methods ---

     Inst_DS__DS_ADD_U64::Inst_DS__DS_ADD_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_add_u64")
     {
     } // Inst_DS__DS_ADD_U64

     Inst_DS__DS_ADD_U64::~Inst_DS__DS_ADD_U64()
     {
     } // ~Inst_DS__DS_ADD_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_SUB_U64::Inst_DS__DS_SUB_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_sub_u64")
     {
     } // Inst_DS__DS_SUB_U64

     Inst_DS__DS_SUB_U64::~Inst_DS__DS_SUB_U64()
     {
     } // ~Inst_DS__DS_SUB_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_SUB_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_RSUB_U64::Inst_DS__DS_RSUB_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_rsub_u64")
     {
     } // Inst_DS__DS_RSUB_U64

     Inst_DS__DS_RSUB_U64::~Inst_DS__DS_RSUB_U64()
     {
     } // ~Inst_DS__DS_RSUB_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA - MEM[ADDR];
     // RETURN_DATA = tmp.
     // Subtraction with reversed operands.
     void
     Inst_DS__DS_RSUB_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_INC_U64::Inst_DS__DS_INC_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_inc_u64")
     {
     } // Inst_DS__DS_INC_U64

     Inst_DS__DS_INC_U64::~Inst_DS__DS_INC_U64()
     {
     } // ~Inst_DS__DS_INC_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_INC_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_DEC_U64::Inst_DS__DS_DEC_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_dec_u64")
     {
     } // Inst_DS__DS_DEC_U64

     Inst_DS__DS_DEC_U64::~Inst_DS__DS_DEC_U64()
     {
     } // ~Inst_DS__DS_DEC_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
     // (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_DEC_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_I64::Inst_DS__DS_MIN_I64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_i64")
     {
     } // Inst_DS__DS_MIN_I64

     Inst_DS__DS_MIN_I64::~Inst_DS__DS_MIN_I64()
     {
     } // ~Inst_DS__DS_MIN_I64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_MIN_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_I64::Inst_DS__DS_MAX_I64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_i64")
     {
     } // Inst_DS__DS_MAX_I64

     Inst_DS__DS_MAX_I64::~Inst_DS__DS_MAX_I64()
     {
     } // ~Inst_DS__DS_MAX_I64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_MAX_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_U64::Inst_DS__DS_MIN_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_u64")
     {
     } // Inst_DS__DS_MIN_U64

     Inst_DS__DS_MIN_U64::~Inst_DS__DS_MIN_U64()
     {
     } // ~Inst_DS__DS_MIN_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_MIN_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_U64::Inst_DS__DS_MAX_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_u64")
     {
     } // Inst_DS__DS_MAX_U64

     Inst_DS__DS_MAX_U64::~Inst_DS__DS_MAX_U64()
     {
     } // ~Inst_DS__DS_MAX_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_MAX_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_AND_B64::Inst_DS__DS_AND_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_and_b64")
     {
     } // Inst_DS__DS_AND_B64

     Inst_DS__DS_AND_B64::~Inst_DS__DS_AND_B64()
     {
     } // ~Inst_DS__DS_AND_B64

     // tmp = MEM[ADDR];
     // MEM[ADDR] &= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_AND_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_OR_B64::Inst_DS__DS_OR_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_or_b64")
     {
     } // Inst_DS__DS_OR_B64

     Inst_DS__DS_OR_B64::~Inst_DS__DS_OR_B64()
     {
     } // ~Inst_DS__DS_OR_B64

     // tmp = MEM[ADDR];
     // MEM[ADDR] |= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_OR_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_XOR_B64::Inst_DS__DS_XOR_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_xor_b64")
     {
     } // Inst_DS__DS_XOR_B64

     Inst_DS__DS_XOR_B64::~Inst_DS__DS_XOR_B64()
     {
     } // ~Inst_DS__DS_XOR_B64

     // tmp = MEM[ADDR];
     // MEM[ADDR] ^= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MSKOR_B64::Inst_DS__DS_MSKOR_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_mskor_b64")
     {
     } // Inst_DS__DS_MSKOR_B64

     Inst_DS__DS_MSKOR_B64::~Inst_DS__DS_MSKOR_B64()
     {
     } // ~Inst_DS__DS_MSKOR_B64

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
     // RETURN_DATA = tmp.
     void
     Inst_DS__DS_MSKOR_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRITE_B64::Inst_DS__DS_WRITE_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write_b64")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE_B64

     Inst_DS__DS_WRITE_B64::~Inst_DS__DS_WRITE_B64()
     {
     } // ~Inst_DS__DS_WRITE_B64

     // MEM[ADDR] = DATA.
     // Write qword.
     void
     Inst_DS__DS_WRITE_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU64 data(gpuDynInst, extData.DATA0);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->wrLmReqsInPipe--;
         wf->outstandingReqsWrLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_DS__DS_WRITE_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0;
         Addr offset1 = instData.OFFSET1;
         Addr offset = (offset1 << 8) | offset0;

         initMemWrite<VecElemU64>(gpuDynInst, offset);
     } // initiateAcc

     void
     Inst_DS__DS_WRITE_B64::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_DS__DS_WRITE2_B64::Inst_DS__DS_WRITE2_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write2_b64")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE2_B64

     Inst_DS__DS_WRITE2_B64::~Inst_DS__DS_WRITE2_B64()
     {
     } // ~Inst_DS__DS_WRITE2_B64

     // MEM[ADDR_BASE + OFFSET0 * 8] = DATA;
     // MEM[ADDR_BASE + OFFSET1 * 8] = DATA2.
     // Write 2 qwords.
     void
     Inst_DS__DS_WRITE2_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
         ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);

         addr.read();
         data0.read();
         data1.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane * 2] = data0[lane];
                 (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
             }
         }

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->wrLmReqsInPipe--;
         wf->outstandingReqsWrLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_DS__DS_WRITE2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0 * 8;
         Addr offset1 = instData.OFFSET1 * 8;

         initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
     }

     void
     Inst_DS__DS_WRITE2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_DS__DS_WRITE2ST64_B64::Inst_DS__DS_WRITE2ST64_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write2st64_b64")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE2ST64_B64

     Inst_DS__DS_WRITE2ST64_B64::~Inst_DS__DS_WRITE2ST64_B64()
     {
     } // ~Inst_DS__DS_WRITE2ST64_B64

     // MEM[ADDR_BASE + OFFSET0 * 8 * 64] = DATA;
     // MEM[ADDR_BASE + OFFSET1 * 8 * 64] = DATA2;
     // Write 2 qwords.
     void
     Inst_DS__DS_WRITE2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_CMPST_B64::Inst_DS__DS_CMPST_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_cmpst_b64")
     {
     } // Inst_DS__DS_CMPST_B64

     Inst_DS__DS_CMPST_B64::~Inst_DS__DS_CMPST_B64()
     {
     } // ~Inst_DS__DS_CMPST_B64

     // tmp = MEM[ADDR];
     // src = DATA2;
     // cmp = DATA;
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     // Compare and store.
     void
     Inst_DS__DS_CMPST_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_CMPST_F64::Inst_DS__DS_CMPST_F64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_cmpst_f64")
     {
         setFlag(F64);
     } // Inst_DS__DS_CMPST_F64

     Inst_DS__DS_CMPST_F64::~Inst_DS__DS_CMPST_F64()
     {
     } // ~Inst_DS__DS_CMPST_F64

     // tmp = MEM[ADDR];
     // src = DATA2;
     // cmp = DATA;
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     void
     Inst_DS__DS_CMPST_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_F64::Inst_DS__DS_MIN_F64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_f64")
     {
         setFlag(F64);
     } // Inst_DS__DS_MIN_F64

     Inst_DS__DS_MIN_F64::~Inst_DS__DS_MIN_F64()
     {
     } // ~Inst_DS__DS_MIN_F64

     // tmp = MEM[ADDR];
     // src = DATA;
     // cmp = DATA2;
     // MEM[ADDR] = (cmp < tmp) ? src : tmp.
     void
     Inst_DS__DS_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_F64::Inst_DS__DS_MAX_F64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_f64")
     {
         setFlag(F64);
     } // Inst_DS__DS_MAX_F64

     Inst_DS__DS_MAX_F64::~Inst_DS__DS_MAX_F64()
     {
     } // ~Inst_DS__DS_MAX_F64

     // tmp = MEM[ADDR];
     // src = DATA;
     // cmp = DATA2;
     // MEM[ADDR] = (tmp > cmp) ? src : tmp.
     void
     Inst_DS__DS_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_ADD_RTN_U64::Inst_DS__DS_ADD_RTN_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_add_rtn_u64")
     {
     } // Inst_DS__DS_ADD_RTN_U64

     Inst_DS__DS_ADD_RTN_U64::~Inst_DS__DS_ADD_RTN_U64()
     {
     } // ~Inst_DS__DS_ADD_RTN_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_ADD_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_SUB_RTN_U64::Inst_DS__DS_SUB_RTN_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_sub_rtn_u64")
     {
     } // Inst_DS__DS_SUB_RTN_U64

     Inst_DS__DS_SUB_RTN_U64::~Inst_DS__DS_SUB_RTN_U64()
     {
     } // ~Inst_DS__DS_SUB_RTN_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_SUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_RSUB_RTN_U64::Inst_DS__DS_RSUB_RTN_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_rsub_rtn_u64")
     {
     } // Inst_DS__DS_RSUB_RTN_U64

     Inst_DS__DS_RSUB_RTN_U64::~Inst_DS__DS_RSUB_RTN_U64()
     {
     } // ~Inst_DS__DS_RSUB_RTN_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA - MEM[ADDR];
     // RETURN_DATA = tmp.
     // Subtraction with reversed operands.
     void
     Inst_DS__DS_RSUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_INC_RTN_U64::Inst_DS__DS_INC_RTN_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_inc_rtn_u64")
     {
     } // Inst_DS__DS_INC_RTN_U64

     Inst_DS__DS_INC_RTN_U64::~Inst_DS__DS_INC_RTN_U64()
     {
     } // ~Inst_DS__DS_INC_RTN_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_INC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_DEC_RTN_U64::Inst_DS__DS_DEC_RTN_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_dec_rtn_u64")
     {
     } // Inst_DS__DS_DEC_RTN_U64

     Inst_DS__DS_DEC_RTN_U64::~Inst_DS__DS_DEC_RTN_U64()
     {
     } // ~Inst_DS__DS_DEC_RTN_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
     // (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_DEC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_RTN_I64::Inst_DS__DS_MIN_RTN_I64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_rtn_i64")
     {
     } // Inst_DS__DS_MIN_RTN_I64

     Inst_DS__DS_MIN_RTN_I64::~Inst_DS__DS_MIN_RTN_I64()
     {
     } // ~Inst_DS__DS_MIN_RTN_I64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_MIN_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_RTN_I64::Inst_DS__DS_MAX_RTN_I64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_rtn_i64")
     {
     } // Inst_DS__DS_MAX_RTN_I64

     Inst_DS__DS_MAX_RTN_I64::~Inst_DS__DS_MAX_RTN_I64()
     {
     } // ~Inst_DS__DS_MAX_RTN_I64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_MAX_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_RTN_U64::Inst_DS__DS_MIN_RTN_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_rtn_u64")
     {
     } // Inst_DS__DS_MIN_RTN_U64

     Inst_DS__DS_MIN_RTN_U64::~Inst_DS__DS_MIN_RTN_U64()
     {
     } // ~Inst_DS__DS_MIN_RTN_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_MIN_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_RTN_U64::Inst_DS__DS_MAX_RTN_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_rtn_u64")
     {
     } // Inst_DS__DS_MAX_RTN_U64

     Inst_DS__DS_MAX_RTN_U64::~Inst_DS__DS_MAX_RTN_U64()
     {
     } // ~Inst_DS__DS_MAX_RTN_U64

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_MAX_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_AND_RTN_B64::Inst_DS__DS_AND_RTN_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_and_rtn_b64")
     {
     } // Inst_DS__DS_AND_RTN_B64

     Inst_DS__DS_AND_RTN_B64::~Inst_DS__DS_AND_RTN_B64()
     {
     } // ~Inst_DS__DS_AND_RTN_B64

     // tmp = MEM[ADDR];
     // MEM[ADDR] &= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_AND_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_OR_RTN_B64::Inst_DS__DS_OR_RTN_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_or_rtn_b64")
     {
     } // Inst_DS__DS_OR_RTN_B64

     Inst_DS__DS_OR_RTN_B64::~Inst_DS__DS_OR_RTN_B64()
     {
     } // ~Inst_DS__DS_OR_RTN_B64

     // tmp = MEM[ADDR];
     // MEM[ADDR] |= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_OR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_XOR_RTN_B64::Inst_DS__DS_XOR_RTN_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_xor_rtn_b64")
     {
     } // Inst_DS__DS_XOR_RTN_B64

     Inst_DS__DS_XOR_RTN_B64::~Inst_DS__DS_XOR_RTN_B64()
     {
     } // ~Inst_DS__DS_XOR_RTN_B64

     // tmp = MEM[ADDR];
     // MEM[ADDR] ^= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_DS__DS_XOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MSKOR_RTN_B64::Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_mskor_rtn_b64")
     {
     } // Inst_DS__DS_MSKOR_RTN_B64

     Inst_DS__DS_MSKOR_RTN_B64::~Inst_DS__DS_MSKOR_RTN_B64()
     {
     } // ~Inst_DS__DS_MSKOR_RTN_B64

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
     // RETURN_DATA = tmp.
     // Masked dword OR, D0 contains the mask and D1 contains the new value.
     void
     Inst_DS__DS_MSKOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRXCHG_RTN_B64::Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_wrxchg_rtn_b64")
     {
     } // Inst_DS__DS_WRXCHG_RTN_B64

     Inst_DS__DS_WRXCHG_RTN_B64::~Inst_DS__DS_WRXCHG_RTN_B64()
     {
     } // ~Inst_DS__DS_WRXCHG_RTN_B64

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA;
     // RETURN_DATA = tmp.
     // Write-exchange operation.
     void
     Inst_DS__DS_WRXCHG_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRXCHG2_RTN_B64::Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_wrxchg2_rtn_b64")
     {
     } // Inst_DS__DS_WRXCHG2_RTN_B64

     Inst_DS__DS_WRXCHG2_RTN_B64::~Inst_DS__DS_WRXCHG2_RTN_B64()
     {
     } // ~Inst_DS__DS_WRXCHG2_RTN_B64

     // Write-exchange 2 separate qwords.
     void
     Inst_DS__DS_WRXCHG2_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRXCHG2ST64_RTN_B64::Inst_DS__DS_WRXCHG2ST64_RTN_B64(
           InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b64")
     {
     } // Inst_DS__DS_WRXCHG2ST64_RTN_B64

     Inst_DS__DS_WRXCHG2ST64_RTN_B64::~Inst_DS__DS_WRXCHG2ST64_RTN_B64()
     {
     } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B64

     // Write-exchange 2 qwords with a stride of 64 qwords.
     void
     Inst_DS__DS_WRXCHG2ST64_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_CMPST_RTN_B64::Inst_DS__DS_CMPST_RTN_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_cmpst_rtn_b64")
     {
     } // Inst_DS__DS_CMPST_RTN_B64

     Inst_DS__DS_CMPST_RTN_B64::~Inst_DS__DS_CMPST_RTN_B64()
     {
     } // ~Inst_DS__DS_CMPST_RTN_B64

     // tmp = MEM[ADDR];
     // src = DATA2;
     // cmp = DATA;
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     // Compare and store.
     void
     Inst_DS__DS_CMPST_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_CMPST_RTN_F64::Inst_DS__DS_CMPST_RTN_F64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_cmpst_rtn_f64")
     {
         setFlag(F64);
     } // Inst_DS__DS_CMPST_RTN_F64

     Inst_DS__DS_CMPST_RTN_F64::~Inst_DS__DS_CMPST_RTN_F64()
     {
     } // ~Inst_DS__DS_CMPST_RTN_F64

     // tmp = MEM[ADDR];
     // src = DATA2;
     // cmp = DATA;
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     void
     Inst_DS__DS_CMPST_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_RTN_F64::Inst_DS__DS_MIN_RTN_F64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_rtn_f64")
     {
         setFlag(F64);
     } // Inst_DS__DS_MIN_RTN_F64

     Inst_DS__DS_MIN_RTN_F64::~Inst_DS__DS_MIN_RTN_F64()
     {
     } // ~Inst_DS__DS_MIN_RTN_F64

     // tmp = MEM[ADDR];
     // src = DATA;
     // cmp = DATA2;
     // MEM[ADDR] = (cmp < tmp) ? src : tmp.
     void
     Inst_DS__DS_MIN_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_RTN_F64::Inst_DS__DS_MAX_RTN_F64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_rtn_f64")
     {
         setFlag(F64);
     } // Inst_DS__DS_MAX_RTN_F64

     Inst_DS__DS_MAX_RTN_F64::~Inst_DS__DS_MAX_RTN_F64()
     {
     } // ~Inst_DS__DS_MAX_RTN_F64

     // tmp = MEM[ADDR];
     // src = DATA;
     // cmp = DATA2;
     // MEM[ADDR] = (tmp > cmp) ? src : tmp.
     void
     Inst_DS__DS_MAX_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_READ_B64::Inst_DS__DS_READ_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read_b64")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ_B64

     Inst_DS__DS_READ_B64::~Inst_DS__DS_READ_B64()
     {
     } // ~Inst_DS__DS_READ_B64

     // RETURN_DATA = MEM[ADDR].
     // Read 1 qword.
     void
     Inst_DS__DS_READ_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->rdLmReqsInPipe--;
         wf->outstandingReqsRdLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_DS__DS_READ_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0;
         Addr offset1 = instData.OFFSET1;
         Addr offset = (offset1 << 8) | offset0;

         initMemRead<VecElemU64>(gpuDynInst, offset);
     } // initiateAcc

     void
     Inst_DS__DS_READ_B64::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU64 vdst(gpuDynInst, extData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst[lane] = (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane];
             }
         }

         vdst.write();
     } // completeAcc

     Inst_DS__DS_READ2_B64::Inst_DS__DS_READ2_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read2_b64")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ2_B64

     Inst_DS__DS_READ2_B64::~Inst_DS__DS_READ2_B64()
     {
     } // ~Inst_DS__DS_READ2_B64

     // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8];
     // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8].
     // Read 2 qwords.
     void
     Inst_DS__DS_READ2_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->rdLmReqsInPipe--;
         wf->outstandingReqsRdLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_DS__DS_READ2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = instData.OFFSET0 * 8;
         Addr offset1 = instData.OFFSET1 * 8;

         initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
     } // initiateAcc

     void
     Inst_DS__DS_READ2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU64 vdst0(gpuDynInst, extData.VDST);
         VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst0[lane] = (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane * 2];
                 vdst1[lane] = (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane * 2 + 1];
             }
         }

         vdst0.write();
         vdst1.write();
     } // completeAcc

     Inst_DS__DS_READ2ST64_B64::Inst_DS__DS_READ2ST64_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read2st64_b64")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ2ST64_B64

     Inst_DS__DS_READ2ST64_B64::~Inst_DS__DS_READ2ST64_B64()
     {
     } // ~Inst_DS__DS_READ2ST64_B64

     // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8 * 64];
     // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8 * 64].
     // Read 2 qwords.
     void
     Inst_DS__DS_READ2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(
                 gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
         ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

         wf->rdLmReqsInPipe--;
         wf->outstandingReqsRdLm++;
         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_DS__DS_READ2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         Addr offset0 = (instData.OFFSET0 * 8 * 64);
         Addr offset1 = (instData.OFFSET1 * 8 * 64);

         initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
     }

     void
     Inst_DS__DS_READ2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU64 vdst0(gpuDynInst, extData.VDST);
         VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst0[lane] = (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane * 2];
                 vdst1[lane] = (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane * 2 + 1];
             }
         }

         vdst0.write();
         vdst1.write();
     }

     Inst_DS__DS_CONDXCHG32_RTN_B64::Inst_DS__DS_CONDXCHG32_RTN_B64(
           InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_condxchg32_rtn_b64")
     {
     } // Inst_DS__DS_CONDXCHG32_RTN_B64

     Inst_DS__DS_CONDXCHG32_RTN_B64::~Inst_DS__DS_CONDXCHG32_RTN_B64()
     {
     } // ~Inst_DS__DS_CONDXCHG32_RTN_B64

     // Conditional write exchange.
     void
     Inst_DS__DS_CONDXCHG32_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_ADD_SRC2_U32::Inst_DS__DS_ADD_SRC2_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_add_src2_u32")
     {
     } // Inst_DS__DS_ADD_SRC2_U32

     Inst_DS__DS_ADD_SRC2_U32::~Inst_DS__DS_ADD_SRC2_U32()
     {
     } // ~Inst_DS__DS_ADD_SRC2_U32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] + MEM[B].
     void
     Inst_DS__DS_ADD_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_SUB_SRC2_U32::Inst_DS__DS_SUB_SRC2_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_sub_src2_u32")
     {
     } // Inst_DS__DS_SUB_SRC2_U32

     Inst_DS__DS_SUB_SRC2_U32::~Inst_DS__DS_SUB_SRC2_U32()
     {
     } // ~Inst_DS__DS_SUB_SRC2_U32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] - MEM[B].
     void
     Inst_DS__DS_SUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_RSUB_SRC2_U32::Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_rsub_src2_u32")
     {
     } // Inst_DS__DS_RSUB_SRC2_U32

     Inst_DS__DS_RSUB_SRC2_U32::~Inst_DS__DS_RSUB_SRC2_U32()
     {
     } // ~Inst_DS__DS_RSUB_SRC2_U32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[B] - MEM[A].
     void
     Inst_DS__DS_RSUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_INC_SRC2_U32::Inst_DS__DS_INC_SRC2_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_inc_src2_u32")
     {
     } // Inst_DS__DS_INC_SRC2_U32

     Inst_DS__DS_INC_SRC2_U32::~Inst_DS__DS_INC_SRC2_U32()
     {
     } // ~Inst_DS__DS_INC_SRC2_U32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
     void
     Inst_DS__DS_INC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_DEC_SRC2_U32::Inst_DS__DS_DEC_SRC2_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_dec_src2_u32")
     {
     } // Inst_DS__DS_DEC_SRC2_U32

     Inst_DS__DS_DEC_SRC2_U32::~Inst_DS__DS_DEC_SRC2_U32()
     {
     } // ~Inst_DS__DS_DEC_SRC2_U32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
     // Uint decrement.
     void
     Inst_DS__DS_DEC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_SRC2_I32::Inst_DS__DS_MIN_SRC2_I32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_src2_i32")
     {
     } // Inst_DS__DS_MIN_SRC2_I32

     Inst_DS__DS_MIN_SRC2_I32::~Inst_DS__DS_MIN_SRC2_I32()
     {
     } // ~Inst_DS__DS_MIN_SRC2_I32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = min(MEM[A], MEM[B]).
     void
     Inst_DS__DS_MIN_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_SRC2_I32::Inst_DS__DS_MAX_SRC2_I32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_src2_i32")
     {
     } // Inst_DS__DS_MAX_SRC2_I32

     Inst_DS__DS_MAX_SRC2_I32::~Inst_DS__DS_MAX_SRC2_I32()
     {
     } // ~Inst_DS__DS_MAX_SRC2_I32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = max(MEM[A], MEM[B]).
     void
     Inst_DS__DS_MAX_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_SRC2_U32::Inst_DS__DS_MIN_SRC2_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_src2_u32")
     {
     } // Inst_DS__DS_MIN_SRC2_U32

     Inst_DS__DS_MIN_SRC2_U32::~Inst_DS__DS_MIN_SRC2_U32()
     {
     } // ~Inst_DS__DS_MIN_SRC2_U32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = min(MEM[A], MEM[B]).
     void
     Inst_DS__DS_MIN_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_SRC2_U32::Inst_DS__DS_MAX_SRC2_U32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_src2_u32")
     {
     } // Inst_DS__DS_MAX_SRC2_U32

     Inst_DS__DS_MAX_SRC2_U32::~Inst_DS__DS_MAX_SRC2_U32()
     {
     } // ~Inst_DS__DS_MAX_SRC2_U32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = max(MEM[A], MEM[B]).
     void
     Inst_DS__DS_MAX_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_AND_SRC2_B32::Inst_DS__DS_AND_SRC2_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_and_src2_b32")
     {
     } // Inst_DS__DS_AND_SRC2_B32

     Inst_DS__DS_AND_SRC2_B32::~Inst_DS__DS_AND_SRC2_B32()
     {
     } // ~Inst_DS__DS_AND_SRC2_B32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] & MEM[B].
     void
     Inst_DS__DS_AND_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_OR_SRC2_B32::Inst_DS__DS_OR_SRC2_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_or_src2_b32")
     {
     } // Inst_DS__DS_OR_SRC2_B32

     Inst_DS__DS_OR_SRC2_B32::~Inst_DS__DS_OR_SRC2_B32()
     {
     } // ~Inst_DS__DS_OR_SRC2_B32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] | MEM[B].
     void
     Inst_DS__DS_OR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_XOR_SRC2_B32::Inst_DS__DS_XOR_SRC2_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_xor_src2_b32")
     {
     } // Inst_DS__DS_XOR_SRC2_B32

     Inst_DS__DS_XOR_SRC2_B32::~Inst_DS__DS_XOR_SRC2_B32()
     {
     } // ~Inst_DS__DS_XOR_SRC2_B32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] ^ MEM[B].
     void
     Inst_DS__DS_XOR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRITE_SRC2_B32::Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write_src2_b32")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE_SRC2_B32

     Inst_DS__DS_WRITE_SRC2_B32::~Inst_DS__DS_WRITE_SRC2_B32()
     {
     } // ~Inst_DS__DS_WRITE_SRC2_B32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[B].
     // Write dword.
     void
     Inst_DS__DS_WRITE_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_SRC2_F32::Inst_DS__DS_MIN_SRC2_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_src2_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_MIN_SRC2_F32

     Inst_DS__DS_MIN_SRC2_F32::~Inst_DS__DS_MIN_SRC2_F32()
     {
     } // ~Inst_DS__DS_MIN_SRC2_F32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
     void
     Inst_DS__DS_MIN_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_SRC2_F32::Inst_DS__DS_MAX_SRC2_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_src2_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_MAX_SRC2_F32

     Inst_DS__DS_MAX_SRC2_F32::~Inst_DS__DS_MAX_SRC2_F32()
     {
     } // ~Inst_DS__DS_MAX_SRC2_F32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
     void
     Inst_DS__DS_MAX_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_ADD_SRC2_F32::Inst_DS__DS_ADD_SRC2_F32(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_add_src2_f32")
     {
         setFlag(F32);
     } // Inst_DS__DS_ADD_SRC2_F32

     Inst_DS__DS_ADD_SRC2_F32::~Inst_DS__DS_ADD_SRC2_F32()
     {
     } // ~Inst_DS__DS_ADD_SRC2_F32

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[B] + MEM[A].
     void
     Inst_DS__DS_ADD_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_GWS_SEMA_RELEASE_ALL::Inst_DS__DS_GWS_SEMA_RELEASE_ALL(
           InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_gws_sema_release_all")
     {
     } // Inst_DS__DS_GWS_SEMA_RELEASE_ALL

     Inst_DS__DS_GWS_SEMA_RELEASE_ALL::~Inst_DS__DS_GWS_SEMA_RELEASE_ALL()
     {
     } // ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL

     void
     Inst_DS__DS_GWS_SEMA_RELEASE_ALL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_GWS_INIT::Inst_DS__DS_GWS_INIT(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_gws_init")
     {
     } // Inst_DS__DS_GWS_INIT

     Inst_DS__DS_GWS_INIT::~Inst_DS__DS_GWS_INIT()
     {
     } // ~Inst_DS__DS_GWS_INIT

     void
     Inst_DS__DS_GWS_INIT::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_GWS_SEMA_V::Inst_DS__DS_GWS_SEMA_V(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_gws_sema_v")
     {
     } // Inst_DS__DS_GWS_SEMA_V

     Inst_DS__DS_GWS_SEMA_V::~Inst_DS__DS_GWS_SEMA_V()
     {
     } // ~Inst_DS__DS_GWS_SEMA_V

     void
     Inst_DS__DS_GWS_SEMA_V::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_GWS_SEMA_BR::Inst_DS__DS_GWS_SEMA_BR(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_gws_sema_br")
     {
     } // Inst_DS__DS_GWS_SEMA_BR

     Inst_DS__DS_GWS_SEMA_BR::~Inst_DS__DS_GWS_SEMA_BR()
     {
     } // ~Inst_DS__DS_GWS_SEMA_BR

     void
     Inst_DS__DS_GWS_SEMA_BR::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_GWS_SEMA_P::Inst_DS__DS_GWS_SEMA_P(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_gws_sema_p")
     {
     } // Inst_DS__DS_GWS_SEMA_P

     Inst_DS__DS_GWS_SEMA_P::~Inst_DS__DS_GWS_SEMA_P()
     {
     } // ~Inst_DS__DS_GWS_SEMA_P

     void
     Inst_DS__DS_GWS_SEMA_P::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_GWS_BARRIER::Inst_DS__DS_GWS_BARRIER(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_gws_barrier")
     {
     } // Inst_DS__DS_GWS_BARRIER

     Inst_DS__DS_GWS_BARRIER::~Inst_DS__DS_GWS_BARRIER()
     {
     } // ~Inst_DS__DS_GWS_BARRIER

     void
     Inst_DS__DS_GWS_BARRIER::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_CONSUME::Inst_DS__DS_CONSUME(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_consume")
     {
     } // Inst_DS__DS_CONSUME

     Inst_DS__DS_CONSUME::~Inst_DS__DS_CONSUME()
     {
     } // ~Inst_DS__DS_CONSUME

     void
     Inst_DS__DS_CONSUME::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_APPEND::Inst_DS__DS_APPEND(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_append")
     {
     } // Inst_DS__DS_APPEND

     Inst_DS__DS_APPEND::~Inst_DS__DS_APPEND()
     {
     } // ~Inst_DS__DS_APPEND

     void
     Inst_DS__DS_APPEND::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_ORDERED_COUNT::Inst_DS__DS_ORDERED_COUNT(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_ordered_count")
     {
     } // Inst_DS__DS_ORDERED_COUNT

     Inst_DS__DS_ORDERED_COUNT::~Inst_DS__DS_ORDERED_COUNT()
     {
     } // ~Inst_DS__DS_ORDERED_COUNT

     void
     Inst_DS__DS_ORDERED_COUNT::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_ADD_SRC2_U64::Inst_DS__DS_ADD_SRC2_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_add_src2_u64")
     {
     } // Inst_DS__DS_ADD_SRC2_U64

     Inst_DS__DS_ADD_SRC2_U64::~Inst_DS__DS_ADD_SRC2_U64()
     {
     } // ~Inst_DS__DS_ADD_SRC2_U64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] + MEM[B].
     void
     Inst_DS__DS_ADD_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_SUB_SRC2_U64::Inst_DS__DS_SUB_SRC2_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_sub_src2_u64")
     {
     } // Inst_DS__DS_SUB_SRC2_U64

     Inst_DS__DS_SUB_SRC2_U64::~Inst_DS__DS_SUB_SRC2_U64()
     {
     } // ~Inst_DS__DS_SUB_SRC2_U64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] - MEM[B].
     void
     Inst_DS__DS_SUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_RSUB_SRC2_U64::Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_rsub_src2_u64")
     {
     } // Inst_DS__DS_RSUB_SRC2_U64

     Inst_DS__DS_RSUB_SRC2_U64::~Inst_DS__DS_RSUB_SRC2_U64()
     {
     } // ~Inst_DS__DS_RSUB_SRC2_U64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[B] - MEM[A].
     void
     Inst_DS__DS_RSUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_INC_SRC2_U64::Inst_DS__DS_INC_SRC2_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_inc_src2_u64")
     {
     } // Inst_DS__DS_INC_SRC2_U64

     Inst_DS__DS_INC_SRC2_U64::~Inst_DS__DS_INC_SRC2_U64()
     {
     } // ~Inst_DS__DS_INC_SRC2_U64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
     void
     Inst_DS__DS_INC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_DEC_SRC2_U64::Inst_DS__DS_DEC_SRC2_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_dec_src2_u64")
     {
     } // Inst_DS__DS_DEC_SRC2_U64

     Inst_DS__DS_DEC_SRC2_U64::~Inst_DS__DS_DEC_SRC2_U64()
     {
     } // ~Inst_DS__DS_DEC_SRC2_U64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
     // Uint decrement.
     void
     Inst_DS__DS_DEC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_SRC2_I64::Inst_DS__DS_MIN_SRC2_I64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_src2_i64")
     {
     } // Inst_DS__DS_MIN_SRC2_I64

     Inst_DS__DS_MIN_SRC2_I64::~Inst_DS__DS_MIN_SRC2_I64()
     {
     } // ~Inst_DS__DS_MIN_SRC2_I64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = min(MEM[A], MEM[B]).
     void
     Inst_DS__DS_MIN_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_SRC2_I64::Inst_DS__DS_MAX_SRC2_I64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_src2_i64")
     {
     } // Inst_DS__DS_MAX_SRC2_I64

     Inst_DS__DS_MAX_SRC2_I64::~Inst_DS__DS_MAX_SRC2_I64()
     {
     } // ~Inst_DS__DS_MAX_SRC2_I64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = max(MEM[A], MEM[B]).
     void
     Inst_DS__DS_MAX_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_SRC2_U64::Inst_DS__DS_MIN_SRC2_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_src2_u64")
     {
     } // Inst_DS__DS_MIN_SRC2_U64

     Inst_DS__DS_MIN_SRC2_U64::~Inst_DS__DS_MIN_SRC2_U64()
     {
     } // ~Inst_DS__DS_MIN_SRC2_U64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = min(MEM[A], MEM[B]).
     void
     Inst_DS__DS_MIN_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_SRC2_U64::Inst_DS__DS_MAX_SRC2_U64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_src2_u64")
     {
     } // Inst_DS__DS_MAX_SRC2_U64

     Inst_DS__DS_MAX_SRC2_U64::~Inst_DS__DS_MAX_SRC2_U64()
     {
     } // ~Inst_DS__DS_MAX_SRC2_U64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = max(MEM[A], MEM[B]).
     void
     Inst_DS__DS_MAX_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_AND_SRC2_B64::Inst_DS__DS_AND_SRC2_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_and_src2_b64")
     {
     } // Inst_DS__DS_AND_SRC2_B64

     Inst_DS__DS_AND_SRC2_B64::~Inst_DS__DS_AND_SRC2_B64()
     {
     } // ~Inst_DS__DS_AND_SRC2_B64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] & MEM[B].
     void
     Inst_DS__DS_AND_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_OR_SRC2_B64::Inst_DS__DS_OR_SRC2_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_or_src2_b64")
     {
     } // Inst_DS__DS_OR_SRC2_B64

     Inst_DS__DS_OR_SRC2_B64::~Inst_DS__DS_OR_SRC2_B64()
     {
     } // ~Inst_DS__DS_OR_SRC2_B64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] | MEM[B].
     void
     Inst_DS__DS_OR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_XOR_SRC2_B64::Inst_DS__DS_XOR_SRC2_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_xor_src2_b64")
     {
     } // Inst_DS__DS_XOR_SRC2_B64

     Inst_DS__DS_XOR_SRC2_B64::~Inst_DS__DS_XOR_SRC2_B64()
     {
     } // ~Inst_DS__DS_XOR_SRC2_B64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[A] ^ MEM[B].
     void
     Inst_DS__DS_XOR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRITE_SRC2_B64::Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write_src2_b64")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE_SRC2_B64

     Inst_DS__DS_WRITE_SRC2_B64::~Inst_DS__DS_WRITE_SRC2_B64()
     {
     } // ~Inst_DS__DS_WRITE_SRC2_B64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = MEM[B].
     // Write qword.
     void
     Inst_DS__DS_WRITE_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MIN_SRC2_F64::Inst_DS__DS_MIN_SRC2_F64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_min_src2_f64")
     {
         setFlag(F64);
     } // Inst_DS__DS_MIN_SRC2_F64

     Inst_DS__DS_MIN_SRC2_F64::~Inst_DS__DS_MIN_SRC2_F64()
     {
     } // ~Inst_DS__DS_MIN_SRC2_F64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
     void
     Inst_DS__DS_MIN_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_MAX_SRC2_F64::Inst_DS__DS_MAX_SRC2_F64(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_max_src2_f64")
     {
         setFlag(F64);
     } // Inst_DS__DS_MAX_SRC2_F64

     Inst_DS__DS_MAX_SRC2_F64::~Inst_DS__DS_MAX_SRC2_F64()
     {
     } // ~Inst_DS__DS_MAX_SRC2_F64

     // A = ADDR_BASE;
     // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
     // {offset1[6],offset1[6:0],offset0});
     // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
     void
     Inst_DS__DS_MAX_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRITE_B96::Inst_DS__DS_WRITE_B96(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write_b96")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE_B96

     Inst_DS__DS_WRITE_B96::~Inst_DS__DS_WRITE_B96()
     {
     } // ~Inst_DS__DS_WRITE_B96

     // {MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[95:0].
     // Tri-dword write.
     void
     Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write_b128")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_DS__DS_WRITE_B128

     Inst_DS__DS_WRITE_B128::~Inst_DS__DS_WRITE_B128()
     {
     } // ~Inst_DS__DS_WRITE_B128

     // {MEM[ADDR + 12], MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[127:0].
     // Qword write.
     void
     Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read_b96")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ_B96

     Inst_DS__DS_READ_B96::~Inst_DS__DS_READ_B96()
     {
     } // ~Inst_DS__DS_READ_B96

     // Tri-dword read.
     void
     Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read_b128")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_DS__DS_READ_B128

     Inst_DS__DS_READ_B128::~Inst_DS__DS_READ_B128()
     {
     } // ~Inst_DS__DS_READ_B128

     // Qword read.
     void
     Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_LOAD_FORMAT_X
         ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_format_x")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_FORMAT_X

     Inst_MUBUF__BUFFER_LOAD_FORMAT_X::~Inst_MUBUF__BUFFER_LOAD_FORMAT_X()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X

     // Untyped buffer load 1 dword with format conversion.
     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
         ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_format_xy")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY

     Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY

     // Untyped buffer load 2 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
         ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_format_xyz")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ

     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ

     // Untyped buffer load 3 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
         ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_format_xyzw")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW

     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW

     // Untyped buffer load 4 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_FORMAT_X
         ::Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_format_x")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_STORE_FORMAT_X

     Inst_MUBUF__BUFFER_STORE_FORMAT_X::~Inst_MUBUF__BUFFER_STORE_FORMAT_X()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_X

     // Untyped buffer store 1 dword with format conversion.
     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_FORMAT_XY
         ::Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_format_xy")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_STORE_FORMAT_XY

     Inst_MUBUF__BUFFER_STORE_FORMAT_XY::~Inst_MUBUF__BUFFER_STORE_FORMAT_XY()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY

     // Untyped buffer store 2 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
         ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_format_xyz")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ

     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ

     // Untyped buffer store 3 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
         ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_format_xyzw")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW

     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
         ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW

     // Untyped buffer store 4 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
         ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_format_d16_x")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X

     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
         ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X

     // Untyped buffer load 1 dword with format conversion.
     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
         ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_format_d16_xy")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY

     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
         ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY

     // Untyped buffer load 2 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
         ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyz")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ

     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
         ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ

     // Untyped buffer load 3 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
         ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyzw")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW

     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
         ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW

     // Untyped buffer load 4 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
         ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_format_d16_x")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X

     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
         ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X

     // Untyped buffer store 1 dword with format conversion.
     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
         ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_format_d16_xy")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY

     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
         ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY

     // Untyped buffer store 2 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
         ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyz")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ

     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
         ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ

     // Untyped buffer store 3 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
         ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyzw")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW

     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
         ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW

     // Untyped buffer store 4 dwords with format conversion.
     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_UBYTE
         ::Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_ubyte")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_LOAD_UBYTE

     Inst_MUBUF__BUFFER_LOAD_UBYTE::~Inst_MUBUF__BUFFER_LOAD_UBYTE()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_UBYTE

     // Untyped buffer load unsigned byte (zero extend to VGPR destination).
     void
     Inst_MUBUF__BUFFER_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

         rsrcDesc.read();
         offset.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->rdLmReqsInPipe--;
             wf->outstandingReqsRdLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<VecElemU8>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst(gpuDynInst, extData.VDATA);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 if (!oobMask[lane]) {
                     vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
                         gpuDynInst->d_data))[lane]);
                 } else {
                     vdst[lane] = 0;
                 }
             }
         }

         vdst.write();
     }


     Inst_MUBUF__BUFFER_LOAD_SBYTE
         ::Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_sbyte")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_SBYTE

     Inst_MUBUF__BUFFER_LOAD_SBYTE::~Inst_MUBUF__BUFFER_LOAD_SBYTE()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_SBYTE

     // Untyped buffer load signed byte (sign extend to VGPR destination).
     void
     Inst_MUBUF__BUFFER_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_USHORT
         ::Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_ushort")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_LOAD_USHORT

     Inst_MUBUF__BUFFER_LOAD_USHORT::~Inst_MUBUF__BUFFER_LOAD_USHORT()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_USHORT

     // Untyped buffer load unsigned short (zero extend to VGPR destination).
     void
     Inst_MUBUF__BUFFER_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

         rsrcDesc.read();
         offset.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdLmReqsInPipe--;
             wf->outstandingReqsRdLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<VecElemU16>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst(gpuDynInst, extData.VDATA);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 if (!oobMask[lane]) {
                     vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
                         gpuDynInst->d_data))[lane]);
                 } else {
                     vdst[lane] = 0;
                 }
             }
         }

         vdst.write();
     }


     Inst_MUBUF__BUFFER_LOAD_SSHORT
         ::Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_sshort")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_LOAD_SSHORT

     Inst_MUBUF__BUFFER_LOAD_SSHORT::~Inst_MUBUF__BUFFER_LOAD_SSHORT()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_SSHORT

     // Untyped buffer load signed short (sign extend to VGPR destination).
     void
     Inst_MUBUF__BUFFER_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_LOAD_DWORD
         ::Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_dword")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_LOAD_DWORD

     Inst_MUBUF__BUFFER_LOAD_DWORD::~Inst_MUBUF__BUFFER_LOAD_DWORD()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_DWORD

     // Untyped buffer load dword.
     void
     Inst_MUBUF__BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

         rsrcDesc.read();
         offset.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdLmReqsInPipe--;
             wf->outstandingReqsRdLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_MUBUF__BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst(gpuDynInst, extData.VDATA);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 if (!oobMask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane];
                 } else {
                     vdst[lane] = 0;
                 }
             }
         }

         vdst.write();
     } // completeAcc

     Inst_MUBUF__BUFFER_LOAD_DWORDX2
         ::Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_dwordx2")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_LOAD_DWORDX2

     Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX2

     // Untyped buffer load 2 dwords.
     void
     Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

         rsrcDesc.read();
         offset.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdLmReqsInPipe--;
             wf->outstandingReqsRdLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<2>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
         VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 if (!oobMask[lane]) {
                     vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane * 2];
                     vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane * 2 + 1];
                 } else {
                     vdst0[lane] = 0;
                     vdst1[lane] = 0;
                 }
             }
         }

         vdst0.write();
         vdst1.write();
     } // completeAcc

     Inst_MUBUF__BUFFER_LOAD_DWORDX3
         ::Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_dwordx3")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_LOAD_DWORDX3

     Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX3

     // Untyped buffer load 3 dwords.
     void
     Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

         rsrcDesc.read();
         offset.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdLmReqsInPipe--;
             wf->outstandingReqsRdLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<3>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
         VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
         VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 if (!oobMask[lane]) {
                     vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane * 3];
                     vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane * 3 + 1];
                     vdst2[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane * 3 + 2];
                 } else {
                     vdst0[lane] = 0;
                     vdst1[lane] = 0;
                     vdst2[lane] = 0;
                 }
             }
         }

         vdst0.write();
         vdst1.write();
         vdst2.write();
     } // completeAcc

     Inst_MUBUF__BUFFER_LOAD_DWORDX4
         ::Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_dwordx4")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_LOAD_DWORDX4

     Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
     {
     } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX4

     // Untyped buffer load 4 dwords.
     void
     Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

         rsrcDesc.read();
         offset.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdLmReqsInPipe--;
             wf->outstandingReqsRdLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<4>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
         VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
         VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
         VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 if (!oobMask[lane]) {
                     vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane * 4];
                     vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane * 4 + 1];
                     vdst2[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane * 4 + 2];
                     vdst3[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane * 4 + 3];
                 } else {
                     vdst0[lane] = 0;
                     vdst1[lane] = 0;
                     vdst2[lane] = 0;
                     vdst3[lane] = 0;
                 }
             }
         }

         vdst0.write();
         vdst1.write();
         vdst2.write();
         vdst3.write();
     } // completeAcc

     Inst_MUBUF__BUFFER_STORE_BYTE
         ::Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_byte")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_STORE_BYTE

     Inst_MUBUF__BUFFER_STORE_BYTE::~Inst_MUBUF__BUFFER_STORE_BYTE()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_BYTE

     // Untyped buffer store byte.
     void
     Inst_MUBUF__BUFFER_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

         rsrcDesc.read();
         offset.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrLmReqsInPipe--;
             wf->outstandingReqsWrLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         initMemWrite<VecElemI8>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_SHORT
         ::Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_short")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_STORE_SHORT

     Inst_MUBUF__BUFFER_STORE_SHORT::~Inst_MUBUF__BUFFER_STORE_SHORT()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_SHORT

     // Untyped buffer store short.
     void
     Inst_MUBUF__BUFFER_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

         rsrcDesc.read();
         offset.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrLmReqsInPipe--;
             wf->outstandingReqsWrLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         initMemWrite<VecElemI16>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MUBUF__BUFFER_STORE_DWORD::
         Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_dword")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_STORE_DWORD

     Inst_MUBUF__BUFFER_STORE_DWORD::~Inst_MUBUF__BUFFER_STORE_DWORD()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_DWORD

     // Untyped buffer store dword.
     void
     Inst_MUBUF__BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

         rsrcDesc.read();
         offset.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrLmReqsInPipe--;
             wf->outstandingReqsWrLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         initMemWrite<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_MUBUF__BUFFER_STORE_DWORDX2
         ::Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_dwordx2")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_STORE_DWORDX2

     Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_DWORDX2

     // Untyped buffer store 2 dwords.
     void
     Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
         ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
         ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);

         rsrcDesc.read();
         offset.read();
         data0.read();
         data1.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrLmReqsInPipe--;
             wf->outstandingReqsWrLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
                     = data0[lane];
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
                     = data1[lane];
             }
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemWrite<2>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_MUBUF__BUFFER_STORE_DWORDX3
         ::Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_dwordx3")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_STORE_DWORDX3

     Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_DWORDX3

     // Untyped buffer store 3 dwords.
     void
     Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
         ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
         ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
         ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);

         rsrcDesc.read();
         offset.read();
         data0.read();
         data1.read();
         data2.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrLmReqsInPipe--;
             wf->outstandingReqsWrLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
                     = data0[lane];
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
                     = data1[lane];
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
                     = data2[lane];
             }
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemWrite<3>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_MUBUF__BUFFER_STORE_DWORDX4
         ::Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_dwordx4")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         if (instData.LDS) {
             setFlag(GroupSegment);
         } else {
             setFlag(GlobalSegment);
         }
     } // Inst_MUBUF__BUFFER_STORE_DWORDX4

     Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_DWORDX4

     // Untyped buffer store 4 dwords.
     void
     Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
         ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
         ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
         ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
         ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
         ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
         ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
         ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);

         rsrcDesc.read();
         offset.read();
         data0.read();
         data1.read();
         data2.read();
         data3.read();

         int inst_offset = instData.OFFSET;

         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (!instData.IDXEN && instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr0, addr1, rsrcDesc, offset, inst_offset);
         } else if (instData.IDXEN && !instData.OFFEN) {
             addr0.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         } else {
             addr0.read();
             addr1.read();
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                     addr1, addr0, rsrcDesc, offset, inst_offset);
         }

         if (isLocalMem()) {
             gpuDynInst->computeUnit()->localMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrLmReqsInPipe--;
             wf->outstandingReqsWrLm++;
         } else {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         }

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
                     = data0[lane];
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
                     = data1[lane];
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
                     = data2[lane];
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
                     = data3[lane];
             }
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     } // execute

     void
     Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemWrite<4>(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_MUBUF__BUFFER_STORE_LDS_DWORD
         ::Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_store_lds_dword")
     {
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_STORE_LDS_DWORD

     Inst_MUBUF__BUFFER_STORE_LDS_DWORD::~Inst_MUBUF__BUFFER_STORE_LDS_DWORD()
     {
     } // ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD

     // Store one DWORD from LDS memory to system memory without utilizing
     // VGPRs.
     void
     Inst_MUBUF__BUFFER_STORE_LDS_DWORD::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_WBINVL1::Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_wbinvl1")
     {
         setFlag(MemoryRef);
         setFlag(GPUStaticInst::MemSync);
         setFlag(GlobalSegment);
         setFlag(MemSync);
     } // Inst_MUBUF__BUFFER_WBINVL1

     Inst_MUBUF__BUFFER_WBINVL1::~Inst_MUBUF__BUFFER_WBINVL1()
     {
     } // ~Inst_MUBUF__BUFFER_WBINVL1

     // Write back and invalidate the shader L1.
     // Always returns ACK to shader.
     void
     Inst_MUBUF__BUFFER_WBINVL1::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;

             wf->outstandingReqsWrGm++;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_MUBUF__BUFFER_WBINVL1::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         injectGlobalMemFence(gpuDynInst);
     } // initiateAcc

     void
     Inst_MUBUF__BUFFER_WBINVL1::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_MUBUF__BUFFER_WBINVL1_VOL
         ::Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*iFmt)
         : Inst_MUBUF(iFmt, "buffer_wbinvl1_vol") {
         /**
          * This instruction is same as buffer_wbinvl1 instruction except this
          * instruction only invalidate L1 shader line with MTYPE for system
          * or group coherence. Since L1 do not differentiate between its cache
          * lines, this instruction currently behaves (and implemented )
          * exactly like buffer_wbinvl1 instruction.
          */
         setFlag(MemoryRef);
         setFlag(GPUStaticInst::MemSync);
         setFlag(GlobalSegment);
         setFlag(MemSync);
     } // Inst_MUBUF__BUFFER_WBINVL1_VOL

     Inst_MUBUF__BUFFER_WBINVL1_VOL::~Inst_MUBUF__BUFFER_WBINVL1_VOL()
     {
     } // ~Inst_MUBUF__BUFFER_WBINVL1_VOL

     // Write back and invalidate the shader L1 only for lines that are marked
     // volatile. Always returns ACK to shader.
     void
     Inst_MUBUF__BUFFER_WBINVL1_VOL::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;

             wf->outstandingReqsWrGm++;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }
     void
     Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         injectGlobalMemFence(gpuDynInst);
     } // initiateAcc
     void
     Inst_MUBUF__BUFFER_WBINVL1_VOL::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_MUBUF__BUFFER_ATOMIC_SWAP
         ::Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_swap")
     {
         setFlag(AtomicExch);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         } // if
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_SWAP

     Inst_MUBUF__BUFFER_ATOMIC_SWAP::~Inst_MUBUF__BUFFER_ATOMIC_SWAP()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
         ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap")
     {
         setFlag(AtomicCAS);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP

     Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP

     // tmp = MEM[ADDR];
     // src = DATA[0];
     // cmp = DATA[1];
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_ADD
         ::Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_add")
     {
         setFlag(AtomicAdd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         } // if
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_ADD

     Inst_MUBUF__BUFFER_ATOMIC_ADD::~Inst_MUBUF__BUFFER_ATOMIC_ADD()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_SUB
         ::Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_sub")
     {
         setFlag(AtomicSub);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_SUB

     Inst_MUBUF__BUFFER_ATOMIC_SUB::~Inst_MUBUF__BUFFER_ATOMIC_SUB()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_SMIN
         ::Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_smin")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_SMIN

     Inst_MUBUF__BUFFER_ATOMIC_SMIN::~Inst_MUBUF__BUFFER_ATOMIC_SMIN()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_UMIN
         ::Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_umin")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_UMIN

     Inst_MUBUF__BUFFER_ATOMIC_UMIN::~Inst_MUBUF__BUFFER_ATOMIC_UMIN()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_SMAX
         ::Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_smax")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_SMAX

     Inst_MUBUF__BUFFER_ATOMIC_SMAX::~Inst_MUBUF__BUFFER_ATOMIC_SMAX()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_UMAX
         ::Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_umax")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         } // if
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_UMAX

     Inst_MUBUF__BUFFER_ATOMIC_UMAX::~Inst_MUBUF__BUFFER_ATOMIC_UMAX()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_AND
         ::Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_and")
     {
         setFlag(AtomicAnd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_AND

     Inst_MUBUF__BUFFER_ATOMIC_AND::~Inst_MUBUF__BUFFER_ATOMIC_AND()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_AND

     // tmp = MEM[ADDR];
     // MEM[ADDR] &= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_OR
         ::Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_or")
     {
         setFlag(AtomicOr);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_OR

     Inst_MUBUF__BUFFER_ATOMIC_OR::~Inst_MUBUF__BUFFER_ATOMIC_OR()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_OR

     // tmp = MEM[ADDR];
     // MEM[ADDR] |= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_XOR
         ::Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_xor")
     {
         setFlag(AtomicXor);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_XOR

     Inst_MUBUF__BUFFER_ATOMIC_XOR::~Inst_MUBUF__BUFFER_ATOMIC_XOR()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR

     // tmp = MEM[ADDR];
     // MEM[ADDR] ^= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_INC
         ::Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_inc")
     {
         setFlag(AtomicInc);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_INC

     Inst_MUBUF__BUFFER_ATOMIC_INC::~Inst_MUBUF__BUFFER_ATOMIC_INC()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_INC

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_DEC
         ::Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_dec")
     {
         setFlag(AtomicDec);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_DEC

     Inst_MUBUF__BUFFER_ATOMIC_DEC::~Inst_MUBUF__BUFFER_ATOMIC_DEC()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
     // (unsigned compare); RETURN_DATA = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_swap_x2")
     {
         setFlag(AtomicExch);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2

     Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap_x2")
     {
         setFlag(AtomicCAS);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2

     Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
         ::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2

     // tmp = MEM[ADDR];
     // src = DATA[0:1];
     // cmp = DATA[2:3];
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_add_x2")
     {
         setFlag(AtomicAdd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2

     Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_sub_x2")
     {
         setFlag(AtomicSub);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2

     Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_smin_x2")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2

     Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_umin_x2")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2

     Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_smax_x2")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2

     Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_umax_x2")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2

     Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_AND_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_and_x2")
     {
         setFlag(AtomicAnd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_AND_X2

     Inst_MUBUF__BUFFER_ATOMIC_AND_X2::~Inst_MUBUF__BUFFER_ATOMIC_AND_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] &= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_OR_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_or_x2")
     {
         setFlag(AtomicOr);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_OR_X2

     Inst_MUBUF__BUFFER_ATOMIC_OR_X2::~Inst_MUBUF__BUFFER_ATOMIC_OR_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] |= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_xor_x2")
     {
         setFlag(AtomicXor);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2

     Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] ^= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_INC_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_inc_x2")
     {
         setFlag(AtomicInc);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_INC_X2

     Inst_MUBUF__BUFFER_ATOMIC_INC_X2::~Inst_MUBUF__BUFFER_ATOMIC_INC_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
         ::Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_atomic_dec_x2")
     {
         setFlag(AtomicDec);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2

     Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2()
     {
     } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
     // (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
         ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
     {
     } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X

     // Typed buffer load 1 dword with format conversion.
     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
         ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
     {
     } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY

     // Typed buffer load 2 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
         ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
     {
     } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ

     // Typed buffer load 3 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
         ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
         ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
     {
     } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW

     // Typed buffer load 4 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_STORE_FORMAT_X
         ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X

     Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
     {
     } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X

     // Typed buffer store 1 dword with format conversion.
     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
         ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY

     Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
     {
     } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY

     // Typed buffer store 2 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
         ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ

     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
         ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
     {
     } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ

     // Typed buffer store 3 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
         ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW

     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
         ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
     {
     } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW

     // Typed buffer store 4 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
         ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
         ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
     {
     } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X

     // Typed buffer load 1 dword with format conversion.
     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
           GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
         ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
         ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
     {
     } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY

     // Typed buffer load 2 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
         ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
           InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
         ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
     {
     } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ

     // Typed buffer load 3 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
         ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
           InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW

     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
         ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
     {
     } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW

     // Typed buffer load 4 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
         ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X

     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
         ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
     {
     } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X

     // Typed buffer store 1 dword with format conversion.
     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
         ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY

     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
         ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
     {
     } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY

     // Typed buffer store 2 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
         ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ

     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
         ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
     {
     } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ

     // Typed buffer store 3 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
           GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
         ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
         : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW

     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
         ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
     {
     } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW

     // Typed buffer store 4 dwords with format conversion.
     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
         GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
         GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
         GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_LOAD::Inst_MIMG__IMAGE_LOAD(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_load")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_LOAD

     Inst_MIMG__IMAGE_LOAD::~Inst_MIMG__IMAGE_LOAD()
     {
     } // ~Inst_MIMG__IMAGE_LOAD

     // Image memory load with format conversion specified
     void
     Inst_MIMG__IMAGE_LOAD::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_LOAD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_LOAD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_LOAD_MIP::Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_load_mip")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_LOAD_MIP

     Inst_MIMG__IMAGE_LOAD_MIP::~Inst_MIMG__IMAGE_LOAD_MIP()
     {
     } // ~Inst_MIMG__IMAGE_LOAD_MIP

     void
     Inst_MIMG__IMAGE_LOAD_MIP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_LOAD_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_LOAD_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_LOAD_PCK::Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_load_pck")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_LOAD_PCK

     Inst_MIMG__IMAGE_LOAD_PCK::~Inst_MIMG__IMAGE_LOAD_PCK()
     {
     } // ~Inst_MIMG__IMAGE_LOAD_PCK

     void
     Inst_MIMG__IMAGE_LOAD_PCK::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_LOAD_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_LOAD_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_LOAD_PCK_SGN::Inst_MIMG__IMAGE_LOAD_PCK_SGN(
         InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_load_pck_sgn")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_LOAD_PCK_SGN

     Inst_MIMG__IMAGE_LOAD_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_PCK_SGN()
     {
     } // ~Inst_MIMG__IMAGE_LOAD_PCK_SGN

     // Image memory load with with no format conversion and sign extension
     void
     Inst_MIMG__IMAGE_LOAD_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_LOAD_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_LOAD_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_LOAD_MIP_PCK::Inst_MIMG__IMAGE_LOAD_MIP_PCK(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_load_mip_pck")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_LOAD_MIP_PCK

     Inst_MIMG__IMAGE_LOAD_MIP_PCK::~Inst_MIMG__IMAGE_LOAD_MIP_PCK()
     {
     } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK

     // Image memory load with user-supplied mip level, no format conversion
     void
     Inst_MIMG__IMAGE_LOAD_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_LOAD_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_LOAD_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(
         InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_load_mip_pck_sgn")
     {
         setFlag(MemoryRef);
         setFlag(Load);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN

     Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN()
     {
     } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN

     // Image memory load with user-supplied mip level, no format conversion.
     void
     Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_STORE::Inst_MIMG__IMAGE_STORE(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_store")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_STORE

     Inst_MIMG__IMAGE_STORE::~Inst_MIMG__IMAGE_STORE()
     {
     } // ~Inst_MIMG__IMAGE_STORE

     // Image memory store with format conversion specified
     void
     Inst_MIMG__IMAGE_STORE::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_STORE::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_STORE::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_STORE_MIP::Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_store_mip")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_STORE_MIP

     Inst_MIMG__IMAGE_STORE_MIP::~Inst_MIMG__IMAGE_STORE_MIP()
     {
     } // ~Inst_MIMG__IMAGE_STORE_MIP

     void
     Inst_MIMG__IMAGE_STORE_MIP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_STORE_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_STORE_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_STORE_PCK::Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_store_pck")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_STORE_PCK

     Inst_MIMG__IMAGE_STORE_PCK::~Inst_MIMG__IMAGE_STORE_PCK()
     {
     } // ~Inst_MIMG__IMAGE_STORE_PCK

     // Image memory store of packed data without format conversion.
     void
     Inst_MIMG__IMAGE_STORE_PCK::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_STORE_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_STORE_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_STORE_MIP_PCK::Inst_MIMG__IMAGE_STORE_MIP_PCK(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_store_mip_pck")
     {
         setFlag(MemoryRef);
         setFlag(Store);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_STORE_MIP_PCK

     Inst_MIMG__IMAGE_STORE_MIP_PCK::~Inst_MIMG__IMAGE_STORE_MIP_PCK()
     {
     } // ~Inst_MIMG__IMAGE_STORE_MIP_PCK

     // Image memory store of packed data without format conversion
     void
     Inst_MIMG__IMAGE_STORE_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_MIMG__IMAGE_STORE_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_MIMG__IMAGE_STORE_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_MIMG__IMAGE_GET_RESINFO::Inst_MIMG__IMAGE_GET_RESINFO(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_get_resinfo")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GET_RESINFO

     Inst_MIMG__IMAGE_GET_RESINFO::~Inst_MIMG__IMAGE_GET_RESINFO()
     {
     } // ~Inst_MIMG__IMAGE_GET_RESINFO

     void
     Inst_MIMG__IMAGE_GET_RESINFO::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_SWAP::Inst_MIMG__IMAGE_ATOMIC_SWAP(
         InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_swap")
     {
         setFlag(AtomicExch);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_SWAP

     Inst_MIMG__IMAGE_ATOMIC_SWAP::~Inst_MIMG__IMAGE_ATOMIC_SWAP()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_SWAP

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(
         InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_cmpswap")
     {
         setFlag(AtomicCAS);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP

     Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP

     // tmp = MEM[ADDR];
     // src = DATA[0];
     // cmp = DATA[1];
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_ADD::Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_add")
     {
         setFlag(AtomicAdd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_ADD

     Inst_MIMG__IMAGE_ATOMIC_ADD::~Inst_MIMG__IMAGE_ATOMIC_ADD()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_ADD

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_SUB::Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_sub")
     {
         setFlag(AtomicSub);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_SUB

     Inst_MIMG__IMAGE_ATOMIC_SUB::~Inst_MIMG__IMAGE_ATOMIC_SUB()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_SUB

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_SMIN::Inst_MIMG__IMAGE_ATOMIC_SMIN(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_smin")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_SMIN

     Inst_MIMG__IMAGE_ATOMIC_SMIN::~Inst_MIMG__IMAGE_ATOMIC_SMIN()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_SMIN

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_UMIN::Inst_MIMG__IMAGE_ATOMIC_UMIN(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_umin")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_UMIN

     Inst_MIMG__IMAGE_ATOMIC_UMIN::~Inst_MIMG__IMAGE_ATOMIC_UMIN()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_UMIN

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_SMAX::Inst_MIMG__IMAGE_ATOMIC_SMAX(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_smax")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_SMAX

     Inst_MIMG__IMAGE_ATOMIC_SMAX::~Inst_MIMG__IMAGE_ATOMIC_SMAX()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_SMAX

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_UMAX::Inst_MIMG__IMAGE_ATOMIC_UMAX(
         InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_umax")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_UMAX

     Inst_MIMG__IMAGE_ATOMIC_UMAX::~Inst_MIMG__IMAGE_ATOMIC_UMAX()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_UMAX

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_AND::Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_and")
     {
         setFlag(AtomicAnd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_AND

     Inst_MIMG__IMAGE_ATOMIC_AND::~Inst_MIMG__IMAGE_ATOMIC_AND()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_AND

     // tmp = MEM[ADDR];
     // MEM[ADDR] &= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_OR::Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_or")
     {
         setFlag(AtomicOr);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_OR

     Inst_MIMG__IMAGE_ATOMIC_OR::~Inst_MIMG__IMAGE_ATOMIC_OR()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_OR

     // tmp = MEM[ADDR];
     // MEM[ADDR] |= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_XOR::Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_xor")
     {
         setFlag(AtomicXor);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_XOR

     Inst_MIMG__IMAGE_ATOMIC_XOR::~Inst_MIMG__IMAGE_ATOMIC_XOR()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_XOR

     // tmp = MEM[ADDR];
     // MEM[ADDR] ^= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_INC::Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_inc")
     {
         setFlag(AtomicInc);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_INC

     Inst_MIMG__IMAGE_ATOMIC_INC::~Inst_MIMG__IMAGE_ATOMIC_INC()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_INC

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_ATOMIC_DEC::Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_atomic_dec")
     {
         setFlag(AtomicDec);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_ATOMIC_DEC

     Inst_MIMG__IMAGE_ATOMIC_DEC::~Inst_MIMG__IMAGE_ATOMIC_DEC()
     {
     } // ~Inst_MIMG__IMAGE_ATOMIC_DEC

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
     // (unsigned compare); RETURN_DATA = tmp.
     void
     Inst_MIMG__IMAGE_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE::Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE

     Inst_MIMG__IMAGE_SAMPLE::~Inst_MIMG__IMAGE_SAMPLE()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE

     void
     Inst_MIMG__IMAGE_SAMPLE::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_CL::Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_CL

     Inst_MIMG__IMAGE_SAMPLE_CL::~Inst_MIMG__IMAGE_SAMPLE_CL()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_CL

     void
     Inst_MIMG__IMAGE_SAMPLE_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_D::Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_d")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_D

     Inst_MIMG__IMAGE_SAMPLE_D::~Inst_MIMG__IMAGE_SAMPLE_D()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_D

     void
     Inst_MIMG__IMAGE_SAMPLE_D::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_D_CL::Inst_MIMG__IMAGE_SAMPLE_D_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_d_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_D_CL

     Inst_MIMG__IMAGE_SAMPLE_D_CL::~Inst_MIMG__IMAGE_SAMPLE_D_CL()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL

     void
     Inst_MIMG__IMAGE_SAMPLE_D_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_L::Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_l")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_L

     Inst_MIMG__IMAGE_SAMPLE_L::~Inst_MIMG__IMAGE_SAMPLE_L()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_L

     void
     Inst_MIMG__IMAGE_SAMPLE_L::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_B::Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_b")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_B

     Inst_MIMG__IMAGE_SAMPLE_B::~Inst_MIMG__IMAGE_SAMPLE_B()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_B

     void
     Inst_MIMG__IMAGE_SAMPLE_B::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_B_CL::Inst_MIMG__IMAGE_SAMPLE_B_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_b_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_B_CL

     Inst_MIMG__IMAGE_SAMPLE_B_CL::~Inst_MIMG__IMAGE_SAMPLE_B_CL()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL

     void
     Inst_MIMG__IMAGE_SAMPLE_B_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_LZ::Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_lz")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_LZ

     Inst_MIMG__IMAGE_SAMPLE_LZ::~Inst_MIMG__IMAGE_SAMPLE_LZ()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_LZ

     void
     Inst_MIMG__IMAGE_SAMPLE_LZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C::Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C

     Inst_MIMG__IMAGE_SAMPLE_C::~Inst_MIMG__IMAGE_SAMPLE_C()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C

     void
     Inst_MIMG__IMAGE_SAMPLE_C::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_CL::Inst_MIMG__IMAGE_SAMPLE_C_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_CL

     Inst_MIMG__IMAGE_SAMPLE_C_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CL()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL

     void
     Inst_MIMG__IMAGE_SAMPLE_C_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_D::Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_d")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_D

     Inst_MIMG__IMAGE_SAMPLE_C_D::~Inst_MIMG__IMAGE_SAMPLE_C_D()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_D

     void
     Inst_MIMG__IMAGE_SAMPLE_C_D::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_D_CL::Inst_MIMG__IMAGE_SAMPLE_C_D_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_d_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL

     Inst_MIMG__IMAGE_SAMPLE_C_D_CL::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL

     void
     Inst_MIMG__IMAGE_SAMPLE_C_D_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_L::Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_l")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_L

     Inst_MIMG__IMAGE_SAMPLE_C_L::~Inst_MIMG__IMAGE_SAMPLE_C_L()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_L

     void
     Inst_MIMG__IMAGE_SAMPLE_C_L::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_B::Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_b")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_B

     Inst_MIMG__IMAGE_SAMPLE_C_B::~Inst_MIMG__IMAGE_SAMPLE_C_B()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_B

     void
     Inst_MIMG__IMAGE_SAMPLE_C_B::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_B_CL::Inst_MIMG__IMAGE_SAMPLE_C_B_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_b_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL

     Inst_MIMG__IMAGE_SAMPLE_C_B_CL::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL

     void
     Inst_MIMG__IMAGE_SAMPLE_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_LZ::Inst_MIMG__IMAGE_SAMPLE_C_LZ(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_lz")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_LZ

     Inst_MIMG__IMAGE_SAMPLE_C_LZ::~Inst_MIMG__IMAGE_SAMPLE_C_LZ()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ

     void
     Inst_MIMG__IMAGE_SAMPLE_C_LZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_O::Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_O

     Inst_MIMG__IMAGE_SAMPLE_O::~Inst_MIMG__IMAGE_SAMPLE_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_O

     void
     Inst_MIMG__IMAGE_SAMPLE_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_CL_O::Inst_MIMG__IMAGE_SAMPLE_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_CL_O

     Inst_MIMG__IMAGE_SAMPLE_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_CL_O

     void
     Inst_MIMG__IMAGE_SAMPLE_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_D_O::Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_d_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_D_O

     Inst_MIMG__IMAGE_SAMPLE_D_O::~Inst_MIMG__IMAGE_SAMPLE_D_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_D_O

     void
     Inst_MIMG__IMAGE_SAMPLE_D_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_D_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_d_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_D_CL_O

     Inst_MIMG__IMAGE_SAMPLE_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_D_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O

     void
     Inst_MIMG__IMAGE_SAMPLE_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_L_O::Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_l_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_L_O

     Inst_MIMG__IMAGE_SAMPLE_L_O::~Inst_MIMG__IMAGE_SAMPLE_L_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_L_O

     void
     Inst_MIMG__IMAGE_SAMPLE_L_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_B_O::Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_b_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_B_O

     Inst_MIMG__IMAGE_SAMPLE_B_O::~Inst_MIMG__IMAGE_SAMPLE_B_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_B_O

     void
     Inst_MIMG__IMAGE_SAMPLE_B_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_B_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_b_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_B_CL_O

     Inst_MIMG__IMAGE_SAMPLE_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_B_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O

     void
     Inst_MIMG__IMAGE_SAMPLE_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_LZ_O::Inst_MIMG__IMAGE_SAMPLE_LZ_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_lz_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_LZ_O

     Inst_MIMG__IMAGE_SAMPLE_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_LZ_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_LZ_O

     void
     Inst_MIMG__IMAGE_SAMPLE_LZ_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_O::Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_O

     Inst_MIMG__IMAGE_SAMPLE_C_O::~Inst_MIMG__IMAGE_SAMPLE_C_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_CL_O

     Inst_MIMG__IMAGE_SAMPLE_C_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_D_O::Inst_MIMG__IMAGE_SAMPLE_C_D_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_d_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_D_O

     Inst_MIMG__IMAGE_SAMPLE_C_D_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_D_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_d_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O

     Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_L_O::Inst_MIMG__IMAGE_SAMPLE_C_L_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_l_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_L_O

     Inst_MIMG__IMAGE_SAMPLE_C_L_O::~Inst_MIMG__IMAGE_SAMPLE_C_L_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_L_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_L_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_B_O::Inst_MIMG__IMAGE_SAMPLE_C_B_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_b_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_B_O

     Inst_MIMG__IMAGE_SAMPLE_C_B_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_B_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_b_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O

     Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_lz_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O

     Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4::Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4

     Inst_MIMG__IMAGE_GATHER4::~Inst_MIMG__IMAGE_GATHER4()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4

     void
     Inst_MIMG__IMAGE_GATHER4::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_CL::Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_CL

     Inst_MIMG__IMAGE_GATHER4_CL::~Inst_MIMG__IMAGE_GATHER4_CL()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_CL

     void
     Inst_MIMG__IMAGE_GATHER4_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_L::Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_l")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_L

     Inst_MIMG__IMAGE_GATHER4_L::~Inst_MIMG__IMAGE_GATHER4_L()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_L

     void
     Inst_MIMG__IMAGE_GATHER4_L::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_B::Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_b")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_B

     Inst_MIMG__IMAGE_GATHER4_B::~Inst_MIMG__IMAGE_GATHER4_B()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_B

     void
     Inst_MIMG__IMAGE_GATHER4_B::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_B_CL::Inst_MIMG__IMAGE_GATHER4_B_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_b_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_B_CL

     Inst_MIMG__IMAGE_GATHER4_B_CL::~Inst_MIMG__IMAGE_GATHER4_B_CL()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_B_CL

     void
     Inst_MIMG__IMAGE_GATHER4_B_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_LZ::Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_lz")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_LZ

     Inst_MIMG__IMAGE_GATHER4_LZ::~Inst_MIMG__IMAGE_GATHER4_LZ()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_LZ

     void
     Inst_MIMG__IMAGE_GATHER4_LZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C::Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C

     Inst_MIMG__IMAGE_GATHER4_C::~Inst_MIMG__IMAGE_GATHER4_C()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C

     void
     Inst_MIMG__IMAGE_GATHER4_C::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_CL::Inst_MIMG__IMAGE_GATHER4_C_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_CL

     Inst_MIMG__IMAGE_GATHER4_C_CL::~Inst_MIMG__IMAGE_GATHER4_C_CL()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_CL

     void
     Inst_MIMG__IMAGE_GATHER4_C_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_L::Inst_MIMG__IMAGE_GATHER4_C_L(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_l")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_L

     Inst_MIMG__IMAGE_GATHER4_C_L::~Inst_MIMG__IMAGE_GATHER4_C_L()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_L

     void
     Inst_MIMG__IMAGE_GATHER4_C_L::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_B::Inst_MIMG__IMAGE_GATHER4_C_B(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_b")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_B

     Inst_MIMG__IMAGE_GATHER4_C_B::~Inst_MIMG__IMAGE_GATHER4_C_B()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_B

     void
     Inst_MIMG__IMAGE_GATHER4_C_B::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_B_CL::Inst_MIMG__IMAGE_GATHER4_C_B_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_b_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_B_CL

     Inst_MIMG__IMAGE_GATHER4_C_B_CL::~Inst_MIMG__IMAGE_GATHER4_C_B_CL()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL

     void
     Inst_MIMG__IMAGE_GATHER4_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_LZ::Inst_MIMG__IMAGE_GATHER4_C_LZ(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_lz")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_LZ

     Inst_MIMG__IMAGE_GATHER4_C_LZ::~Inst_MIMG__IMAGE_GATHER4_C_LZ()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ

     void
     Inst_MIMG__IMAGE_GATHER4_C_LZ::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_O::Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_O

     Inst_MIMG__IMAGE_GATHER4_O::~Inst_MIMG__IMAGE_GATHER4_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_O

     void
     Inst_MIMG__IMAGE_GATHER4_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_CL_O::Inst_MIMG__IMAGE_GATHER4_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_CL_O

     Inst_MIMG__IMAGE_GATHER4_CL_O::~Inst_MIMG__IMAGE_GATHER4_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_CL_O

     void
     Inst_MIMG__IMAGE_GATHER4_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_L_O::Inst_MIMG__IMAGE_GATHER4_L_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_l_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_L_O

     Inst_MIMG__IMAGE_GATHER4_L_O::~Inst_MIMG__IMAGE_GATHER4_L_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_L_O

     void
     Inst_MIMG__IMAGE_GATHER4_L_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_B_O::Inst_MIMG__IMAGE_GATHER4_B_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_b_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_B_O

     Inst_MIMG__IMAGE_GATHER4_B_O::~Inst_MIMG__IMAGE_GATHER4_B_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_B_O

     void
     Inst_MIMG__IMAGE_GATHER4_B_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_B_CL_O::Inst_MIMG__IMAGE_GATHER4_B_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_b_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_B_CL_O

     Inst_MIMG__IMAGE_GATHER4_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_B_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_B_CL_O

     void
     Inst_MIMG__IMAGE_GATHER4_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_LZ_O::Inst_MIMG__IMAGE_GATHER4_LZ_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_lz_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_LZ_O

     Inst_MIMG__IMAGE_GATHER4_LZ_O::~Inst_MIMG__IMAGE_GATHER4_LZ_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_LZ_O

     void
     Inst_MIMG__IMAGE_GATHER4_LZ_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_O::Inst_MIMG__IMAGE_GATHER4_C_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_O

     Inst_MIMG__IMAGE_GATHER4_C_O::~Inst_MIMG__IMAGE_GATHER4_C_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_O

     void
     Inst_MIMG__IMAGE_GATHER4_C_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_CL_O::Inst_MIMG__IMAGE_GATHER4_C_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_CL_O

     Inst_MIMG__IMAGE_GATHER4_C_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_CL_O

     void
     Inst_MIMG__IMAGE_GATHER4_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_L_O::Inst_MIMG__IMAGE_GATHER4_C_L_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_l_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_L_O

     Inst_MIMG__IMAGE_GATHER4_C_L_O::~Inst_MIMG__IMAGE_GATHER4_C_L_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_L_O

     void
     Inst_MIMG__IMAGE_GATHER4_C_L_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_B_O::Inst_MIMG__IMAGE_GATHER4_C_B_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_b_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_B_O

     Inst_MIMG__IMAGE_GATHER4_C_B_O::~Inst_MIMG__IMAGE_GATHER4_C_B_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_B_O

     void
     Inst_MIMG__IMAGE_GATHER4_C_B_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_b_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O

     Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O

     void
     Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GATHER4_C_LZ_O::Inst_MIMG__IMAGE_GATHER4_C_LZ_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_gather4_c_lz_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GATHER4_C_LZ_O

     Inst_MIMG__IMAGE_GATHER4_C_LZ_O::~Inst_MIMG__IMAGE_GATHER4_C_LZ_O()
     {
     } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O

     void
     Inst_MIMG__IMAGE_GATHER4_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_GET_LOD::Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_get_lod")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_GET_LOD

     Inst_MIMG__IMAGE_GET_LOD::~Inst_MIMG__IMAGE_GET_LOD()
     {
     } // ~Inst_MIMG__IMAGE_GET_LOD

     void
     Inst_MIMG__IMAGE_GET_LOD::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_CD::Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_cd")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_CD

     Inst_MIMG__IMAGE_SAMPLE_CD::~Inst_MIMG__IMAGE_SAMPLE_CD()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_CD

     void
     Inst_MIMG__IMAGE_SAMPLE_CD::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_CD_CL::Inst_MIMG__IMAGE_SAMPLE_CD_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_cd_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_CD_CL

     Inst_MIMG__IMAGE_SAMPLE_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_CD_CL()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL

     void
     Inst_MIMG__IMAGE_SAMPLE_CD_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_CD::Inst_MIMG__IMAGE_SAMPLE_C_CD(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_cd")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_CD

     Inst_MIMG__IMAGE_SAMPLE_C_CD::~Inst_MIMG__IMAGE_SAMPLE_C_CD()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD

     void
     Inst_MIMG__IMAGE_SAMPLE_C_CD::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_cd_cl")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL

     Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL

     void
     Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_CD_O::Inst_MIMG__IMAGE_SAMPLE_CD_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_cd_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_CD_O

     Inst_MIMG__IMAGE_SAMPLE_CD_O::~Inst_MIMG__IMAGE_SAMPLE_CD_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_CD_O

     void
     Inst_MIMG__IMAGE_SAMPLE_CD_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_cd_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O

     Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O

     void
     Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_CD_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_cd_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_CD_O

     Inst_MIMG__IMAGE_SAMPLE_C_CD_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_CD_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(
           InFmt_MIMG *iFmt)
         : Inst_MIMG(iFmt, "image_sample_c_cd_cl_o")
     {
         setFlag(GlobalSegment);
     } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O

     Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O()
     {
     } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O

     void
     Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
         : Inst_EXP(iFmt, "exp")
     {
     } // Inst_EXP__EXP

     Inst_EXP__EXP::~Inst_EXP__EXP()
     {
     } // ~Inst_EXP__EXP

     void
     Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_LOAD_UBYTE::Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_load_ubyte")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_FLAT__FLAT_LOAD_UBYTE

     Inst_FLAT__FLAT_LOAD_UBYTE::~Inst_FLAT__FLAT_LOAD_UBYTE()
     {
     } // ~Inst_FLAT__FLAT_LOAD_UBYTE

     // Untyped buffer load unsigned byte (zero extend to VGPR destination).
     void
     Inst_FLAT__FLAT_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             gpuDynInst->exec_mask = wf->execMask();
             wf->computeUnit->vrf[wf->simdId]->
                 scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = gpuDynInst->wavefront()->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     } // execute

     void
     Inst_FLAT__FLAT_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<VecElemU8>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst(gpuDynInst, extData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
                     gpuDynInst->d_data))[lane]);
             }
         }
         vdst.write();
     } // execute
     // --- Inst_FLAT__FLAT_LOAD_SBYTE class methods ---

     Inst_FLAT__FLAT_LOAD_SBYTE::Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_load_sbyte")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_FLAT__FLAT_LOAD_SBYTE

     Inst_FLAT__FLAT_LOAD_SBYTE::~Inst_FLAT__FLAT_LOAD_SBYTE()
     {
     } // ~Inst_FLAT__FLAT_LOAD_SBYTE

     // Untyped buffer load signed byte (sign extend to VGPR destination).
     void
     Inst_FLAT__FLAT_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_FLAT__FLAT_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_FLAT__FLAT_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_FLAT__FLAT_LOAD_USHORT::Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_load_ushort")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_FLAT__FLAT_LOAD_USHORT

     Inst_FLAT__FLAT_LOAD_USHORT::~Inst_FLAT__FLAT_LOAD_USHORT()
     {
     } // ~Inst_FLAT__FLAT_LOAD_USHORT

     // Untyped buffer load unsigned short (zero extend to VGPR destination).
     void
     Inst_FLAT__FLAT_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             gpuDynInst->exec_mask = wf->execMask();
             wf->computeUnit->vrf[wf->simdId]->
                 scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = gpuDynInst->wavefront()->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<VecElemU16>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst(gpuDynInst, extData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
                     gpuDynInst->d_data))[lane]);
             }
         }
         vdst.write();
     }


     Inst_FLAT__FLAT_LOAD_SSHORT::Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_load_sshort")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_FLAT__FLAT_LOAD_SSHORT

     Inst_FLAT__FLAT_LOAD_SSHORT::~Inst_FLAT__FLAT_LOAD_SSHORT()
     {
     } // ~Inst_FLAT__FLAT_LOAD_SSHORT

     // Untyped buffer load signed short (sign extend to VGPR destination).
     void
     Inst_FLAT__FLAT_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     void
     Inst_FLAT__FLAT_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
     } // initiateAcc

     void
     Inst_FLAT__FLAT_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_FLAT__FLAT_LOAD_DWORD::Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_load_dword")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_FLAT__FLAT_LOAD_DWORD

     Inst_FLAT__FLAT_LOAD_DWORD::~Inst_FLAT__FLAT_LOAD_DWORD()
     {
     } // ~Inst_FLAT__FLAT_LOAD_DWORD

     // Untyped buffer load dword.
     void
     Inst_FLAT__FLAT_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             gpuDynInst->exec_mask = wf->execMask();
             wf->computeUnit->vrf[wf->simdId]->
                 scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = gpuDynInst->wavefront()->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst(gpuDynInst, extData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane];
             }
         }
         vdst.write();
     } // completeAcc

     Inst_FLAT__FLAT_LOAD_DWORDX2::Inst_FLAT__FLAT_LOAD_DWORDX2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_load_dwordx2")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_FLAT__FLAT_LOAD_DWORDX2

     Inst_FLAT__FLAT_LOAD_DWORDX2::~Inst_FLAT__FLAT_LOAD_DWORDX2()
     {
     } // ~Inst_FLAT__FLAT_LOAD_DWORDX2

     // Untyped buffer load 2 dwords.
     void
     Inst_FLAT__FLAT_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             gpuDynInst->exec_mask = wf->execMask();
             wf->computeUnit->vrf[wf->simdId]->
                 scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = gpuDynInst->wavefront()->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<VecElemU64>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU64 vdst(gpuDynInst, extData.VDST);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst[lane] = (reinterpret_cast<VecElemU64*>(
                     gpuDynInst->d_data))[lane];
             }
         }
         vdst.write();
     } // completeAcc

     Inst_FLAT__FLAT_LOAD_DWORDX3::Inst_FLAT__FLAT_LOAD_DWORDX3(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_load_dwordx3")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_FLAT__FLAT_LOAD_DWORDX3

     Inst_FLAT__FLAT_LOAD_DWORDX3::~Inst_FLAT__FLAT_LOAD_DWORDX3()
     {
     } // ~Inst_FLAT__FLAT_LOAD_DWORDX3

     // Untyped buffer load 3 dwords.
     void
     Inst_FLAT__FLAT_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             gpuDynInst->exec_mask = wf->execMask();
             wf->computeUnit->vrf[wf->simdId]->
                 scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<3>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst0(gpuDynInst, extData.VDST);
         VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
         VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 3];
                 vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 3 + 1];
                 vdst2[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 3 + 2];
             }
         }

         vdst0.write();
         vdst1.write();
         vdst2.write();
     } // completeAcc

     Inst_FLAT__FLAT_LOAD_DWORDX4::Inst_FLAT__FLAT_LOAD_DWORDX4(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_load_dwordx4")
     {
         setFlag(MemoryRef);
         setFlag(Load);
     } // Inst_FLAT__FLAT_LOAD_DWORDX4

     Inst_FLAT__FLAT_LOAD_DWORDX4::~Inst_FLAT__FLAT_LOAD_DWORDX4()
     {
     } // ~Inst_FLAT__FLAT_LOAD_DWORDX4

     // Untyped buffer load 4 dwords.
     void
     Inst_FLAT__FLAT_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             gpuDynInst->exec_mask = wf->execMask();
             wf->computeUnit->vrf[wf->simdId]->
                 scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initMemRead<4>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         VecOperandU32 vdst0(gpuDynInst, extData.VDST);
         VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
         VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
         VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 4];
                 vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 4 + 1];
                 vdst2[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 4 + 2];
                 vdst3[lane] = (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 4 + 3];
             }
         }

         vdst0.write();
         vdst1.write();
         vdst2.write();
         vdst3.write();
     } // completeAcc

     Inst_FLAT__FLAT_STORE_BYTE::Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_store_byte")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_FLAT__FLAT_STORE_BYTE

     Inst_FLAT__FLAT_STORE_BYTE::~Inst_FLAT__FLAT_STORE_BYTE()
     {
     } // ~Inst_FLAT__FLAT_STORE_BYTE

     // Untyped buffer store byte.
     void
     Inst_FLAT__FLAT_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     } // execute

     void
     Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandU8 data(gpuDynInst, extData.DATA);
         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         initMemWrite<VecElemU8>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     }

     Inst_FLAT__FLAT_STORE_SHORT::Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_store_short")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_FLAT__FLAT_STORE_SHORT

     Inst_FLAT__FLAT_STORE_SHORT::~Inst_FLAT__FLAT_STORE_SHORT()
     {
     } // ~Inst_FLAT__FLAT_STORE_SHORT

     // Untyped buffer store short.
     void
     Inst_FLAT__FLAT_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandU16 data(gpuDynInst, extData.DATA);

         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         initMemWrite<VecElemU16>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_store_dword")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_FLAT__FLAT_STORE_DWORD

     Inst_FLAT__FLAT_STORE_DWORD::~Inst_FLAT__FLAT_STORE_DWORD()
     {
     } // ~Inst_FLAT__FLAT_STORE_DWORD

     // Untyped buffer store dword.
     void
     Inst_FLAT__FLAT_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandU32 data(gpuDynInst, extData.DATA);
         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         initMemWrite<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_FLAT__FLAT_STORE_DWORDX2::Inst_FLAT__FLAT_STORE_DWORDX2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_store_dwordx2")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_FLAT__FLAT_STORE_DWORDX2

     Inst_FLAT__FLAT_STORE_DWORDX2::~Inst_FLAT__FLAT_STORE_DWORDX2()
     {
     } // ~Inst_FLAT__FLAT_STORE_DWORDX2

     // Untyped buffer store 2 dwords.
     void
     Inst_FLAT__FLAT_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         wf->outstandingReqs++;
         wf->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandU64 data(gpuDynInst, extData.DATA);
         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
                     = data[lane];
             }
         }

         initMemWrite<VecElemU64>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_FLAT__FLAT_STORE_DWORDX3::Inst_FLAT__FLAT_STORE_DWORDX3(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_store_dwordx3")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_FLAT__FLAT_STORE_DWORDX3

     Inst_FLAT__FLAT_STORE_DWORDX3::~Inst_FLAT__FLAT_STORE_DWORDX3()
     {
     } // ~Inst_FLAT__FLAT_STORE_DWORDX3

     // Untyped buffer store 3 dwords.
     void
     Inst_FLAT__FLAT_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
         ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
         ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);

         data0.read();
         data1.read();
         data2.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 3] = data0[lane];
                 (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
                 (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
             }
         }

         initMemWrite<3>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_FLAT__FLAT_STORE_DWORDX4::Inst_FLAT__FLAT_STORE_DWORDX4(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_store_dwordx4")
     {
         setFlag(MemoryRef);
         setFlag(Store);
     } // Inst_FLAT__FLAT_STORE_DWORDX4

     Inst_FLAT__FLAT_STORE_DWORDX4::~Inst_FLAT__FLAT_STORE_DWORDX4()
     {
     } // ~Inst_FLAT__FLAT_STORE_DWORDX4

     // Untyped buffer store 4 dwords.
     void
     Inst_FLAT__FLAT_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
         ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
         ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
         ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);

         data0.read();
         data1.read();
         data2.read();
         data3.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 4] = data0[lane];
                 (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
                 (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
                 (reinterpret_cast<VecElemU32*>(
                     gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
             }
         }

         initMemWrite<4>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_SWAP::Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_swap")
     {
         setFlag(AtomicExch);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         } // if
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_SWAP

     Inst_FLAT__FLAT_ATOMIC_SWAP::~Inst_FLAT__FLAT_ATOMIC_SWAP()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_SWAP

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA;
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

         addr.read();

         calcAddr(gpuDynInst, addr);

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL ||
             gpuDynInst->executedAs() == Enums::SC_PRIVATE) {
             // TODO: additional address computation required for scratch
             panic_if(gpuDynInst->executedAs() == Enums::SC_PRIVATE,
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();

         ConstVecOperandU32 data(gpuDynInst, extData.DATA);

         data.read();

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                     = data[lane];
             }
         }

     } // execute

     void
     Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU32 vdst(gpuDynInst, extData.VDST);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---

     Inst_FLAT__FLAT_ATOMIC_CMPSWAP
         ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_cmpswap")
     {
         setFlag(AtomicCAS);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         } // if
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP

     Inst_FLAT__FLAT_ATOMIC_CMPSWAP::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP

     // tmp = MEM[ADDR];
     // src = DATA[0];
     // cmp = DATA[1];
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data(gpuDynInst, extData.DATA);
         ConstVecOperandU32 cmp(gpuDynInst, extData.DATA + 1);

         addr.read();
         data.read();
         cmp.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
                     = data[lane];
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                     = cmp[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL ||
             gpuDynInst->executedAs() == Enums::SC_PRIVATE) {
             /**
              * TODO: If you encounter this panic, just remove this panic
              * and restart the simulation. It should just work fine but
              * this is to warn user that this path is never tested although
              * all the necessary logic is implemented
              */
             panic_if(gpuDynInst->executedAs() == Enums::SC_PRIVATE,
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU32 vdst(gpuDynInst, extData.VDST);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_ADD::Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_add")
     {
         setFlag(AtomicAdd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         } // if
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_ADD

     Inst_FLAT__FLAT_ATOMIC_ADD::~Inst_FLAT__FLAT_ATOMIC_ADD()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_ADD

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA;
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data(gpuDynInst, extData.DATA);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                     = data[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_ADD::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU32 vdst(gpuDynInst, extData.VDST);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_SUB::Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_sub")
     {
         setFlag(AtomicSub);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         } // if
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_SUB

     Inst_FLAT__FLAT_ATOMIC_SUB::~Inst_FLAT__FLAT_ATOMIC_SUB()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_SUB

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data(gpuDynInst, extData.DATA);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                     = data[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }
     void
     Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU32 vdst(gpuDynInst, extData.VDST);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_smin")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_SMIN

     Inst_FLAT__FLAT_ATOMIC_SMIN::~Inst_FLAT__FLAT_ATOMIC_SMIN()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_SMIN

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_UMIN::Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_umin")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_UMIN

     Inst_FLAT__FLAT_ATOMIC_UMIN::~Inst_FLAT__FLAT_ATOMIC_UMIN()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_UMIN

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_SMAX::Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_smax")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_SMAX

     Inst_FLAT__FLAT_ATOMIC_SMAX::~Inst_FLAT__FLAT_ATOMIC_SMAX()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_SMAX

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_UMAX::Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_umax")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_UMAX

     Inst_FLAT__FLAT_ATOMIC_UMAX::~Inst_FLAT__FLAT_ATOMIC_UMAX()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_UMAX

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_AND::Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_and")
     {
         setFlag(AtomicAnd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_AND

     Inst_FLAT__FLAT_ATOMIC_AND::~Inst_FLAT__FLAT_ATOMIC_AND()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_AND

     // tmp = MEM[ADDR];
     // MEM[ADDR] &= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_OR::Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_or")
     {
         setFlag(AtomicOr);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_OR

     Inst_FLAT__FLAT_ATOMIC_OR::~Inst_FLAT__FLAT_ATOMIC_OR()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_OR

     // tmp = MEM[ADDR];
     // MEM[ADDR] |= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_XOR::Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_xor")
     {
         setFlag(AtomicXor);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_XOR

     Inst_FLAT__FLAT_ATOMIC_XOR::~Inst_FLAT__FLAT_ATOMIC_XOR()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_XOR

     // tmp = MEM[ADDR];
     // MEM[ADDR] ^= DATA;
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_INC::Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_inc")
     {
         setFlag(AtomicInc);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_INC

     Inst_FLAT__FLAT_ATOMIC_INC::~Inst_FLAT__FLAT_ATOMIC_INC()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_INC

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
     // RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data(gpuDynInst, extData.DATA);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                     = data[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU32 vdst(gpuDynInst, extData.VDST);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_dec")
     {
         setFlag(AtomicDec);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_DEC

     Inst_FLAT__FLAT_ATOMIC_DEC::~Inst_FLAT__FLAT_ATOMIC_DEC()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_DEC

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
     // (unsigned compare); RETURN_DATA = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU32 data(gpuDynInst, extData.DATA);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                     = data[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU32>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU32 vdst(gpuDynInst, extData.VDST);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU32*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_swap_x2")
     {
         setFlag(AtomicExch);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_SWAP_X2

     Inst_FLAT__FLAT_ATOMIC_SWAP_X2::~Inst_FLAT__FLAT_ATOMIC_SWAP_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] = DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_cmpswap_x2")
     {
         setFlag(AtomicCAS);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2

     Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2

     // tmp = MEM[ADDR];
     // src = DATA[0:1];
     // cmp = DATA[2:3];
     // MEM[ADDR] = (tmp == cmp) ? src : tmp;
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU64 data(gpuDynInst, extData.DATA);
         ConstVecOperandU64 cmp(gpuDynInst, extData.DATA + 2);

         addr.read();
         data.read();
         cmp.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU64*>(gpuDynInst->x_data))[lane]
                     = data[lane];
                 (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                     = cmp[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL ||
             gpuDynInst->executedAs() == Enums::SC_PRIVATE) {
             /**
              * TODO: If you encounter this panic, just remove this panic
              * and restart the simulation. It should just work fine but
              * this is to warn user that this path is never tested although
              * all the necessary logic is implemented
              */
             panic_if(gpuDynInst->executedAs() == Enums::SC_PRIVATE,
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU64>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU64 vdst(gpuDynInst, extData.VDST);

             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU64*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_ADD_X2::Inst_FLAT__FLAT_ATOMIC_ADD_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_add_x2")
     {
         setFlag(AtomicAdd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_ADD_X2

     Inst_FLAT__FLAT_ATOMIC_ADD_X2::~Inst_FLAT__FLAT_ATOMIC_ADD_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_ADD_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] += DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU64 data(gpuDynInst, extData.DATA);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                     = data[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_ATOMIC_ADD_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU64>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_ADD_X2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU64 vdst(gpuDynInst, extData.VDST);


             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU64*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_SUB_X2::Inst_FLAT__FLAT_ATOMIC_SUB_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_sub_x2")
     {
         setFlag(AtomicSub);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_SUB_X2

     Inst_FLAT__FLAT_ATOMIC_SUB_X2::~Inst_FLAT__FLAT_ATOMIC_SUB_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_SUB_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU64 data(gpuDynInst, extData.DATA);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                     = data[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU64>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU64 vdst(gpuDynInst, extData.VDST);


             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU64*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_smin_x2")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_SMIN_X2

     Inst_FLAT__FLAT_ATOMIC_SMIN_X2::~Inst_FLAT__FLAT_ATOMIC_SMIN_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_UMIN_X2::Inst_FLAT__FLAT_ATOMIC_UMIN_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_umin_x2")
     {
         setFlag(AtomicMin);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_UMIN_X2

     Inst_FLAT__FLAT_ATOMIC_UMIN_X2::~Inst_FLAT__FLAT_ATOMIC_UMIN_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_SMAX_X2::Inst_FLAT__FLAT_ATOMIC_SMAX_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_smax_x2")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_SMAX_X2

     Inst_FLAT__FLAT_ATOMIC_SMAX_X2::~Inst_FLAT__FLAT_ATOMIC_SMAX_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_UMAX_X2::Inst_FLAT__FLAT_ATOMIC_UMAX_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_umax_x2")
     {
         setFlag(AtomicMax);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_UMAX_X2

     Inst_FLAT__FLAT_ATOMIC_UMAX_X2::~Inst_FLAT__FLAT_ATOMIC_UMAX_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_AND_X2::Inst_FLAT__FLAT_ATOMIC_AND_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_and_x2")
     {
         setFlag(AtomicAnd);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_AND_X2

     Inst_FLAT__FLAT_ATOMIC_AND_X2::~Inst_FLAT__FLAT_ATOMIC_AND_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_AND_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] &= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_OR_X2::Inst_FLAT__FLAT_ATOMIC_OR_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_or_x2")
     {
         setFlag(AtomicOr);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_OR_X2

     Inst_FLAT__FLAT_ATOMIC_OR_X2::~Inst_FLAT__FLAT_ATOMIC_OR_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_OR_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] |= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_XOR_X2::Inst_FLAT__FLAT_ATOMIC_XOR_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_xor_x2")
     {
         setFlag(AtomicXor);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_XOR_X2

     Inst_FLAT__FLAT_ATOMIC_XOR_X2::~Inst_FLAT__FLAT_ATOMIC_XOR_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_XOR_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] ^= DATA[0:1];
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         panicUnimplemented();
     }

     Inst_FLAT__FLAT_ATOMIC_INC_X2::Inst_FLAT__FLAT_ATOMIC_INC_X2(
           InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_inc_x2")
     {
         setFlag(AtomicInc);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_INC_X2

     Inst_FLAT__FLAT_ATOMIC_INC_X2::~Inst_FLAT__FLAT_ATOMIC_INC_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_INC_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU64 data(gpuDynInst, extData.DATA);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                     = data[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU64>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU64 vdst(gpuDynInst, extData.VDST);


             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU64*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc

     Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
         InFmt_FLAT *iFmt)
         : Inst_FLAT(iFmt, "flat_atomic_dec_x2")
     {
         setFlag(AtomicDec);
         if (instData.GLC) {
             setFlag(AtomicReturn);
         } else {
             setFlag(AtomicNoReturn);
         }
         setFlag(MemoryRef);
     } // Inst_FLAT__FLAT_ATOMIC_DEC_X2

     Inst_FLAT__FLAT_ATOMIC_DEC_X2::~Inst_FLAT__FLAT_ATOMIC_DEC_X2()
     {
     } // ~Inst_FLAT__FLAT_ATOMIC_DEC_X2

     // tmp = MEM[ADDR];
     // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
     // (unsigned compare);
     // RETURN_DATA[0:1] = tmp.
     void
     Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();

         if (wf->execMask().none()) {
             wf->decVMemInstsIssued();
             wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             if (instData.GLC) {
                 gpuDynInst->exec_mask = wf->execMask();
                 wf->computeUnit->vrf[wf->simdId]->
                     scheduleWriteOperandsFromLoad(wf, gpuDynInst);
             }
             return;
         }

         gpuDynInst->execUnitId = wf->execUnitId;
         gpuDynInst->exec_mask = wf->execMask();
         gpuDynInst->latency.init(gpuDynInst->computeUnit());
         gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

         ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
         ConstVecOperandU64 data(gpuDynInst, extData.DATA);

         addr.read();
         data.read();

         calcAddr(gpuDynInst, addr);

         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
                 (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                     = data[lane];
             }
         }

         if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
             wf->wrGmReqsInPipe--;
             wf->outstandingReqsWrGm++;
             wf->rdGmReqsInPipe--;
             wf->outstandingReqsRdGm++;
         } else {
             fatal("Non global flat instructions not implemented yet.\n");
         }

         gpuDynInst->wavefront()->outstandingReqs++;
         gpuDynInst->wavefront()->validateRequestCounters();
     }

     void
     Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
     {
         initAtomicAccess<VecElemU64>(gpuDynInst);
     } // initiateAcc

     void
     Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
     {
         if (isAtomicRet()) {
             VecOperandU64 vdst(gpuDynInst, extData.VDST);


             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                 if (gpuDynInst->exec_mask[lane]) {
                     vdst[lane] = (reinterpret_cast<VecElemU64*>(
                         gpuDynInst->d_data))[lane];
                 }
             }

             vdst.write();
         }
     } // completeAcc
 } // namespace Gcn3ISA