src/arch/hsail/insts/pseudo_inst.cc - public/gem5 - Git at Google

 /*
  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its contributors
  * may be used to endorse or promote products derived from this software
  * without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Author: Marc Orr
  */

 #include <csignal>

 #include "arch/hsail/insts/decl.hh"
 #include "arch/hsail/insts/mem.hh"

 namespace HsailISA
 {
     // Pseudo (or magic) instructions are overloaded on the hsail call
     // instruction, because of its flexible parameter signature.

     // To add a new magic instruction:
     // 1. Add an entry to the enum.
     // 2. Implement it in the switch statement below (Call::exec).
     // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
     //    so its easy to call from an OpenCL kernel.

     // This enum should be identical to the enum in
     // hsa/hsail-gpu-compute/util/magicinst.h
     enum
     {
         MAGIC_PRINT_WF_32 = 0,
         MAGIC_PRINT_WF_64,
         MAGIC_PRINT_LANE,
         MAGIC_PRINT_LANE_64,
         MAGIC_PRINT_WF_FLOAT,
         MAGIC_SIM_BREAK,
         MAGIC_PREF_SUM,
         MAGIC_REDUCTION,
         MAGIC_MASKLANE_LOWER,
         MAGIC_MASKLANE_UPPER,
         MAGIC_JOIN_WF_BAR,
         MAGIC_WAIT_WF_BAR,
         MAGIC_PANIC,
         MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
         MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
         MAGIC_LOAD_GLOBAL_U32_REG,
         MAGIC_XACT_CAS_LD,
         MAGIC_MOST_SIG_THD,
         MAGIC_MOST_SIG_BROADCAST,
         MAGIC_PRINT_WFID_32,
         MAGIC_PRINT_WFID_64
     };

     void
     Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
     {
         const VectorMask &mask = w->getPred();

         int op = 0;
         bool got_op = false;

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val0 = src1.get<int>(w, lane, 0);
                 if (got_op) {
                     if (src_val0 != op) {
                         fatal("Multiple magic instructions per PC not "
                               "supported\n");
                     }
                 } else {
                     op = src_val0;
                     got_op = true;
                 }
             }
         }

         switch(op) {
           case MAGIC_PRINT_WF_32:
             MagicPrintWF32(w);
             break;
           case MAGIC_PRINT_WF_64:
             MagicPrintWF64(w);
             break;
           case MAGIC_PRINT_LANE:
             MagicPrintLane(w);
             break;
           case MAGIC_PRINT_LANE_64:
             MagicPrintLane64(w);
             break;
           case MAGIC_PRINT_WF_FLOAT:
             MagicPrintWFFloat(w);
             break;
           case MAGIC_SIM_BREAK:
             MagicSimBreak(w);
             break;
           case MAGIC_PREF_SUM:
             MagicPrefixSum(w);
             break;
           case MAGIC_REDUCTION:
             MagicReduction(w);
             break;
           case MAGIC_MASKLANE_LOWER:
             MagicMaskLower(w);
             break;
           case MAGIC_MASKLANE_UPPER:
             MagicMaskUpper(w);
             break;
           case MAGIC_JOIN_WF_BAR:
             MagicJoinWFBar(w);
             break;
           case MAGIC_WAIT_WF_BAR:
             MagicWaitWFBar(w);
             break;
           case MAGIC_PANIC:
             MagicPanic(w);
             break;

           // atomic instructions
           case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
             MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
             break;

           case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
             MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
             break;

           case MAGIC_LOAD_GLOBAL_U32_REG:
             MagicLoadGlobalU32Reg(w, gpuDynInst);
             break;

           case MAGIC_XACT_CAS_LD:
             MagicXactCasLd(w);
             break;

           case MAGIC_MOST_SIG_THD:
             MagicMostSigThread(w);
             break;

           case MAGIC_MOST_SIG_BROADCAST:
             MagicMostSigBroadcast(w);
             break;

           case MAGIC_PRINT_WFID_32:
             MagicPrintWF32ID(w);
             break;

           case MAGIC_PRINT_WFID_64:
             MagicPrintWFID64(w);
             break;

           default: fatal("unrecognized magic instruction: %d\n", op);
         }
     }

     void
     Call::MagicPrintLane(Wavefront *w)
     {
     #if TRACING_ON
         const VectorMask &mask = w->getPred();
         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 int src_val2 = src1.get<int>(w, lane, 2);
                 if (src_val2) {
                     DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
                              disassemble(), w->computeUnit->cu_id, w->simdId,
                              w->wfSlotId, lane, src_val1);
                 } else {
                     DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
                              disassemble(), w->computeUnit->cu_id, w->simdId,
                              w->wfSlotId, lane, src_val1);
                 }
             }
         }
     #endif
     }

     void
     Call::MagicPrintLane64(Wavefront *w)
     {
     #if TRACING_ON
         const VectorMask &mask = w->getPred();
         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
                 int src_val2 = src1.get<int>(w, lane, 2);
                 if (src_val2) {
                     DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
                              disassemble(), w->computeUnit->cu_id, w->simdId,
                              w->wfSlotId, lane, src_val1);
                 } else {
                     DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
                              disassemble(), w->computeUnit->cu_id, w->simdId,
                              w->wfSlotId, lane, src_val1);
                 }
             }
         }
     #endif
     }

     void
     Call::MagicPrintWF32(Wavefront *w)
     {
     #if TRACING_ON
         const VectorMask &mask = w->getPred();
         std::string res_str;
         res_str = csprintf("krl_prt (%s)\n", disassemble());

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 7)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }

             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 int src_val2 = src1.get<int>(w, lane, 2);

                 if (src_val2) {
                     res_str += csprintf("%08x", src_val1);
                 } else {
                     res_str += csprintf("%08d", src_val1);
                 }
             } else {
                 res_str += csprintf("xxxxxxxx");
             }

             if ((lane & 7) == 7) {
                 res_str += csprintf("\n");
             } else {
                 res_str += csprintf(" ");
             }
         }

         res_str += "\n\n";
         DPRINTFN(res_str.c_str());
     #endif
     }

     void
     Call::MagicPrintWF32ID(Wavefront *w)
     {
     #if TRACING_ON
         const VectorMask &mask = w->getPred();
         std::string res_str;
         int src_val3 = -1;
         res_str = csprintf("krl_prt (%s)\n", disassemble());

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 7)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }

             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 int src_val2 = src1.get<int>(w, lane, 2);
                 src_val3 = src1.get<int>(w, lane, 3);

                 if (src_val2) {
                     res_str += csprintf("%08x", src_val1);
                 } else {
                     res_str += csprintf("%08d", src_val1);
                 }
             } else {
                 res_str += csprintf("xxxxxxxx");
             }

             if ((lane & 7) == 7) {
                 res_str += csprintf("\n");
             } else {
                 res_str += csprintf(" ");
             }
         }

         res_str += "\n\n";
         if (w->wfDynId == src_val3) {
             DPRINTFN(res_str.c_str());
         }
     #endif
     }

     void
     Call::MagicPrintWF64(Wavefront *w)
     {
     #if TRACING_ON
         const VectorMask &mask = w->getPred();
         std::string res_str;
         res_str = csprintf("krl_prt (%s)\n", disassemble());

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 3)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }

             if (mask[lane]) {
                 int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
                 int src_val2 = src1.get<int>(w, lane, 2);

                 if (src_val2) {
                     res_str += csprintf("%016x", src_val1);
                 } else {
                     res_str += csprintf("%016d", src_val1);
                 }
             } else {
                 res_str += csprintf("xxxxxxxxxxxxxxxx");
             }

             if ((lane & 3) == 3) {
                 res_str += csprintf("\n");
             } else {
                 res_str += csprintf(" ");
             }
         }

         res_str += "\n\n";
         DPRINTFN(res_str.c_str());
     #endif
     }

     void
     Call::MagicPrintWFID64(Wavefront *w)
     {
     #if TRACING_ON
         const VectorMask &mask = w->getPred();
         std::string res_str;
         int src_val3 = -1;
         res_str = csprintf("krl_prt (%s)\n", disassemble());

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 3)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }

             if (mask[lane]) {
                 int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
                 int src_val2 = src1.get<int>(w, lane, 2);
                 src_val3 = src1.get<int>(w, lane, 3);

                 if (src_val2) {
                     res_str += csprintf("%016x", src_val1);
                 } else {
                     res_str += csprintf("%016d", src_val1);
                 }
             } else {
                 res_str += csprintf("xxxxxxxxxxxxxxxx");
             }

             if ((lane & 3) == 3) {
                 res_str += csprintf("\n");
             } else {
                 res_str += csprintf(" ");
             }
         }

         res_str += "\n\n";
         if (w->wfDynId == src_val3) {
             DPRINTFN(res_str.c_str());
         }
     #endif
     }

     void
     Call::MagicPrintWFFloat(Wavefront *w)
     {
     #if TRACING_ON
         const VectorMask &mask = w->getPred();
         std::string res_str;
         res_str = csprintf("krl_prt (%s)\n", disassemble());

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (!(lane & 7)) {
                 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
             }

             if (mask[lane]) {
                 float src_val1 = src1.get<float>(w, lane, 1);
                 res_str += csprintf("%08f", src_val1);
             } else {
                 res_str += csprintf("xxxxxxxx");
             }

             if ((lane & 7) == 7) {
                 res_str += csprintf("\n");
             } else {
                 res_str += csprintf(" ");
             }
         }

         res_str += "\n\n";
         DPRINTFN(res_str.c_str());
     #endif
     }

     // raises a signal that GDB will catch
     // when done with the break, type "signal 0" in gdb to continue
     void
     Call::MagicSimBreak(Wavefront *w)
     {
         std::string res_str;
         // print out state for this wavefront and then break
         res_str = csprintf("Breakpoint encountered for wavefront %i\n",
                            w->wfSlotId);

         res_str += csprintf("  Kern ID: %i\n", w->kernId);
         res_str += csprintf("  Phase ID: %i\n", w->simdId);
         res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
         res_str += csprintf("  Exec mask: ");

         for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
             if (w->execMask(i))
                 res_str += "1";
             else
                 res_str += "0";

             if ((i & 7) == 7)
                 res_str += " ";
         }

         res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());

         res_str += "\nHelpful debugging hints:\n";
         res_str += "   Check out w->s_reg / w->d_reg for register state\n";

         res_str += "\n\n";
         DPRINTFN(res_str.c_str());
         fflush(stdout);

         raise(SIGTRAP);
     }

     void
     Call::MagicPrefixSum(Wavefront *w)
     {
         const VectorMask &mask = w->getPred();
         int res = 0;

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 dest.set<int>(w, lane, res);
                 res += src_val1;
             }
         }
     }

     void
     Call::MagicReduction(Wavefront *w)
     {
         // reduction magic instruction
         //   The reduction instruction takes up to 64 inputs (one from
         //   each thread in a WF) and sums them. It returns the sum to
         //   each thread in the WF.
         const VectorMask &mask = w->getPred();
         int res = 0;

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 res += src_val1;
             }
         }

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 dest.set<int>(w, lane, res);
             }
         }
     }

     void
     Call::MagicMaskLower(Wavefront *w)
     {
         const VectorMask &mask = w->getPred();
         int res = 0;

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);

                 if (src_val1) {
                     if (lane < (w->computeUnit->wfSize()/2)) {
                         res = res | ((uint32_t)(1) << lane);
                     }
                 }
             }
         }

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 dest.set<int>(w, lane, res);
             }
         }
     }

     void
     Call::MagicMaskUpper(Wavefront *w)
     {
         const VectorMask &mask = w->getPred();
         int res = 0;
         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);

                 if (src_val1) {
                     if (lane >= (w->computeUnit->wfSize()/2)) {
                         res = res | ((uint32_t)(1) <<
                                      (lane - (w->computeUnit->wfSize()/2)));
                     }
                 }
             }
         }

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 dest.set<int>(w, lane, res);
             }
         }
     }

     void
     Call::MagicJoinWFBar(Wavefront *w)
     {
         const VectorMask &mask = w->getPred();
         int max_cnt = 0;

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 w->barCnt[lane]++;

                 if (w->barCnt[lane] > max_cnt) {
                     max_cnt = w->barCnt[lane];
                 }
             }
         }

         if (max_cnt > w->maxBarCnt) {
             w->maxBarCnt = max_cnt;
         }
     }

     void
     Call::MagicWaitWFBar(Wavefront *w)
     {
         const VectorMask &mask = w->getPred();
         int max_cnt = 0;

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 w->barCnt[lane]--;
             }

             if (w->barCnt[lane] > max_cnt) {
                 max_cnt = w->barCnt[lane];
             }
         }

         if (max_cnt < w->maxBarCnt) {
             w->maxBarCnt = max_cnt;
         }

         w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
                                    w->instructionBuffer.end());
         if (w->pendingFetch)
             w->dropFetch = true;
     }

     void
     Call::MagicPanic(Wavefront *w)
     {
         const VectorMask &mask = w->getPred();

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 int src_val1 = src1.get<int>(w, lane, 1);
                 panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
                       src_val1, lane);
             }
         }
     }

     void
     Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
     {
         // the address is in src1 | src2
         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             int src_val1 = src1.get<int>(w, lane, 1);
             int src_val2 = src1.get<int>(w, lane, 2);
             Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);

             m->addr[lane] = addr;
         }

     }

     void
     Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
     {
         GPUDynInstPtr m = gpuDynInst;

         calcAddr(w, m);

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
         }

         setFlag(AtomicNoReturn);
         setFlag(AtomicAdd);
         setFlag(NoScope);
         setFlag(NoOrder);
         setFlag(GlobalSegment);

         m->m_type = U32::memType;
         m->v_type = U32::vgprType;

         m->exec_mask = w->execMask();
         m->statusBitVector = 0;
         m->equiv = 0;  // atomics don't have an equivalence class operand
         m->n_reg = 1;

         m->simdId = w->simdId;
         m->wfSlotId = w->wfSlotId;
         m->wfDynId = w->wfDynId;
         m->latency.init(&w->computeUnit->shader->tick_cnt);

         m->pipeId = GLBMEM_PIPE;
         m->latency.set(w->computeUnit->shader->ticks(64));
         w->computeUnit->globalMemoryPipe.issueRequest(m);
         w->outstandingReqsWrGm++;
         w->wrGmReqsInPipe--;
         w->outstandingReqsRdGm++;
         w->rdGmReqsInPipe--;
         w->outstandingReqs++;
         w->memReqsInPipe--;
     }

     void
     Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
     {
         GPUDynInstPtr m = gpuDynInst;
         calcAddr(w, m);

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
         }

         setFlag(AtomicNoReturn);
         setFlag(AtomicAdd);
         setFlag(NoScope);
         setFlag(NoOrder);
         setFlag(GlobalSegment);

         m->m_type = U32::memType;
         m->v_type = U32::vgprType;

         m->exec_mask = w->execMask();
         m->statusBitVector = 0;
         m->equiv = 0;  // atomics don't have an equivalence class operand
         m->n_reg = 1;

         m->simdId = w->simdId;
         m->wfSlotId = w->wfSlotId;
         m->wfDynId = w->wfDynId;
         m->latency.init(&w->computeUnit->shader->tick_cnt);

         m->pipeId = GLBMEM_PIPE;
         m->latency.set(w->computeUnit->shader->ticks(64));
         w->computeUnit->globalMemoryPipe.issueRequest(m);
         w->outstandingReqsWrGm++;
         w->wrGmReqsInPipe--;
         w->outstandingReqsRdGm++;
         w->rdGmReqsInPipe--;
         w->outstandingReqs++;
         w->memReqsInPipe--;
     }

     void
     Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
     {
         GPUDynInstPtr m = gpuDynInst;
         // calculate the address
         calcAddr(w, m);

         setFlag(Load);
         setFlag(NoScope);
         setFlag(NoOrder);
         setFlag(GlobalSegment);

         m->m_type = U32::memType;  //MemDataType::memType;
         m->v_type = U32::vgprType; //DestDataType::vgprType;

         m->exec_mask = w->execMask();
         m->statusBitVector = 0;
         m->equiv = 0;
         m->n_reg = 1;

         // FIXME
         //m->dst_reg = this->dest.regIndex();

         m->simdId = w->simdId;
         m->wfSlotId = w->wfSlotId;
         m->wfDynId = w->wfDynId;
         m->latency.init(&w->computeUnit->shader->tick_cnt);

         m->pipeId = GLBMEM_PIPE;
         m->latency.set(w->computeUnit->shader->ticks(1));
         w->computeUnit->globalMemoryPipe.issueRequest(m);
         w->outstandingReqsRdGm++;
         w->rdGmReqsInPipe--;
         w->outstandingReqs++;
         w->memReqsInPipe--;
     }

     void
     Call::MagicXactCasLd(Wavefront *w)
     {
         const VectorMask &mask = w->getPred();
         int src_val1 = 0;

         for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
             if (mask[lane]) {
                 src_val1 = src1.get<int>(w, lane, 1);
                 break;
             }
         }

         if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
             w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
             w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
         }

         w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
             .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
     }

     void
     Call::MagicMostSigThread(Wavefront *w)
     {
         const VectorMask &mask = w->getPred();
         unsigned mst = true;

         for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
             if (mask[lane]) {
                 dest.set<int>(w, lane, mst);
                 mst = false;
             }
         }
     }

     void
     Call::MagicMostSigBroadcast(Wavefront *w)
     {
         const VectorMask &mask = w->getPred();
         int res = 0;
         bool got_res = false;

         for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
             if (mask[lane]) {
                 if (!got_res) {
                     res = src1.get<int>(w, lane, 1);
                     got_res = true;
                 }
                 dest.set<int>(w, lane, res);
             }
         }
     }

 } // namespace HsailISA
	/*
	* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* For use for simulation and test purposes only
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* 3. Neither the name of the copyright holder nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* Author: Marc Orr
	*/

	#include <csignal>

	#include "arch/hsail/insts/decl.hh"
	#include "arch/hsail/insts/mem.hh"

	namespace HsailISA
	{
	// Pseudo (or magic) instructions are overloaded on the hsail call
	// instruction, because of its flexible parameter signature.

	// To add a new magic instruction:
	// 1. Add an entry to the enum.
	// 2. Implement it in the switch statement below (Call::exec).
	// 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
	// so its easy to call from an OpenCL kernel.

	// This enum should be identical to the enum in
	// hsa/hsail-gpu-compute/util/magicinst.h
	enum
	{
	MAGIC_PRINT_WF_32 = 0,
	MAGIC_PRINT_WF_64,
	MAGIC_PRINT_LANE,
	MAGIC_PRINT_LANE_64,
	MAGIC_PRINT_WF_FLOAT,
	MAGIC_SIM_BREAK,
	MAGIC_PREF_SUM,
	MAGIC_REDUCTION,
	MAGIC_MASKLANE_LOWER,
	MAGIC_MASKLANE_UPPER,
	MAGIC_JOIN_WF_BAR,
	MAGIC_WAIT_WF_BAR,
	MAGIC_PANIC,
	MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
	MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
	MAGIC_LOAD_GLOBAL_U32_REG,
	MAGIC_XACT_CAS_LD,
	MAGIC_MOST_SIG_THD,
	MAGIC_MOST_SIG_BROADCAST,
	MAGIC_PRINT_WFID_32,
	MAGIC_PRINT_WFID_64
	};

	void
	Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
	{
	const VectorMask &mask = w->getPred();

	int op = 0;
	bool got_op = false;

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	int src_val0 = src1.get<int>(w, lane, 0);
	if (got_op) {
	if (src_val0 != op) {
	fatal("Multiple magic instructions per PC not "
	"supported\n");
	}
	} else {
	op = src_val0;
	got_op = true;
	}
	}
	}

	switch(op) {
	case MAGIC_PRINT_WF_32:
	MagicPrintWF32(w);
	break;
	case MAGIC_PRINT_WF_64:
	MagicPrintWF64(w);
	break;
	case MAGIC_PRINT_LANE:
	MagicPrintLane(w);
	break;
	case MAGIC_PRINT_LANE_64:
	MagicPrintLane64(w);
	break;
	case MAGIC_PRINT_WF_FLOAT:
	MagicPrintWFFloat(w);
	break;
	case MAGIC_SIM_BREAK:
	MagicSimBreak(w);
	break;
	case MAGIC_PREF_SUM:
	MagicPrefixSum(w);
	break;
	case MAGIC_REDUCTION:
	MagicReduction(w);
	break;
	case MAGIC_MASKLANE_LOWER:
	MagicMaskLower(w);
	break;
	case MAGIC_MASKLANE_UPPER:
	MagicMaskUpper(w);
	break;
	case MAGIC_JOIN_WF_BAR:
	MagicJoinWFBar(w);
	break;
	case MAGIC_WAIT_WF_BAR:
	MagicWaitWFBar(w);
	break;
	case MAGIC_PANIC:
	MagicPanic(w);
	break;

	// atomic instructions
	case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
	MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
	break;

	case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
	MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
	break;

	case MAGIC_LOAD_GLOBAL_U32_REG:
	MagicLoadGlobalU32Reg(w, gpuDynInst);
	break;

	case MAGIC_XACT_CAS_LD:
	MagicXactCasLd(w);
	break;

	case MAGIC_MOST_SIG_THD:
	MagicMostSigThread(w);
	break;

	case MAGIC_MOST_SIG_BROADCAST:
	MagicMostSigBroadcast(w);
	break;

	case MAGIC_PRINT_WFID_32:
	MagicPrintWF32ID(w);
	break;

	case MAGIC_PRINT_WFID_64:
	MagicPrintWFID64(w);
	break;

	default: fatal("unrecognized magic instruction: %d\n", op);
	}
	}

	void
	Call::MagicPrintLane(Wavefront *w)
	{
	#if TRACING_ON
	const VectorMask &mask = w->getPred();
	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	int src_val1 = src1.get<int>(w, lane, 1);
	int src_val2 = src1.get<int>(w, lane, 2);
	if (src_val2) {
	DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
	disassemble(), w->computeUnit->cu_id, w->simdId,
	w->wfSlotId, lane, src_val1);
	} else {
	DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
	disassemble(), w->computeUnit->cu_id, w->simdId,
	w->wfSlotId, lane, src_val1);
	}
	}
	}
	#endif
	}

	void
	Call::MagicPrintLane64(Wavefront *w)
	{
	#if TRACING_ON
	const VectorMask &mask = w->getPred();
	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
	int src_val2 = src1.get<int>(w, lane, 2);
	if (src_val2) {
	DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
	disassemble(), w->computeUnit->cu_id, w->simdId,
	w->wfSlotId, lane, src_val1);
	} else {
	DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
	disassemble(), w->computeUnit->cu_id, w->simdId,
	w->wfSlotId, lane, src_val1);
	}
	}
	}
	#endif
	}

	void
	Call::MagicPrintWF32(Wavefront *w)
	{
	#if TRACING_ON
	const VectorMask &mask = w->getPred();
	std::string res_str;
	res_str = csprintf("krl_prt (%s)\n", disassemble());

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (!(lane & 7)) {
	res_str += csprintf("DB%03d: ", (int)w->wfDynId);
	}

	if (mask[lane]) {
	int src_val1 = src1.get<int>(w, lane, 1);
	int src_val2 = src1.get<int>(w, lane, 2);

	if (src_val2) {
	res_str += csprintf("%08x", src_val1);
	} else {
	res_str += csprintf("%08d", src_val1);
	}
	} else {
	res_str += csprintf("xxxxxxxx");
	}

	if ((lane & 7) == 7) {
	res_str += csprintf("\n");
	} else {
	res_str += csprintf(" ");
	}
	}

	res_str += "\n\n";
	DPRINTFN(res_str.c_str());
	#endif
	}

	void
	Call::MagicPrintWF32ID(Wavefront *w)
	{
	#if TRACING_ON
	const VectorMask &mask = w->getPred();
	std::string res_str;
	int src_val3 = -1;
	res_str = csprintf("krl_prt (%s)\n", disassemble());

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (!(lane & 7)) {
	res_str += csprintf("DB%03d: ", (int)w->wfDynId);
	}

	if (mask[lane]) {
	int src_val1 = src1.get<int>(w, lane, 1);
	int src_val2 = src1.get<int>(w, lane, 2);
	src_val3 = src1.get<int>(w, lane, 3);

	if (src_val2) {
	res_str += csprintf("%08x", src_val1);
	} else {
	res_str += csprintf("%08d", src_val1);
	}
	} else {
	res_str += csprintf("xxxxxxxx");
	}

	if ((lane & 7) == 7) {
	res_str += csprintf("\n");
	} else {
	res_str += csprintf(" ");
	}
	}

	res_str += "\n\n";
	if (w->wfDynId == src_val3) {
	DPRINTFN(res_str.c_str());
	}
	#endif
	}

	void
	Call::MagicPrintWF64(Wavefront *w)
	{
	#if TRACING_ON
	const VectorMask &mask = w->getPred();
	std::string res_str;
	res_str = csprintf("krl_prt (%s)\n", disassemble());

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (!(lane & 3)) {
	res_str += csprintf("DB%03d: ", (int)w->wfDynId);
	}

	if (mask[lane]) {
	int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
	int src_val2 = src1.get<int>(w, lane, 2);

	if (src_val2) {
	res_str += csprintf("%016x", src_val1);
	} else {
	res_str += csprintf("%016d", src_val1);
	}
	} else {
	res_str += csprintf("xxxxxxxxxxxxxxxx");
	}

	if ((lane & 3) == 3) {
	res_str += csprintf("\n");
	} else {
	res_str += csprintf(" ");
	}
	}

	res_str += "\n\n";
	DPRINTFN(res_str.c_str());
	#endif
	}

	void
	Call::MagicPrintWFID64(Wavefront *w)
	{
	#if TRACING_ON
	const VectorMask &mask = w->getPred();
	std::string res_str;
	int src_val3 = -1;
	res_str = csprintf("krl_prt (%s)\n", disassemble());

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (!(lane & 3)) {
	res_str += csprintf("DB%03d: ", (int)w->wfDynId);
	}

	if (mask[lane]) {
	int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
	int src_val2 = src1.get<int>(w, lane, 2);
	src_val3 = src1.get<int>(w, lane, 3);

	if (src_val2) {
	res_str += csprintf("%016x", src_val1);
	} else {
	res_str += csprintf("%016d", src_val1);
	}
	} else {
	res_str += csprintf("xxxxxxxxxxxxxxxx");
	}

	if ((lane & 3) == 3) {
	res_str += csprintf("\n");
	} else {
	res_str += csprintf(" ");
	}
	}

	res_str += "\n\n";
	if (w->wfDynId == src_val3) {
	DPRINTFN(res_str.c_str());
	}
	#endif
	}

	void
	Call::MagicPrintWFFloat(Wavefront *w)
	{
	#if TRACING_ON
	const VectorMask &mask = w->getPred();
	std::string res_str;
	res_str = csprintf("krl_prt (%s)\n", disassemble());

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (!(lane & 7)) {
	res_str += csprintf("DB%03d: ", (int)w->wfDynId);
	}

	if (mask[lane]) {
	float src_val1 = src1.get<float>(w, lane, 1);
	res_str += csprintf("%08f", src_val1);
	} else {
	res_str += csprintf("xxxxxxxx");
	}

	if ((lane & 7) == 7) {
	res_str += csprintf("\n");
	} else {
	res_str += csprintf(" ");
	}
	}

	res_str += "\n\n";
	DPRINTFN(res_str.c_str());
	#endif
	}

	// raises a signal that GDB will catch
	// when done with the break, type "signal 0" in gdb to continue
	void
	Call::MagicSimBreak(Wavefront *w)
	{
	std::string res_str;
	// print out state for this wavefront and then break
	res_str = csprintf("Breakpoint encountered for wavefront %i\n",
	w->wfSlotId);

	res_str += csprintf(" Kern ID: %i\n", w->kernId);
	res_str += csprintf(" Phase ID: %i\n", w->simdId);
	res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
	res_str += csprintf(" Exec mask: ");

	for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
	if (w->execMask(i))
	res_str += "1";
	else
	res_str += "0";

	if ((i & 7) == 7)
	res_str += " ";
	}

	res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());

	res_str += "\nHelpful debugging hints:\n";
	res_str += " Check out w->s_reg / w->d_reg for register state\n";

	res_str += "\n\n";
	DPRINTFN(res_str.c_str());
	fflush(stdout);

	raise(SIGTRAP);
	}

	void
	Call::MagicPrefixSum(Wavefront *w)
	{
	const VectorMask &mask = w->getPred();
	int res = 0;

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	int src_val1 = src1.get<int>(w, lane, 1);
	dest.set<int>(w, lane, res);
	res += src_val1;
	}
	}
	}

	void
	Call::MagicReduction(Wavefront *w)
	{
	// reduction magic instruction
	// The reduction instruction takes up to 64 inputs (one from
	// each thread in a WF) and sums them. It returns the sum to
	// each thread in the WF.
	const VectorMask &mask = w->getPred();
	int res = 0;

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	int src_val1 = src1.get<int>(w, lane, 1);
	res += src_val1;
	}
	}

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	dest.set<int>(w, lane, res);
	}
	}
	}

	void
	Call::MagicMaskLower(Wavefront *w)
	{
	const VectorMask &mask = w->getPred();
	int res = 0;

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	int src_val1 = src1.get<int>(w, lane, 1);

	if (src_val1) {
	if (lane < (w->computeUnit->wfSize()/2)) {
	res = res \| ((uint32_t)(1) << lane);
	}
	}
	}
	}

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	dest.set<int>(w, lane, res);
	}
	}
	}

	void
	Call::MagicMaskUpper(Wavefront *w)
	{
	const VectorMask &mask = w->getPred();
	int res = 0;
	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	int src_val1 = src1.get<int>(w, lane, 1);

	if (src_val1) {
	if (lane >= (w->computeUnit->wfSize()/2)) {
	res = res \| ((uint32_t)(1) <<
	(lane - (w->computeUnit->wfSize()/2)));
	}
	}
	}
	}

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	dest.set<int>(w, lane, res);
	}
	}
	}

	void
	Call::MagicJoinWFBar(Wavefront *w)
	{
	const VectorMask &mask = w->getPred();
	int max_cnt = 0;

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	w->barCnt[lane]++;

	if (w->barCnt[lane] > max_cnt) {
	max_cnt = w->barCnt[lane];
	}
	}
	}

	if (max_cnt > w->maxBarCnt) {
	w->maxBarCnt = max_cnt;
	}
	}

	void
	Call::MagicWaitWFBar(Wavefront *w)
	{
	const VectorMask &mask = w->getPred();
	int max_cnt = 0;

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	w->barCnt[lane]--;
	}

	if (w->barCnt[lane] > max_cnt) {
	max_cnt = w->barCnt[lane];
	}
	}

	if (max_cnt < w->maxBarCnt) {
	w->maxBarCnt = max_cnt;
	}

	w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
	w->instructionBuffer.end());
	if (w->pendingFetch)
	w->dropFetch = true;
	}

	void
	Call::MagicPanic(Wavefront *w)
	{
	const VectorMask &mask = w->getPred();

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	int src_val1 = src1.get<int>(w, lane, 1);
	panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
	src_val1, lane);
	}
	}
	}

	void
	Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
	{
	// the address is in src1 \| src2
	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	int src_val1 = src1.get<int>(w, lane, 1);
	int src_val2 = src1.get<int>(w, lane, 2);
	Addr addr = (((Addr) src_val1) << 32) \| ((Addr) src_val2);

	m->addr[lane] = addr;
	}

	}

	void
	Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
	{
	GPUDynInstPtr m = gpuDynInst;

	calcAddr(w, m);

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
	}

	setFlag(AtomicNoReturn);
	setFlag(AtomicAdd);
	setFlag(NoScope);
	setFlag(NoOrder);
	setFlag(GlobalSegment);

	m->m_type = U32::memType;
	m->v_type = U32::vgprType;

	m->exec_mask = w->execMask();
	m->statusBitVector = 0;
	m->equiv = 0; // atomics don't have an equivalence class operand
	m->n_reg = 1;

	m->simdId = w->simdId;
	m->wfSlotId = w->wfSlotId;
	m->wfDynId = w->wfDynId;
	m->latency.init(&w->computeUnit->shader->tick_cnt);

	m->pipeId = GLBMEM_PIPE;
	m->latency.set(w->computeUnit->shader->ticks(64));
	w->computeUnit->globalMemoryPipe.issueRequest(m);
	w->outstandingReqsWrGm++;
	w->wrGmReqsInPipe--;
	w->outstandingReqsRdGm++;
	w->rdGmReqsInPipe--;
	w->outstandingReqs++;
	w->memReqsInPipe--;
	}

	void
	Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
	{
	GPUDynInstPtr m = gpuDynInst;
	calcAddr(w, m);

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
	}

	setFlag(AtomicNoReturn);
	setFlag(AtomicAdd);
	setFlag(NoScope);
	setFlag(NoOrder);
	setFlag(GlobalSegment);

	m->m_type = U32::memType;
	m->v_type = U32::vgprType;

	m->exec_mask = w->execMask();
	m->statusBitVector = 0;
	m->equiv = 0; // atomics don't have an equivalence class operand
	m->n_reg = 1;

	m->simdId = w->simdId;
	m->wfSlotId = w->wfSlotId;
	m->wfDynId = w->wfDynId;
	m->latency.init(&w->computeUnit->shader->tick_cnt);

	m->pipeId = GLBMEM_PIPE;
	m->latency.set(w->computeUnit->shader->ticks(64));
	w->computeUnit->globalMemoryPipe.issueRequest(m);
	w->outstandingReqsWrGm++;
	w->wrGmReqsInPipe--;
	w->outstandingReqsRdGm++;
	w->rdGmReqsInPipe--;
	w->outstandingReqs++;
	w->memReqsInPipe--;
	}

	void
	Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
	{
	GPUDynInstPtr m = gpuDynInst;
	// calculate the address
	calcAddr(w, m);

	setFlag(Load);
	setFlag(NoScope);
	setFlag(NoOrder);
	setFlag(GlobalSegment);

	m->m_type = U32::memType; //MemDataType::memType;
	m->v_type = U32::vgprType; //DestDataType::vgprType;

	m->exec_mask = w->execMask();
	m->statusBitVector = 0;
	m->equiv = 0;
	m->n_reg = 1;

	// FIXME
	//m->dst_reg = this->dest.regIndex();

	m->simdId = w->simdId;
	m->wfSlotId = w->wfSlotId;
	m->wfDynId = w->wfDynId;
	m->latency.init(&w->computeUnit->shader->tick_cnt);

	m->pipeId = GLBMEM_PIPE;
	m->latency.set(w->computeUnit->shader->ticks(1));
	w->computeUnit->globalMemoryPipe.issueRequest(m);
	w->outstandingReqsRdGm++;
	w->rdGmReqsInPipe--;
	w->outstandingReqs++;
	w->memReqsInPipe--;
	}

	void
	Call::MagicXactCasLd(Wavefront *w)
	{
	const VectorMask &mask = w->getPred();
	int src_val1 = 0;

	for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
	if (mask[lane]) {
	src_val1 = src1.get<int>(w, lane, 1);
	break;
	}
	}

	if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
	w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
	w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
	}

	w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
	.push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
	}

	void
	Call::MagicMostSigThread(Wavefront *w)
	{
	const VectorMask &mask = w->getPred();
	unsigned mst = true;

	for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
	if (mask[lane]) {
	dest.set<int>(w, lane, mst);
	mst = false;
	}
	}
	}

	void
	Call::MagicMostSigBroadcast(Wavefront *w)
	{
	const VectorMask &mask = w->getPred();
	int res = 0;
	bool got_res = false;

	for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
	if (mask[lane]) {
	if (!got_res) {
	res = src1.get<int>(w, lane, 1);
	got_res = true;
	}
	dest.set<int>(w, lane, res);
	}
	}
	}

	} // namespace HsailISA