gpu-compute, arch-gcn3: Change how waitcnts are implemented

Use single counters per memory operation type and increment
them upon issue, not execute.

Change-Id: I6afc0b66b21882538ef90a14a57a3ab3cc7bd6f3
diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc
index 19c6e63..bf8f60f 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -32565,6 +32565,7 @@
 
         vdst.write();
 
+        wf->decLGKMInstsIssued();
         wf->rdLmReqsInPipe--;
         wf->validateRequestCounters();
     } // execute
@@ -32635,6 +32636,7 @@
 
         vdst.write();
 
+        wf->decLGKMInstsIssued();
         wf->rdLmReqsInPipe--;
         wf->validateRequestCounters();
     } // execute
@@ -39400,6 +39402,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             return;
@@ -39496,6 +39500,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             return;
@@ -39592,6 +39598,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             return;
@@ -39660,6 +39668,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             return;
@@ -39728,6 +39738,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
             return;
@@ -39805,6 +39817,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->rdGmReqsInPipe--;
             wf->rdLmReqsInPipe--;
         }
@@ -39884,6 +39898,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
@@ -39952,6 +39968,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
@@ -40021,6 +40039,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
@@ -40090,6 +40110,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
@@ -40159,6 +40181,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
@@ -40237,6 +40261,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->wrLmReqsInPipe--;
             return;
@@ -40325,6 +40351,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             return;
@@ -40425,6 +40453,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             return;
@@ -40526,6 +40556,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             return;
@@ -40893,6 +40925,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             return;
@@ -40995,6 +41029,8 @@
         Wavefront *wf = gpuDynInst->wavefront();
 
         if (wf->execMask().none()) {
+            wf->decVMemInstsIssued();
+            wf->decLGKMInstsIssued();
             wf->wrGmReqsInPipe--;
             wf->rdGmReqsInPipe--;
             return;
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
index 327b5d1..eedd938 100644
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -133,6 +133,7 @@
         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
                 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
         m->completeAcc(m);
+        w->decVMemInstsIssued();
 
         if (m->isLoad() || m->isAtomicRet()) {
             w->computeUnit->vrf[w->simdId]->
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index ef5f5dd..debbc37 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -821,6 +821,7 @@
     if (executedAs() == Enums::SC_GLOBAL) {
         // no transormation for global segment
         wavefront()->execUnitId =  wavefront()->flatGmUnitId;
+        wavefront()->decLGKMInstsIssued();
         if (isLoad()) {
             wavefront()->rdLmReqsInPipe--;
         } else if (isStore()) {
@@ -840,6 +841,7 @@
             }
         }
         wavefront()->execUnitId =  wavefront()->flatLmUnitId;
+        wavefront()->decVMemInstsIssued();
         if (isLoad()) {
             wavefront()->rdGmReqsInPipe--;
         } else if (isStore()) {
@@ -899,6 +901,7 @@
             }
         }
         wavefront()->execUnitId =  wavefront()->flatLmUnitId;
+        wavefront()->decLGKMInstsIssued();
         if (isLoad()) {
             wavefront()->rdGmReqsInPipe--;
         } else if (isStore()) {
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index e34627c..a4f62a6 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -78,6 +78,7 @@
         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
                 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
         m->completeAcc(m);
+        w->decLGKMInstsIssued();
 
         if (m->isLoad() || m->isAtomicRet()) {
             w->computeUnit->vrf[w->simdId]->
diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc
index 35b4ca5..5e4496d 100644
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -85,6 +85,7 @@
         }
 
         m->completeAcc(m);
+        w->decLGKMInstsIssued();
 
         if (m->isLoad() || m->isAtomic()) {
             returnedLoads.pop();
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
index 1f4a96d..0ee6a0f 100644
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -139,6 +139,15 @@
             // this wave spends in SCH stage.
             wf->schCycles++;
             addToSchListStalls[j]++;
+        } else {
+            if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
+                wf->incLGKMInstsIssued();
+            } else {
+                wf->incVMemInstsIssued();
+                if (gpu_dyn_inst->isFlat()) {
+                    wf->incLGKMInstsIssued();
+                }
+            }
         }
     }
 
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 6f3b755..10a1505 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -55,6 +55,7 @@
   : SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId),
     maxIbSize(p->max_ib_size), _gpuISA(*this),
     vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
+    vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
     barId(WFBarrier::InvalidID)
 {
     lastTrace = 0;
@@ -1255,37 +1256,27 @@
         return false;
     }
 
-    // If we reach here, that means waitCnt instruction is executed and
-    // the waitcnts are set by the execute method. Check if waitcnts are
-    // satisfied.
-
-    // current number of vector memory ops in flight
-    int vm_cnt = outstandingReqsWrGm + outstandingReqsRdGm;
-
-    // current number of export insts or vector memory writes in flight
-    int exp_cnt = outstandingReqsWrGm;
-
-    // current number of scalar/LDS memory ops in flight
-    // we do not consider GDS/message ops
-    int lgkm_cnt = outstandingReqsWrLm + outstandingReqsRdLm +
-        scalarOutstandingReqsRdGm + scalarOutstandingReqsWrGm;
-
+    /**
+     * If we reach here, that means an s_waitcnt instruction was executed
+     * and the waitcnts are set by the execute method. Check if waitcnts
+     * are satisfied.
+     */
     if (vmWaitCnt != -1) {
-        if (vm_cnt > vmWaitCnt) {
+        if (vmemInstsIssued > vmWaitCnt) {
             // vmWaitCnt not satisfied
             return false;
         }
     }
 
     if (expWaitCnt != -1) {
-        if (exp_cnt > expWaitCnt) {
+        if (expInstsIssued > expWaitCnt) {
             // expWaitCnt not satisfied
             return false;
         }
     }
 
     if (lgkmWaitCnt != -1) {
-        if (lgkm_cnt > lgkmWaitCnt) {
+        if (lgkmInstsIssued > lgkmWaitCnt) {
             // lgkmWaitCnt not satisfied
             return false;
         }
@@ -1357,6 +1348,42 @@
     status = S_RUNNING;
 }
 
+void
+Wavefront::incVMemInstsIssued()
+{
+    ++vmemInstsIssued;
+}
+
+void
+Wavefront::incExpInstsIssued()
+{
+    ++expInstsIssued;
+}
+
+void
+Wavefront::incLGKMInstsIssued()
+{
+    ++lgkmInstsIssued;
+}
+
+void
+Wavefront::decVMemInstsIssued()
+{
+    --vmemInstsIssued;
+}
+
+void
+Wavefront::decExpInstsIssued()
+{
+    --expInstsIssued;
+}
+
+void
+Wavefront::decLGKMInstsIssued()
+{
+    --lgkmInstsIssued;
+}
+
 Addr
 Wavefront::pc() const
 {
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 59a6108..f05c17a 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -306,6 +306,13 @@
     void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
     void clearWaitCnts();
 
+    void incVMemInstsIssued();
+    void incExpInstsIssued();
+    void incLGKMInstsIssued();
+    void decVMemInstsIssued();
+    void decExpInstsIssued();
+    void decLGKMInstsIssued();
+
     /** Freeing VRF space */
     void freeRegisterFile();
 
@@ -345,6 +352,9 @@
     int vmWaitCnt;
     int expWaitCnt;
     int lgkmWaitCnt;
+    int vmemInstsIssued;
+    int expInstsIssued;
+    int lgkmInstsIssued;
     status_e status;
     Addr _pc;
     VectorMask _execMask;