riscv: fix AMO, LR and SC instructions

(1) Atomic Memory Operation (AMO)

This patch changes how RISC-V AMO instructions are implemented. For each
AMO, instead of issuing a locking load and an unlocking store request to
downstream memory system, this patch issues a single memory request that
contains a corresponding AtomicOpFunctor to the memory system. Once the
memory system receives the request, the atomic operation is executed in
one single step.

This patch also changes how AMO instructions handle acquire and release
flags in AMOs (e.g., amoadd.aq and amoadd.rl). If an AMO is associated
with an acquire flag, a memory fence is inserted after the AMO completes
as a micro-op. If an AMO is associated with a release flag, another
memory fence is inserted before the AMO executes. If both flags are
specified, the AMO is broken down into a sequence of 3 micro-ops:
mem fence -> atomic RMW -> mem fence. This change makes this AMO
implementation comply to the release consistency model.

(2) Load-Reserved (LR) and Store-Conditional (SC)

Addresses locked by LR instructions are tracked in a stack data
structure. LR instruction pushes its target address to the stack, and SC
instruction pops the top address from the stack. As specified by RISC-V
ISA, a SC fails if its target address does not match with the most recent
LR.

Previously, there was a single stack for all hardware thread contexts.
A shared stack between thread contexts can lead to a infinite sequence
of failed SCs if LRs from other threads keep pushing new addresses to
this stack.

This patch gives each context its private stack to address the problem.

This patch also adds extra memory fence micro-ops to lr/sc to guarantee
a correct execution order of memory instructions with respect to release
consistency model.

Change-Id: I1e95900367c89dd866ba872a5203f63359ac51ae
Reviewed-on: https://gem5-review.googlesource.com/c/8189
Reviewed-by: Alec Roelke <ar4jc@virginia.edu>
Maintainer: Alec Roelke <ar4jc@virginia.edu>
diff --git a/src/arch/riscv/insts/amo.cc b/src/arch/riscv/insts/amo.cc
index 7f5740f..d120647 100644
--- a/src/arch/riscv/insts/amo.cc
+++ b/src/arch/riscv/insts/amo.cc
@@ -43,6 +43,22 @@
 namespace RiscvISA
 {
 
+// memfence micro instruction
+string MemFenceMicro::generateDisassembly(Addr pc,
+    const SymbolTable *symtab) const
+{
+    stringstream ss;
+    ss << csprintf("0x%08x", machInst) << ' ' << mnemonic;
+    return ss.str();
+}
+
+Fault MemFenceMicro::execute(ExecContext *xc,
+    Trace::InstRecord *traceData) const
+{
+    return NoFault;
+}
+
+// load-reserved
 string LoadReserved::generateDisassembly(Addr pc,
     const SymbolTable *symtab) const
 {
@@ -52,6 +68,16 @@
     return ss.str();
 }
 
+string LoadReservedMicro::generateDisassembly(Addr pc,
+    const SymbolTable *symtab) const
+{
+    stringstream ss;
+    ss << mnemonic << ' ' << registerName(_destRegIdx[0]) << ", ("
+            << registerName(_srcRegIdx[0]) << ')';
+    return ss.str();
+}
+
+// store-conditional
 string StoreCond::generateDisassembly(Addr pc,
     const SymbolTable *symtab) const
 {
@@ -62,6 +88,17 @@
     return ss.str();
 }
 
+string StoreCondMicro::generateDisassembly(Addr pc,
+    const SymbolTable *symtab) const
+{
+    stringstream ss;
+    ss << mnemonic << ' ' << registerName(_destRegIdx[0]) << ", "
+            << registerName(_srcRegIdx[1]) << ", ("
+            << registerName(_srcRegIdx[0]) << ')';
+    return ss.str();
+}
+
+// AMOs
 string AtomicMemOp::generateDisassembly(Addr pc,
     const SymbolTable *symtab) const
 {
@@ -76,8 +113,10 @@
     const SymbolTable *symtab) const
 {
     stringstream ss;
-    ss << csprintf("0x%08x", machInst) << ' ' << mnemonic;
+    ss << mnemonic << ' ' << registerName(_destRegIdx[0]) << ", "
+            << registerName(_srcRegIdx[1]) << ", ("
+            << registerName(_srcRegIdx[0]) << ')';
     return ss.str();
 }
 
-}
\ No newline at end of file
+}
diff --git a/src/arch/riscv/insts/amo.hh b/src/arch/riscv/insts/amo.hh
index 7c07bc2..748fe14 100644
--- a/src/arch/riscv/insts/amo.hh
+++ b/src/arch/riscv/insts/amo.hh
@@ -41,24 +41,62 @@
 namespace RiscvISA
 {
 
-class LoadReserved : public MemInst
+// memfence micro instruction
+class MemFenceMicro : public RiscvMicroInst
+{
+  public:
+    MemFenceMicro(ExtMachInst _machInst, OpClass __opClass)
+        : RiscvMicroInst("fence", _machInst, __opClass)
+    { }
+  protected:
+    using RiscvMicroInst::RiscvMicroInst;
+
+    Fault execute(ExecContext *, Trace::InstRecord *) const override;
+    std::string generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const override;
+};
+
+// load-reserved
+class LoadReserved : public RiscvMacroInst
 {
   protected:
-    using MemInst::MemInst;
+    using RiscvMacroInst::RiscvMacroInst;
 
     std::string generateDisassembly(
         Addr pc, const SymbolTable *symtab) const override;
 };
 
-class StoreCond : public MemInst
+class LoadReservedMicro : public RiscvMicroInst
 {
   protected:
-    using MemInst::MemInst;
+    Request::Flags memAccessFlags;
+    using RiscvMicroInst::RiscvMicroInst;
 
     std::string generateDisassembly(
         Addr pc, const SymbolTable *symtab) const override;
 };
 
+// store-cond
+class StoreCond : public RiscvMacroInst
+{
+  protected:
+    using RiscvMacroInst::RiscvMacroInst;
+
+    std::string generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const override;
+};
+
+class StoreCondMicro : public RiscvMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+    using RiscvMicroInst::RiscvMicroInst;
+
+    std::string generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const override;
+};
+
+// AMOs
 class AtomicMemOp : public RiscvMacroInst
 {
   protected:
@@ -78,6 +116,23 @@
         Addr pc, const SymbolTable *symtab) const override;
 };
 
+/**
+ * A generic atomic op class
+ */
+
+template<typename T>
+class AtomicGenericOp : public TypedAtomicOpFunctor<T>
+{
+  public:
+    AtomicGenericOp(T _a, std::function<void(T*,T)> _op)
+      : a(_a), op(_op) { }
+    AtomicOpFunctor* clone() { return new AtomicGenericOp<T>(*this); }
+    void execute(T *b) { op(b, a); }
+  private:
+    T a;
+    std::function<void(T*,T)> op;
+};
+
 }
 
-#endif // __ARCH_RISCV_INSTS_AMO_HH__
\ No newline at end of file
+#endif // __ARCH_RISCV_INSTS_AMO_HH__
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index 3a04a02..8de4829 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -512,44 +512,69 @@
                 }}, {{
                     Rd = result;
                 }}, inst_flags=IsStoreConditional, mem_flags=LLSC);
-                format AtomicMemOp {
-                    0x0: amoadd_w({{Rt_sd = Mem_sw;}}, {{
-                        Mem_sw = Rs2_sw + Rt_sd;
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x1: amoswap_w({{Rt_sd = Mem_sw;}}, {{
-                        Mem_sw = Rs2_uw;
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x4: amoxor_w({{Rt_sd = Mem_sw;}}, {{
-                        Mem_sw = Rs2_uw^Rt_sd;
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x8: amoor_w({{Rt_sd = Mem_sw;}}, {{
-                        Mem_sw = Rs2_uw | Rt_sd;
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0xc: amoand_w({{Rt_sd = Mem_sw;}}, {{
-                        Mem_sw = Rs2_uw&Rt_sd;
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x10: amomin_w({{Rt_sd = Mem_sw;}}, {{
-                        Mem_sw = min<int32_t>(Rs2_sw, Rt_sd);
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x14: amomax_w({{Rt_sd = Mem_sw;}}, {{
-                        Mem_sw = max<int32_t>(Rs2_sw, Rt_sd);
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x18: amominu_w({{Rt_sd = Mem_sw;}}, {{
-                        Mem_sw = min<uint32_t>(Rs2_uw, Rt_sd);
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x1c: amomaxu_w({{Rt_sd = Mem_sw;}}, {{
-                        Mem_sw = max<uint32_t>(Rs2_uw, Rt_sd);
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                }
+                0x0: AtomicMemOp::amoadd_w({{
+                    Rd_sd = Mem_sw;
+                }}, {{
+                    TypedAtomicOpFunctor<int32_t> *amo_op =
+                          new AtomicGenericOp<int32_t>(Rs2_sw,
+                                  [](int32_t* b, int32_t a){ *b += a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x1: AtomicMemOp::amoswap_w({{
+                    Rd_sd = Mem_sw;
+                }}, {{
+                    TypedAtomicOpFunctor<uint32_t> *amo_op =
+                          new AtomicGenericOp<uint32_t>(Rs2_uw,
+                                  [](uint32_t* b, uint32_t a){ *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x4: AtomicMemOp::amoxor_w({{
+                    Rd_sd = Mem_sw;
+                }}, {{
+                    TypedAtomicOpFunctor<uint32_t> *amo_op =
+                          new AtomicGenericOp<uint32_t>(Rs2_uw,
+                                  [](uint32_t* b, uint32_t a){ *b ^= a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x8: AtomicMemOp::amoor_w({{
+                    Rd_sd = Mem_sw;
+                }}, {{
+                    TypedAtomicOpFunctor<uint32_t> *amo_op =
+                          new AtomicGenericOp<uint32_t>(Rs2_uw,
+                                  [](uint32_t* b, uint32_t a){ *b |= a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0xc: AtomicMemOp::amoand_w({{
+                    Rd_sd = Mem_sw;
+                }}, {{
+                    TypedAtomicOpFunctor<uint32_t> *amo_op =
+                          new AtomicGenericOp<uint32_t>(Rs2_uw,
+                                  [](uint32_t* b, uint32_t a){ *b &= a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x10: AtomicMemOp::amomin_w({{
+                    Rd_sd = Mem_sw;
+                }}, {{
+                    TypedAtomicOpFunctor<int32_t> *amo_op =
+                      new AtomicGenericOp<int32_t>(Rs2_sw,
+                        [](int32_t* b, int32_t a){ if (a < *b) *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x14: AtomicMemOp::amomax_w({{
+                    Rd_sd = Mem_sw;
+                }}, {{
+                    TypedAtomicOpFunctor<int32_t> *amo_op =
+                      new AtomicGenericOp<int32_t>(Rs2_sw,
+                        [](int32_t* b, int32_t a){ if (a > *b) *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x18: AtomicMemOp::amominu_w({{
+                    Rd_sd = Mem_sw;
+                }}, {{
+                    TypedAtomicOpFunctor<uint32_t> *amo_op =
+                      new AtomicGenericOp<uint32_t>(Rs2_uw,
+                        [](uint32_t* b, uint32_t a){ if (a < *b) *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x1c: AtomicMemOp::amomaxu_w({{
+                    Rd_sd = Mem_sw;
+                }}, {{
+                    TypedAtomicOpFunctor<uint32_t> *amo_op =
+                      new AtomicGenericOp<uint32_t>(Rs2_uw,
+                        [](uint32_t* b, uint32_t a){ if (a > *b) *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
             }
             0x3: decode AMOFUNCT {
                 0x2: LoadReserved::lr_d({{
@@ -560,44 +585,69 @@
                 }}, {{
                     Rd = result;
                 }}, mem_flags=LLSC, inst_flags=IsStoreConditional);
-                format AtomicMemOp {
-                    0x0: amoadd_d({{Rt_sd = Mem_sd;}}, {{
-                        Mem_sd = Rs2_sd + Rt_sd;
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x1: amoswap_d({{Rt = Mem;}}, {{
-                        Mem = Rs2;
-                        Rd = Rt;
-                    }}, {{EA = Rs1;}});
-                    0x4: amoxor_d({{Rt = Mem;}}, {{
-                        Mem = Rs2^Rt;
-                        Rd = Rt;
-                    }}, {{EA = Rs1;}});
-                    0x8: amoor_d({{Rt = Mem;}}, {{
-                        Mem = Rs2 | Rt;
-                        Rd = Rt;
-                    }}, {{EA = Rs1;}});
-                    0xc: amoand_d({{Rt = Mem;}}, {{
-                        Mem = Rs2&Rt;
-                        Rd = Rt;
-                    }}, {{EA = Rs1;}});
-                    0x10: amomin_d({{Rt_sd = Mem_sd;}}, {{
-                        Mem_sd = min(Rs2_sd, Rt_sd);
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x14: amomax_d({{Rt_sd = Mem_sd;}}, {{
-                        Mem_sd = max(Rs2_sd, Rt_sd);
-                        Rd_sd = Rt_sd;
-                    }}, {{EA = Rs1;}});
-                    0x18: amominu_d({{Rt = Mem;}}, {{
-                        Mem = min(Rs2, Rt);
-                        Rd = Rt;
-                    }}, {{EA = Rs1;}});
-                    0x1c: amomaxu_d({{Rt = Mem;}}, {{
-                        Mem = max(Rs2, Rt);
-                        Rd = Rt;
-                    }}, {{EA = Rs1;}});
-                }
+                0x0: AtomicMemOp::amoadd_d({{
+                    Rd_sd = Mem_sd;
+                }}, {{
+                    TypedAtomicOpFunctor<int64_t> *amo_op =
+                          new AtomicGenericOp<int64_t>(Rs2_sd,
+                                  [](int64_t* b, int64_t a){ *b += a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x1: AtomicMemOp::amoswap_d({{
+                    Rd_sd = Mem_sd;
+                }}, {{
+                    TypedAtomicOpFunctor<uint64_t> *amo_op =
+                          new AtomicGenericOp<uint64_t>(Rs2_ud,
+                                  [](uint64_t* b, uint64_t a){ *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x4: AtomicMemOp::amoxor_d({{
+                    Rd_sd = Mem_sd;
+                }}, {{
+                    TypedAtomicOpFunctor<uint64_t> *amo_op =
+                          new AtomicGenericOp<uint64_t>(Rs2_ud,
+                                 [](uint64_t* b, uint64_t a){ *b ^= a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x8: AtomicMemOp::amoor_d({{
+                    Rd_sd = Mem_sd;
+                }}, {{
+                    TypedAtomicOpFunctor<uint64_t> *amo_op =
+                          new AtomicGenericOp<uint64_t>(Rs2_ud,
+                                 [](uint64_t* b, uint64_t a){ *b |= a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0xc: AtomicMemOp::amoand_d({{
+                    Rd_sd = Mem_sd;
+                }}, {{
+                    TypedAtomicOpFunctor<uint64_t> *amo_op =
+                          new AtomicGenericOp<uint64_t>(Rs2_ud,
+                                 [](uint64_t* b, uint64_t a){ *b &= a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x10: AtomicMemOp::amomin_d({{
+                    Rd_sd = Mem_sd;
+                }}, {{
+                    TypedAtomicOpFunctor<int64_t> *amo_op =
+                      new AtomicGenericOp<int64_t>(Rs2_sd,
+                        [](int64_t* b, int64_t a){ if (a < *b) *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x14: AtomicMemOp::amomax_d({{
+                    Rd_sd = Mem_sd;
+                }}, {{
+                    TypedAtomicOpFunctor<int64_t> *amo_op =
+                      new AtomicGenericOp<int64_t>(Rs2_sd,
+                        [](int64_t* b, int64_t a){ if (a > *b) *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x18: AtomicMemOp::amominu_d({{
+                    Rd_sd = Mem_sd;
+                }}, {{
+                    TypedAtomicOpFunctor<uint64_t> *amo_op =
+                      new AtomicGenericOp<uint64_t>(Rs2_ud,
+                        [](uint64_t* b, uint64_t a){ if (a < *b) *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
+                0x1c: AtomicMemOp::amomaxu_d({{
+                    Rd_sd = Mem_sd;
+                }}, {{
+                    TypedAtomicOpFunctor<uint64_t> *amo_op =
+                      new AtomicGenericOp<uint64_t>(Rs2_ud,
+                        [](uint64_t* b, uint64_t a){ if (a > *b) *b = a; });
+                }}, mem_flags=ATOMIC_RETURN_OP);
             }
         }
         0x0c: decode FUNCT3 {
diff --git a/src/arch/riscv/isa/formats/amo.isa b/src/arch/riscv/isa/formats/amo.isa
index 1dca571..cc7346a 100644
--- a/src/arch/riscv/isa/formats/amo.isa
+++ b/src/arch/riscv/isa/formats/amo.isa
@@ -29,10 +29,7 @@
 //
 // Authors: Alec Roelke
 
-////////////////////////////////////////////////////////////////////
-//
-// Atomic memory operation instructions
-//
+// Declaration templates
 def template AtomicMemOpDeclare {{
     /**
      * Static instruction class for an AtomicMemOp operation
@@ -45,24 +42,14 @@
 
     protected:
 
-        class %(class_name)sLoad : public %(base_class)sMicro
+        /*
+         * The main RMW part of an AMO
+         */
+        class %(class_name)sRMW : public %(base_class)sMicro
         {
           public:
             // Constructor
-            %(class_name)sLoad(ExtMachInst machInst, %(class_name)s *_p);
-
-            Fault execute(ExecContext *, Trace::InstRecord *) const override;
-            Fault initiateAcc(ExecContext *,
-                              Trace::InstRecord *) const override;
-            Fault completeAcc(PacketPtr, ExecContext *,
-                              Trace::InstRecord *) const override;
-        };
-
-        class %(class_name)sStore : public %(base_class)sMicro
-        {
-          public:
-            // Constructor
-            %(class_name)sStore(ExtMachInst machInst, %(class_name)s *_p);
+            %(class_name)sRMW(ExtMachInst machInst, %(class_name)s *_p);
 
             Fault execute(ExecContext *, Trace::InstRecord *) const override;
             Fault initiateAcc(ExecContext *,
@@ -73,15 +60,90 @@
     };
 }};
 
-def template LRSCConstructor {{
+def template LRSCDeclare {{
+    /**
+     * Static instruction class for an AtomicMemOp operation
+     */
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst);
+
+    protected:
+
+        class %(class_name)sMicro : public %(base_class)sMicro
+        {
+          public:
+            // Constructor
+            %(class_name)sMicro(ExtMachInst machInst, %(class_name)s *_p);
+
+            Fault execute(ExecContext *, Trace::InstRecord *) const override;
+            Fault initiateAcc(ExecContext *,
+                              Trace::InstRecord *) const override;
+            Fault completeAcc(PacketPtr, ExecContext *,
+                              Trace::InstRecord *) const override;
+        };
+    };
+}};
+
+// Constructor templates
+def template LRSCMacroConstructor {{
     %(class_name)s::%(class_name)s(ExtMachInst machInst):
         %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
     {
         %(constructor)s;
-        if (AQ)
-            memAccessFlags = memAccessFlags | Request::ACQUIRE;
-        if (RL)
-            memAccessFlags = memAccessFlags | Request::RELEASE;
+
+        StaticInstPtr rel_fence;
+        StaticInstPtr lrsc;
+        StaticInstPtr acq_fence;
+
+        // set up release fence
+        if (RL) {
+            rel_fence = new MemFenceMicro(machInst, No_OpClass);
+            rel_fence->setFlag(IsFirstMicroop);
+            rel_fence->setFlag(IsMemBarrier);
+            rel_fence->setFlag(IsDelayedCommit);
+        }
+
+        // set up atomic rmw op
+        lrsc = new %(class_name)sMicro(machInst, this);
+
+        if (!RL) {
+            lrsc->setFlag(IsFirstMicroop);
+        }
+
+        if (!AQ) {
+            lrsc->setFlag(IsLastMicroop);
+        } else {
+            lrsc->setFlag(IsDelayedCommit);
+        }
+
+        // set up acquire fence
+        if (AQ) {
+            acq_fence = new MemFenceMicro(machInst, No_OpClass);
+            acq_fence->setFlag(IsLastMicroop);
+            acq_fence->setFlag(IsMemBarrier);
+        }
+
+        if (RL && AQ) {
+            microops = {rel_fence, lrsc, acq_fence};
+        } else if (RL) {
+            microops = {rel_fence, lrsc};
+        } else if (AQ) {
+            microops = {lrsc, acq_fence};
+        } else {
+            microops = {lrsc};
+        }
+    }
+}};
+
+def template LRSCMicroConstructor {{
+    %(class_name)s::%(class_name)sMicro::%(class_name)sMicro(
+        ExtMachInst machInst, %(class_name)s *_p)
+            : %(base_class)sMicro("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        %(constructor)s;
     }
 }};
 
@@ -90,39 +152,95 @@
             : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
     {
         %(constructor)s;
-        microops = {new %(class_name)sLoad(machInst, this),
-            new %(class_name)sStore(machInst, this)};
+
+        StaticInstPtr rel_fence;
+        StaticInstPtr rmw_op;
+        StaticInstPtr acq_fence;
+
+        // set up release fence
+        if (RL) {
+            rel_fence = new MemFenceMicro(machInst, No_OpClass);
+            rel_fence->setFlag(IsFirstMicroop);
+            rel_fence->setFlag(IsMemBarrier);
+            rel_fence->setFlag(IsDelayedCommit);
+        }
+
+        // set up atomic rmw op
+        rmw_op = new %(class_name)sRMW(machInst, this);
+
+        if (!RL) {
+            rmw_op->setFlag(IsFirstMicroop);
+        }
+
+        if (!AQ) {
+            rmw_op->setFlag(IsLastMicroop);
+        } else {
+            rmw_op->setFlag(IsDelayedCommit);
+        }
+
+        // set up acquire fence
+        if (AQ) {
+            acq_fence = new MemFenceMicro(machInst, No_OpClass);
+            acq_fence->setFlag(IsLastMicroop);
+            acq_fence->setFlag(IsMemBarrier);
+        }
+
+        if (RL && AQ) {
+            microops = {rel_fence, rmw_op, acq_fence};
+        } else if (RL) {
+            microops = {rel_fence, rmw_op};
+        } else if (AQ) {
+            microops = {rmw_op, acq_fence};
+        } else {
+            microops = {rmw_op};
+        }
     }
 }};
 
-def template AtomicMemOpLoadConstructor {{
-    %(class_name)s::%(class_name)sLoad::%(class_name)sLoad(
+def template AtomicMemOpRMWConstructor {{
+    %(class_name)s::%(class_name)sRMW::%(class_name)sRMW(
         ExtMachInst machInst, %(class_name)s *_p)
             : %(base_class)s("%(mnemonic)s[l]", machInst, %(op_class)s)
     {
         %(constructor)s;
-        flags[IsFirstMicroop] = true;
-        flags[IsDelayedCommit] = true;
-        if (AQ)
-            memAccessFlags = Request::ACQUIRE;
+
+        // overwrite default flags
+        flags[IsMemRef] = true;
+        flags[IsLoad] = false;
+        flags[IsStore] = false;
+        flags[IsAtomic] = true;
     }
 }};
 
-def template AtomicMemOpStoreConstructor {{
-    %(class_name)s::%(class_name)sStore::%(class_name)sStore(
-        ExtMachInst machInst, %(class_name)s *_p)
-            : %(base_class)s("%(mnemonic)s[s]", machInst, %(op_class)s)
+// execute() templates
+
+def template LoadReservedExecute {{
+    Fault
+    %(class_name)s::%(class_name)sMicro::execute(
+        ExecContext *xc, Trace::InstRecord *traceData) const
     {
-        %(constructor)s;
-        flags[IsLastMicroop] = true;
-        flags[IsNonSpeculative] = true;
-        if (RL)
-            memAccessFlags = Request::RELEASE;
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        if (fault == NoFault) {
+            fault = readMemAtomic(xc, traceData, EA, Mem, memAccessFlags);
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
     }
 }};
 
 def template StoreCondExecute {{
-    Fault %(class_name)s::execute(ExecContext *xc,
+    Fault %(class_name)s::%(class_name)sMicro::execute(ExecContext *xc,
         Trace::InstRecord *traceData) const
     {
         Addr EA;
@@ -157,8 +275,8 @@
     }
 }};
 
-def template AtomicMemOpLoadExecute {{
-    Fault %(class_name)s::%(class_name)sLoad::execute(ExecContext *xc,
+def template AtomicMemOpRMWExecute {{
+    Fault %(class_name)s::%(class_name)sRMW::execute(ExecContext *xc,
         Trace::InstRecord *traceData) const
     {
         Addr EA;
@@ -167,13 +285,18 @@
         %(op_decl)s;
         %(op_rd)s;
         %(ea_code)s;
+        %(amoop_code)s;
+
+        assert(amo_op);
 
         if (fault == NoFault) {
-            fault = readMemAtomic(xc, traceData, EA, Mem, memAccessFlags);
+            fault = amoMemAtomic(xc, traceData, Mem, EA, memAccessFlags,
+                                 amo_op);
+            %(memacc_code)s;
         }
 
         if (fault == NoFault) {
-            %(code)s;
+            %(postacc_code)s;
         }
 
         if (fault == NoFault) {
@@ -184,36 +307,11 @@
     }
 }};
 
-def template AtomicMemOpStoreExecute {{
-    Fault %(class_name)s::%(class_name)sStore::execute(ExecContext *xc,
-        Trace::InstRecord *traceData) const
-    {
-        Addr EA;
-        Fault fault = NoFault;
+// initiateAcc() templates
 
-        %(op_decl)s;
-        %(op_rd)s;
-        %(ea_code)s;
-
-        if (fault == NoFault) {
-            %(code)s;
-        }
-
-        if (fault == NoFault) {
-            fault = writeMemAtomic(xc, traceData, Mem, EA, memAccessFlags,
-                nullptr);
-        }
-
-        if (fault == NoFault) {
-            %(op_wb)s;
-        }
-
-        return fault;
-    }
-}};
-
-def template AtomicMemOpLoadInitiateAcc {{
-    Fault %(class_name)s::%(class_name)sLoad::initiateAcc(ExecContext *xc,
+def template LoadReservedInitiateAcc {{
+    Fault
+    %(class_name)s::%(class_name)sMicro::initiateAcc(ExecContext *xc,
         Trace::InstRecord *traceData) const
     {
         Addr EA;
@@ -231,9 +329,10 @@
     }
 }};
 
-def template AtomicMemOpStoreInitiateAcc {{
-    Fault %(class_name)s::%(class_name)sStore::initiateAcc(
-        ExecContext *xc, Trace::InstRecord *traceData) const
+def template StoreCondInitiateAcc {{
+    Fault
+    %(class_name)s::%(class_name)sMicro::initiateAcc(ExecContext *xc,
+        Trace::InstRecord *traceData) const
     {
         Addr EA;
         Fault fault = NoFault;
@@ -243,12 +342,62 @@
         %(ea_code)s;
 
         if (fault == NoFault) {
-            %(code)s;
+            %(memacc_code)s;
         }
 
         if (fault == NoFault) {
-            fault = writeMemTiming(xc, traceData, Mem, EA, memAccessFlags,
-                nullptr);
+            fault = writeMemTiming(xc, traceData, Mem, EA,
+                memAccessFlags, nullptr);
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template AtomicMemOpRMWInitiateAcc {{
+    Fault
+    %(class_name)s::%(class_name)sRMW::initiateAcc(ExecContext *xc,
+        Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_src_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+        %(amoop_code)s;
+
+        assert(amo_op);
+
+        if (fault == NoFault) {
+            fault = initiateMemAMO(xc, traceData, EA, Mem, memAccessFlags,
+                                   amo_op);
+        }
+
+        return fault;
+    }
+}};
+
+// completeAcc() templates
+
+def template LoadReservedCompleteAcc {{
+    Fault
+    %(class_name)s::%(class_name)sMicro::completeAcc(PacketPtr pkt,
+        ExecContext *xc, Trace::InstRecord *traceData) const
+    {
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+
+        getMem(pkt, Mem, traceData);
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
         }
 
         if (fault == NoFault) {
@@ -260,8 +409,8 @@
 }};
 
 def template StoreCondCompleteAcc {{
-    Fault %(class_name)s::completeAcc(Packet *pkt, ExecContext *xc,
-        Trace::InstRecord *traceData) const
+    Fault %(class_name)s::%(class_name)sMicro::completeAcc(Packet *pkt,
+          ExecContext *xc, Trace::InstRecord *traceData) const
     {
         Fault fault = NoFault;
 
@@ -283,8 +432,8 @@
     }
 }};
 
-def template AtomicMemOpLoadCompleteAcc {{
-    Fault %(class_name)s::%(class_name)sLoad::completeAcc(PacketPtr pkt,
+def template AtomicMemOpRMWCompleteAcc {{
+    Fault %(class_name)s::%(class_name)sRMW::completeAcc(Packet *pkt,
         ExecContext *xc, Trace::InstRecord *traceData) const
     {
         Fault fault = NoFault;
@@ -295,7 +444,7 @@
         getMem(pkt, Mem, traceData);
 
         if (fault == NoFault) {
-            %(code)s;
+            %(memacc_code)s;
         }
 
         if (fault == NoFault) {
@@ -306,16 +455,20 @@
     }
 }};
 
-def template AtomicMemOpStoreCompleteAcc {{
-    Fault %(class_name)s::%(class_name)sStore::completeAcc(PacketPtr pkt,
-        ExecContext *xc, Trace::InstRecord *traceData) const
-    {
-        return NoFault;
-    }
-}};
+// LR/SC/AMO decode formats
 
 def format LoadReserved(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
         mem_flags=[], inst_flags=[]) {{
+    macro_ea_code = ''
+    macro_inst_flags = []
+    macro_iop = InstObjParams(name, Name, 'LoadReserved', macro_ea_code,
+                              macro_inst_flags)
+    header_output = LRSCDeclare.subst(macro_iop)
+    decoder_output = LRSCMacroConstructor.subst(macro_iop)
+    decode_block = BasicDecode.subst(macro_iop)
+
+    exec_output = ''
+
     mem_flags = makeList(mem_flags)
     inst_flags = makeList(inst_flags)
     iop = InstObjParams(name, Name, 'LoadReserved',
@@ -324,16 +477,25 @@
     iop.constructor += '\n\tmemAccessFlags = memAccessFlags | ' + \
         '|'.join(['Request::%s' % flag for flag in mem_flags]) + ';'
 
-    header_output = LoadStoreDeclare.subst(iop)
-    decoder_output = LRSCConstructor.subst(iop)
-    decode_block = BasicDecode.subst(iop)
-    exec_output = LoadExecute.subst(iop) \
-        + LoadInitiateAcc.subst(iop) \
-        + LoadCompleteAcc.subst(iop)
+    decoder_output += LRSCMicroConstructor.subst(iop)
+    decode_block += BasicDecode.subst(iop)
+    exec_output += LoadReservedExecute.subst(iop) \
+        + LoadReservedInitiateAcc.subst(iop) \
+        + LoadReservedCompleteAcc.subst(iop)
 }};
 
 def format StoreCond(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
         mem_flags=[], inst_flags=[]) {{
+    macro_ea_code = ''
+    macro_inst_flags = []
+    macro_iop = InstObjParams(name, Name, 'StoreCond', macro_ea_code,
+                              macro_inst_flags)
+    header_output = LRSCDeclare.subst(macro_iop)
+    decoder_output = LRSCMacroConstructor.subst(macro_iop)
+    decode_block = BasicDecode.subst(macro_iop)
+
+    exec_output = ''
+
     mem_flags = makeList(mem_flags)
     inst_flags = makeList(inst_flags)
     iop = InstObjParams(name, Name, 'StoreCond',
@@ -342,37 +504,40 @@
     iop.constructor += '\n\tmemAccessFlags = memAccessFlags | ' + \
         '|'.join(['Request::%s' % flag for flag in mem_flags]) + ';'
 
-    header_output = LoadStoreDeclare.subst(iop)
-    decoder_output = LRSCConstructor.subst(iop)
-    decode_block = BasicDecode.subst(iop)
-    exec_output = StoreCondExecute.subst(iop) \
-        + StoreInitiateAcc.subst(iop) \
+    decoder_output += LRSCMicroConstructor.subst(iop)
+    decode_block += BasicDecode.subst(iop)
+    exec_output += StoreCondExecute.subst(iop) \
+        + StoreCondInitiateAcc.subst(iop) \
         + StoreCondCompleteAcc.subst(iop)
 }};
 
-def format AtomicMemOp(load_code, store_code, ea_code, load_flags=[],
-        store_flags=[], inst_flags=[]) {{
-    macro_iop = InstObjParams(name, Name, 'AtomicMemOp', ea_code, inst_flags)
+def format AtomicMemOp(memacc_code, amoop_code, postacc_code={{ }},
+        ea_code={{EA = Rs1;}}, mem_flags=[], inst_flags=[]) {{
+    macro_ea_code = ''
+    macro_inst_flags = []
+    macro_iop = InstObjParams(name, Name, 'AtomicMemOp', macro_ea_code,
+                              macro_inst_flags)
     header_output = AtomicMemOpDeclare.subst(macro_iop)
     decoder_output = AtomicMemOpMacroConstructor.subst(macro_iop)
     decode_block = BasicDecode.subst(macro_iop)
+
     exec_output = ''
 
-    load_inst_flags = makeList(inst_flags) + ["IsMemRef", "IsLoad"]
-    load_iop = InstObjParams(name, Name, 'AtomicMemOpMicro',
-        {'ea_code': ea_code, 'code': load_code, 'op_name': 'Load'},
-        load_inst_flags)
-    decoder_output += AtomicMemOpLoadConstructor.subst(load_iop)
-    exec_output += AtomicMemOpLoadExecute.subst(load_iop) \
-        + AtomicMemOpLoadInitiateAcc.subst(load_iop) \
-        + AtomicMemOpLoadCompleteAcc.subst(load_iop)
+    rmw_mem_flags = makeList(mem_flags)
+    rmw_inst_flags = makeList(inst_flags)
+    rmw_iop = InstObjParams(name, Name, 'AtomicMemOpMicro',
+                            {'ea_code': ea_code,
+                             'memacc_code': memacc_code,
+                             'postacc_code': postacc_code,
+                             'amoop_code': amoop_code},
+                            rmw_inst_flags)
 
-    store_inst_flags = makeList(inst_flags) + ["IsMemRef", "IsStore"]
-    store_iop = InstObjParams(name, Name, 'AtomicMemOpMicro',
-        {'ea_code': ea_code, 'code': store_code, 'op_name': 'Store'},
-        store_inst_flags)
-    decoder_output += AtomicMemOpStoreConstructor.subst(store_iop)
-    exec_output += AtomicMemOpStoreExecute.subst(store_iop) \
-        + AtomicMemOpStoreInitiateAcc.subst(store_iop) \
-        + AtomicMemOpStoreCompleteAcc.subst(store_iop)
+    rmw_iop.constructor += '\n\tmemAccessFlags = memAccessFlags | ' + \
+          '|'.join(['Request::%s' % flag for flag in rmw_mem_flags]) + ';'
+
+    decoder_output += AtomicMemOpRMWConstructor.subst(rmw_iop)
+    decode_block += BasicDecode.subst(rmw_iop)
+    exec_output += AtomicMemOpRMWExecute.subst(rmw_iop) \
+                 + AtomicMemOpRMWInitiateAcc.subst(rmw_iop) \
+                 + AtomicMemOpRMWCompleteAcc.subst(rmw_iop)
 }};
diff --git a/src/arch/riscv/locked_mem.cc b/src/arch/riscv/locked_mem.cc
index 3c8dbe9..957cffb 100644
--- a/src/arch/riscv/locked_mem.cc
+++ b/src/arch/riscv/locked_mem.cc
@@ -6,7 +6,5 @@
 
 namespace RiscvISA
 {
-
-std::stack<Addr> locked_addrs;
-
+    std::unordered_map<int, std::stack<Addr>> locked_addrs;
 }
diff --git a/src/arch/riscv/locked_mem.hh b/src/arch/riscv/locked_mem.hh
index b1cde34..08d27f1 100644
--- a/src/arch/riscv/locked_mem.hh
+++ b/src/arch/riscv/locked_mem.hh
@@ -49,6 +49,7 @@
 #define __ARCH_RISCV_LOCKED_MEM_HH__
 
 #include <stack>
+#include <unordered_map>
 
 #include "arch/registers.hh"
 #include "base/logging.hh"
@@ -67,24 +68,28 @@
 
 // RISC-V allows multiple locks per hart, but each SC has to unlock the most
 // recent one, so we use a stack here.
-extern std::stack<Addr> locked_addrs;
+extern std::unordered_map<int, std::stack<Addr>> locked_addrs;
 
 template <class XC> inline void
 handleLockedSnoop(XC *xc, PacketPtr pkt, Addr cacheBlockMask)
 {
-    if (locked_addrs.empty())
+    std::stack<Addr>& locked_addr_stack = locked_addrs[xc->contextId()];
+
+    if (locked_addr_stack.empty())
         return;
     Addr snoop_addr = pkt->getAddr() & cacheBlockMask;
     DPRINTF(LLSC, "Locked snoop on address %x.\n", snoop_addr);
-    if ((locked_addrs.top() & cacheBlockMask) == snoop_addr)
-        locked_addrs.pop();
+    if ((locked_addr_stack.top() & cacheBlockMask) == snoop_addr)
+        locked_addr_stack.pop();
 }
 
 
 template <class XC> inline void
 handleLockedRead(XC *xc, const RequestPtr &req)
 {
-    locked_addrs.push(req->getPaddr() & ~0xF);
+    std::stack<Addr>& locked_addr_stack = locked_addrs[xc->contextId()];
+
+    locked_addr_stack.push(req->getPaddr() & ~0xF);
     DPRINTF(LLSC, "[cid:%d]: Reserved address %x.\n",
             req->contextId(), req->getPaddr() & ~0xF);
 }
@@ -96,21 +101,23 @@
 template <class XC> inline bool
 handleLockedWrite(XC *xc, const RequestPtr &req, Addr cacheBlockMask)
 {
+    std::stack<Addr>& locked_addr_stack = locked_addrs[xc->contextId()];
+
     // Normally RISC-V uses zero to indicate success and nonzero to indicate
     // failure (right now only 1 is reserved), but in gem5 zero indicates
     // failure and one indicates success, so here we conform to that (it should
     // be switched in the instruction's implementation)
 
     DPRINTF(LLSC, "[cid:%d]: locked_addrs empty? %s.\n", req->contextId(),
-            locked_addrs.empty() ? "yes" : "no");
-    if (!locked_addrs.empty()) {
+            locked_addr_stack.empty() ? "yes" : "no");
+    if (!locked_addr_stack.empty()) {
         DPRINTF(LLSC, "[cid:%d]: addr = %x.\n", req->contextId(),
                 req->getPaddr() & ~0xF);
         DPRINTF(LLSC, "[cid:%d]: last locked addr = %x.\n", req->contextId(),
-                locked_addrs.top());
+                locked_addr_stack.top());
     }
-    if (locked_addrs.empty()
-            || locked_addrs.top() != ((req->getPaddr() & ~0xF))) {
+    if (locked_addr_stack.empty()
+            || locked_addr_stack.top() != ((req->getPaddr() & ~0xF))) {
         req->setExtraData(0);
         int stCondFailures = xc->readStCondFailures();
         xc->setStCondFailures(++stCondFailures);