arch-arm: Adding CAS/CASP AMO instr including new TypedAtomic func

CAS/CASP atomic instruction implementation
This change includes:
+ Instructions decode
+ new amo64.isa file where CAS/CASP main functional code is implemented
+ mem64.isa include Execute/complete/initiatie skeletons,
contructor and declarator
+ Added TypedAtomic function for pair register CASP instruction

Change-Id: I4a4acdec4ab1c8b888f10ef5dc1e896be8c432bf
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/19811
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Maintainer: Giacomo Travaglini <giacomo.travaglini@arm.com>
diff --git a/src/arch/arm/insts/mem64.hh b/src/arch/arm/insts/mem64.hh
index 34eb43f..4fbbe77 100644
--- a/src/arch/arm/insts/mem64.hh
+++ b/src/arch/arm/insts/mem64.hh
@@ -306,6 +306,28 @@
     std::function<void(T*, T, T)> op;
 };
 
+template<typename T>
+class AtomicGenericPair3Op : public TypedAtomicOpFunctor<T>
+{
+  public:
+    AtomicGenericPair3Op(std::array<T, 2>& _a, std::array<T, 2> _c,
+           std::function<void(T*, std::array<T, 2>&, std::array<T, 2>)> _op)
+        : a(_a), c(_c), op(_op)
+    {}
+    AtomicOpFunctor* clone() override
+    {
+        return new AtomicGenericPair3Op<T>(*this);
+    }
+    void execute(T* b) override
+    {
+        op(b, a, c);
+    }
+  private:
+    std::array<T, 2> a;
+    std::array<T, 2> c;
+    std::function<void(T*, std::array<T, 2>&, std::array<T, 2>)> op;
+};
+
 }
 
 #endif //__ARCH_ARM_INSTS_MEM_HH__
diff --git a/src/arch/arm/isa/formats/aarch64.isa b/src/arch/arm/isa/formats/aarch64.isa
index 82770eb..a9e645e 100644
--- a/src/arch/arm/isa/formats/aarch64.isa
+++ b/src/arch/arm/isa/formats/aarch64.isa
@@ -587,8 +587,9 @@
                   case 0x2:
                     switch (size) {
                       case 0x0:
+                        return new CASP32(machInst, rt, rnsp, rs);
                       case 0x1:
-                        return new Unknown64(machInst);
+                        return new CASP64(machInst, rt, rnsp, rs);
                       case 0x2:
                         return new STXPW64(machInst, rs, rt, rt2, rnsp);
                       case 0x3:
@@ -600,8 +601,9 @@
                   case 0x3:
                     switch (size) {
                       case 0x0:
+                        return new CASPL32(machInst, rt, rnsp, rs);
                       case 0x1:
-                        return new Unknown64(machInst);
+                        return new CASPL64(machInst, rt, rnsp, rs);
                       case 0x2:
                         return new STLXPW64(machInst, rs, rt, rt2, rnsp);
                       case 0x3:
@@ -639,8 +641,9 @@
                   case 0x6:
                     switch (size) {
                       case 0x0:
+                        return new CASPA32(machInst, rt, rnsp, rs);
                       case 0x1:
-                        return new Unknown64(machInst);
+                        return new CASPA64(machInst, rt, rnsp, rs);
                       case 0x2:
                         return new LDXPW64(machInst, rt, rt2, rnsp);
                       case 0x3:
@@ -648,12 +651,12 @@
                       default:
                         M5_UNREACHABLE;
                     }
-
                   case 0x7:
                     switch (size) {
                       case 0x0:
+                        return new CASPAL32(machInst, rt, rnsp, rs);
                       case 0x1:
-                        return new Unknown64(machInst);
+                        return new CASPAL64(machInst, rt, rnsp, rs);
                       case 0x2:
                         return new LDAXPW64(machInst, rt, rt2, rnsp);
                       case 0x3:
@@ -661,7 +664,6 @@
                       default:
                         M5_UNREACHABLE;
                     }
-
                   case 0x9:
                     switch (size) {
                       case 0x0:
@@ -675,6 +677,32 @@
                       default:
                         M5_UNREACHABLE;
                     }
+                  case 0xa:
+                    switch (size) {
+                      case 0x0:
+                        return new CASB(machInst, rt, rnsp, rs);
+                      case 0x1:
+                        return new CASH(machInst, rt, rnsp, rs);
+                      case 0x2:
+                        return new CAS32(machInst, rt, rnsp, rs);
+                      case 0x3:
+                        return new CAS64(machInst, rt, rnsp, rs);
+                      default:
+                        M5_UNREACHABLE;
+                    }
+                  case 0xb:
+                    switch (size) {
+                      case 0x0:
+                        return new CASLB(machInst, rt, rnsp, rs);
+                      case 0x1:
+                        return new CASLH(machInst, rt, rnsp, rs);
+                      case 0x2:
+                        return new CASL32(machInst, rt, rnsp, rs);
+                      case 0x3:
+                        return new CASL64(machInst, rt, rnsp, rs);
+                      default:
+                        M5_UNREACHABLE;
+                    }
                   case 0xd:
                     switch (size) {
                       case 0x0:
@@ -688,6 +716,32 @@
                       default:
                         M5_UNREACHABLE;
                     }
+                  case 0xe:
+                    switch (size) {
+                      case 0x0:
+                        return new CASAB(machInst, rt, rnsp, rs);
+                      case 0x1:
+                        return new CASAH(machInst, rt, rnsp, rs);
+                      case 0x2:
+                        return new CASA32(machInst, rt, rnsp, rs);
+                      case 0x3:
+                        return new CASA64(machInst, rt, rnsp, rs);
+                      default:
+                        M5_UNREACHABLE;
+                    }
+                  case 0xf:
+                    switch (size) {
+                      case 0x0:
+                        return new CASALB(machInst, rt, rnsp, rs);
+                      case 0x1:
+                        return new CASALH(machInst, rt, rnsp, rs);
+                      case 0x2:
+                        return new CASAL32(machInst, rt, rnsp, rs);
+                      case 0x3:
+                        return new CASAL64(machInst, rt, rnsp, rs);
+                      default:
+                        M5_UNREACHABLE;
+                    }
                   default:
                     return new Unknown64(machInst);
                 }
diff --git a/src/arch/arm/isa/insts/amo64.isa b/src/arch/arm/isa/insts/amo64.isa
new file mode 100644
index 0000000..70604e9
--- /dev/null
+++ b/src/arch/arm/isa/insts/amo64.isa
@@ -0,0 +1,401 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2018 Metempsy Technology Consulting
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Jordi Vaquero
+
+let {{
+
+    import math
+
+    OP_DICT = { "CAS" : 'if (a == *b){*b = c;}',
+                "SWP" : '*b = c;',
+                "ADD" : '*b += c;',
+                "EOR" : '*b ^= c;',
+                "CLR" : '*b &= ~c;',
+                "SET" : '*b |= c;',
+                "MAX" : '*b = std::max(*b, c);',
+                "MIN" : '*b = std::min(*b, c);', }
+
+    MASKS = { 1: 0xFF,
+              2: 0xFFFF,
+              4: 0xFFFFFFFF,
+              8: 0xFFFFFFFFFFFFFFFF,
+             }
+
+    header_output = ""
+    decoder_output = ""
+    exec_output = ""
+
+    class AtomicInst64(LoadStoreInst):
+        execBase = 'AtomicInst64'
+        micro = False
+
+        def __init__(self, mnem, Name, size=4, user=False, flavor="normal",
+                    unsign=True, top = False, paired=False):
+            super(AtomicInst64, self).__init__()
+
+            self.name= mnem
+            self.Name = Name
+            self.size = size
+            self.user = user
+            self.flavor = flavor
+            self.unsign = unsign
+            self.top = top
+            self.paired = paired
+
+            self.memFlags = ["ArmISA::TLB::MustBeOne"]
+            self.instFlags = ["IsAtomic"]
+            self.codeBlobs = { "postacc_code" : "" }
+            self.codeBlobs['usrDecl'] = ""
+
+            # Add memory request flags where necessary
+            if self.user:
+                self.memFlags.append("ArmISA::TLB::UserMode")
+
+            sz = self.size*2 if paired else self.size
+            self.memFlags.append("%d" % int(math.log(sz, 2)))
+
+            if self.micro:
+                self.instFlags.append("IsMicroop")
+
+            if self.flavor in ("release", "acquire_release", "acquire"):
+                self.instFlags.append("IsMemBarrier")
+            if self.flavor in ("release", "acquire_release"):
+                self.instFlags.append("IsWriteBarrier")
+            if self.flavor in ("acquire_release", "acquire"):
+                self.instFlags.append("IsReadBarrier")
+            self.memFlags.append('Request::ATOMIC_RETURN_OP')
+
+        def emitHelper(self, base = 'Memory64', wbDecl = None, ):
+            global header_output, decoder_output, exec_output
+
+            # If this is a microop itself, don't allow anything that would
+            # require further microcoding.
+            if self.micro:
+                assert not wbDecl
+
+            fa_code = None
+            if not self.micro :
+            #and self.flavor in ("normal", "release"):
+                fa_code = '''
+                    fault->annotate(ArmFault::SAS, %s);
+                    fault->annotate(ArmFault::SSE, false);
+                    fault->annotate(ArmFault::SRT, dest);
+                    fault->annotate(ArmFault::SF, %s);
+                    fault->annotate(ArmFault::AR, %s);
+                ''' % ("0" if self.size == 1 else
+                       "1" if self.size == 2 else
+                       "2" if self.size == 4 else "3",
+                       "true" if self.size == 8 else "false",
+                       "true" if self.flavor != "normal"  else "false")
+            sas_code = "3"
+            if self.size == 1 :
+                sas_code = "0"
+            elif self.size == 2:
+                sas_code = "1"
+            elif self.size == 4:
+                sas_code = "2"
+
+            if self.paired and sas_code == "3":
+                sas_code = "4"
+            if self.paired and sas_code == "2":
+                sas_code = "3"
+
+
+            fa_code = '''
+               fault->annotate(ArmFault::SAS, %s);
+               fault->annotate(ArmFault::SSE, %s);
+               fault->annotate(ArmFault::SRT, dest);
+               fault->annotate(ArmFault::SF, %s);
+               fault->annotate(ArmFault::AR, %s);
+               ''' % (sas_code,
+                     "true" if not self.unsign else "false",
+                     "true" if self.size == 8 else "false",
+                     "true" if self.flavor != "normal"  else "false")
+
+            (newHeader, newDecoder, newExec) = \
+                self.fillTemplates(self.name, self.Name, self.codeBlobs,
+                                   self.memFlags, self.instFlags,
+                                   base, wbDecl, faCode=fa_code)
+
+            header_output += newHeader
+            decoder_output += newDecoder
+            exec_output += newExec
+
+        def buildEACode(self):
+            # Address computation
+            eaCode = SPAlignmentCheckCode + "EA = XBase"
+            if self.size == 16:
+                if self.top:
+                    eaCode += " + (isBigEndian64(xc->tcBase()) ? 0 : 8)"
+                else:
+                    eaCode += " + (isBigEndian64(xc->tcBase()) ? 8 : 0)"
+            if not self.post:
+                eaCode += self.offset
+            eaCode += ";"
+            self.codeBlobs["ea_code"] = eaCode
+
+
+    class AtomicSingleOp(AtomicInst64):
+        decConstBase = 'AmoOp'
+        base = 'ArmISA::MemoryEx64'
+        writeback = True
+        post = False
+        execBase = 'AmoOp'
+
+        def __init__(self, *args, **kargs):
+            super(AtomicSingleOp, self).__init__(*args, **kargs)
+            self.suffix = buildMemSuffix(not self.unsign, self.size)
+            if self.size == 8:
+                self.res = 'XResult_ud' #if self.unsign else 'XResult_sd'
+                self.des = 'XDest_ud' #if self.unsign else 'XDest_sd'
+                self.tp = 'uint64_t' if self.unsign else 'int64_t'
+                self.utp = 'uint64_t'
+                self.suffix = '_sd' if not self.unsign else '_ud'
+            elif self.size == 4:
+                self.res = 'XResult_uw' #if self.unsign else 'XResult_sw'
+                self.des = 'XDest_uw' #if self.unsign else 'XDest_sw'
+                self.tp = 'uint32_t' if self.unsign else 'int32_t'
+                self.utp = 'uint32_t'
+            elif self.size == 2:
+                self.res = 'XResult_uh' #if self.unsign else 'XResult_sh'
+                self.des = 'XDest_uh' #if self.unsign else 'XDest_sh'
+                self.tp = 'uint16_t' if self.unsign else 'int16_t'
+                self.utp = 'uint16_t'
+            elif self.size == 1:
+                self.res = 'XResult_ub' #if self.unsign else 'XResult_sb'
+                self.des = 'XDest_ub' #if self.unsign else 'XDest_sb'
+                self.tp = 'uint8_t' if self.unsign else 'int8_t'
+                self.utp = 'uint8_t'
+            self.offset = ""
+            store_res = '''
+                        %(result)s = cSwap(Mem%(suffix)s,
+                                         isBigEndian64(xc->tcBase()));
+                      '''
+            store_res = store_res % {"result":self.res, "suffix":self.suffix}
+            self.codeBlobs["postacc_code"] = \
+                    store_res + " SevMailbox = 1; LLSCLock = 0;"
+
+        def emit(self, op):
+            self.buildEACode()
+            usrDecl = "%(type)s valRs;\n" % {'type': self.tp}
+            self.codeBlobs['usrDecl'] = usrDecl
+
+            opcode = "valRs = cSwap(%(dest)s,"\
+                      " isBigEndian64(xc->tcBase()));\n"
+            opcode += "TypedAtomicOpFunctor<%(type)s> *amo_op = "\
+                      "new AtomicGeneric3Op<%(type)s>(Mem%(suffix)s,"\
+                      " valRs, [](%(type)s* b, %(type)s a,"\
+                      " %(type)s c){ %(op)s });\n"
+
+            opcode = opcode % {"suffix": self.suffix,
+                               "type": self.tp ,
+                               "dest": self.des,
+                               "op": op}
+            self.codeBlobs['amo_code'] = opcode
+            accCode = "Mem%(suffix)s = cSwap(%(result)s,"\
+                      " isBigEndian64(xc->tcBase()));"
+            accCode = accCode % { "result": self.res, "type":self.tp,
+                                  "suffix": self.suffix}
+            self.codeBlobs["memacc_code"] = accCode
+            self.emitHelper(self.base)
+
+
+    AtomicSingleOp("cas",   "CAS64",   8, unsign=True,
+                   flavor="normal").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casa",  "CASA64",  8, unsign=True,
+                   flavor="acquire").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casal", "CASAL64", 8, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casl",  "CASL64",  8, unsign=True,
+                   flavor="release").emit(OP_DICT['CAS'])
+
+    AtomicSingleOp("casb",   "CASB",   1, unsign=True,
+                   flavor="normal").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casab",  "CASAB",  1, unsign=True,
+                   flavor="acquire").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casalb", "CASALB", 1, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['CAS'])
+    AtomicSingleOp("caslb",  "CASLB",  1, unsign=True,
+                   flavor="release").emit(OP_DICT['CAS'])
+
+    AtomicSingleOp("cash",   "CASH",   2, unsign=True,
+                   flavor="normal").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casah",  "CASAH",  2, unsign=True,
+                   flavor="acquire").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casalh", "CASALH", 2, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['CAS'])
+    AtomicSingleOp("caslh",  "CASLH",  2, unsign=True,
+                   flavor="release").emit(OP_DICT['CAS'])
+
+    AtomicSingleOp("cas",   "CAS32",   4, unsign=True,
+                   flavor="normal").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casa",  "CASA32",  4, unsign=True,
+                   flavor="acquire").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casal", "CASAL32", 4, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['CAS'])
+    AtomicSingleOp("casl",  "CASL32",  4, unsign=True,
+                   flavor="release").emit(OP_DICT['CAS'])
+
+    class CasPair64(AtomicInst64):
+        decConstBase = 'AmoPairOp'
+        base = 'ArmISA::MemoryEx64'
+        writeback = True
+        post = False
+        execBase = 'AmoOp'
+
+        def __init__(self, *args, **kargs):
+            super(CasPair64, self).__init__(*args, **kargs)
+            self.paired = True
+            self.offset = ""
+            if self.size == 8:
+                self.res = 'XResult_ud'
+                self.des = 'XDest_ud'
+                self.tp = 'std::array<uint64_t, 2>'
+                self.suffix = "_tud"
+                store_res = '''
+                            %(result)s = cSwap(Mem%(suffix)s[0],
+                                          isBigEndian64(xc->tcBase()));
+                            uint64_t result2 = cSwap(Mem%(suffix)s[1],
+                                           isBigEndian64(xc->tcBase()));
+                            xc->setIntRegOperand(this, r2_dst, (result2)
+                                                    & mask(aarch64 ? 64 : 32));
+                            '''
+            elif self.size == 4:
+                self.res = 'Result_uw'
+                self.des = 'WDest_uw'
+                self.tp = 'uint64_t'
+                self.suffix = "_ud"
+                store_res = '''
+                    uint64_t data = cSwap(Mem%(suffix)s,
+                                          isBigEndian64(xc->tcBase()));
+                    %(result)s = isBigEndian64(xc->tcBase())
+                                   ? (data >> 32)
+                                   : (uint32_t)data;
+                    uint32_t result2 = isBigEndian64(xc->tcBase())
+                                   ? (uint32_t)data
+                                   : (data >> 32);
+                    xc->setIntRegOperand(this, r2_dst, (result2) &
+                                                mask(aarch64 ? 64 : 32));
+                            '''
+
+            store_res = store_res % {"result":self.res, "suffix":self.suffix}
+            usrDecl = "%(type)s valRs;\n" % {'type': self.tp}
+            self.codeBlobs['usrDecl'] = usrDecl
+            self.codeBlobs["postacc_code"] = \
+                    store_res + " SevMailbox = 1; LLSCLock = 0;"
+
+        def emit(self):
+            self.buildEACode()
+
+            # Code that actually handles the access
+
+            if self.size == 4:
+                accCode = \
+                  "uint32_t result2 = ((xc->readIntRegOperand(this, r2_src))"\
+                  " & mask(aarch64 ? 64 : 32)) ;\n"\
+                  " uint32_t dest2 = ((xc->readIntRegOperand(this, d2_src)) "\
+                  " & mask(aarch64 ? 64 : 32)) ;"
+                accCode += '''
+                     uint64_t data = dest2;
+                     data = isBigEndian64(xc->tcBase())
+                          ? ((uint64_t(WDest_uw) << 32) | data)
+                                 : ((data << 32) | WDest_uw);
+                     valRs = cSwap(data, isBigEndian64(xc->tcBase()));
+                     uint64_t data2 = result2 ;
+                     data2 = isBigEndian64(xc->tcBase())
+                          ? ((uint64_t(Result_uw) << 32) | data2)
+                                 : ((data2 << 32) | Result_uw);
+                     Mem_ud = cSwap(data2, isBigEndian64(xc->tcBase()));
+                     '''
+
+                opcode = "TypedAtomicOpFunctor<%(type)s> *amo_op = "\
+                      "new AtomicGeneric3Op<%(type)s>(Mem%(suffix)s,"\
+                      " valRs, [](%(type)s* b, %(type)s a,"\
+                      " %(type)s c){ %(op)s });\n"
+
+            elif self.size == 8:
+                accCode = ""\
+                  "uint64_t result2 = ((xc->readIntRegOperand(this, r2_src))"\
+                  " & mask(aarch64 ? 64 : 32)) ;\n"\
+                  " uint64_t dest2 = ((xc->readIntRegOperand(this, d2_src)) "\
+                  " & mask(aarch64 ? 64 : 32)) ;"
+                accCode += '''
+                   // This temporary needs to be here so that the parser
+                   // will correctly identify this instruction as a store.
+                   std::array<uint64_t, 2> temp;
+                   temp[0] = cSwap(XDest_ud,isBigEndian64(xc->tcBase()));
+                   temp[1] = cSwap(dest2,isBigEndian64(xc->tcBase()));
+                   valRs = temp;
+                   std::array<uint64_t, 2> temp2;
+                   temp2[0] = cSwap(XResult_ud,isBigEndian64(xc->tcBase()));
+                   temp2[1] = cSwap(result2,isBigEndian64(xc->tcBase()));
+                   Mem_tud = temp2;
+                     '''
+
+                opcode = "TypedAtomicOpFunctor<uint64_t> *amo_op = "\
+                          "new AtomicGenericPair3Op<uint64_t>(Mem_tud, "\
+                          "valRs, [](uint64_t* b, std::array<uint64_t,2> a,"\
+                          '''
+                          std::array<uint64_t,2> c){
+                             if(a[0]==b[0] && a[1]==b[1]){
+                                b[0] = c[0]; b[1] = c[1];
+                             }
+                          });'''
+
+            opcode = opcode % { "suffix" : self.suffix,
+                                "type": self.tp,
+                                "op": OP_DICT['CAS']}
+            self.codeBlobs['amo_code'] = opcode
+            self.codeBlobs["memacc_code"] = accCode % {"type": self.tp}
+
+            # Push it out to the output files
+            self.emitHelper(self.base)
+
+    CasPair64("casp",   "CASP64",   8, flavor="normal", paired=True).emit()
+    CasPair64("caspa",  "CASPA64",  8, flavor="acquire", paired=True).emit()
+    CasPair64("caspal", "CASPAL64", 8, flavor="acquire_release",
+            paired=True).emit()
+    CasPair64("caspl",  "CASPL64",  8, flavor="release", paired=True).emit()
+
+    CasPair64("casp",   "CASP32",   4, flavor="normal", paired=True).emit()
+    CasPair64("caspa",  "CASPA32",  4, flavor="acquire", paired=True).emit()
+    CasPair64("caspal", "CASPAL32", 4, flavor="acquire_release",
+            paired=True).emit()
+    CasPair64("caspl",  "CASPL32",  4, flavor="release", paired=True).emit()
+
+}};
diff --git a/src/arch/arm/isa/insts/insts.isa b/src/arch/arm/isa/insts/insts.isa
index a1b35ef..45159e3 100644
--- a/src/arch/arm/isa/insts/insts.isa
+++ b/src/arch/arm/isa/insts/insts.isa
@@ -43,6 +43,9 @@
 //Useful bits shared by memory instructions
 ##include "mem.isa"
 
+//AArch64 atomic operations
+##include "amo64.isa"
+
 //Loads of a single item
 ##include "ldr.isa"
 
diff --git a/src/arch/arm/isa/templates/mem64.isa b/src/arch/arm/isa/templates/mem64.isa
index fd79669..dc8e0c5 100644
--- a/src/arch/arm/isa/templates/mem64.isa
+++ b/src/arch/arm/isa/templates/mem64.isa
@@ -729,3 +729,212 @@
         setExcAcRel(exclusive, acrel);
     }
 }};
+
+// Atomic operations in memory
+
+def template AmoOpExecute {{
+    Fault %(class_name)s::execute(ExecContext *xc,
+                                  Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        %(usrDecl)s;
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        %(amo_code)s
+        assert(amo_op);
+
+        if (fault == NoFault) {
+            fault = amoMemAtomic(xc, traceData, Mem, EA,
+                memAccessFlags, amo_op);
+        }
+
+        if (fault == NoFault) {
+            %(postacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+         return fault;
+    }
+}};
+
+def template AmoOpInitiateAcc {{
+    Fault %(class_name)s::initiateAcc(ExecContext *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_src_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+        %(usrDecl)s;
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        %(amo_code)s;
+
+        assert(amo_op);
+        if (fault == NoFault) {
+            fault = initiateMemAMO(xc, traceData, EA, Mem, memAccessFlags,
+                                   amo_op);
+        }
+
+        return fault;
+     }
+}};
+
+def template AmoOpCompleteAcc {{
+    Fault %(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+         Fault fault = NoFault;
+
+         %(op_decl)s;
+         %(op_rd)s;
+
+         // ARM instructions will not have a pkt if the predicate is false
+        getMem(pkt, Mem, traceData);
+
+        if (fault == NoFault) {
+            %(postacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+         return fault;
+    }
+
+}};
+
+def template AmoOpDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _base, IntRegIndex _result);
+
+        Fault execute(ExecContext *, Trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, Trace::InstRecord *) const override;
+        Fault completeAcc(PacketPtr, ExecContext *,
+                          Trace::InstRecord *) const override;
+
+        void
+        annotateFault(ArmFault *fault) override
+        {
+            %(fa_code)s
+        }
+    };
+}};
+
+
+def template AmoOpConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _base, IntRegIndex _result)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                          _dest, _base, _result)
+    {
+        %(constructor)s;
+
+    }
+}};
+
+def template AmoPairOpDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        uint32_t d2_src ;
+        uint32_t r2_src ;
+        uint32_t r2_dst ;
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _base, IntRegIndex _result);
+
+        Fault execute(ExecContext *, Trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, Trace::InstRecord *) const override;
+        Fault completeAcc(PacketPtr, ExecContext *,
+                          Trace::InstRecord *) const override;
+
+        void
+        annotateFault(ArmFault *fault) override
+        {
+            %(fa_code)s
+        }
+    };
+}};
+
+
+def template AmoPairOpConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _base, IntRegIndex _result)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                          _dest,  _base, _result)
+    {
+        %(constructor)s;
+
+        uint32_t d2 = RegId(IntRegClass, dest).index() + 1 ;
+        uint32_t r2 = RegId(IntRegClass, result).index() + 1 ;
+
+        d2_src = _numSrcRegs ;
+        _srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, d2);
+        r2_src = _numSrcRegs ;
+        _srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, r2);
+        r2_dst = _numDestRegs ;
+        _destRegIdx[_numDestRegs++] = RegId(IntRegClass, r2);
+
+    }
+}};
+
+def template AmoArithmeticOpDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        bool isXZR ;
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _base, IntRegIndex _result);
+
+        Fault execute(ExecContext *, Trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, Trace::InstRecord *) const override;
+        Fault completeAcc(PacketPtr, ExecContext *,
+                          Trace::InstRecord *) const override;
+
+        void
+        annotateFault(ArmFault *fault) override
+        {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template AmoArithmeticOpConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _base, IntRegIndex _result)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                          _dest,  _base, _result)
+    {
+        %(constructor)s;
+        isXZR = false;
+        uint32_t r2 = RegId(IntRegClass, dest).index() ;
+        if (r2 == 31){
+            flags[IsReadBarrier] = false;
+            isXZR = true;
+        }
+    }
+}};