arch-arm: Added LD/ST<op> atomic instruction family and SWP instrs

Adding LD/ST/SWP family of instructions, LD/ST include a set of
operations like ADD/CLR/EOR/SET/UMAX/UMIN/SMAX/SMIN
This commit includes:
+ Instruction decode
+ Instruction functional code
+ New set of skeletons for Ex/Com/Ini/Constructor and declaration.

Change-Id: Ieea8d4256807e004d2f8aca8f421b3df8d76b116
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/19812
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Maintainer: Giacomo Travaglini <giacomo.travaglini@arm.com>
diff --git a/src/arch/arm/isa/formats/aarch64.isa b/src/arch/arm/isa/formats/aarch64.isa
index a9e645e..dbe7e28 100644
--- a/src/arch/arm/isa/formats/aarch64.isa
+++ b/src/arch/arm/isa/formats/aarch64.isa
@@ -541,6 +541,544 @@
 namespace Aarch64
 {
     StaticInstPtr
+    decodeAtomicArithOp(ExtMachInst machInst)
+    {
+        uint8_t opc  = bits(machInst, 14, 12);
+        uint8_t o3  = bits(machInst, 15);
+        uint8_t size_ar = bits(machInst, 23, 22)<<0 | bits(machInst, 31, 30)<<2;
+        IntRegIndex rt = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+        IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+        IntRegIndex rnsp = makeSP(rn);
+        IntRegIndex rs = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+        uint8_t  A_rt = bits(machInst, 4, 0)<<0 | bits(machInst, 23)<<5;
+
+        switch(opc) {
+            case 0x0:
+                switch(size_ar){
+                    case 0x0:
+                        if (o3 == 1)
+                            return new SWPB(machInst, rt, rnsp, rs);
+                        else if (A_rt == 0x1f)
+                            return new STADDB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDB(machInst, rt, rnsp, rs);
+                    case 0x1 :
+                        if (o3 == 1)
+                            return new SWPLB(machInst, rt, rnsp, rs);
+                        else if (A_rt == 0x1f)
+                            return new STADDLB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDLB(machInst, rt, rnsp, rs);
+                    case 0x2:
+                        if (o3 == 1)
+                            return new SWPAB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDAB(machInst, rt, rnsp, rs);
+                    case 0x3:
+                        if (o3 == 1)
+                            return new SWPLAB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDLAB(machInst, rt, rnsp, rs);
+                    case 0x4:
+                        if (o3 == 1)
+                            return new SWPH(machInst, rt, rnsp, rs);
+                        else if (A_rt == 0x1f)
+                            return new STADDH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDH(machInst, rt, rnsp, rs);
+                    case 0x5 :
+                        if (o3 == 1)
+                            return new SWPLH(machInst, rt, rnsp, rs);
+                        else if (A_rt == 0x1f)
+                            return new STADDLH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDLH(machInst, rt, rnsp, rs);
+                    case 0x6:
+                        if (o3 == 1)
+                            return new SWPAH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDAH(machInst, rt, rnsp, rs);
+                    case 0x7:
+                        if (o3 == 1)
+                            return new SWPLAH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDLAH(machInst, rt, rnsp, rs);
+                    case 0x8:
+                        if (o3 == 1)
+                            return new SWP(machInst, rt, rnsp, rs);
+                        else if (A_rt == 0x1f)
+                            return new STADD(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADD(machInst, rt, rnsp, rs);
+                    case 0x9 :
+                        if (o3 == 1)
+                            return new SWPL(machInst, rt, rnsp, rs);
+                        else if (A_rt == 0x1f)
+                            return new STADDL(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDL(machInst, rt, rnsp, rs);
+                    case 0xa:
+                        if (o3 == 1)
+                            return new SWPA(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDA(machInst, rt, rnsp, rs);
+                    case 0xb:
+                        if (o3 == 1)
+                            return new SWPLA(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDLA(machInst, rt, rnsp, rs);
+                    case 0xc:
+                        if (o3 == 1)
+                            return new SWP64(machInst, rt, rnsp, rs);
+
+                        else if (A_rt == 0x1f)
+                            return new STADD64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADD64(machInst, rt, rnsp, rs);
+                    case 0xd :
+                        if (o3 == 1)
+                            return new SWPL64(machInst, rt, rnsp, rs);
+                        else if (A_rt == 0x1f)
+                            return new STADDL64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDL64(machInst, rt, rnsp, rs);
+                    case 0xe:
+                        if (o3 == 1)
+                            return new SWPA64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDA64(machInst, rt, rnsp, rs);
+                    case 0xf:
+                        if (o3 == 1)
+                            return new SWPLA64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDADDLA64(machInst, rt, rnsp, rs);
+                }
+            case 0x1:
+                switch(size_ar){
+                    case 0x0:
+                        if (A_rt == 0x1f)
+                            return new STCLRB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDCLRB(machInst, rt, rnsp, rs);
+                    case 0x1 :
+                        if (A_rt == 0x1f)
+                            return new STCLRLB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDCLRLB(machInst, rt, rnsp, rs);
+                    case 0x2:
+                        return new LDCLRAB(machInst, rt, rnsp, rs);
+                    case 0x3:
+                        return new LDCLRLAB(machInst, rt, rnsp, rs);
+                    case 0x4:
+                        if (A_rt == 0x1f)
+                            return new STCLRH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDCLRH(machInst, rt, rnsp, rs);
+                    case 0x5 :
+                        if (A_rt == 0x1f)
+                            return new STCLRLH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDCLRLH(machInst, rt, rnsp, rs);
+                    case 0x6:
+                        return new LDCLRAH(machInst, rt, rnsp, rs);
+                    case 0x7:
+                        return new LDCLRLAH(machInst, rt, rnsp, rs);
+                    case 0x8:
+                        if (A_rt == 0x1f)
+                            return new STCLR(machInst, rt, rnsp, rs);
+                        else
+                            return new LDCLR(machInst, rt, rnsp, rs);
+                    case 0x9 :
+                        if (A_rt == 0x1f)
+                            return new STCLRL(machInst, rt, rnsp, rs);
+                        else
+                            return new LDCLRL(machInst, rt, rnsp, rs);
+                    case 0xa:
+                        return new LDCLRA(machInst, rt, rnsp, rs);
+                    case 0xb:
+                        return new LDCLRLA(machInst, rt, rnsp, rs);
+                    case 0xc:
+                        if (A_rt == 0x1f)
+                            return new STCLR64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDCLR64(machInst, rt, rnsp, rs);
+                    case 0xd :
+                        if (A_rt == 0x1f)
+                            return new STCLRL64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDCLRL64(machInst, rt, rnsp, rs);
+                    case 0xe:
+                        return new LDCLRA64(machInst, rt, rnsp, rs);
+                    case 0xf:
+                        return new LDCLRLA64(machInst, rt, rnsp, rs);
+                }
+            case 0x2:
+                switch(size_ar){
+                    case 0x0:
+                        if (A_rt == 0x1f)
+                            return new STEORB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDEORB(machInst, rt, rnsp, rs);
+                    case 0x1 :
+                        if (A_rt == 0x1f)
+                            return new STEORLB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDEORLB(machInst, rt, rnsp, rs);
+                    case 0x2:
+                        return new LDEORAB(machInst, rt, rnsp, rs);
+                    case 0x3:
+                        return new LDEORLAB(machInst, rt, rnsp, rs);
+                    case 0x4:
+                        if (A_rt == 0x1f)
+                            return new STEORH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDEORH(machInst, rt, rnsp, rs);
+                    case 0x5 :
+                        if (A_rt == 0x1f)
+                            return new STEORLH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDEORLH(machInst, rt, rnsp, rs);
+                    case 0x6:
+                        return new LDEORAH(machInst, rt, rnsp, rs);
+                    case 0x7:
+                        return new LDEORLAH(machInst, rt, rnsp, rs);
+                    case 0x8:
+                        if (A_rt == 0x1f)
+                            return new STEOR(machInst, rt, rnsp, rs);
+                        else
+                            return new LDEOR(machInst, rt, rnsp, rs);
+                    case 0x9 :
+                        if (A_rt == 0x1f)
+                            return new STEORL(machInst, rt, rnsp, rs);
+                        else
+                            return new LDEORL(machInst, rt, rnsp, rs);
+                    case 0xa:
+                        return new LDEORA(machInst, rt, rnsp, rs);
+                    case 0xb:
+                        return new LDEORLA(machInst, rt, rnsp, rs);
+                    case 0xc:
+                        if (A_rt == 0x1f)
+                            return new STEOR64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDEOR64(machInst, rt, rnsp, rs);
+                    case 0xd :
+                        if (A_rt == 0x1f)
+                            return new STEORL64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDEORL64(machInst, rt, rnsp, rs);
+                    case 0xe:
+                        return new LDEORA64(machInst, rt, rnsp, rs);
+                    case 0xf:
+                        return new LDEORLA64(machInst, rt, rnsp, rs);
+                }
+            case 0x3:
+                switch(size_ar){
+                    case 0x0:
+                        if (A_rt == 0x1f)
+                            return new STSETB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSETB(machInst, rt, rnsp, rs);
+                    case 0x1 :
+                        if (A_rt == 0x1f)
+                            return new STSETLB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSETLB(machInst, rt, rnsp, rs);
+                    case 0x2:
+                        return new LDSETAB(machInst, rt, rnsp, rs);
+                    case 0x3:
+                        return new LDSETLAB(machInst, rt, rnsp, rs);
+                    case 0x4:
+                        if (A_rt == 0x1f)
+                            return new STSETH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSETH(machInst, rt, rnsp, rs);
+                    case 0x5 :
+                        if (A_rt == 0x1f)
+                            return new STSETLH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSETLH(machInst, rt, rnsp, rs);
+                    case 0x6:
+                        return new LDSETAH(machInst, rt, rnsp, rs);
+                    case 0x7:
+                        return new LDSETLAH(machInst, rt, rnsp, rs);
+                    case 0x8:
+                        if (A_rt == 0x1f)
+                            return new STSET(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSET(machInst, rt, rnsp, rs);
+                    case 0x9 :
+                        if (A_rt == 0x1f)
+                            return new STSETL(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSETL(machInst, rt, rnsp, rs);
+                    case 0xa:
+                        return new LDSETA(machInst, rt, rnsp, rs);
+                    case 0xb:
+                        return new LDSETLA(machInst, rt, rnsp, rs);
+                    case 0xc:
+                        if (A_rt == 0x1f)
+                            return new STSET64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSET64(machInst, rt, rnsp, rs);
+                    case 0xd :
+                        if (A_rt == 0x1f)
+                            return new STSETL64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSETL64(machInst, rt, rnsp, rs);
+                    case 0xe:
+                        return new LDSETA64(machInst, rt, rnsp, rs);
+                    case 0xf:
+                        return new LDSETLA64(machInst, rt, rnsp, rs);
+                }
+            case 0x4:
+                switch(size_ar){
+                    case 0x0:
+                        if (A_rt == 0x1f)
+                            return new STSMAXB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMAXB(machInst, rt, rnsp, rs);
+                    case 0x1 :
+                        if (A_rt == 0x1f)
+                            return new STSMAXLB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMAXLB(machInst, rt, rnsp, rs);
+                    case 0x2:
+                        return new LDSMAXAB(machInst, rt, rnsp, rs);
+                    case 0x3:
+                        return new LDSMAXLAB(machInst, rt, rnsp, rs);
+                    case 0x4:
+                        if (A_rt == 0x1f)
+                            return new STSMAXH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMAXH(machInst, rt, rnsp, rs);
+                    case 0x5 :
+                        if (A_rt == 0x1f)
+                            return new STSMAXLH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMAXLH(machInst, rt, rnsp, rs);
+                    case 0x6:
+                        return new LDSMAXAH(machInst, rt, rnsp, rs);
+                    case 0x7:
+                        return new LDSMAXLAH(machInst, rt, rnsp, rs);
+                    case 0x8:
+                        if (A_rt == 0x1f)
+                            return new STSMAX(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMAX(machInst, rt, rnsp, rs);
+                    case 0x9 :
+                        if (A_rt == 0x1f)
+                            return new STSMAXL(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMAXL(machInst, rt, rnsp, rs);
+                    case 0xa:
+                        return new LDSMAXA(machInst, rt, rnsp, rs);
+                    case 0xb:
+                        return new LDSMAXLA(machInst, rt, rnsp, rs);
+                    case 0xc:
+                        if (A_rt == 0x1f)
+                            return new STSMAX64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMAX64(machInst, rt, rnsp, rs);
+                    case 0xd :
+                        if (A_rt == 0x1f)
+                            return new STSMAXL64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMAXL64(machInst, rt, rnsp, rs);
+                    case 0xe:
+                        return new LDSMAXA64(machInst, rt, rnsp, rs);
+                    case 0xf:
+                        return new LDSMAXLA64(machInst, rt, rnsp, rs);
+                }
+            case 0x5:
+                switch(size_ar){
+                    case 0x0:
+                        if (A_rt == 0x1f)
+                            return new STSMINB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMINB(machInst, rt, rnsp, rs);
+                    case 0x1 :
+                        if (A_rt == 0x1f)
+                            return new STSMINLB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMINLB(machInst, rt, rnsp, rs);
+                    case 0x2:
+                        return new LDSMINAB(machInst, rt, rnsp, rs);
+                    case 0x3:
+                        return new LDSMINLAB(machInst, rt, rnsp, rs);
+                    case 0x4:
+                        if (A_rt == 0x1f)
+                            return new STSMINH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMINH(machInst, rt, rnsp, rs);
+                    case 0x5 :
+                        if (A_rt == 0x1f)
+                            return new STSMINLH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMINLH(machInst, rt, rnsp, rs);
+                    case 0x6:
+                        return new LDSMINAH(machInst, rt, rnsp, rs);
+                    case 0x7:
+                        return new LDSMINLAH(machInst, rt, rnsp, rs);
+                    case 0x8:
+                        if (A_rt == 0x1f)
+                            return new STSMIN(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMIN(machInst, rt, rnsp, rs);
+                    case 0x9 :
+                        if (A_rt == 0x1f)
+                            return new STSMINL(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMINL(machInst, rt, rnsp, rs);
+                    case 0xa:
+                        return new LDSMINA(machInst, rt, rnsp, rs);
+                    case 0xb:
+                        return new LDSMINLA(machInst, rt, rnsp, rs);
+                    case 0xc:
+                        if (A_rt == 0x1f)
+                            return new STSMIN64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMIN64(machInst, rt, rnsp, rs);
+                    case 0xd :
+                        if (A_rt == 0x1f)
+                            return new STSMINL64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDSMINL64(machInst, rt, rnsp, rs);
+                    case 0xe:
+                        return new LDSMINA64(machInst, rt, rnsp, rs);
+                    case 0xf:
+                        return new LDSMINLA64(machInst, rt, rnsp, rs);
+                }
+            case 0x6:
+                switch(size_ar){
+                    case 0x0:
+                        if (A_rt == 0x1f)
+                            return new STUMAXB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMAXB(machInst, rt, rnsp, rs);
+                    case 0x1 :
+                        if (A_rt == 0x1f)
+                            return new STUMAXLB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMAXLB(machInst, rt, rnsp, rs);
+                    case 0x2:
+                        return new LDUMAXAB(machInst, rt, rnsp, rs);
+                    case 0x3:
+                        return new LDUMAXLAB(machInst, rt, rnsp, rs);
+                    case 0x4:
+                        if (A_rt == 0x1f)
+                            return new STUMAXH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMAXH(machInst, rt, rnsp, rs);
+                    case 0x5 :
+                        if (A_rt == 0x1f)
+                            return new STUMAXLH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMAXLH(machInst, rt, rnsp, rs);
+                    case 0x6:
+                        return new LDUMAXAH(machInst, rt, rnsp, rs);
+                    case 0x7:
+                        return new LDUMAXLAH(machInst, rt, rnsp, rs);
+                    case 0x8:
+                        if (A_rt == 0x1f)
+                            return new STUMAX(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMAX(machInst, rt, rnsp, rs);
+                    case 0x9 :
+                        if (A_rt == 0x1f)
+                            return new STUMAXL(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMAXL(machInst, rt, rnsp, rs);
+                    case 0xa:
+                        return new LDUMAXA(machInst, rt, rnsp, rs);
+                    case 0xb:
+                        return new LDUMAXLA(machInst, rt, rnsp, rs);
+                    case 0xc:
+                        if (A_rt == 0x1f)
+                            return new STUMAX64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMAX64(machInst, rt, rnsp, rs);
+                    case 0xd :
+                        if (A_rt == 0x1f)
+                            return new STUMAXL64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMAXL64(machInst, rt, rnsp, rs);
+                    case 0xe:
+                        return new LDUMAXA64(machInst, rt, rnsp, rs);
+                    case 0xf:
+                        return new LDUMAXLA64(machInst, rt, rnsp, rs);
+                }
+            case 0x7:
+                switch(size_ar){
+                    case 0x0:
+                        if (A_rt == 0x1f)
+                            return new STUMINB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMINB(machInst, rt, rnsp, rs);
+                    case 0x1 :
+                        if (A_rt == 0x1f)
+                            return new STUMINLB(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMINLB(machInst, rt, rnsp, rs);
+                    case 0x2:
+                        return new LDUMINAB(machInst, rt, rnsp, rs);
+                    case 0x3:
+                        return new LDUMINLAB(machInst, rt, rnsp, rs);
+                    case 0x4:
+                        if (A_rt == 0x1f)
+                            return new STUMINH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMINH(machInst, rt, rnsp, rs);
+                    case 0x5 :
+                        if (A_rt == 0x1f)
+                            return new STUMINLH(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMINLH(machInst, rt, rnsp, rs);
+                    case 0x6:
+                        return new LDUMINAH(machInst, rt, rnsp, rs);
+                    case 0x7:
+                        return new LDUMINLAH(machInst, rt, rnsp, rs);
+                    case 0x8:
+                        if (A_rt == 0x1f)
+                            return new STUMIN(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMIN(machInst, rt, rnsp, rs);
+                    case 0x9 :
+                        if (A_rt == 0x1f)
+                            return new STUMINL(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMINL(machInst, rt, rnsp, rs);
+                    case 0xa:
+                        return new LDUMINA(machInst, rt, rnsp, rs);
+                    case 0xb:
+                        return new LDUMINLA(machInst, rt, rnsp, rs);
+                    case 0xc:
+                        if (A_rt == 0x1f)
+                            return new STUMIN64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMIN64(machInst, rt, rnsp, rs);
+                    case 0xd :
+                        if (A_rt == 0x1f)
+                            return new STUMINL64(machInst, rt, rnsp, rs);
+                        else
+                            return new LDUMINL64(machInst, rt, rnsp, rs);
+                    case 0xe:
+                        return new LDUMINA64(machInst, rt, rnsp, rs);
+                    case 0xf:
+                        return new LDUMINLA64(machInst, rt, rnsp, rs);
+                }
+            default:
+                return new Unknown64(machInst);
+        }
+    }
+}
+}};
+
+
+output decoder {{
+namespace Aarch64
+{
+
+    StaticInstPtr
     decodeLoadsStores(ExtMachInst machInst)
     {
         // bit 27,25=10
@@ -925,7 +1463,7 @@
                                 M5_UNREACHABLE;
                             }
                         } else {
-                            return new Unknown64(machInst);
+                            return decodeAtomicArithOp(machInst);
                         }
                     }
                   case 0x2:
diff --git a/src/arch/arm/isa/insts/amo64.isa b/src/arch/arm/isa/insts/amo64.isa
index 70604e9..28136a8 100644
--- a/src/arch/arm/isa/insts/amo64.isa
+++ b/src/arch/arm/isa/insts/amo64.isa
@@ -65,7 +65,7 @@
         micro = False
 
         def __init__(self, mnem, Name, size=4, user=False, flavor="normal",
-                    unsign=True, top = False, paired=False):
+                    unsign=True, top = False, paired=False, ret_op=True):
             super(AtomicInst64, self).__init__()
 
             self.name= mnem
@@ -98,7 +98,10 @@
                 self.instFlags.append("IsWriteBarrier")
             if self.flavor in ("acquire_release", "acquire"):
                 self.instFlags.append("IsReadBarrier")
-            self.memFlags.append('Request::ATOMIC_RETURN_OP')
+            if ret_op:
+                self.memFlags.append('Request::ATOMIC_RETURN_OP')
+            else:
+                self.memFlags.append('Request::ATOMIC_NO_RETURN_OP')
 
         def emitHelper(self, base = 'Memory64', wbDecl = None, ):
             global header_output, decoder_output, exec_output
@@ -108,20 +111,6 @@
             if self.micro:
                 assert not wbDecl
 
-            fa_code = None
-            if not self.micro :
-            #and self.flavor in ("normal", "release"):
-                fa_code = '''
-                    fault->annotate(ArmFault::SAS, %s);
-                    fault->annotate(ArmFault::SSE, false);
-                    fault->annotate(ArmFault::SRT, dest);
-                    fault->annotate(ArmFault::SF, %s);
-                    fault->annotate(ArmFault::AR, %s);
-                ''' % ("0" if self.size == 1 else
-                       "1" if self.size == 2 else
-                       "2" if self.size == 4 else "3",
-                       "true" if self.size == 8 else "false",
-                       "true" if self.flavor != "normal"  else "false")
             sas_code = "3"
             if self.size == 1 :
                 sas_code = "0"
@@ -398,4 +387,486 @@
             paired=True).emit()
     CasPair64("caspl",  "CASPL32",  4, flavor="release", paired=True).emit()
 
+    #Set of LD<OP> atomic instructions
+
+    class AtomicArithmeticSingleOp(AtomicSingleOp):
+        decConstBase = 'AmoArithmeticOp'
+        base = 'ArmISA::MemoryEx64'
+        writeback = True
+        post = False
+        execBase = 'AmoOp'
+
+        def __init__(self, *args, **kargs):
+            super(AtomicArithmeticSingleOp, self).__init__(*args, **kargs)
+            store_res = "%(utype)s unsMem = Mem%(suffix)s"
+
+            if self.size != 8:
+                store_res += " & %(mask)s"
+
+            store_res += ";\n"
+            store_res += ''' if (!isXZR) %(dest)s = cSwap(unsMem,
+                                  isBigEndian64(xc->tcBase()));
+                      '''
+            store_res = store_res % { "dest": self.des, "suffix":self.suffix,
+                    "mask": MASKS[self.size], "utype": self.utp}
+            self.codeBlobs["postacc_code"] = \
+                    store_res + " SevMailbox = 1; LLSCLock = 0;"
+
+        def emit(self, op):
+            self.buildEACode()
+
+            opcode = "%(type)s val = cSwap(%(result)s,"\
+                      " isBigEndian64(xc->tcBase()));\n"
+            opcode += "TypedAtomicOpFunctor<%(type)s> *amo_op = "\
+                      "new AtomicGeneric3Op<%(type)s>(Mem%(suffix)s,"\
+                      " val, [](%(type)s* b, %(type)s a,"\
+                      " %(type)s c){ %(op)s });\n"
+
+            opcode = opcode % { "suffix" : self.suffix,
+                    "type": self.tp , "result": self.res, "op": op}
+            self.codeBlobs['amo_code'] = opcode
+            accCode = "Mem%(suffix)s = cSwap(%(dest)s,"\
+                                       "isBigEndian64(xc->tcBase()));"
+            accCode = accCode % { "dest": self.des, "suffix":self.suffix}
+            self.codeBlobs["memacc_code"] = accCode
+            self.emitHelper(self.base)
+
+
+    AtomicArithmeticSingleOp("ldaddb",    "LDADDB",    1, unsign=True,
+                    flavor="normal").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddlb",   "LDADDLB",   1, unsign=True,
+                   flavor="release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddab",   "LDADDAB",   1, unsign=True,
+                   flavor="acquire").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddlab",  "LDADDLAB",  1, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddh",    "LDADDH",    2, unsign=True,
+                   flavor="normal").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddlh",   "LDADDLH",   2, unsign=True,
+                   flavor="release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddah",   "LDADDAH",   2, unsign=True,
+                   flavor="acquire").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddlah",  "LDADDLAH",  2, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldadd",     "LDADD",     4, unsign=True,
+                   flavor="normal").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddl",    "LDADDL",    4, unsign=True,
+                   flavor="release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldadda",    "LDADDA",    4, unsign=True,
+                   flavor="acquire").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddla",   "LDADDLA",   4, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldadd64",   "LDADD64",   8, unsign=True,
+                   flavor="normal").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddl64",  "LDADDL64",  8, unsign=True,
+                   flavor="release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldadda64",  "LDADDA64",  8, unsign=True,
+                   flavor="acquire").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("ldaddla64", "LDADDLA64", 8, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['ADD'])
+
+    AtomicArithmeticSingleOp("ldclrb",    "LDCLRB",    1, unsign=True,
+                   flavor="normal").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrlb",   "LDCLRLB",   1, unsign=True,
+                   flavor="release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrab",   "LDCLRAB",   1, unsign=True,
+                   flavor="acquire").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrlab",  "LDCLRLAB",  1, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrh",    "LDCLRH",    2, unsign=True,
+                   flavor="normal").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrlh",   "LDCLRLH",   2, unsign=True,
+                   flavor="release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrah",   "LDCLRAH",   2, unsign=True,
+                   flavor="acquire").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrlah",  "LDCLRLAH",  2, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclr",     "LDCLR",     4, unsign=True,
+                   flavor="normal").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrl",    "LDCLRL",    4, unsign=True,
+                   flavor="release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclra",    "LDCLRA",    4, unsign=True,
+                   flavor="acquire").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrla",   "LDCLRLA",   4, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclr64",   "LDCLR64",   8, unsign=True,
+                   flavor="normal").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrl64",  "LDCLRL64",  8, unsign=True,
+                   flavor="release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclra64",  "LDCLRA64",  8, unsign=True,
+                   flavor="acquire").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("ldclrla64", "LDCLRLA64", 8, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['CLR'])
+
+    AtomicArithmeticSingleOp("ldeorb",   "LDEORB",   1, unsign=True,
+                   flavor="normal").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorlb",  "LDEORLB",  1, unsign=True,
+                   flavor="release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorab",  "LDEORAB",  1, unsign=True,
+                   flavor="acquire").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorlab", "LDEORLAB", 1, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorh",   "LDEORH",   2, unsign=True,
+                   flavor="normal").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorlh",  "LDEORLH",  2, unsign=True,
+                   flavor="release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorah",  "LDEORAH",  2, unsign=True,
+                   flavor="acquire").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorlah", "LDEORLAH", 2, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeor",    "LDEOR",    4, unsign=True,
+                   flavor="normal").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorl",   "LDEORL",   4, unsign=True,
+                   flavor="release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeora",   "LDEORA",   4, unsign=True,
+                   flavor="acquire").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorla",  "LDEORLA",  4, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeor64",   "LDEOR64",   8, unsign=True,
+                   flavor="normal").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorl64",  "LDEORL64",  8, unsign=True,
+                   flavor="release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeora64",  "LDEORA64",  8, unsign=True,
+                   flavor="acquire").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("ldeorla64", "LDEORLA64", 8, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['EOR'])
+
+    AtomicArithmeticSingleOp("ldsetb",   "LDSETB",   1, unsign=True,
+                   flavor="normal").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetlb",  "LDSETLB",  1, unsign=True,
+                   flavor="release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetab",  "LDSETAB",  1, unsign=True,
+                   flavor="acquire").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetlab", "LDSETLAB", 1, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldseth",   "LDSETH",   2, unsign=True,
+                   flavor="normal").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetlh",  "LDSETLH",  2, unsign=True,
+                   flavor="release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetah",  "LDSETAH",  2, unsign=True,
+                   flavor="acquire").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetlah", "LDSETLAH", 2, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldset",    "LDSET",    4, unsign=True,
+                   flavor="normal").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetl",   "LDSETL",   4, unsign=True,
+                   flavor="release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldseta",   "LDSETA",   4, unsign=True,
+                   flavor="acquire").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetla",  "LDSETLA",  4, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldset64",   "LDSET64",   8, unsign=True,
+                   flavor="normal").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetl64",  "LDSETL64",  8, unsign=True,
+                   flavor="release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldseta64",  "LDSETA64",  8, unsign=True,
+                   flavor="acquire").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("ldsetla64", "LDSETLA64", 8, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['SET'])
+
+    AtomicArithmeticSingleOp("ldsmaxb",   "LDSMAXB",    1, unsign=False,
+                   flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxlb",  "LDSMAXLB",   1, unsign=False,
+                   flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxab",  "LDSMAXAB",   1, unsign=False,
+                   flavor="acquire").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxlab", "LDSMAXLAB",  1, unsign=False,
+                   flavor="acquire_release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxh",   "LDSMAXH",    2, unsign=False,
+                   flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxlh",  "LDSMAXLH",   2, unsign=False,
+                   flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxah",  "LDSMAXAH",   2, unsign=False,
+                   flavor="acquire").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxlah", "LDSMAXLAH",  2, unsign=False,
+                   flavor="acquire_release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmax",    "LDSMAX",     4, unsign=False,
+                   flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxl",   "LDSMAXL",    4, unsign=False,
+                   flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxa",   "LDSMAXA",    4, unsign=False,
+                   flavor="acquire").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxla",  "LDSMAXLA",   4, unsign=False,
+                   flavor="acquire_release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmax64",   "LDSMAX64",   8, unsign=False,
+                   flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxl64",  "LDSMAXL64",  8, unsign=False,
+                   flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxa64",  "LDSMAXA64",  8, unsign=False,
+                   flavor="acquire").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldsmaxla64", "LDSMAXLA64", 8, unsign=False,
+                   flavor="acquire_release").emit(OP_DICT['MAX'])
+
+    AtomicArithmeticSingleOp("ldsminb",   "LDSMINB",   1, unsign=False,
+                   flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminlb",  "LDSMINLB",  1, unsign=False,
+                   flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminab",  "LDSMINAB",  1, unsign=False,
+                   flavor="acquire").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminlab", "LDSMINLAB", 1, unsign=False,
+                   flavor="acquire_release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminh",   "LDSMINH",   2, unsign=False,
+                   flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminlh",  "LDSMINLH",  2, unsign=False,
+                   flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminah",  "LDSMINAH",  2, unsign=False,
+                   flavor="acquire").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminlah", "LDSMINLAH", 2, unsign=False,
+                   flavor="acquire_release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsmin",    "LDSMIN",    4, unsign=False,
+                   flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminl",   "LDSMINL",   4, unsign=False,
+                   flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsmina",   "LDSMINA",   4, unsign=False,
+                   flavor="acquire").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminla",  "LDSMINLA",  4, unsign=False,
+                   flavor="acquire_release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsmin64",   "LDSMIN64",   8, unsign=False,
+                   flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminl64",  "LDSMINL64",  8, unsign=False,
+                   flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsmina64",  "LDSMINA64",  8, unsign=False,
+                   flavor="acquire").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldsminla64", "LDSMINLA64", 8, unsign=False,
+                   flavor="acquire_release").emit(OP_DICT['MIN'])
+
+    AtomicArithmeticSingleOp("ldumaxb",   "LDUMAXB",    1, unsign=True,
+                   flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxlb",  "LDUMAXLB",   1, unsign=True,
+                   flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxab",  "LDUMAXAB",   1, unsign=True,
+                   flavor="acquire").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxlab", "LDUMAXLAB",  1, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxh",   "LDUMAXH",    2, unsign=True,
+                   flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxlh",  "LDUMAXLH",   2, unsign=True,
+                   flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxah",  "LDUMAXAH",   2, unsign=True,
+                   flavor="acquire").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxlah", "LDUMAXLAH",  2, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumax",    "LDUMAX",     4, unsign=True,
+                   flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxl",   "LDUMAXL",    4, unsign=True,
+                   flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxa",   "LDUMAXA",    4, unsign=True,
+                   flavor="acquire").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxla",  "LDUMAXLA",   4, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumax64",  "LDUMAX64",    8, unsign=True,
+                   flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxl64", "LDUMAXL64",   8, unsign=True,
+                   flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxa64", "LDUMAXA64",   8, unsign=True,
+                   flavor="acquire").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("ldumaxla64", "LDUMAXLA64", 8, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['MAX'])
+
+
+    AtomicArithmeticSingleOp("lduminb",   "LDUMINB",    1, unsign=True,
+                   flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminlb",  "LDUMINLB",   1, unsign=True,
+                   flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminab",  "LDUMINAB",   1, unsign=True,
+                   flavor="acquire").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminlab", "LDUMINLAB",  1, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminh",   "LDUMINH",    2, unsign=True,
+                   flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminlh",  "LDUMINLH",   2, unsign=True,
+                   flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminah",  "LDUMINAH",   2, unsign=True,
+                   flavor="acquire").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminlah", "LDUMINLAH",  2, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldumin",    "LDUMIN",     4, unsign=True,
+                   flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminl",   "LDUMINL",    4, unsign=True,
+                   flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldumina",   "LDUMINA",    4, unsign=True,
+                   flavor="acquire").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminla",  "LDUMINLA",   4, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldumin64",  "LDUMIN64",    8, unsign=True,
+                   flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminl64", "LDUMINL64",   8, unsign=True,
+                   flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("ldumina64", "LDUMINA64",   8, unsign=True,
+                   flavor="acquire").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("lduminla64", "LDUMINLA64", 8, unsign=True,
+                   flavor="acquire_release").emit(OP_DICT['MIN'])
+
+    AtomicArithmeticSingleOp("staddb",    "STADDB",    1, unsign=True,
+                    ret_op=False, flavor="normal").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("staddlb",   "STADDLB",   1, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("staddh",    "STADDH",    2, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("staddlh",   "STADDLH",   2, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("stadd",     "STADD",     4, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("staddl",    "STADDL",    4, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("stadd64",   "STADD64",   8, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['ADD'])
+    AtomicArithmeticSingleOp("staddl64",  "STADDL64",  8, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['ADD'])
+
+    AtomicArithmeticSingleOp("stclrb",    "STCLRB",    1, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("stclrlb",   "STCLRLB",   1, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("stclrh",    "STCLRH",    2, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("stclrlh",   "STCLRLH",   2, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("stclr",     "STCLR",     4, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("stclrl",    "STCLRL",    4, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("stclr64",   "STCLR64",   8, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['CLR'])
+    AtomicArithmeticSingleOp("stclrl64",  "STCLRL64",  8, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['CLR'])
+
+    AtomicArithmeticSingleOp("steorb",   "STEORB",   1, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("steorlb",  "STEORLB",  1, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("steorh",   "STEORH",   2, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("steorlh",  "STEORLH",  2, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("steor",    "STEOR",    4, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("steorl",   "STEORL",   4, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("steor64",   "STEOR64",   8, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['EOR'])
+    AtomicArithmeticSingleOp("steorl64",  "STEORL64",  8, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['EOR'])
+
+    AtomicArithmeticSingleOp("stsetb",   "STSETB",   1, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("stsetlb",  "STSETLB",  1, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("stsetab",  "STSETAB",  1, unsign=True,
+                   ret_op=False, flavor="acquire").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("stsetlab", "STSETLAB", 1, unsign=True,
+                   ret_op=False, flavor="acquire_release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("stseth",   "STSETH",   2, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("stsetlh",  "STSETLH",  2, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("stset",    "STSET",    4, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("stsetl",   "STSETL",   4, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("stset64",   "STSET64",   8, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['SET'])
+    AtomicArithmeticSingleOp("stsetl64",  "STSETL64",  8, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['SET'])
+
+    AtomicArithmeticSingleOp("stsmaxb",   "STSMAXB",    1, unsign=False,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stsmaxlb",  "STSMAXLB",   1, unsign=False,
+                   ret_op=False, flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stsmaxh",   "STSMAXH",    2, unsign=False,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stsmaxlh",  "STSMAXLH",   2, unsign=False,
+                   ret_op=False, flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stsmax",    "STSMAX",     4, unsign=False,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stsmaxl",   "STSMAXL",    4, unsign=False,
+                   ret_op=False, flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stsmax64",   "STSMAX64",   8, unsign=False,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stsmaxl64",  "STSMAXL64",  8, unsign=False,
+                   ret_op=False, flavor="release").emit(OP_DICT['MAX'])
+
+    AtomicArithmeticSingleOp("stsminb",   "STSMINB",   1, unsign=False,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stsminlb",  "STSMINLB",  1, unsign=False,
+                   ret_op=False, flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stsminh",   "STSMINH",   2, unsign=False,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stsminlh",  "STSMINLH",  2, unsign=False,
+                   ret_op=False, flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stsmin",    "STSMIN",    4, unsign=False,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stsminl",   "STSMINL",   4, unsign=False,
+                   ret_op=False, flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stsmin64",   "STSMIN64",   8, unsign=False,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stsminl64",  "STSMINL64",  8, unsign=False,
+                   ret_op=False, flavor="release").emit(OP_DICT['MIN'])
+
+    AtomicArithmeticSingleOp("stumaxb",   "STUMAXB",    1, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stumaxlb",  "STUMAXLB",   1, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stumaxh",   "STUMAXH",    2, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stumaxlh",  "STUMAXLH",   2, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stumax",    "STUMAX",     4, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stumaxl",   "STUMAXL",    4, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stumax64",  "STUMAX64",    8, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MAX'])
+    AtomicArithmeticSingleOp("stumaxl64", "STUMAXL64",   8, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['MAX'])
+
+    AtomicArithmeticSingleOp("stuminb",   "STUMINB",    1, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stuminlb",  "STUMINLB",   1, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stuminh",   "STUMINH",    2, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stuminlh",  "STUMINLH",   2, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stumin",    "STUMIN",     4, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stuminl",   "STUMINL",    4, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stumin64",  "STUMIN64",    8, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['MIN'])
+    AtomicArithmeticSingleOp("stuminl64", "STUMINL64",   8, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['MIN'])
+
+    AtomicArithmeticSingleOp("swpb",   "SWPB",    1, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swplb",  "SWPLB",   1, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swpab",  "SWPAB",   1, unsign=True,
+                   ret_op=False, flavor="acquire").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swplab", "SWPLAB",  1, unsign=True,
+                   ret_op=False, flavor="acquire_release").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swph",   "SWPH",    2, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swplh",  "SWPLH",   2, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swpah",  "SWPAH",   2, unsign=True,
+                   ret_op=False, flavor="acquire").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swplah", "SWPLAH",  2, unsign=True,
+                   ret_op=False, flavor="acquire_release").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swp",    "SWP",     4, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swpl",   "SWPL",    4, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swpa",   "SWPA",    4, unsign=True,
+                   ret_op=False, flavor="acquire").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swpla",  "SWPLA",   4, unsign=True,
+                   ret_op=False, flavor="acquire_release").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swp64",  "SWP64",    8, unsign=True,
+                   ret_op=False, flavor="normal").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swpl64", "SWPL64",   8, unsign=True,
+                   ret_op=False, flavor="release").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swpa64", "SWPA64",   8, unsign=True,
+                   ret_op=False, flavor="acquire").emit(OP_DICT['SWP'])
+    AtomicArithmeticSingleOp("swpla64", "SWPLA64", 8, unsign=True,
+                   ret_op=False, flavor="acquire_release").emit(OP_DICT['SWP'])
 }};
diff --git a/src/arch/arm/isa/templates/mem64.isa b/src/arch/arm/isa/templates/mem64.isa
index dc8e0c5..e5df801 100644
--- a/src/arch/arm/isa/templates/mem64.isa
+++ b/src/arch/arm/isa/templates/mem64.isa
@@ -851,7 +851,8 @@
                           _dest, _base, _result)
     {
         %(constructor)s;
-
+        flags[IsStore] = false;
+        flags[IsLoad] = false;
     }
 }};
 
@@ -897,7 +898,8 @@
         _srcRegIdx[_numSrcRegs++] = RegId(IntRegClass, r2);
         r2_dst = _numDestRegs ;
         _destRegIdx[_numDestRegs++] = RegId(IntRegClass, r2);
-
+        flags[IsStore] = false;
+        flags[IsLoad] = false;
     }
 }};
 
@@ -932,8 +934,9 @@
         %(constructor)s;
         isXZR = false;
         uint32_t r2 = RegId(IntRegClass, dest).index() ;
+        flags[IsStore] = false;
+        flags[IsLoad] = false;
         if (r2 == 31){
-            flags[IsReadBarrier] = false;
             isXZR = true;
         }
     }