arm: Add support for ARMv8 (AArch64 & AArch32)

Note: AArch64 and AArch32 interworking is not supported. If you use an AArch64
kernel you are restricted to AArch64 user-mode binaries. This will be addressed
in a later patch.

Note: Virtualization is only supported in AArch32 mode. This will also be fixed
in a later patch.

Contributors:
Giacomo Gabrielli    (TrustZone, LPAE, system-level AArch64, AArch64 NEON, validation)
Thomas Grocutt       (AArch32 Virtualization, AArch64 FP, validation)
Mbou Eyole           (AArch64 NEON, validation)
Ali Saidi            (AArch64 Linux support, code integration, validation)
Edmund Grimley-Evans (AArch64 FP)
William Wang         (AArch64 Linux support)
Rene De Jong         (AArch64 Linux support, performance opt.)
Matt Horsnell        (AArch64 MP, validation)
Matt Evans           (device models, code integration, validation)
Chris Adeniyi-Jones  (AArch64 syscall-emulation)
Prakash Ramrakhyani  (validation)
Dam Sunwoo           (validation)
Chander Sudanthi     (validation)
Stephan Diestelhorst (validation)
Andreas Hansson      (code integration, performance opt.)
Eric Van Hensbergen  (performance opt.)
Gabe Black
diff --git a/src/arch/arm/isa/bitfields.isa b/src/arch/arm/isa/bitfields.isa
index 5a8b5db..6006cfb 100644
--- a/src/arch/arm/isa/bitfields.isa
+++ b/src/arch/arm/isa/bitfields.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010, 2011 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -73,6 +73,7 @@
 
 def bitfield THUMB         thumb;
 def bitfield BIGTHUMB      bigThumb;
+def bitfield AARCH64       aarch64;
 
 // Other
 def bitfield COND_CODE     condCode;
diff --git a/src/arch/arm/isa/decoder/aarch64.isa b/src/arch/arm/isa/decoder/aarch64.isa
new file mode 100644
index 0000000..a6c0fa2
--- /dev/null
+++ b/src/arch/arm/isa/decoder/aarch64.isa
@@ -0,0 +1,48 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+////////////////////////////////////////////////////////////////////
+//
+// The 64 bit ARM decoder
+// --------------------------
+//
+
+
+Aarch64::aarch64();
+
diff --git a/src/arch/arm/isa/decoder/arm.isa b/src/arch/arm/isa/decoder/arm.isa
index 4bd9d5c..f0c0dec 100644
--- a/src/arch/arm/isa/decoder/arm.isa
+++ b/src/arch/arm/isa/decoder/arm.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2012 ARM Limited
+// Copyright (c) 2010-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -73,7 +73,11 @@
                         0x9: ArmBlxReg::armBlxReg();
                     }
                     0x5: ArmSatAddSub::armSatAddSub();
-                    0x7: Breakpoint::bkpt();
+                    0x6: ArmERet::armERet();
+                    0x7: decode OPCODE_22 {
+                        0: Breakpoint::bkpt();
+                        1: ArmSmcHyp::armSmcHyp();
+                    }
                 }
                 0x1: ArmHalfWordMultAndMultAcc::armHalfWordMultAndMultAcc();
             }
@@ -105,6 +109,10 @@
     }
     0x6: decode CPNUM {
         0xa, 0xb: ExtensionRegLoadStore::extensionRegLoadStore();
+        0xf: decode OPCODE_20 {
+            0: Mcrr15::Mcrr15();
+            1: Mrrc15::Mrrc15();
+        }
     }
     0x7: decode OPCODE_24 {
         0: decode OPCODE_4 {
diff --git a/src/arch/arm/isa/decoder/decoder.isa b/src/arch/arm/isa/decoder/decoder.isa
index cf7d178..94685b9 100644
--- a/src/arch/arm/isa/decoder/decoder.isa
+++ b/src/arch/arm/isa/decoder/decoder.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2011 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -41,8 +41,12 @@
 // Authors: Gabe Black
 
 decode THUMB default Unknown::unknown() {
-0:
-##include "arm.isa"
+0: decode AARCH64 {
+    0:
+    ##include "arm.isa"
+    1:
+    ##include "aarch64.isa"
+}
 1:
 ##include "thumb.isa"
 }
diff --git a/src/arch/arm/isa/decoder/thumb.isa b/src/arch/arm/isa/decoder/thumb.isa
index f54cc72..3149579 100644
--- a/src/arch/arm/isa/decoder/thumb.isa
+++ b/src/arch/arm/isa/decoder/thumb.isa
@@ -95,8 +95,14 @@
                     0xa, 0xb: ExtensionRegLoadStore::extensionRegLoadStre();
                     0xf: decode HTOPCODE_9_4 {
                         0x00: Unknown::undefined();
-                        0x04: WarnUnimpl::mcrr(); // mcrr2
-                        0x05: WarnUnimpl::mrrc(); // mrrc2
+                        0x04: decode LTCOPROC {
+                            0xf: Mcrr15::Mcrr15();
+                            default: WarnUnimpl::mcrr(); // mcrr2
+                        }
+                        0x05: decode LTCOPROC {
+                            0xf: Mrrc15::Mrrc15();
+                            default: WarnUnimpl::mrrc(); // mrrc2
+                        }
                         0x02, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10,
                         0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e:
                             WarnUnimpl::stc(); // stc2
diff --git a/src/arch/arm/isa/formats/aarch64.isa b/src/arch/arm/isa/formats/aarch64.isa
new file mode 100644
index 0000000..3ed70ce
--- /dev/null
+++ b/src/arch/arm/isa/formats/aarch64.isa
@@ -0,0 +1,2035 @@
+// Copyright (c) 2011-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+//          Thomas Grocutt
+//          Mbou Eyole
+//          Giacomo Gabrielli
+
+output header {{
+namespace Aarch64
+{
+    StaticInstPtr decodeDataProcImm(ExtMachInst machInst);
+    StaticInstPtr decodeBranchExcSys(ExtMachInst machInst);
+    StaticInstPtr decodeLoadsStores(ExtMachInst machInst);
+    StaticInstPtr decodeDataProcReg(ExtMachInst machInst);
+
+    StaticInstPtr decodeFpAdvSIMD(ExtMachInst machInst);
+    StaticInstPtr decodeFp(ExtMachInst machInst);
+    StaticInstPtr decodeAdvSIMD(ExtMachInst machInst);
+    StaticInstPtr decodeAdvSIMDScalar(ExtMachInst machInst);
+
+    StaticInstPtr decodeGem5Ops(ExtMachInst machInst);
+}
+}};
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    decodeDataProcImm(ExtMachInst machInst)
+    {
+        IntRegIndex rd = (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+        IntRegIndex rdsp = makeSP(rd);
+        IntRegIndex rn = (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+        IntRegIndex rnsp = makeSP(rn);
+
+        uint8_t opc = bits(machInst, 30, 29);
+        bool sf = bits(machInst, 31);
+        bool n = bits(machInst, 22);
+        uint8_t immr = bits(machInst, 21, 16);
+        uint8_t imms = bits(machInst, 15, 10);
+        switch (bits(machInst, 25, 23)) {
+          case 0x0:
+          case 0x1:
+          {
+            uint64_t immlo = bits(machInst, 30, 29);
+            uint64_t immhi = bits(machInst, 23, 5);
+            uint64_t imm = (immlo << 0) | (immhi << 2);
+            if (bits(machInst, 31) == 0)
+                return new AdrXImm(machInst, rd, INTREG_ZERO, sext<21>(imm));
+            else
+                return new AdrpXImm(machInst, rd, INTREG_ZERO,
+                                    sext<33>(imm << 12));
+          }
+          case 0x2:
+          case 0x3:
+          {
+            uint32_t imm12 = bits(machInst, 21, 10);
+            uint8_t shift = bits(machInst, 23, 22);
+            uint32_t imm;
+            if (shift == 0x0)
+                imm = imm12 << 0;
+            else if (shift == 0x1)
+                imm = imm12 << 12;
+            else
+                return new Unknown64(machInst);
+            switch (opc) {
+              case 0x0:
+                return new AddXImm(machInst, rdsp, rnsp, imm);
+              case 0x1:
+                return new AddXImmCc(machInst, rd, rnsp, imm);
+              case 0x2:
+                return new SubXImm(machInst, rdsp, rnsp, imm);
+              case 0x3:
+                return new SubXImmCc(machInst, rd, rnsp, imm);
+            }
+          }
+          case 0x4:
+          {
+            if (!sf && n)
+                return new Unknown64(machInst);
+            // len = MSB(n:NOT(imms)), len < 1 is undefined.
+            uint8_t len = 0;
+            if (n) {
+                len = 6;
+            } else if (imms == 0x3f || imms == 0x3e) {
+                return new Unknown64(machInst);
+            } else {
+                len = findMsbSet(imms ^ 0x3f);
+            }
+            // Generate r, s, and size.
+            uint64_t r = bits(immr, len - 1, 0);
+            uint64_t s = bits(imms, len - 1, 0);
+            uint8_t size = 1 << len;
+            if (s == size - 1)
+                return new Unknown64(machInst);
+            // Generate the pattern with s 1s, rotated by r, with size bits.
+            uint64_t pattern = mask(s + 1);
+            if (r) {
+                pattern = (pattern >> r) | (pattern << (size - r));
+                pattern &= mask(size);
+            }
+            uint8_t width = sf ? 64 : 32;
+            // Replicate that to fill up the immediate.
+            for (unsigned i = 1; i < (width / size); i *= 2)
+                pattern |= (pattern << (i * size));
+            uint64_t imm = pattern;
+
+            switch (opc) {
+              case 0x0:
+                return new AndXImm(machInst, rdsp, rn, imm);
+              case 0x1:
+                return new OrrXImm(machInst, rdsp, rn, imm);
+              case 0x2:
+                return new EorXImm(machInst, rdsp, rn, imm);
+              case 0x3:
+                return new AndXImmCc(machInst, rd, rn, imm);
+            }
+          }
+          case 0x5:
+          {
+            IntRegIndex rd = (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+            uint32_t imm16 = bits(machInst, 20, 5);
+            uint32_t hw = bits(machInst, 22, 21);
+            switch (opc) {
+              case 0x0:
+                return new Movn(machInst, rd, imm16, hw * 16);
+              case 0x1:
+                return new Unknown64(machInst);
+              case 0x2:
+                return new Movz(machInst, rd, imm16, hw * 16);
+              case 0x3:
+                return new Movk(machInst, rd, imm16, hw * 16);
+            }
+          }
+          case 0x6:
+            if ((sf != n) || (!sf && (bits(immr, 5) || bits(imms, 5))))
+                return new Unknown64(machInst);
+            switch (opc) {
+              case 0x0:
+                return new Sbfm64(machInst, rd, rn, immr, imms);
+              case 0x1:
+                return new Bfm64(machInst, rd, rn, immr, imms);
+              case 0x2:
+                return new Ubfm64(machInst, rd, rn, immr, imms);
+              case 0x3:
+                return new Unknown64(machInst);
+            }
+          case 0x7:
+          {
+            IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+            if (opc || bits(machInst, 21))
+                return new Unknown64(machInst);
+            else
+                return new Extr64(machInst, rd, rn, rm, imms);
+          }
+        }
+        return new FailUnimplemented("Unhandled Case8", machInst);
+    }
+}
+}};
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    decodeBranchExcSys(ExtMachInst machInst)
+    {
+        switch (bits(machInst, 30, 29)) {
+          case 0x0:
+          {
+            int64_t imm = sext<26>(bits(machInst, 25, 0)) << 2;
+            if (bits(machInst, 31) == 0)
+                return new B64(machInst, imm);
+            else
+                return new Bl64(machInst, imm);
+          }
+          case 0x1:
+          {
+            IntRegIndex rt = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+            if (bits(machInst, 25) == 0) {
+                int64_t imm = sext<19>(bits(machInst, 23, 5)) << 2;
+                if (bits(machInst, 24) == 0)
+                    return new Cbz64(machInst, imm, rt);
+                else
+                    return new Cbnz64(machInst, imm, rt);
+            } else {
+                uint64_t bitmask = 0x1;
+                bitmask <<= bits(machInst, 23, 19);
+                int64_t imm = sext<14>(bits(machInst, 18, 5)) << 2;
+                if (bits(machInst, 31))
+                    bitmask <<= 32;
+                if (bits(machInst, 24) == 0)
+                    return new Tbz64(machInst, bitmask, imm, rt);
+                else
+                    return new Tbnz64(machInst, bitmask, imm, rt);
+            }
+          }
+          case 0x2:
+            // bit 30:26=10101
+            if (bits(machInst, 31) == 0) {
+                if (bits(machInst, 25, 24) || bits(machInst, 4))
+                    return new Unknown64(machInst);
+                int64_t imm = sext<19>(bits(machInst, 23, 5)) << 2;
+                ConditionCode condCode =
+                    (ConditionCode)(uint8_t)(bits(machInst, 3, 0));
+                return new BCond64(machInst, imm, condCode);
+            } else if (bits(machInst, 25, 24) == 0x0) {
+                if (bits(machInst, 4, 2))
+                    return new Unknown64(machInst);
+                uint8_t decVal = (bits(machInst, 1, 0) << 0) |
+                                 (bits(machInst, 23, 21) << 2);
+                switch (decVal) {
+                  case 0x01:
+                    return new Svc64(machInst);
+                  case 0x02:
+                    return new FailUnimplemented("hvc", machInst);
+                  case 0x03:
+                    return new Smc64(machInst);
+                  case 0x04:
+                    return new FailUnimplemented("brk", machInst);
+                  case 0x08:
+                    return new FailUnimplemented("hlt", machInst);
+                  case 0x15:
+                    return new FailUnimplemented("dcps1", machInst);
+                  case 0x16:
+                    return new FailUnimplemented("dcps2", machInst);
+                  case 0x17:
+                    return new FailUnimplemented("dcps3", machInst);
+                  default:
+                    return new Unknown64(machInst);
+                }
+            } else if (bits(machInst, 25, 22) == 0x4) {
+                // bit 31:22=1101010100
+                bool l = bits(machInst, 21);
+                uint8_t op0 = bits(machInst, 20, 19);
+                uint8_t op1 = bits(machInst, 18, 16);
+                uint8_t crn = bits(machInst, 15, 12);
+                uint8_t crm = bits(machInst, 11, 8);
+                uint8_t op2 = bits(machInst, 7, 5);
+                IntRegIndex rt = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+                switch (op0) {
+                  case 0x0:
+                    if (rt != 0x1f || l)
+                        return new Unknown64(machInst);
+                    if (crn == 0x2 && op1 == 0x3) {
+                        switch (op2) {
+                          case 0x0:
+                            return new NopInst(machInst);
+                          case 0x1:
+                            return new YieldInst(machInst);
+                          case 0x2:
+                            return new WfeInst(machInst);
+                          case 0x3:
+                            return new WfiInst(machInst);
+                          case 0x4:
+                            return new SevInst(machInst);
+                          case 0x5:
+                            return new SevlInst(machInst);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                    } else if (crn == 0x3 && op1 == 0x3) {
+                        switch (op2) {
+                          case 0x2:
+                            return new Clrex64(machInst);
+                          case 0x4:
+                            return new Dsb64(machInst);
+                          case 0x5:
+                            return new Dmb64(machInst);
+                          case 0x6:
+                            return new Isb64(machInst);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                    } else if (crn == 0x4) {
+                        // MSR immediate
+                        switch (op1 << 3 | op2) {
+                          case 0x5:
+                            // SP
+                            return new MsrSP64(machInst,
+                                               (IntRegIndex) MISCREG_SPSEL,
+                                               INTREG_ZERO,
+                                               crm & 0x1);
+                          case 0x1e:
+                            // DAIFSet
+                            return new MsrDAIFSet64(
+                                machInst,
+                                (IntRegIndex) MISCREG_DAIF,
+                                INTREG_ZERO,
+                                crm);
+                          case 0x1f:
+                            // DAIFClr
+                            return new MsrDAIFClr64(
+                                machInst,
+                                (IntRegIndex) MISCREG_DAIF,
+                                INTREG_ZERO,
+                                crm);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                    } else {
+                        return new Unknown64(machInst);
+                    }
+                    break;
+                  case 0x1:
+                  case 0x2:
+                  case 0x3:
+                  {
+                    // bit 31:22=1101010100, 20:19=11
+                    bool read = l;
+                    MiscRegIndex miscReg =
+                        decodeAArch64SysReg(op0, op1, crn, crm, op2);
+                    if (read) {
+                        if ((miscReg == MISCREG_DC_CIVAC_Xt) ||
+                            (miscReg == MISCREG_DC_CVAC_Xt) ||
+                            (miscReg == MISCREG_DC_ZVA_Xt)) {
+                            return new Unknown64(machInst);
+                        }
+                    }
+                    // Check for invalid registers
+                    if (miscReg == MISCREG_UNKNOWN) {
+                        return new Unknown64(machInst);
+                    } else if (miscRegInfo[miscReg][MISCREG_IMPLEMENTED]) {
+                        if (miscReg == MISCREG_NZCV) {
+                            if (read)
+                                return new MrsNZCV64(machInst, rt, (IntRegIndex) miscReg);
+                            else
+                                return new MsrNZCV64(machInst, (IntRegIndex) miscReg, rt);
+                        }
+                        uint32_t iss = msrMrs64IssBuild(read, op0, op1, crn, crm, op2, rt);
+                        if (miscReg == MISCREG_DC_ZVA_Xt && !read)
+                            return new Dczva(machInst, rt, (IntRegIndex) miscReg, iss);
+
+                        if (read)
+                            return new Mrs64(machInst, rt, (IntRegIndex) miscReg, iss);
+                        else
+                            return new Msr64(machInst, (IntRegIndex) miscReg, rt, iss);
+                    } else if (miscRegInfo[miscReg][MISCREG_WARN_NOT_FAIL]) {
+                        std::string full_mnem = csprintf("%s %s",
+                            read ? "mrs" : "msr", miscRegName[miscReg]);
+                        return new WarnUnimplemented(read ? "mrs" : "msr",
+                                                     machInst, full_mnem);
+                    } else {
+                        return new FailUnimplemented(csprintf("%s %s",
+                            read ? "mrs" : "msr", miscRegName[miscReg]).c_str(),
+                            machInst);
+                    }
+                  }
+                  break;
+                }
+            } else if (bits(machInst, 25) == 0x1) {
+                uint8_t opc = bits(machInst, 24, 21);
+                uint8_t op2 = bits(machInst, 20, 16);
+                uint8_t op3 = bits(machInst, 15, 10);
+                IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+                uint8_t op4 = bits(machInst, 4, 0);
+                if (op2 != 0x1f || op3 != 0x0 || op4 != 0x0)
+                    return new Unknown64(machInst);
+                switch (opc) {
+                  case 0x0:
+                    return new Br64(machInst, rn);
+                  case 0x1:
+                    return new Blr64(machInst, rn);
+                  case 0x2:
+                    return new Ret64(machInst, rn);
+                  case 0x4:
+                    if (rn != 0x1f)
+                        return new Unknown64(machInst);
+                    return new Eret64(machInst);
+                  case 0x5:
+                    if (rn != 0x1f)
+                        return new Unknown64(machInst);
+                    return new FailUnimplemented("dret", machInst);
+                }
+            }
+          default:
+            return new Unknown64(machInst);
+        }
+        return new FailUnimplemented("Unhandled Case7", machInst);
+    }
+}
+}};
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    decodeLoadsStores(ExtMachInst machInst)
+    {
+        // bit 27,25=10
+        switch (bits(machInst, 29, 28)) {
+          case 0x0:
+            if (bits(machInst, 26) == 0) {
+                if (bits(machInst, 24) != 0)
+                    return new Unknown64(machInst);
+                IntRegIndex rt = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+                IntRegIndex rnsp = makeSP(rn);
+                IntRegIndex rt2 = (IntRegIndex)(uint8_t)bits(machInst, 14, 10);
+                IntRegIndex rs = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+                uint8_t opc = (bits(machInst, 15) << 0) |
+                              (bits(machInst, 23, 21) << 1);
+                uint8_t size = bits(machInst, 31, 30);
+                switch (opc) {
+                  case 0x0:
+                    switch (size) {
+                      case 0x0:
+                        return new STXRB64(machInst, rt, rnsp, rs);
+                      case 0x1:
+                        return new STXRH64(machInst, rt, rnsp, rs);
+                      case 0x2:
+                        return new STXRW64(machInst, rt, rnsp, rs);
+                      case 0x3:
+                        return new STXRX64(machInst, rt, rnsp, rs);
+                    }
+                  case 0x1:
+                    switch (size) {
+                      case 0x0:
+                        return new STLXRB64(machInst, rt, rnsp, rs);
+                      case 0x1:
+                        return new STLXRH64(machInst, rt, rnsp, rs);
+                      case 0x2:
+                        return new STLXRW64(machInst, rt, rnsp, rs);
+                      case 0x3:
+                        return new STLXRX64(machInst, rt, rnsp, rs);
+                    }
+                  case 0x2:
+                    switch (size) {
+                      case 0x0:
+                      case 0x1:
+                        return new Unknown64(machInst);
+                      case 0x2:
+                        return new STXPW64(machInst, rs, rt, rt2, rnsp);
+                      case 0x3:
+                        return new STXPX64(machInst, rs, rt, rt2, rnsp);
+                    }
+
+                  case 0x3:
+                    switch (size) {
+                      case 0x0:
+                      case 0x1:
+                        return new Unknown64(machInst);
+                      case 0x2:
+                        return new STLXPW64(machInst, rs, rt, rt2, rnsp);
+                      case 0x3:
+                        return new STLXPX64(machInst, rs, rt, rt2, rnsp);
+                    }
+
+                  case 0x4:
+                    switch (size) {
+                      case 0x0:
+                        return new LDXRB64(machInst, rt, rnsp, rs);
+                      case 0x1:
+                        return new LDXRH64(machInst, rt, rnsp, rs);
+                      case 0x2:
+                        return new LDXRW64(machInst, rt, rnsp, rs);
+                      case 0x3:
+                        return new LDXRX64(machInst, rt, rnsp, rs);
+                    }
+                  case 0x5:
+                    switch (size) {
+                      case 0x0:
+                        return new LDAXRB64(machInst, rt, rnsp, rs);
+                      case 0x1:
+                        return new LDAXRH64(machInst, rt, rnsp, rs);
+                      case 0x2:
+                        return new LDAXRW64(machInst, rt, rnsp, rs);
+                      case 0x3:
+                        return new LDAXRX64(machInst, rt, rnsp, rs);
+                    }
+                  case 0x6:
+                    switch (size) {
+                      case 0x0:
+                      case 0x1:
+                        return new Unknown64(machInst);
+                      case 0x2:
+                        return new LDXPW64(machInst, rt, rt2, rnsp);
+                      case 0x3:
+                        return new LDXPX64(machInst, rt, rt2, rnsp);
+                    }
+
+                  case 0x7:
+                    switch (size) {
+                      case 0x0:
+                      case 0x1:
+                        return new Unknown64(machInst);
+                      case 0x2:
+                        return new LDAXPW64(machInst, rt, rt2, rnsp);
+                      case 0x3:
+                        return new LDAXPX64(machInst, rt, rt2, rnsp);
+                    }
+
+                  case 0x9:
+                    switch (size) {
+                      case 0x0:
+                        return new STLRB64(machInst, rt, rnsp);
+                      case 0x1:
+                        return new STLRH64(machInst, rt, rnsp);
+                      case 0x2:
+                        return new STLRW64(machInst, rt, rnsp);
+                      case 0x3:
+                        return new STLRX64(machInst, rt, rnsp);
+                    }
+                  case 0xd:
+                    switch (size) {
+                      case 0x0:
+                        return new LDARB64(machInst, rt, rnsp);
+                      case 0x1:
+                        return new LDARH64(machInst, rt, rnsp);
+                      case 0x2:
+                        return new LDARW64(machInst, rt, rnsp);
+                      case 0x3:
+                        return new LDARX64(machInst, rt, rnsp);
+                    }
+                  default:
+                    return new Unknown64(machInst);
+                }
+            } else if (bits(machInst, 31)) {
+                return new Unknown64(machInst);
+            } else {
+                return decodeNeonMem(machInst);
+            }
+          case 0x1:
+          {
+            if (bits(machInst, 24) != 0)
+                return new Unknown64(machInst);
+            uint8_t switchVal = (bits(machInst, 26) << 0) |
+                                (bits(machInst, 31, 30) << 1);
+            int64_t imm = sext<19>(bits(machInst, 23, 5)) << 2;
+            IntRegIndex rt = (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+            switch (switchVal) {
+              case 0x0:
+                return new LDRWL64_LIT(machInst, rt, imm);
+              case 0x1:
+                return new LDRSFP64_LIT(machInst, rt, imm);
+              case 0x2:
+                return new LDRXL64_LIT(machInst, rt, imm);
+              case 0x3:
+                return new LDRDFP64_LIT(machInst, rt, imm);
+              case 0x4:
+                return new LDRSWL64_LIT(machInst, rt, imm);
+              case 0x5:
+                return new BigFpMemLit("ldr", machInst, rt, imm);
+              case 0x6:
+                return new PRFM64_LIT(machInst, rt, imm);
+              default:
+                return new Unknown64(machInst);
+            }
+          }
+          case 0x2:
+          {
+            uint8_t opc = bits(machInst, 31, 30);
+            if (opc >= 3)
+                return new Unknown64(machInst);
+            uint32_t size = 0;
+            bool fp = bits(machInst, 26);
+            bool load = bits(machInst, 22);
+            if (fp) {
+                size = 4 << opc;
+            } else {
+                if ((opc == 1) && !load)
+                    return new Unknown64(machInst);
+                size = (opc == 0 || opc == 1) ? 4 : 8;
+            }
+            uint8_t type = bits(machInst, 24, 23);
+            int64_t imm = sext<7>(bits(machInst, 21, 15)) * size;
+
+            IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+            IntRegIndex rt = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+            IntRegIndex rt2 = (IntRegIndex)(uint8_t)bits(machInst, 14, 10);
+
+            bool noAlloc = (type == 0);
+            bool signExt = !noAlloc && !fp && opc == 1;
+            PairMemOp::AddrMode mode;
+            const char *mnemonic = NULL;
+            switch (type) {
+              case 0x0:
+              case 0x2:
+                mode = PairMemOp::AddrMd_Offset;
+                break;
+              case 0x1:
+                mode = PairMemOp::AddrMd_PostIndex;
+                break;
+              case 0x3:
+                mode = PairMemOp::AddrMd_PreIndex;
+                break;
+              default:
+                return new Unknown64(machInst);
+            }
+            if (load) {
+                if (noAlloc)
+                    mnemonic = "ldnp";
+                else if (signExt)
+                    mnemonic = "ldpsw";
+                else
+                    mnemonic = "ldp";
+            } else {
+                if (noAlloc)
+                    mnemonic = "stnp";
+                else
+                    mnemonic = "stp";
+            }
+
+            return new LdpStp(mnemonic, machInst, size, fp, load, noAlloc,
+                    signExt, false, false, imm, mode, rn, rt, rt2);
+          }
+          // bit 29:27=111, 25=0
+          case 0x3:
+          {
+            uint8_t switchVal = (bits(machInst, 23, 22) << 0) |
+                                (bits(machInst, 26) << 2) |
+                                (bits(machInst, 31, 30) << 3);
+            if (bits(machInst, 24) == 1) {
+                uint64_t imm12 = bits(machInst, 21, 10);
+                IntRegIndex rt = (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+                IntRegIndex rnsp = makeSP(rn);
+                switch (switchVal) {
+                  case 0x00:
+                    return new STRB64_IMM(machInst, rt, rnsp, imm12);
+                  case 0x01:
+                    return new LDRB64_IMM(machInst, rt, rnsp, imm12);
+                  case 0x02:
+                    return new LDRSBX64_IMM(machInst, rt, rnsp, imm12);
+                  case 0x03:
+                    return new LDRSBW64_IMM(machInst, rt, rnsp, imm12);
+                  case 0x04:
+                    return new STRBFP64_IMM(machInst, rt, rnsp, imm12);
+                  case 0x05:
+                    return new LDRBFP64_IMM(machInst, rt, rnsp, imm12);
+                  case 0x06:
+                    return new BigFpMemImm("str", machInst, false,
+                                           rt, rnsp, imm12 << 4);
+                  case 0x07:
+                    return new BigFpMemImm("ldr", machInst, true,
+                                           rt, rnsp, imm12 << 4);
+                  case 0x08:
+                    return new STRH64_IMM(machInst, rt, rnsp, imm12 << 1);
+                  case 0x09:
+                    return new LDRH64_IMM(machInst, rt, rnsp, imm12 << 1);
+                  case 0x0a:
+                    return new LDRSHX64_IMM(machInst, rt, rnsp, imm12 << 1);
+                  case 0x0b:
+                    return new LDRSHW64_IMM(machInst, rt, rnsp, imm12 << 1);
+                  case 0x0c:
+                    return new STRHFP64_IMM(machInst, rt, rnsp, imm12 << 1);
+                  case 0x0d:
+                    return new LDRHFP64_IMM(machInst, rt, rnsp, imm12 << 1);
+                  case 0x10:
+                    return new STRW64_IMM(machInst, rt, rnsp, imm12 << 2);
+                  case 0x11:
+                    return new LDRW64_IMM(machInst, rt, rnsp, imm12 << 2);
+                  case 0x12:
+                    return new LDRSW64_IMM(machInst, rt, rnsp, imm12 << 2);
+                  case 0x14:
+                    return new STRSFP64_IMM(machInst, rt, rnsp, imm12 << 2);
+                  case 0x15:
+                    return new LDRSFP64_IMM(machInst, rt, rnsp, imm12 << 2);
+                  case 0x18:
+                    return new STRX64_IMM(machInst, rt, rnsp, imm12 << 3);
+                  case 0x19:
+                    return new LDRX64_IMM(machInst, rt, rnsp, imm12 << 3);
+                  case 0x1a:
+                    return new PRFM64_IMM(machInst, rt, rnsp, imm12 << 3);
+                  case 0x1c:
+                    return new STRDFP64_IMM(machInst, rt, rnsp, imm12 << 3);
+                  case 0x1d:
+                    return new LDRDFP64_IMM(machInst, rt, rnsp, imm12 << 3);
+                  default:
+                    return new Unknown64(machInst);
+                }
+            } else if (bits(machInst, 21) == 1) {
+                if (bits(machInst, 11, 10) != 0x2)
+                    return new Unknown64(machInst);
+                if (!bits(machInst, 14))
+                    return new Unknown64(machInst);
+                IntRegIndex rt = (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+                IntRegIndex rnsp = makeSP(rn);
+                IntRegIndex rm = (IntRegIndex)(uint32_t)bits(machInst, 20, 16);
+                ArmExtendType type =
+                    (ArmExtendType)(uint32_t)bits(machInst, 15, 13);
+                uint8_t s = bits(machInst, 12);
+                switch (switchVal) {
+                  case 0x00:
+                    return new STRB64_REG(machInst, rt, rnsp, rm, type, 0);
+                  case 0x01:
+                    return new LDRB64_REG(machInst, rt, rnsp, rm, type, 0);
+                  case 0x02:
+                    return new LDRSBX64_REG(machInst, rt, rnsp, rm, type, 0);
+                  case 0x03:
+                    return new LDRSBW64_REG(machInst, rt, rnsp, rm, type, 0);
+                  case 0x04:
+                    return new STRBFP64_REG(machInst, rt, rnsp, rm, type, 0);
+                  case 0x05:
+                    return new LDRBFP64_REG(machInst, rt, rnsp, rm, type, 0);
+                  case 0x6:
+                    return new BigFpMemReg("str", machInst, false,
+                                           rt, rnsp, rm, type, s * 4);
+                  case 0x7:
+                    return new BigFpMemReg("ldr", machInst, true,
+                                           rt, rnsp, rm, type, s * 4);
+                  case 0x08:
+                    return new STRH64_REG(machInst, rt, rnsp, rm, type, s);
+                  case 0x09:
+                    return new LDRH64_REG(machInst, rt, rnsp, rm, type, s);
+                  case 0x0a:
+                    return new LDRSHX64_REG(machInst, rt, rnsp, rm, type, s);
+                  case 0x0b:
+                    return new LDRSHW64_REG(machInst, rt, rnsp, rm, type, s);
+                  case 0x0c:
+                    return new STRHFP64_REG(machInst, rt, rnsp, rm, type, s);
+                  case 0x0d:
+                    return new LDRHFP64_REG(machInst, rt, rnsp, rm, type, s);
+                  case 0x10:
+                    return new STRW64_REG(machInst, rt, rnsp, rm, type, s * 2);
+                  case 0x11:
+                    return new LDRW64_REG(machInst, rt, rnsp, rm, type, s * 2);
+                  case 0x12:
+                    return new LDRSW64_REG(machInst, rt, rnsp, rm, type, s * 2);
+                  case 0x14:
+                    return new STRSFP64_REG(machInst, rt, rnsp, rm, type, s * 2);
+                  case 0x15:
+                    return new LDRSFP64_REG(machInst, rt, rnsp, rm, type, s * 2);
+                  case 0x18:
+                    return new STRX64_REG(machInst, rt, rnsp, rm, type, s * 3);
+                  case 0x19:
+                    return new LDRX64_REG(machInst, rt, rnsp, rm, type, s * 3);
+                  case 0x1a:
+                    return new PRFM64_REG(machInst, rt, rnsp, rm, type, s * 3);
+                  case 0x1c:
+                    return new STRDFP64_REG(machInst, rt, rnsp, rm, type, s * 3);
+                  case 0x1d:
+                    return new LDRDFP64_REG(machInst, rt, rnsp, rm, type, s * 3);
+                  default:
+                    return new Unknown64(machInst);
+                }
+            } else {
+                // bit 29:27=111, 25:24=00, 21=0
+                switch (bits(machInst, 11, 10)) {
+                  case 0x0:
+                  {
+                    IntRegIndex rt =
+                        (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+                    IntRegIndex rn =
+                        (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+                    IntRegIndex rnsp = makeSP(rn);
+                    uint64_t imm = sext<9>(bits(machInst, 20, 12));
+                    switch (switchVal) {
+                      case 0x00:
+                        return new STURB64_IMM(machInst, rt, rnsp, imm);
+                      case 0x01:
+                        return new LDURB64_IMM(machInst, rt, rnsp, imm);
+                      case 0x02:
+                        return new LDURSBX64_IMM(machInst, rt, rnsp, imm);
+                      case 0x03:
+                        return new LDURSBW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x04:
+                        return new STURBFP64_IMM(machInst, rt, rnsp, imm);
+                      case 0x05:
+                        return new LDURBFP64_IMM(machInst, rt, rnsp, imm);
+                      case 0x06:
+                        return new BigFpMemImm("stur", machInst, false,
+                                               rt, rnsp, imm);
+                      case 0x07:
+                        return new BigFpMemImm("ldur", machInst, true,
+                                               rt, rnsp, imm);
+                      case 0x08:
+                        return new STURH64_IMM(machInst, rt, rnsp, imm);
+                      case 0x09:
+                        return new LDURH64_IMM(machInst, rt, rnsp, imm);
+                      case 0x0a:
+                        return new LDURSHX64_IMM(machInst, rt, rnsp, imm);
+                      case 0x0b:
+                        return new LDURSHW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x0c:
+                        return new STURHFP64_IMM(machInst, rt, rnsp, imm);
+                      case 0x0d:
+                        return new LDURHFP64_IMM(machInst, rt, rnsp, imm);
+                      case 0x10:
+                        return new STURW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x11:
+                        return new LDURW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x12:
+                        return new LDURSW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x14:
+                        return new STURSFP64_IMM(machInst, rt, rnsp, imm);
+                      case 0x15:
+                        return new LDURSFP64_IMM(machInst, rt, rnsp, imm);
+                      case 0x18:
+                        return new STURX64_IMM(machInst, rt, rnsp, imm);
+                      case 0x19:
+                        return new LDURX64_IMM(machInst, rt, rnsp, imm);
+                      case 0x1a:
+                        return new PRFUM64_IMM(machInst, rt, rnsp, imm);
+                      case 0x1c:
+                        return new STURDFP64_IMM(machInst, rt, rnsp, imm);
+                      case 0x1d:
+                        return new LDURDFP64_IMM(machInst, rt, rnsp, imm);
+                      default:
+                        return new Unknown64(machInst);
+                    }
+                  }
+                  // bit 29:27=111, 25:24=00, 21=0, 11:10=01
+                  case 0x1:
+                  {
+                    IntRegIndex rt =
+                        (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+                    IntRegIndex rn =
+                        (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+                    IntRegIndex rnsp = makeSP(rn);
+                    uint64_t imm = sext<9>(bits(machInst, 20, 12));
+                    switch (switchVal) {
+                      case 0x00:
+                        return new STRB64_POST(machInst, rt, rnsp, imm);
+                      case 0x01:
+                        return new LDRB64_POST(machInst, rt, rnsp, imm);
+                      case 0x02:
+                        return new LDRSBX64_POST(machInst, rt, rnsp, imm);
+                      case 0x03:
+                        return new LDRSBW64_POST(machInst, rt, rnsp, imm);
+                      case 0x04:
+                        return new STRBFP64_POST(machInst, rt, rnsp, imm);
+                      case 0x05:
+                        return new LDRBFP64_POST(machInst, rt, rnsp, imm);
+                      case 0x06:
+                        return new BigFpMemPost("str", machInst, false,
+                                                rt, rnsp, imm);
+                      case 0x07:
+                        return new BigFpMemPost("ldr", machInst, true,
+                                                rt, rnsp, imm);
+                      case 0x08:
+                        return new STRH64_POST(machInst, rt, rnsp, imm);
+                      case 0x09:
+                        return new LDRH64_POST(machInst, rt, rnsp, imm);
+                      case 0x0a:
+                        return new LDRSHX64_POST(machInst, rt, rnsp, imm);
+                      case 0x0b:
+                        return new LDRSHW64_POST(machInst, rt, rnsp, imm);
+                      case 0x0c:
+                        return new STRHFP64_POST(machInst, rt, rnsp, imm);
+                      case 0x0d:
+                        return new LDRHFP64_POST(machInst, rt, rnsp, imm);
+                      case 0x10:
+                        return new STRW64_POST(machInst, rt, rnsp, imm);
+                      case 0x11:
+                        return new LDRW64_POST(machInst, rt, rnsp, imm);
+                      case 0x12:
+                        return new LDRSW64_POST(machInst, rt, rnsp, imm);
+                      case 0x14:
+                        return new STRSFP64_POST(machInst, rt, rnsp, imm);
+                      case 0x15:
+                        return new LDRSFP64_POST(machInst, rt, rnsp, imm);
+                      case 0x18:
+                        return new STRX64_POST(machInst, rt, rnsp, imm);
+                      case 0x19:
+                        return new LDRX64_POST(machInst, rt, rnsp, imm);
+                      case 0x1c:
+                        return new STRDFP64_POST(machInst, rt, rnsp, imm);
+                      case 0x1d:
+                        return new LDRDFP64_POST(machInst, rt, rnsp, imm);
+                      default:
+                        return new Unknown64(machInst);
+                    }
+                  }
+                  case 0x2:
+                  {
+                    IntRegIndex rt =
+                        (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+                    IntRegIndex rn =
+                        (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+                    IntRegIndex rnsp = makeSP(rn);
+                    uint64_t imm = sext<9>(bits(machInst, 20, 12));
+                    switch (switchVal) {
+                      case 0x00:
+                        return new STTRB64_IMM(machInst, rt, rnsp, imm);
+                      case 0x01:
+                        return new LDTRB64_IMM(machInst, rt, rnsp, imm);
+                      case 0x02:
+                        return new LDTRSBX64_IMM(machInst, rt, rnsp, imm);
+                      case 0x03:
+                        return new LDTRSBW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x08:
+                        return new STTRH64_IMM(machInst, rt, rnsp, imm);
+                      case 0x09:
+                        return new LDTRH64_IMM(machInst, rt, rnsp, imm);
+                      case 0x0a:
+                        return new LDTRSHX64_IMM(machInst, rt, rnsp, imm);
+                      case 0x0b:
+                        return new LDTRSHW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x10:
+                        return new STTRW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x11:
+                        return new LDTRW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x12:
+                        return new LDTRSW64_IMM(machInst, rt, rnsp, imm);
+                      case 0x18:
+                        return new STTRX64_IMM(machInst, rt, rnsp, imm);
+                      case 0x19:
+                        return new LDTRX64_IMM(machInst, rt, rnsp, imm);
+                      default:
+                        return new Unknown64(machInst);
+                    }
+                  }
+                  case 0x3:
+                  {
+                    IntRegIndex rt =
+                        (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+                    IntRegIndex rn =
+                        (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+                    IntRegIndex rnsp = makeSP(rn);
+                    uint64_t imm = sext<9>(bits(machInst, 20, 12));
+                    switch (switchVal) {
+                      case 0x00:
+                        return new STRB64_PRE(machInst, rt, rnsp, imm);
+                      case 0x01:
+                        return new LDRB64_PRE(machInst, rt, rnsp, imm);
+                      case 0x02:
+                        return new LDRSBX64_PRE(machInst, rt, rnsp, imm);
+                      case 0x03:
+                        return new LDRSBW64_PRE(machInst, rt, rnsp, imm);
+                      case 0x04:
+                        return new STRBFP64_PRE(machInst, rt, rnsp, imm);
+                      case 0x05:
+                        return new LDRBFP64_PRE(machInst, rt, rnsp, imm);
+                      case 0x06:
+                        return new BigFpMemPre("str", machInst, false,
+                                               rt, rnsp, imm);
+                      case 0x07:
+                        return new BigFpMemPre("ldr", machInst, true,
+                                               rt, rnsp, imm);
+                      case 0x08:
+                        return new STRH64_PRE(machInst, rt, rnsp, imm);
+                      case 0x09:
+                        return new LDRH64_PRE(machInst, rt, rnsp, imm);
+                      case 0x0a:
+                        return new LDRSHX64_PRE(machInst, rt, rnsp, imm);
+                      case 0x0b:
+                        return new LDRSHW64_PRE(machInst, rt, rnsp, imm);
+                      case 0x0c:
+                        return new STRHFP64_PRE(machInst, rt, rnsp, imm);
+                      case 0x0d:
+                        return new LDRHFP64_PRE(machInst, rt, rnsp, imm);
+                      case 0x10:
+                        return new STRW64_PRE(machInst, rt, rnsp, imm);
+                      case 0x11:
+                        return new LDRW64_PRE(machInst, rt, rnsp, imm);
+                      case 0x12:
+                        return new LDRSW64_PRE(machInst, rt, rnsp, imm);
+                      case 0x14:
+                        return new STRSFP64_PRE(machInst, rt, rnsp, imm);
+                      case 0x15:
+                        return new LDRSFP64_PRE(machInst, rt, rnsp, imm);
+                      case 0x18:
+                        return new STRX64_PRE(machInst, rt, rnsp, imm);
+                      case 0x19:
+                        return new LDRX64_PRE(machInst, rt, rnsp, imm);
+                      case 0x1c:
+                        return new STRDFP64_PRE(machInst, rt, rnsp, imm);
+                      case 0x1d:
+                        return new LDRDFP64_PRE(machInst, rt, rnsp, imm);
+                      default:
+                        return new Unknown64(machInst);
+                    }
+                  }
+                }
+            }
+          }
+        }
+        return new FailUnimplemented("Unhandled Case1", machInst);
+    }
+}
+}};
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    decodeDataProcReg(ExtMachInst machInst)
+    {
+        uint8_t switchVal = (bits(machInst, 28) << 1) |
+                            (bits(machInst, 24) << 0);
+        switch (switchVal) {
+          case 0x0:
+          {
+            uint8_t switchVal = (bits(machInst, 21) << 0) |
+                                (bits(machInst, 30, 29) << 1);
+            ArmShiftType type = (ArmShiftType)(uint8_t)bits(machInst, 23, 22);
+            uint8_t imm6 = bits(machInst, 15, 10);
+            bool sf = bits(machInst, 31);
+            if (!sf && (imm6 & 0x20))
+                return new Unknown64(machInst);
+            IntRegIndex rd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+            IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+            IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+
+            switch (switchVal) {
+              case 0x0:
+                return new AndXSReg(machInst, rd, rn, rm, imm6, type);
+              case 0x1:
+                return new BicXSReg(machInst, rd, rn, rm, imm6, type);
+              case 0x2:
+                return new OrrXSReg(machInst, rd, rn, rm, imm6, type);
+              case 0x3:
+                return new OrnXSReg(machInst, rd, rn, rm, imm6, type);
+              case 0x4:
+                return new EorXSReg(machInst, rd, rn, rm, imm6, type);
+              case 0x5:
+                return new EonXSReg(machInst, rd, rn, rm, imm6, type);
+              case 0x6:
+                return new AndXSRegCc(machInst, rd, rn, rm, imm6, type);
+              case 0x7:
+                return new BicXSRegCc(machInst, rd, rn, rm, imm6, type);
+            }
+          }
+          case 0x1:
+          {
+            uint8_t switchVal = bits(machInst, 30, 29);
+            if (bits(machInst, 21) == 0) {
+                ArmShiftType type =
+                    (ArmShiftType)(uint8_t)bits(machInst, 23, 22);
+                if (type == ROR)
+                    return new Unknown64(machInst);
+                uint8_t imm6 = bits(machInst, 15, 10);
+                if (!bits(machInst, 31) && bits(imm6, 5))
+                    return new Unknown64(machInst);
+                IntRegIndex rd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+                IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+                switch (switchVal) {
+                  case 0x0:
+                    return new AddXSReg(machInst, rd, rn, rm, imm6, type);
+                  case 0x1:
+                    return new AddXSRegCc(machInst, rd, rn, rm, imm6, type);
+                  case 0x2:
+                    return new SubXSReg(machInst, rd, rn, rm, imm6, type);
+                  case 0x3:
+                    return new SubXSRegCc(machInst, rd, rn, rm, imm6, type);
+                }
+            } else {
+                if (bits(machInst, 23, 22) != 0 || bits(machInst, 12, 10) > 0x4)
+                   return new Unknown64(machInst);
+                ArmExtendType type =
+                    (ArmExtendType)(uint8_t)bits(machInst, 15, 13);
+                uint8_t imm3 = bits(machInst, 12, 10);
+                IntRegIndex rd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+                IntRegIndex rdsp = makeSP(rd);
+                IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+                IntRegIndex rnsp = makeSP(rn);
+                IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+
+                switch (switchVal) {
+                  case 0x0:
+                    return new AddXEReg(machInst, rdsp, rnsp, rm, type, imm3);
+                  case 0x1:
+                    return new AddXERegCc(machInst, rd, rnsp, rm, type, imm3);
+                  case 0x2:
+                    return new SubXEReg(machInst, rdsp, rnsp, rm, type, imm3);
+                  case 0x3:
+                    return new SubXERegCc(machInst, rd, rnsp, rm, type, imm3);
+                }
+            }
+          }
+          case 0x2:
+          {
+            if (bits(machInst, 21) == 1)
+                return new Unknown64(machInst);
+            IntRegIndex rd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+            IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+            IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+            switch (bits(machInst, 23, 22)) {
+              case 0x0:
+              {
+                if (bits(machInst, 15, 10))
+                    return new Unknown64(machInst);
+                uint8_t switchVal = bits(machInst, 30, 29);
+                switch (switchVal) {
+                  case 0x0:
+                    return new AdcXSReg(machInst, rd, rn, rm, 0, LSL);
+                  case 0x1:
+                    return new AdcXSRegCc(machInst, rd, rn, rm, 0, LSL);
+                  case 0x2:
+                    return new SbcXSReg(machInst, rd, rn, rm, 0, LSL);
+                  case 0x3:
+                    return new SbcXSRegCc(machInst, rd, rn, rm, 0, LSL);
+                }
+              }
+              case 0x1:
+              {
+                if ((bits(machInst, 4) == 1) ||
+                        (bits(machInst, 10) == 1) ||
+                        (bits(machInst, 29) == 0)) {
+                    return new Unknown64(machInst);
+                }
+                ConditionCode cond =
+                    (ConditionCode)(uint8_t)bits(machInst, 15, 12);
+                uint8_t flags = bits(machInst, 3, 0);
+                IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+                if (bits(machInst, 11) == 0) {
+                    IntRegIndex rm =
+                        (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+                    if (bits(machInst, 30) == 0) {
+                        return new CcmnReg64(machInst, rn, rm, cond, flags);
+                    } else {
+                        return new CcmpReg64(machInst, rn, rm, cond, flags);
+                    }
+                } else {
+                    uint8_t imm5 = bits(machInst, 20, 16);
+                    if (bits(machInst, 30) == 0) {
+                        return new CcmnImm64(machInst, rn, imm5, cond, flags);
+                    } else {
+                        return new CcmpImm64(machInst, rn, imm5, cond, flags);
+                    }
+                }
+              }
+              case 0x2:
+              {
+                if (bits(machInst, 29) == 1 ||
+                        bits(machInst, 11) == 1) {
+                    return new Unknown64(machInst);
+                }
+                uint8_t switchVal = (bits(machInst, 10) << 0) |
+                                    (bits(machInst, 30) << 1);
+                IntRegIndex rd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+                IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+                ConditionCode cond =
+                    (ConditionCode)(uint8_t)bits(machInst, 15, 12);
+                switch (switchVal) {
+                  case 0x0:
+                    return new Csel64(machInst, rd, rn, rm, cond);
+                  case 0x1:
+                    return new Csinc64(machInst, rd, rn, rm, cond);
+                  case 0x2:
+                    return new Csinv64(machInst, rd, rn, rm, cond);
+                  case 0x3:
+                    return new Csneg64(machInst, rd, rn, rm, cond);
+                }
+              }
+              case 0x3:
+                if (bits(machInst, 30) == 0) {
+                    if (bits(machInst, 29) != 0)
+                        return new Unknown64(machInst);
+                    uint8_t switchVal = bits(machInst, 15, 10);
+                    switch (switchVal) {
+                      case 0x2:
+                        return new Udiv64(machInst, rd, rn, rm);
+                      case 0x3:
+                        return new Sdiv64(machInst, rd, rn, rm);
+                      case 0x8:
+                        return new Lslv64(machInst, rd, rn, rm);
+                      case 0x9:
+                        return new Lsrv64(machInst, rd, rn, rm);
+                      case 0xa:
+                        return new Asrv64(machInst, rd, rn, rm);
+                      case 0xb:
+                        return new Rorv64(machInst, rd, rn, rm);
+                      default:
+                        return new Unknown64(machInst);
+                    }
+                } else {
+                    if (bits(machInst, 20, 16) != 0 ||
+                            bits(machInst, 29) != 0) {
+                        return new Unknown64(machInst);
+                    }
+                    uint8_t switchVal = bits(machInst, 15, 10);
+                    switch (switchVal) {
+                      case 0x0:
+                        return new Rbit64(machInst, rd, rn);
+                      case 0x1:
+                        return new Rev1664(machInst, rd, rn);
+                      case 0x2:
+                        if (bits(machInst, 31) == 0)
+                            return new Rev64(machInst, rd, rn);
+                        else
+                            return new Rev3264(machInst, rd, rn);
+                      case 0x3:
+                        if (bits(machInst, 31) != 1)
+                            return new Unknown64(machInst);
+                        return new Rev64(machInst, rd, rn);
+                      case 0x4:
+                        return new Clz64(machInst, rd, rn);
+                      case 0x5:
+                        return new Cls64(machInst, rd, rn);
+                    }
+                }
+            }
+          }
+          case 0x3:
+          {
+            if (bits(machInst, 30, 29) != 0x0 ||
+                    (bits(machInst, 23, 21) != 0 && bits(machInst, 31) == 0))
+                return new Unknown64(machInst);
+            IntRegIndex rd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+            IntRegIndex rn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+            IntRegIndex ra = (IntRegIndex)(uint8_t)bits(machInst, 14, 10);
+            IntRegIndex rm = (IntRegIndex)(uint8_t)bits(machInst, 20, 16);
+            switch (bits(machInst, 23, 21)) {
+              case 0x0:
+                if (bits(machInst, 15) == 0)
+                    return new Madd64(machInst, rd, ra, rn, rm);
+                else
+                    return new Msub64(machInst, rd, ra, rn, rm);
+              case 0x1:
+                if (bits(machInst, 15) == 0)
+                    return new Smaddl64(machInst, rd, ra, rn, rm);
+                else
+                    return new Smsubl64(machInst, rd, ra, rn, rm);
+              case 0x2:
+                if (bits(machInst, 15) != 0)
+                    return new Unknown64(machInst);
+                return new Smulh64(machInst, rd, rn, rm);
+              case 0x5:
+                if (bits(machInst, 15) == 0)
+                    return new Umaddl64(machInst, rd, ra, rn, rm);
+                else
+                    return new Umsubl64(machInst, rd, ra, rn, rm);
+              case 0x6:
+                if (bits(machInst, 15) != 0)
+                    return new Unknown64(machInst);
+                return new Umulh64(machInst, rd, rn, rm);
+              default:
+                return new Unknown64(machInst);
+            }
+          }
+        }
+        return new FailUnimplemented("Unhandled Case2", machInst);
+    }
+}
+}};
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    decodeAdvSIMD(ExtMachInst machInst)
+    {
+        if (bits(machInst, 24) == 1) {
+            if (bits(machInst, 10) == 0) {
+                return decodeNeonIndexedElem(machInst);
+            } else if (bits(machInst, 23) == 1) {
+                return new Unknown64(machInst);
+            } else {
+                if (bits(machInst, 22, 19)) {
+                    return decodeNeonShiftByImm(machInst);
+                } else {
+                    return decodeNeonModImm(machInst);
+                }
+            }
+        } else if (bits(machInst, 21) == 1) {
+            if (bits(machInst, 10) == 1) {
+                return decodeNeon3Same(machInst);
+            } else if (bits(machInst, 11) == 0) {
+                return decodeNeon3Diff(machInst);
+            } else if (bits(machInst, 20, 17) == 0x0) {
+                return decodeNeon2RegMisc(machInst);
+            } else if (bits(machInst, 20, 17) == 0x8) {
+                return decodeNeonAcrossLanes(machInst);
+            } else {
+                return new Unknown64(machInst);
+            }
+        } else if (bits(machInst, 24) ||
+                   bits(machInst, 21) ||
+                   bits(machInst, 15)) {
+            return new Unknown64(machInst);
+        } else if (bits(machInst, 10) == 1) {
+            if (bits(machInst, 23, 22))
+                return new Unknown64(machInst);
+            return decodeNeonCopy(machInst);
+        } else if (bits(machInst, 29) == 1) {
+            return decodeNeonExt(machInst);
+        } else if (bits(machInst, 11) == 1) {
+            return decodeNeonZipUzpTrn(machInst);
+        } else if (bits(machInst, 23, 22) == 0x0) {
+            return decodeNeonTblTbx(machInst);
+        } else {
+            return new Unknown64(machInst);
+        }
+        return new FailUnimplemented("Unhandled Case3", machInst);
+    }
+}
+}};
+
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    // bit 30=0, 28:25=1111
+    decodeFp(ExtMachInst machInst)
+    {
+        if (bits(machInst, 24) == 1) {
+            if (bits(machInst, 31) || bits(machInst, 29))
+                return new Unknown64(machInst);
+            IntRegIndex rd    = (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+            IntRegIndex rn    = (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+            IntRegIndex rm    = (IntRegIndex)(uint32_t)bits(machInst, 20, 16);
+            IntRegIndex ra    = (IntRegIndex)(uint32_t)bits(machInst, 14, 10);
+            uint8_t switchVal = (bits(machInst, 23, 21) << 1) |
+                                (bits(machInst, 15)     << 0);
+            switch (switchVal) {
+              case 0x0: // FMADD Sd = Sa + Sn*Sm
+                return new FMAddS(machInst, rd, rn, rm, ra);
+              case 0x1: // FMSUB Sd = Sa + (-Sn)*Sm
+                return new FMSubS(machInst, rd, rn, rm, ra);
+              case 0x2: // FNMADD Sd = (-Sa) + (-Sn)*Sm
+                return new FNMAddS(machInst, rd, rn, rm, ra);
+              case 0x3: // FNMSUB Sd = (-Sa) + Sn*Sm
+                return new FNMSubS(machInst, rd, rn, rm, ra);
+              case 0x4: // FMADD Dd = Da + Dn*Dm
+                return new FMAddD(machInst, rd, rn, rm, ra);
+              case 0x5: // FMSUB Dd = Da + (-Dn)*Dm
+                return new FMSubD(machInst, rd, rn, rm, ra);
+              case 0x6: // FNMADD Dd = (-Da) + (-Dn)*Dm
+                return new FNMAddD(machInst, rd, rn, rm, ra);
+              case 0x7: // FNMSUB Dd = (-Da) + Dn*Dm
+                return new FNMSubD(machInst, rd, rn, rm, ra);
+              default:
+                return new Unknown64(machInst);
+            }
+        } else if (bits(machInst, 21) == 0) {
+            bool s = bits(machInst, 29);
+            if (s)
+                return new Unknown64(machInst);
+            uint8_t switchVal = bits(machInst, 20, 16);
+            uint8_t type      = bits(machInst, 23, 22);
+            uint8_t scale     = bits(machInst, 15, 10);
+            IntRegIndex rd    = (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+            IntRegIndex rn    = (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+            if (bits(machInst, 18, 17) == 3 && scale != 0)
+                return new Unknown64(machInst);
+            // 30:24=0011110, 21=0
+            switch (switchVal) {
+              case 0x00:
+                return new FailUnimplemented("fcvtns", machInst);
+              case 0x01:
+                return new FailUnimplemented("fcvtnu", machInst);
+              case 0x02:
+                switch ( (bits(machInst, 31) << 2) | type ) {
+                  case 0: // SCVTF Sd = convertFromInt(Wn/(2^fbits))
+                    return new FcvtSFixedFpSW(machInst, rd, rn, scale);
+                  case 1: // SCVTF Dd = convertFromInt(Wn/(2^fbits))
+                    return new FcvtSFixedFpDW(machInst, rd, rn, scale);
+                  case 4: // SCVTF Sd = convertFromInt(Xn/(2^fbits))
+                    return new FcvtSFixedFpSX(machInst, rd, rn, scale);
+                  case 5: // SCVTF Dd = convertFromInt(Xn/(2^fbits))
+                    return new FcvtSFixedFpDX(machInst, rd, rn, scale);
+                  default:
+                    return new Unknown64(machInst);
+                }
+              case 0x03:
+                switch ( (bits(machInst, 31) << 2) | type ) {
+                  case 0: // UCVTF Sd = convertFromInt(Wn/(2^fbits))
+                    return new FcvtUFixedFpSW(machInst, rd, rn, scale);
+                  case 1: // UCVTF Dd = convertFromInt(Wn/(2^fbits))
+                    return new FcvtUFixedFpDW(machInst, rd, rn, scale);
+                  case 4: // UCVTF Sd = convertFromInt(Xn/(2^fbits))
+                    return new FcvtUFixedFpSX(machInst, rd, rn, scale);
+                  case 5: // UCVTF Dd = convertFromInt(Xn/(2^fbits))
+                    return new FcvtUFixedFpDX(machInst, rd, rn, scale);
+                  default:
+                    return new Unknown64(machInst);
+                }
+              case 0x04:
+                return new FailUnimplemented("fcvtas", machInst);
+              case 0x05:
+                return new FailUnimplemented("fcvtau", machInst);
+              case 0x08:
+                return new FailUnimplemented("fcvtps", machInst);
+              case 0x09:
+                return new FailUnimplemented("fcvtpu", machInst);
+              case 0x0e:
+                return new FailUnimplemented("fmov elem. to 64", machInst);
+              case 0x0f:
+                return new FailUnimplemented("fmov 64 bit", machInst);
+              case 0x10:
+                return new FailUnimplemented("fcvtms", machInst);
+              case 0x11:
+                return new FailUnimplemented("fcvtmu", machInst);
+              case 0x18:
+                switch ( (bits(machInst, 31) << 2) | type ) {
+                  case 0: // FCVTZS Wd = convertToIntExactTowardZero(Sn*(2^fbits))
+                    return new FcvtFpSFixedSW(machInst, rd, rn, scale);
+                  case 1: // FCVTZS Wd = convertToIntExactTowardZero(Dn*(2^fbits))
+                    return new FcvtFpSFixedDW(machInst, rd, rn, scale);
+                  case 4: // FCVTZS Xd = convertToIntExactTowardZero(Sn*(2^fbits))
+                    return new FcvtFpSFixedSX(machInst, rd, rn, scale);
+                  case 5: // FCVTZS Xd = convertToIntExactTowardZero(Dn*(2^fbits))
+                    return new FcvtFpSFixedDX(machInst, rd, rn, scale);
+                  default:
+                    return new Unknown64(machInst);
+                }
+              case 0x19:
+                switch ( (bits(machInst, 31) << 2) | type ) {
+                  case 0: // FCVTZU Wd = convertToIntExactTowardZero(Sn*(2^fbits))
+                    return new FcvtFpUFixedSW(machInst, rd, rn, scale);
+                  case 1: // FCVTZU Wd = convertToIntExactTowardZero(Dn*(2^fbits))
+                    return new FcvtFpUFixedDW(machInst, rd, rn, scale);
+                  case 4: // FCVTZU Xd = convertToIntExactTowardZero(Sn*(2^fbits))
+                    return new FcvtFpUFixedSX(machInst, rd, rn, scale);
+                  case 5: // FCVTZU Xd = convertToIntExactTowardZero(Dn*(2^fbits))
+                    return new FcvtFpUFixedDX(machInst, rd, rn, scale);
+                  default:
+                    return new Unknown64(machInst);
+                }
+            }
+        } else {
+            // 30=0, 28:24=11110, 21=1
+            uint8_t type   = bits(machInst, 23, 22);
+            uint8_t imm8   = bits(machInst, 20, 13);
+            IntRegIndex rd = (IntRegIndex)(uint32_t)bits(machInst, 4, 0);
+            IntRegIndex rn = (IntRegIndex)(uint32_t)bits(machInst, 9, 5);
+            switch (bits(machInst, 11, 10)) {
+              case 0x0:
+                if (bits(machInst, 12) == 1) {
+                    if (bits(machInst, 31) ||
+                            bits(machInst, 29) ||
+                            bits(machInst, 9, 5)) {
+                        return new Unknown64(machInst);
+                    }
+                    // 31:29=000, 28:24=11110, 21=1, 12:10=100
+                    if (type == 0) {
+                        // FMOV S[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>,5)
+                        //             :imm8<5:0>:Zeros(19)
+                        uint32_t imm = vfp_modified_imm(imm8, false);
+                        return new FmovImmS(machInst, rd, imm);
+                    } else if (type == 1) {
+                        // FMOV D[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>,8)
+                        //             :imm8<5:0>:Zeros(48)
+                        uint64_t imm = vfp_modified_imm(imm8, true);
+                        return new FmovImmD(machInst, rd, imm);
+                    } else {
+                        return new Unknown64(machInst);
+                    }
+                } else if (bits(machInst, 13) == 1) {
+                    if (bits(machInst, 31) ||
+                            bits(machInst, 29) ||
+                            bits(machInst, 15, 14) ||
+                            bits(machInst, 23) ||
+                            bits(machInst, 2, 0)) {
+                        return new Unknown64(machInst);
+                    }
+                    uint8_t switchVal = (bits(machInst, 4, 3) << 0) |
+                                        (bits(machInst, 22) << 2);
+                    IntRegIndex rm = (IntRegIndex)(uint32_t)
+                                        bits(machInst, 20, 16);
+                    // 28:23=000111100, 21=1, 15:10=001000, 2:0=000
+                    switch (switchVal) {
+                      case 0x0:
+                        // FCMP flags = compareQuiet(Sn,Sm)
+                        return new FCmpRegS(machInst, rn, rm);
+                      case 0x1:
+                        // FCMP flags = compareQuiet(Sn,0.0)
+                        return new FCmpImmS(machInst, rn, 0);
+                      case 0x2:
+                        // FCMPE flags = compareSignaling(Sn,Sm)
+                        return new FCmpERegS(machInst, rn, rm);
+                      case 0x3:
+                        // FCMPE flags = compareSignaling(Sn,0.0)
+                        return new FCmpEImmS(machInst, rn, 0);
+                      case 0x4:
+                        // FCMP flags = compareQuiet(Dn,Dm)
+                        return new FCmpRegD(machInst, rn, rm);
+                      case 0x5:
+                        // FCMP flags = compareQuiet(Dn,0.0)
+                        return new FCmpImmD(machInst, rn, 0);
+                      case 0x6:
+                        // FCMPE flags = compareSignaling(Dn,Dm)
+                        return new FCmpERegD(machInst, rn, rm);
+                      case 0x7:
+                        // FCMPE flags = compareSignaling(Dn,0.0)
+                        return new FCmpEImmD(machInst, rn, 0);
+                      default:
+                        return new Unknown64(machInst);
+                    }
+                } else if (bits(machInst, 14) == 1) {
+                    if (bits(machInst, 31) || bits(machInst, 29))
+                        return new Unknown64(machInst);
+                    uint8_t opcode = bits(machInst, 20, 15);
+                    // Bits 31:24=00011110, 21=1, 14:10=10000
+                    switch (opcode) {
+                      case 0x0:
+                        if (type == 0)
+                            // FMOV Sd = Sn
+                            return new FmovRegS(machInst, rd, rn);
+                        else if (type == 1)
+                            // FMOV Dd = Dn
+                            return new FmovRegD(machInst, rd, rn);
+                        break;
+                      case 0x1:
+                        if (type == 0)
+                            // FABS Sd = abs(Sn)
+                            return new FAbsS(machInst, rd, rn);
+                        else if (type == 1)
+                            // FABS Dd = abs(Dn)
+                            return new FAbsD(machInst, rd, rn);
+                        break;
+                      case 0x2:
+                        if (type == 0)
+                            // FNEG Sd = -Sn
+                            return new FNegS(machInst, rd, rn);
+                        else if (type == 1)
+                            // FNEG Dd = -Dn
+                            return new FNegD(machInst, rd, rn);
+                        break;
+                      case 0x3:
+                        if (type == 0)
+                            // FSQRT Sd = sqrt(Sn)
+                            return new FSqrtS(machInst, rd, rn);
+                        else if (type == 1)
+                            // FSQRT Dd = sqrt(Dn)
+                            return new FSqrtD(machInst, rd, rn);
+                        break;
+                      case 0x4:
+                        if (type == 1)
+                            // FCVT Sd = convertFormat(Dn)
+                            return new FcvtFpDFpS(machInst, rd, rn);
+                        else if (type == 3)
+                            // FCVT Sd = convertFormat(Hn)
+                            return new FcvtFpHFpS(machInst, rd, rn);
+                        break;
+                      case 0x5:
+                        if (type == 0)
+                            // FCVT Dd = convertFormat(Sn)
+                            return new FCvtFpSFpD(machInst, rd, rn);
+                        else if (type == 3)
+                            // FCVT Dd = convertFormat(Hn)
+                            return new FcvtFpHFpD(machInst, rd, rn);
+                        break;
+                      case 0x7:
+                        if (type == 0)
+                            // FCVT Hd = convertFormat(Sn)
+                            return new FcvtFpSFpH(machInst, rd, rn);
+                        else if (type == 1)
+                            // FCVT Hd = convertFormat(Dn)
+                            return new FcvtFpDFpH(machInst, rd, rn);
+                        break;
+                      case 0x8:
+                        if (type == 0) // FRINTN Sd = roundToIntegralTiesToEven(Sn)
+                            return new FRIntNS(machInst, rd, rn);
+                        else if (type == 1) // FRINTN Dd = roundToIntegralTiesToEven(Dn)
+                            return new FRIntND(machInst, rd, rn);
+                        break;
+                      case 0x9:
+                        if (type == 0) // FRINTP Sd = roundToIntegralTowardPlusInf(Sn)
+                            return new FRIntPS(machInst, rd, rn);
+                        else if (type == 1) // FRINTP Dd = roundToIntegralTowardPlusInf(Dn)
+                            return new FRIntPD(machInst, rd, rn);
+                        break;
+                      case 0xa:
+                        if (type == 0) // FRINTM Sd = roundToIntegralTowardMinusInf(Sn)
+                            return new FRIntMS(machInst, rd, rn);
+                        else if (type == 1) // FRINTM Dd = roundToIntegralTowardMinusInf(Dn)
+                            return new FRIntMD(machInst, rd, rn);
+                        break;
+                      case 0xb:
+                        if (type == 0) // FRINTZ Sd = roundToIntegralTowardZero(Sn)
+                            return new FRIntZS(machInst, rd, rn);
+                        else if (type == 1) // FRINTZ Dd = roundToIntegralTowardZero(Dn)
+                            return new FRIntZD(machInst, rd, rn);
+                        break;
+                      case 0xc:
+                        if (type == 0) // FRINTA Sd = roundToIntegralTiesToAway(Sn)
+                            return new FRIntAS(machInst, rd, rn);
+                        else if (type == 1) // FRINTA Dd = roundToIntegralTiesToAway(Dn)
+                            return new FRIntAD(machInst, rd, rn);
+                        break;
+                      case 0xe:
+                        if (type == 0) // FRINTX Sd = roundToIntegralExact(Sn)
+                            return new FRIntXS(machInst, rd, rn);
+                        else if (type == 1) // FRINTX Dd = roundToIntegralExact(Dn)
+                            return new FRIntXD(machInst, rd, rn);
+                        break;
+                      case 0xf:
+                        if (type == 0) // FRINTI Sd = roundToIntegral(Sn)
+                            return new FRIntIS(machInst, rd, rn);
+                        else if (type == 1) // FRINTI Dd = roundToIntegral(Dn)
+                            return new FRIntID(machInst, rd, rn);
+                        break;
+                      default:
+                        return new Unknown64(machInst);
+                    }
+                    return new Unknown64(machInst);
+                } else if (bits(machInst, 15) == 1) {
+                    return new Unknown64(machInst);
+                } else {
+                    if (bits(machInst, 29))
+                        return new Unknown64(machInst);
+                    uint8_t rmode      = bits(machInst, 20, 19);
+                    uint8_t switchVal1 = bits(machInst, 18, 16);
+                    uint8_t switchVal2 = (type << 1) | bits(machInst, 31);
+                    // 30:24=0011110, 21=1, 15:10=000000
+                    switch (switchVal1) {
+                      case 0x0:
+                        switch ((switchVal2 << 2) | rmode) {
+                          case 0x0: //FCVTNS Wd = convertToIntExactTiesToEven(Sn)
+                            return new FcvtFpSIntWSN(machInst, rd, rn);
+                          case 0x1: //FCVTPS Wd = convertToIntExactTowardPlusInf(Sn)
+                            return new FcvtFpSIntWSP(machInst, rd, rn);
+                          case 0x2: //FCVTMS Wd = convertToIntExactTowardMinusInf(Sn)
+                            return new FcvtFpSIntWSM(machInst, rd, rn);
+                          case 0x3: //FCVTZS Wd = convertToIntExactTowardZero(Sn)
+                            return new FcvtFpSIntWSZ(machInst, rd, rn);
+                          case 0x4: //FCVTNS Xd = convertToIntExactTiesToEven(Sn)
+                            return new FcvtFpSIntXSN(machInst, rd, rn);
+                          case 0x5: //FCVTPS Xd = convertToIntExactTowardPlusInf(Sn)
+                            return new FcvtFpSIntXSP(machInst, rd, rn);
+                          case 0x6: //FCVTMS Xd = convertToIntExactTowardMinusInf(Sn)
+                            return new FcvtFpSIntXSM(machInst, rd, rn);
+                          case 0x7: //FCVTZS Xd = convertToIntExactTowardZero(Sn)
+                            return new FcvtFpSIntXSZ(machInst, rd, rn);
+                          case 0x8: //FCVTNS Wd = convertToIntExactTiesToEven(Dn)
+                            return new FcvtFpSIntWDN(machInst, rd, rn);
+                          case 0x9: //FCVTPS Wd = convertToIntExactTowardPlusInf(Dn)
+                            return new FcvtFpSIntWDP(machInst, rd, rn);
+                          case 0xA: //FCVTMS Wd = convertToIntExactTowardMinusInf(Dn)
+                            return new FcvtFpSIntWDM(machInst, rd, rn);
+                          case 0xB: //FCVTZS Wd = convertToIntExactTowardZero(Dn)
+                            return new FcvtFpSIntWDZ(machInst, rd, rn);
+                          case 0xC: //FCVTNS Xd = convertToIntExactTiesToEven(Dn)
+                            return new FcvtFpSIntXDN(machInst, rd, rn);
+                          case 0xD: //FCVTPS Xd = convertToIntExactTowardPlusInf(Dn)
+                            return new FcvtFpSIntXDP(machInst, rd, rn);
+                          case 0xE: //FCVTMS Xd = convertToIntExactTowardMinusInf(Dn)
+                            return new FcvtFpSIntXDM(machInst, rd, rn);
+                          case 0xF: //FCVTZS Xd = convertToIntExactTowardZero(Dn)
+                            return new FcvtFpSIntXDZ(machInst, rd, rn);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                      case 0x1:
+                        switch ((switchVal2 << 2) | rmode) {
+                          case 0x0: //FCVTNU Wd = convertToIntExactTiesToEven(Sn)
+                            return new FcvtFpUIntWSN(machInst, rd, rn);
+                          case 0x1: //FCVTPU Wd = convertToIntExactTowardPlusInf(Sn)
+                            return new FcvtFpUIntWSP(machInst, rd, rn);
+                          case 0x2: //FCVTMU Wd = convertToIntExactTowardMinusInf(Sn)
+                            return new FcvtFpUIntWSM(machInst, rd, rn);
+                          case 0x3: //FCVTZU Wd = convertToIntExactTowardZero(Sn)
+                            return new FcvtFpUIntWSZ(machInst, rd, rn);
+                          case 0x4: //FCVTNU Xd = convertToIntExactTiesToEven(Sn)
+                            return new FcvtFpUIntXSN(machInst, rd, rn);
+                          case 0x5: //FCVTPU Xd = convertToIntExactTowardPlusInf(Sn)
+                            return new FcvtFpUIntXSP(machInst, rd, rn);
+                          case 0x6: //FCVTMU Xd = convertToIntExactTowardMinusInf(Sn)
+                            return new FcvtFpUIntXSM(machInst, rd, rn);
+                          case 0x7: //FCVTZU Xd = convertToIntExactTowardZero(Sn)
+                            return new FcvtFpUIntXSZ(machInst, rd, rn);
+                          case 0x8: //FCVTNU Wd = convertToIntExactTiesToEven(Dn)
+                            return new FcvtFpUIntWDN(machInst, rd, rn);
+                          case 0x9: //FCVTPU Wd = convertToIntExactTowardPlusInf(Dn)
+                            return new FcvtFpUIntWDP(machInst, rd, rn);
+                          case 0xA: //FCVTMU Wd = convertToIntExactTowardMinusInf(Dn)
+                            return new FcvtFpUIntWDM(machInst, rd, rn);
+                          case 0xB: //FCVTZU Wd = convertToIntExactTowardZero(Dn)
+                            return new FcvtFpUIntWDZ(machInst, rd, rn);
+                          case 0xC: //FCVTNU Xd = convertToIntExactTiesToEven(Dn)
+                            return new FcvtFpUIntXDN(machInst, rd, rn);
+                          case 0xD: //FCVTPU Xd = convertToIntExactTowardPlusInf(Dn)
+                            return new FcvtFpUIntXDP(machInst, rd, rn);
+                          case 0xE: //FCVTMU Xd = convertToIntExactTowardMinusInf(Dn)
+                            return new FcvtFpUIntXDM(machInst, rd, rn);
+                          case 0xF: //FCVTZU Xd = convertToIntExactTowardZero(Dn)
+                            return new FcvtFpUIntXDZ(machInst, rd, rn);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                      case 0x2:
+                        if (rmode != 0)
+                            return new Unknown64(machInst);
+                        switch (switchVal2) {
+                          case 0: // SCVTF Sd = convertFromInt(Wn)
+                            return new FcvtWSIntFpS(machInst, rd, rn);
+                          case 1: // SCVTF Sd = convertFromInt(Xn)
+                            return new FcvtXSIntFpS(machInst, rd, rn);
+                          case 2: // SCVTF Dd = convertFromInt(Wn)
+                            return new FcvtWSIntFpD(machInst, rd, rn);
+                          case 3: // SCVTF Dd = convertFromInt(Xn)
+                            return new FcvtXSIntFpD(machInst, rd, rn);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                      case 0x3:
+                        switch (switchVal2) {
+                          case 0: // UCVTF Sd = convertFromInt(Wn)
+                            return new FcvtWUIntFpS(machInst, rd, rn);
+                          case 1: // UCVTF Sd = convertFromInt(Xn)
+                            return new FcvtXUIntFpS(machInst, rd, rn);
+                          case 2: // UCVTF Dd = convertFromInt(Wn)
+                            return new FcvtWUIntFpD(machInst, rd, rn);
+                          case 3: // UCVTF Dd = convertFromInt(Xn)
+                            return new FcvtXUIntFpD(machInst, rd, rn);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                      case 0x4:
+                        if (rmode != 0)
+                            return new Unknown64(machInst);
+                        switch (switchVal2) {
+                          case 0: // FCVTAS Wd = convertToIntExactTiesToAway(Sn)
+                            return new FcvtFpSIntWSA(machInst, rd, rn);
+                          case 1: // FCVTAS Xd = convertToIntExactTiesToAway(Sn)
+                            return new FcvtFpSIntXSA(machInst, rd, rn);
+                          case 2: // FCVTAS Wd = convertToIntExactTiesToAway(Dn)
+                            return new FcvtFpSIntWDA(machInst, rd, rn);
+                          case 3: // FCVTAS Wd = convertToIntExactTiesToAway(Dn)
+                            return new FcvtFpSIntXDA(machInst, rd, rn);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                      case 0x5:
+                        switch (switchVal2) {
+                          case 0: // FCVTAU Wd = convertToIntExactTiesToAway(Sn)
+                            return new FcvtFpUIntWSA(machInst, rd, rn);
+                          case 1: // FCVTAU Xd = convertToIntExactTiesToAway(Sn)
+                            return new FcvtFpUIntXSA(machInst, rd, rn);
+                          case 2: // FCVTAU Wd = convertToIntExactTiesToAway(Dn)
+                            return new FcvtFpUIntWDA(machInst, rd, rn);
+                          case 3: // FCVTAU Xd = convertToIntExactTiesToAway(Dn)
+                            return new FcvtFpUIntXDA(machInst, rd, rn);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                      case 0x06:
+                        switch (switchVal2) {
+                          case 0: // FMOV Wd = Sn
+                            if (rmode != 0)
+                                return new Unknown64(machInst);
+                            return new FmovRegCoreW(machInst, rd, rn);
+                          case 3: // FMOV Xd = Dn
+                            if (rmode != 0)
+                                return new Unknown64(machInst);
+                            return new FmovRegCoreX(machInst, rd, rn);
+                          case 5: // FMOV Xd = Vn<127:64>
+                            if (rmode != 1)
+                                return new Unknown64(machInst);
+                            return new FmovURegCoreX(machInst, rd, rn);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                        break;
+                      case 0x07:
+                        switch (switchVal2) {
+                          case 0: // FMOV Sd = Wn
+                            if (rmode != 0)
+                                return new Unknown64(machInst);
+                            return new FmovCoreRegW(machInst, rd, rn);
+                          case 3: // FMOV Xd = Dn
+                            if (rmode != 0)
+                                return new Unknown64(machInst);
+                            return new FmovCoreRegX(machInst, rd, rn);
+                          case 5: // FMOV Xd = Vn<127:64>
+                            if (rmode != 1)
+                                return new Unknown64(machInst);
+                            return new FmovUCoreRegX(machInst, rd, rn);
+                          default:
+                            return new Unknown64(machInst);
+                        }
+                        break;
+                      default: // Warning! missing cases in switch statement above, that still need to be added
+                        return new Unknown64(machInst);
+                    }
+                }
+              case 0x1:
+              {
+                if (bits(machInst, 31) ||
+                    bits(machInst, 29) ||
+                    bits(machInst, 23)) {
+                    return new Unknown64(machInst);
+                }
+                IntRegIndex rm = (IntRegIndex)(uint32_t) bits(machInst, 20, 16);
+                IntRegIndex rn = (IntRegIndex)(uint32_t) bits(machInst, 9, 5);
+                uint8_t    imm = (IntRegIndex)(uint32_t) bits(machInst, 3, 0);
+                ConditionCode cond =
+                    (ConditionCode)(uint8_t)(bits(machInst, 15, 12));
+                uint8_t switchVal = (bits(machInst, 4) << 0) |
+                                    (bits(machInst, 22) << 1);
+                // 31:23=000111100, 21=1, 11:10=01
+                switch (switchVal) {
+                  case 0x0:
+                    // FCCMP flags = if cond the compareQuiet(Sn,Sm) else #nzcv
+                    return new FCCmpRegS(machInst, rn, rm, cond, imm);
+                  case 0x1:
+                    // FCCMP flags = if cond then compareSignaling(Sn,Sm)
+                    //               else #nzcv
+                    return new FCCmpERegS(machInst, rn, rm, cond, imm);
+                  case 0x2:
+                    // FCCMP flags = if cond then compareQuiet(Dn,Dm) else #nzcv
+                    return new FCCmpRegD(machInst, rn, rm, cond, imm);
+                  case 0x3:
+                    // FCCMP flags = if cond then compareSignaling(Dn,Dm)
+                    //               else #nzcv
+                    return new FCCmpERegD(machInst, rn, rm, cond, imm);
+                  default:
+                    return new Unknown64(machInst);
+                }
+              }
+              case 0x2:
+              {
+                if (bits(machInst, 31) ||
+                        bits(machInst, 29) ||
+                        bits(machInst, 23)) {
+                    return new Unknown64(machInst);
+                }
+                IntRegIndex rd = (IntRegIndex)(uint32_t)bits(machInst,  4,  0);
+                IntRegIndex rn = (IntRegIndex)(uint32_t)bits(machInst,  9,  5);
+                IntRegIndex rm = (IntRegIndex)(uint32_t)bits(machInst, 20, 16);
+                uint8_t switchVal = (bits(machInst, 15, 12) << 0) |
+                                    (bits(machInst, 22) << 4);
+                switch (switchVal) {
+                  case 0x00: // FMUL Sd = Sn * Sm
+                    return new FMulS(machInst, rd, rn, rm);
+                  case 0x10: // FMUL Dd = Dn * Dm
+                    return new FMulD(machInst, rd, rn, rm);
+                  case 0x01: // FDIV Sd = Sn / Sm
+                    return new FDivS(machInst, rd, rn, rm);
+                  case 0x11: // FDIV Dd = Dn / Dm
+                    return new FDivD(machInst, rd, rn, rm);
+                  case 0x02: // FADD Sd = Sn + Sm
+                    return new FAddS(machInst, rd, rn, rm);
+                  case 0x12: // FADD Dd = Dn + Dm
+                    return new FAddD(machInst, rd, rn, rm);
+                  case 0x03: // FSUB Sd = Sn - Sm
+                    return new FSubS(machInst, rd, rn, rm);
+                  case 0x13: // FSUB Dd = Dn - Dm
+                    return new FSubD(machInst, rd, rn, rm);
+                  case 0x04: // FMAX Sd = max(Sn, Sm)
+                    return new FMaxS(machInst, rd, rn, rm);
+                  case 0x14: // FMAX Dd = max(Dn, Dm)
+                    return new FMaxD(machInst, rd, rn, rm);
+                  case 0x05: // FMIN Sd = min(Sn, Sm)
+                    return new FMinS(machInst, rd, rn, rm);
+                  case 0x15: // FMIN Dd = min(Dn, Dm)
+                    return new FMinD(machInst, rd, rn, rm);
+                  case 0x06: // FMAXNM Sd = maxNum(Sn, Sm)
+                    return new FMaxNMS(machInst, rd, rn, rm);
+                  case 0x16: // FMAXNM Dd = maxNum(Dn, Dm)
+                    return new FMaxNMD(machInst, rd, rn, rm);
+                  case 0x07: // FMINNM Sd = minNum(Sn, Sm)
+                    return new FMinNMS(machInst, rd, rn, rm);
+                  case 0x17: // FMINNM Dd = minNum(Dn, Dm)
+                    return new FMinNMD(machInst, rd, rn, rm);
+                  case 0x08: // FNMUL Sd = -(Sn * Sm)
+                    return new FNMulS(machInst, rd, rn, rm);
+                  case 0x18: // FNMUL Dd = -(Dn * Dm)
+                    return new FNMulD(machInst, rd, rn, rm);
+                  default:
+                    return new Unknown64(machInst);
+                }
+              }
+              case 0x3:
+              {
+                if (bits(machInst, 31) || bits(machInst, 29))
+                    return new Unknown64(machInst);
+                uint8_t type = bits(machInst, 23, 22);
+                IntRegIndex rd = (IntRegIndex)(uint32_t)bits(machInst,  4,  0);
+                IntRegIndex rn = (IntRegIndex)(uint32_t)bits(machInst,  9,  5);
+                IntRegIndex rm = (IntRegIndex)(uint32_t)bits(machInst, 20, 16);
+                ConditionCode cond =
+                    (ConditionCode)(uint8_t)(bits(machInst, 15, 12));
+                if (type == 0) // FCSEL Sd = if cond then Sn else Sm
+                    return new FCSelS(machInst, rd, rn, rm, cond);
+                else if (type == 1) // FCSEL Dd = if cond then Dn else Dm
+                    return new FCSelD(machInst, rd, rn, rm, cond);
+                else
+                    return new Unknown64(machInst);
+              }
+            }
+        }
+        return new FailUnimplemented("Unhandled Case4", machInst);
+    }
+}
+}};
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    decodeAdvSIMDScalar(ExtMachInst machInst)
+    {
+        if (bits(machInst, 24) == 1) {
+            if (bits(machInst, 10) == 0) {
+                return decodeNeonScIndexedElem(machInst);
+            } else if (bits(machInst, 23) == 0) {
+                return decodeNeonScShiftByImm(machInst);
+            }
+        } else if (bits(machInst, 21) == 1) {
+            if (bits(machInst, 10) == 1) {
+                return decodeNeonSc3Same(machInst);
+            } else if (bits(machInst, 11) == 0) {
+                return decodeNeonSc3Diff(machInst);
+            } else if (bits(machInst, 20, 17) == 0x0) {
+                return decodeNeonSc2RegMisc(machInst);
+            } else if (bits(machInst, 20, 17) == 0x8) {
+                return decodeNeonScPwise(machInst);
+            } else {
+                return new Unknown64(machInst);
+            }
+        } else if (bits(machInst, 23, 22) == 0 &&
+                   bits(machInst, 15) == 0 &&
+                   bits(machInst, 10) == 1) {
+            return decodeNeonScCopy(machInst);
+        } else {
+            return new Unknown64(machInst);
+        }
+        return new FailUnimplemented("Unhandled Case6", machInst);
+    }
+}
+}};
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    decodeFpAdvSIMD(ExtMachInst machInst)
+    {
+
+        if (bits(machInst, 28) == 0) {
+            if (bits(machInst, 31) == 0) {
+                return decodeAdvSIMD(machInst);
+            } else {
+                return new Unknown64(machInst);
+            }
+        } else if (bits(machInst, 30) == 0) {
+            return decodeFp(machInst);
+        } else if (bits(machInst, 31) == 0) {
+            return decodeAdvSIMDScalar(machInst);
+        } else {
+            return new Unknown64(machInst);
+        }
+    }
+}
+}};
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    decodeGem5Ops(ExtMachInst machInst)
+    {
+        const uint32_t m5func = bits(machInst, 23, 16);
+        switch (m5func) {
+          case 0x00: return new Arm(machInst);
+          case 0x01: return new Quiesce(machInst);
+          case 0x02: return new QuiesceNs64(machInst);
+          case 0x03: return new QuiesceCycles64(machInst);
+          case 0x04: return new QuiesceTime64(machInst);
+          case 0x07: return new Rpns64(machInst);
+          case 0x09: return new WakeCPU64(machInst);
+          case 0x10: return new Deprecated_ivlb(machInst);
+          case 0x11: return new Deprecated_ivle(machInst);
+          case 0x20: return new Deprecated_exit (machInst);
+          case 0x21: return new M5exit64(machInst);
+          case 0x31: return new Loadsymbol(machInst);
+          case 0x30: return new Initparam64(machInst);
+          case 0x40: return new Resetstats64(machInst);
+          case 0x41: return new Dumpstats64(machInst);
+          case 0x42: return new Dumpresetstats64(machInst);
+          case 0x43: return new M5checkpoint64(machInst);
+          case 0x4F: return new M5writefile64(machInst);
+          case 0x50: return new M5readfile64(machInst);
+          case 0x51: return new M5break(machInst);
+          case 0x52: return new M5switchcpu(machInst);
+          case 0x53: return new M5addsymbol64(machInst);
+          case 0x54: return new M5panic(machInst);
+          case 0x5a: return new M5workbegin64(machInst);
+          case 0x5b: return new M5workend64(machInst);
+          default: return new Unknown64(machInst);
+        }
+    }
+}
+}};
+
+def format Aarch64() {{
+    decode_block = '''
+    {
+        using namespace Aarch64;
+        if (bits(machInst, 27) == 0x0) {
+            if (bits(machInst, 28) == 0x0)
+                return new Unknown64(machInst);
+            else if (bits(machInst, 26) == 0)
+                // bit 28:26=100
+                return decodeDataProcImm(machInst);
+            else
+                // bit 28:26=101
+                return decodeBranchExcSys(machInst);
+        } else if (bits(machInst, 25) == 0) {
+            // bit 27=1, 25=0
+            return decodeLoadsStores(machInst);
+        } else if (bits(machInst, 26) == 0) {
+            // bit 27:25=101
+            return decodeDataProcReg(machInst);
+        } else if (bits(machInst, 24) == 1 &&
+                   bits(machInst, 31, 28) == 0xF) {
+            return decodeGem5Ops(machInst);
+        } else {
+            // bit 27:25=111
+            return decodeFpAdvSIMD(machInst);
+        }
+    }
+    '''
+}};
diff --git a/src/arch/arm/isa/formats/branch.isa b/src/arch/arm/isa/formats/branch.isa
index f1b17ec..513506d 100644
--- a/src/arch/arm/isa/formats/branch.isa
+++ b/src/arch/arm/isa/formats/branch.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010, 2012-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -101,7 +101,7 @@
             return new B(machInst, sext<9>(bits(machInst, 7, 0) << 1),
                          (ConditionCode)(uint32_t)bits(machInst, 11, 8));
         } else if (bits(machInst, 8)) {
-            return new Svc(machInst);
+            return new Svc(machInst, bits(machInst, 7, 0));
         } else {
             // This space will not be allocated in the future.
             return new Unknown(machInst);
@@ -127,7 +127,7 @@
                     // Permanently undefined.
                     return new Unknown(machInst);
                 } else {
-                    return new WarnUnimplemented("smc", machInst);
+                    return new Smc(machInst);
                 }
             } else if ((op & 0x38) != 0x38) {
                 const uint32_t s = bits(machInst, 26);
@@ -141,20 +141,26 @@
                 return new B(machInst, imm,
                              (ConditionCode)(uint32_t)bits(machInst, 25, 22));
             } else {
+                // HIGH: 12-11=10, LOW: 15-14=00, 12=0
                 switch (op) {
                   case 0x38:
-                    {
-                        const IntRegIndex rn =
-                            (IntRegIndex)(uint32_t)bits(machInst, 19, 16);
-                        const uint8_t byteMask = bits(machInst, 11, 8);
-                        return new MsrCpsrReg(machInst, rn, byteMask);
-                    }
                   case 0x39:
                     {
                         const IntRegIndex rn =
                             (IntRegIndex)(uint32_t)bits(machInst, 19, 16);
                         const uint8_t byteMask = bits(machInst, 11, 8);
-                        return new MsrSpsrReg(machInst, rn, byteMask);
+                        const bool    r        = bits(machInst, 20);
+                        if (bits(machInst, 5)) {
+                            const uint8_t sysM = (bits(machInst, 4) << 4) |
+                                                  byteMask;
+                            return new MsrBankedReg(machInst, rn, sysM, r);
+                        } else {
+                            if (r) {
+                                return new MsrSpsrReg(machInst, rn, byteMask);
+                            } else {
+                                return new MsrCpsrReg(machInst, rn, byteMask);
+                            }
+                        }
                     }
                   case 0x3a:
                     {
@@ -196,11 +202,11 @@
                           case 0x2:
                             return new Clrex(machInst);
                           case 0x4:
-                            return new Dsb(machInst);
+                            return new Dsb(machInst, 0);
                           case 0x5:
-                            return new Dmb(machInst);
+                            return new Dmb(machInst, 0);
                           case 0x6:
-                            return new Isb(machInst);
+                            return new Isb(machInst, 0);
                           default:
                             break;
                         }
@@ -208,28 +214,44 @@
                     }
                   case 0x3c:
                     {
-                        // On systems that don't support bxj, bxj == bx
-                        return new BxReg(machInst,
+                        return new BxjReg(machInst,
                                  (IntRegIndex)(uint32_t)bits(machInst, 19, 16),
                                  COND_UC);
                     }
                   case 0x3d:
                     {
                         const uint32_t imm32 = bits(machInst, 7, 0);
-                        return new SubsImmPclr(machInst, INTREG_PC, INTREG_LR,
-                                               imm32, false);
+                        if (imm32 == 0) {
+                            return new Eret(machInst);
+                        } else {
+                            return new SubsImmPclr(machInst, INTREG_PC,
+                                                   INTREG_LR, imm32, false);
+                        }
                     }
                   case 0x3e:
-                    {
-                        const IntRegIndex rd =
-                            (IntRegIndex)(uint32_t)bits(machInst, 11, 8);
-                        return new MrsCpsr(machInst, rd);
-                    }
                   case 0x3f:
                     {
+
                         const IntRegIndex rd =
                             (IntRegIndex)(uint32_t)bits(machInst, 11, 8);
-                        return new MrsSpsr(machInst, rd);
+                        const bool    r        = bits(machInst, 20);
+                        if (bits(machInst, 5)) {
+                            const uint8_t sysM = (bits(machInst, 4) << 4) |
+                                                  bits(machInst, 11, 8);
+                            return new MrsBankedReg(machInst, rd, sysM, r);
+                        } else {
+                            if (r) {
+                                return new MrsSpsr(machInst, rd);
+                            } else {
+                                return new MrsCpsr(machInst, rd);
+                            }
+                        }
+                    }
+                  case 0xfe:
+                    {
+                        uint32_t imm16 = (bits(machInst, 19, 16) << 12) |
+                                         (bits(machInst, 11,  0) <<  0);
+                        return new Hvc(machInst, imm16);
                     }
                 }
                 break;
diff --git a/src/arch/arm/isa/formats/formats.isa b/src/arch/arm/isa/formats/formats.isa
index 90144c1..44e9c5b 100644
--- a/src/arch/arm/isa/formats/formats.isa
+++ b/src/arch/arm/isa/formats/formats.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2011 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -44,6 +44,12 @@
 //Include the basic format
 ##include "basic.isa"
 
+//Include support for decoding AArch64 instructions
+##include "aarch64.isa"
+
+//Include support for decoding AArch64 NEON instructions
+##include "neon64.isa"
+
 //Include support for predicated instructions
 ##include "pred.isa"
 
diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa
index 6d779e5..ccd4589 100644
--- a/src/arch/arm/isa/formats/fp.isa
+++ b/src/arch/arm/isa/formats/fp.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2011 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -151,8 +151,7 @@
             if (singleAll) {
                 size = bits(machInst, 7, 6);
                 bool t = bits(machInst, 5);
-                unsigned eBytes = (1 << size);
-                align = (eBytes - 1) | TLB::AllowUnaligned;
+                align = size | TLB::AllowUnaligned;
                 if (width == 1) {
                     regs = t ? 2 : 1;
                     inc = 1;
@@ -164,7 +163,7 @@
                   case 1:
                   case 2:
                     if (bits(machInst, 4))
-                        align = width * eBytes - 1;
+                        align = size + width - 1;
                     break;
                   case 3:
                     break;
@@ -173,20 +172,19 @@
                         if (bits(machInst, 4) == 0)
                             return new Unknown(machInst);
                         size = 2;
-                        align = 0xf;
+                        align = 0x4;
                     } else if (size == 2) {
                         if (bits(machInst, 4))
-                            align = 7;
+                            align = 0x3;
                     } else {
                         if (bits(machInst, 4))
-                            align = 4 * eBytes - 1;
+                            align = size + 2;
                     }
                     break;
                 }
             } else {
                 size = bits(machInst, 11, 10);
-                unsigned eBytes = (1 << size);
-                align = (eBytes - 1) | TLB::AllowUnaligned;
+                align = size | TLB::AllowUnaligned;
                 regs = width;
                 unsigned indexAlign = bits(machInst, 7, 4);
                 // If width is 1, inc is always 1. That's overridden later.
@@ -219,13 +217,13 @@
                         break;
                       case 2:
                         if (bits(indexAlign, 1, 0))
-                            align = 3;
+                            align = 2;
                         break;
                     }
                     break;
                   case 2:
                     if (bits(indexAlign, 0))
-                        align = (2 * eBytes) - 1;
+                        align = size + 1;
                     break;
                   case 3:
                     break;
@@ -234,11 +232,11 @@
                       case 0:
                       case 1:
                         if (bits(indexAlign, 0))
-                            align = (4 * eBytes) - 1;
+                            align = size + 2;
                         break;
                       case 2:
                         if (bits(indexAlign, 0))
-                            align = (4 << bits(indexAlign, 1, 0)) - 1;
+                            align = bits(indexAlign, 1, 0) + 2;
                         break;
                     }
                     break;
@@ -252,9 +250,9 @@
             align = bits(machInst, 5, 4);
             if (align == 0) {
                 // @align wasn't specified, so alignment can be turned off.
-                align = ((1 << size) - 1) | TLB::AllowUnaligned;
+                align = size | TLB::AllowUnaligned;
             } else {
-                align = ((4 << align) - 1);
+                align = align + 2;
             }
             switch (width) {
               case 1:
@@ -588,6 +586,23 @@
                 }
             }
           case 0xc:
+            if (b) {
+                if (!u) {
+                    if (bits(c, 1) == 0) {
+                        if (q) {
+                            return new NVfmaQFp<float>(machInst, vd, vn, vm);
+                        } else {
+                            return new NVfmaDFp<float>(machInst, vd, vn, vm);
+                        }
+                    } else {
+                        if (q) {
+                            return new NVfmsQFp<float>(machInst, vd, vn, vm);
+                        } else {
+                            return new NVfmsDFp<float>(machInst, vd, vn, vm);
+                        }
+                    }
+                }
+            }
             return new Unknown(machInst);
           case 0xd:
             if (b) {
@@ -1827,7 +1842,7 @@
             break;
           case 0x1:
             {
-                if (offset == 0 || vd + offset/2 > NumFloatArchRegs) {
+                if (offset == 0 || vd + offset/2 > NumFloatV7ArchRegs) {
                     break;
                 }
                 switch (bits(opcode, 1, 0)) {
@@ -1951,8 +1966,9 @@
             } else if (a == 0x7) {
                 const IntRegIndex rt =
                     (IntRegIndex)(uint32_t)bits(machInst, 15, 12);
-                uint32_t specReg = bits(machInst, 19, 16);
-                switch (specReg) {
+                uint32_t reg = bits(machInst, 19, 16);
+                uint32_t specReg;
+                switch (reg) {
                   case 0:
                     specReg = MISCREG_FPSID;
                     break;
@@ -1974,7 +1990,9 @@
                 if (specReg == MISCREG_FPSCR) {
                     return new VmsrFpscr(machInst, (IntRegIndex)specReg, rt);
                 } else {
-                    return new Vmsr(machInst, (IntRegIndex)specReg, rt);
+                    uint32_t iss = mcrMrcIssBuild(0, bits(machInst, 3, 0), rt,
+                        reg, a, bits(machInst, 7, 5));
+                    return new Vmsr(machInst, (IntRegIndex)specReg, rt, iss);
                 }
             }
         } else if (l == 0 && c == 1) {
@@ -2041,8 +2059,9 @@
             } else if (a == 7) {
                 const IntRegIndex rt =
                     (IntRegIndex)(uint32_t)bits(machInst, 15, 12);
-                uint32_t specReg = bits(machInst, 19, 16);
-                switch (specReg) {
+                uint32_t reg = bits(machInst, 19, 16);
+                uint32_t specReg;
+                switch (reg) {
                   case 0:
                     specReg = MISCREG_FPSID;
                     break;
@@ -2070,7 +2089,9 @@
                 } else if (specReg == MISCREG_FPSCR) {
                     return new VmrsFpscr(machInst, rt, (IntRegIndex)specReg);
                 } else {
-                    return new Vmrs(machInst, rt, (IntRegIndex)specReg);
+                    uint32_t iss = mcrMrcIssBuild(l, bits(machInst, 3, 0), rt,
+                        reg, a, bits(machInst, 7, 5));
+                    return new Vmrs(machInst, rt, (IntRegIndex)specReg, iss);
                 }
             }
         } else {
@@ -2235,6 +2256,44 @@
                 }
             }
             break;
+          case 0x9:
+            if ((opc3 & 0x1) == 0) {
+                if (single) {
+                    return decodeVfpRegRegRegOp<VfnmaS>(
+                            machInst, vd, vn, vm, false);
+                } else {
+                    return decodeVfpRegRegRegOp<VfnmaD>(
+                            machInst, vd, vn, vm, true);
+                }
+            } else {
+                if (single) {
+                    return decodeVfpRegRegRegOp<VfnmsS>(
+                            machInst, vd, vn, vm, false);
+                } else {
+                    return decodeVfpRegRegRegOp<VfnmsD>(
+                            machInst, vd, vn, vm, true);
+                }
+            }
+            break;
+          case 0xa:
+            if ((opc3 & 0x1) == 0) {
+                if (single) {
+                    return decodeVfpRegRegRegOp<VfmaS>(
+                            machInst, vd, vn, vm, false);
+                } else {
+                    return decodeVfpRegRegRegOp<VfmaD>(
+                            machInst, vd, vn, vm, true);
+                }
+            } else {
+                if (single) {
+                    return decodeVfpRegRegRegOp<VfmsS>(
+                            machInst, vd, vn, vm, false);
+                } else {
+                    return decodeVfpRegRegRegOp<VfmsD>(
+                            machInst, vd, vn, vm, true);
+                }
+            }
+            break;
           case 0xb:
             if ((opc3 & 0x1) == 0) {
                 const uint32_t baseImm =
diff --git a/src/arch/arm/isa/formats/mem.isa b/src/arch/arm/isa/formats/mem.isa
index f7830ef..abac270 100644
--- a/src/arch/arm/isa/formats/mem.isa
+++ b/src/arch/arm/isa/formats/mem.isa
@@ -282,7 +282,7 @@
             }
         } else {
             const uint32_t mode = bits(machInst, 4, 0);
-            if (badMode((OperatingMode)mode))
+            if (badMode32((OperatingMode)mode))
                 return new Unknown(machInst);
             if (!add && !wb) {
                 return new %(srs)s(machInst, mode,
diff --git a/src/arch/arm/isa/formats/misc.isa b/src/arch/arm/isa/formats/misc.isa
index 00a37d1..647f984 100644
--- a/src/arch/arm/isa/formats/misc.isa
+++ b/src/arch/arm/isa/formats/misc.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2012 ARM Limited
+// Copyright (c) 2010-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -36,19 +36,42 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: Gabe Black
+//          Giacomo Gabrielli
+
+def format ArmERet() {{
+    decode_block = "return new Eret(machInst);"
+}};
 
 def format Svc() {{
-    decode_block = "return new Svc(machInst);"
+    decode_block = "return new Svc(machInst, bits(machInst, 23, 0));"
+}};
+
+def format ArmSmcHyp() {{
+    decode_block = '''
+    {
+        if (bits(machInst, 21))
+        {
+            return new Smc(machInst);
+        } else {
+            uint32_t imm16 = (bits(machInst, 19, 8) << 4) |
+                             (bits(machInst,  3, 0) << 0);
+            return new Hvc(machInst, imm16);
+        }
+    }
+    '''
 }};
 
 def format ArmMsrMrs() {{
     decode_block = '''
     {
         const uint8_t byteMask = bits(machInst, 19, 16);
+        const uint8_t sysM     = byteMask | (bits(machInst, 8) << 4);
         const IntRegIndex rn = (IntRegIndex)(uint32_t)bits(machInst, 3, 0);
         const IntRegIndex rd = (IntRegIndex)(uint32_t)bits(machInst, 15, 12);
         const uint32_t opcode = bits(machInst, 24, 21);
         const bool useImm = bits(machInst, 25);
+        const bool r      = bits(machInst, 22);
+        const bool isBanked = bits(machInst, 9);
 
         const uint32_t unrotated = bits(machInst, 7, 0);
         const uint32_t rotation = (bits(machInst, 11, 8) << 1);
@@ -56,20 +79,36 @@
 
         switch (opcode) {
           case 0x8:
-            return new MrsCpsr(machInst, rd);
+            if (isBanked) {
+                return new MrsBankedReg(machInst, rd, sysM, r!=0);
+            } else {
+                return new MrsCpsr(machInst, rd);
+            }
           case 0x9:
             if (useImm) {
                 return new MsrCpsrImm(machInst, imm, byteMask);
             } else {
-                return new MsrCpsrReg(machInst, rn, byteMask);
+                if (isBanked) {
+                    return new MsrBankedReg(machInst, rn, sysM, r!=0);
+                } else {
+                    return new MsrCpsrReg(machInst, rn, byteMask);
+                }
             }
           case 0xa:
-            return new MrsSpsr(machInst, rd);
+            if (isBanked) {
+                return new MrsBankedReg(machInst, rd, sysM, r!=0);
+            } else {
+                return new MrsSpsr(machInst, rd);
+            }
           case 0xb:
             if (useImm) {
                 return new MsrSpsrImm(machInst, imm, byteMask);
             } else {
-                return new MsrSpsrReg(machInst, rn, byteMask);
+                if (isBanked) {
+                    return new MsrBankedReg(machInst, rn, sysM, r!=0);
+                } else {
+                    return new MsrSpsrReg(machInst, rn, byteMask);
+                }
             }
           default:
             return new Unknown(machInst);
@@ -99,16 +138,17 @@
         switch (miscReg) {
           case MISCREG_NOP:
             return new NopInst(machInst);
-          case NUM_MISCREGS:
+          case MISCREG_CP14_UNIMPL:
             return new FailUnimplemented(
                     csprintf("miscreg crn:%d opc1:%d crm:%d opc2:%d %s unknown",
                     crn, opc1, crm, opc2, isRead ? "read" : "write").c_str(),
                     machInst);
           default:
+            uint32_t iss = mcrMrcIssBuild(isRead, crm, rt, crn, opc1, opc2);
             if (isRead) {
-                return new Mrc14(machInst, rt, (IntRegIndex)miscReg);
+                return new Mrc14(machInst, rt, (IntRegIndex)miscReg, iss);
             } else {
-                return new Mcr14(machInst, (IntRegIndex)miscReg, rt);
+                return new Mcr14(machInst, (IntRegIndex)miscReg, rt, iss);
             }
         }
     }
@@ -123,8 +163,8 @@
 
 let {{
     header_output = '''
-    StaticInstPtr
-    decodeMcrMrc15(ExtMachInst machInst);
+    StaticInstPtr decodeMcrMrc14(ExtMachInst machInst);
+    StaticInstPtr decodeMcrMrc15(ExtMachInst machInst);
     '''
     decoder_output = '''
     StaticInstPtr
@@ -136,107 +176,50 @@
         const uint32_t crm = bits(machInst, 3, 0);
         const MiscRegIndex miscReg = decodeCP15Reg(crn, opc1, crm, opc2);
         const IntRegIndex rt = (IntRegIndex)(uint32_t)bits(machInst, 15, 12);
-
         const bool isRead = bits(machInst, 20);
+        uint32_t iss = mcrMrcIssBuild(isRead, crm, rt, crn, opc1, opc2);
 
         switch (miscReg) {
           case MISCREG_NOP:
             return new NopInst(machInst);
-          case NUM_MISCREGS:
+          case MISCREG_CP15_UNIMPL:
             return new FailUnimplemented(
                     csprintf("miscreg crn:%d opc1:%d crm:%d opc2:%d %s unknown",
                     crn, opc1, crm, opc2, isRead ? "read" : "write").c_str(),
                     machInst);
-          case MISCREG_DCCISW:
-            return new WarnUnimplemented(
-                    isRead ? "mrc dccisw" : "mcr dcisw", machInst);
-          case MISCREG_DCCIMVAC:
-            return new WarnUnimplemented(
-                    isRead ? "mrc dccimvac" : "mcr dccimvac", machInst);
-          case MISCREG_DCIMVAC:
-            return new WarnUnimplemented(
-                    isRead ? "mrc dcimvac" : "mcr dcimvac", machInst);
           case MISCREG_DCCMVAC:
             return new FlushPipeInst(
                     isRead ? "mrc dccmvac" : "mcr dccmvac", machInst);
-          case MISCREG_DCCMVAU:
-            return new WarnUnimplemented(
-                    isRead ? "mrc dccmvau" : "mcr dccmvau", machInst);
           case MISCREG_CP15ISB:
-            return new Isb(machInst);
+            return new Isb(machInst, iss);
           case MISCREG_CP15DSB:
-            return new Dsb(machInst);
+            return new Dsb(machInst, iss);
           case MISCREG_CP15DMB:
-            return new Dmb(machInst);
-          case MISCREG_ICIALLUIS:
-            return new WarnUnimplemented(
-                    isRead ? "mrc icialluis" : "mcr icialluis", machInst);
-          case MISCREG_ICIMVAU:
-            return new WarnUnimplemented(
-                    isRead ? "mrc icimvau" : "mcr icimvau", machInst);
-          case MISCREG_BPIMVA:
-            return new WarnUnimplemented(
-                    isRead ? "mrc bpimva" : "mcr bpimva", machInst);
-          case MISCREG_BPIALLIS:
-            return new WarnUnimplemented(
-                    isRead ? "mrc bpiallis" : "mcr bpiallis", machInst);
-          case MISCREG_BPIALL:
-            return new WarnUnimplemented(
-                    isRead ? "mrc bpiall" : "mcr bpiall", machInst);
-          case MISCREG_L2LATENCY:
-            return new WarnUnimplemented(
-                    isRead ? "mrc l2latency" : "mcr l2latency", machInst);
-          case MISCREG_CRN15:
-            return new WarnUnimplemented(
-                    isRead ? "mrc crn15" : "mcr crn15", machInst);
-
-            // Write only.
-          case MISCREG_TLBIALLIS:
-          case MISCREG_TLBIMVAIS:
-          case MISCREG_TLBIASIDIS:
-          case MISCREG_TLBIMVAAIS:
-          case MISCREG_ITLBIALL:
-          case MISCREG_ITLBIMVA:
-          case MISCREG_ITLBIASID:
-          case MISCREG_DTLBIALL:
-          case MISCREG_DTLBIMVA:
-          case MISCREG_DTLBIASID:
-          case MISCREG_TLBIALL:
-          case MISCREG_TLBIMVA:
-          case MISCREG_TLBIASID:
-          case MISCREG_TLBIMVAA:
-            if (isRead) {
-                return new Unknown(machInst);
-            } else {
-                return new Mcr15(machInst, (IntRegIndex)miscReg, rt);
-            }
-
-            // Read only in user mode.
-          case MISCREG_TPIDRURO:
-            if (isRead) {
-                return new Mrc15User(machInst, rt, (IntRegIndex)miscReg);
-            } else {
-                return new Mcr15(machInst, (IntRegIndex)miscReg, rt);
-            }
-
-            // Read/write in user mode.
-          case MISCREG_TPIDRURW:
-            if (isRead) {
-                return new Mrc15User(machInst, rt, (IntRegIndex)miscReg);
-            } else {
-                return new Mcr15User(machInst, (IntRegIndex)miscReg, rt);
-            }
-
-            // Read/write, priveleged only.
+            return new Dmb(machInst, iss);
           default:
-            if (miscReg >= MISCREG_CP15_UNIMP_START)
+            if (miscRegInfo[miscReg][MISCREG_WARN_NOT_FAIL]) {
+                std::string full_mnem = csprintf("%s %s",
+                    isRead ? "mrc" : "mcr", miscRegName[miscReg]);
+                warn("\\tinstruction '%s' unimplemented\\n", full_mnem);
+
+                // Remove the warn flag and set the implemented flag. This
+                // prevents the instruction warning a second time, it also
+                // means the instruction is actually generated. Actually
+                // creating the instruction to access an register that isn't
+                // implemented sounds a bit silly, but its required to get
+                // the correct behaviour for hyp traps and undef exceptions.
+                miscRegInfo[miscReg][MISCREG_IMPLEMENTED]   = true;
+                miscRegInfo[miscReg][MISCREG_WARN_NOT_FAIL] = false;
+            }
+
+            if (miscRegInfo[miscReg][MISCREG_IMPLEMENTED]) {
+                if (isRead)
+                    return new Mrc15(machInst, rt, (IntRegIndex)miscReg, iss);
+                return new Mcr15(machInst, (IntRegIndex)miscReg, rt, iss);
+            } else {
                 return new FailUnimplemented(csprintf("%s %s",
                     isRead ? "mrc" : "mcr", miscRegName[miscReg]).c_str(),
                     machInst);
-            if (isRead) {
-                return new Mrc15(machInst, rt, (IntRegIndex)miscReg);
-            } else {
-                return new Mcr15(machInst, (IntRegIndex)miscReg, rt);
             }
         }
     }
@@ -248,3 +231,70 @@
     return decodeMcrMrc15(machInst);
     '''
 }};
+
+let {{
+    header_output = '''
+    StaticInstPtr
+    decodeMcrrMrrc15(ExtMachInst machInst);
+    '''
+    decoder_output = '''
+    StaticInstPtr
+    decodeMcrrMrrc15(ExtMachInst machInst)
+    {
+        const uint32_t crm = bits(machInst, 3, 0);
+        const uint32_t opc1 = bits(machInst, 7, 4);
+        const MiscRegIndex miscReg = decodeCP15Reg64(crm, opc1);
+        const IntRegIndex rt = (IntRegIndex) (uint32_t) bits(machInst, 15, 12);
+        const IntRegIndex rt2 = (IntRegIndex) (uint32_t) bits(machInst, 19, 16);
+
+        const bool isRead = bits(machInst, 20);
+
+        switch (miscReg) {
+          case MISCREG_CP15_UNIMPL:
+            return new FailUnimplemented(
+                    csprintf("miscreg crm:%d opc1:%d 64-bit %s unknown",
+                    crm, opc1, isRead ? "read" : "write").c_str(),
+                    machInst);
+          default:
+            if (miscRegInfo[miscReg][MISCREG_WARN_NOT_FAIL]) {
+                std::string full_mnem = csprintf("%s %s",
+                    isRead ? "mrrc" : "mcrr", miscRegName[miscReg]);
+                warn("\\tinstruction '%s' unimplemented\\n", full_mnem);
+
+                // Remove the warn flag and set the implemented flag. This
+                // prevents the instruction warning a second time, it also
+                // means the instruction is actually generated. Actually
+                // creating the instruction to access an register that isn't
+                // implemented sounds a bit silly, but its required to get
+                // the correct behaviour for hyp traps and undef exceptions.
+                miscRegInfo[miscReg][MISCREG_IMPLEMENTED]   = true;
+                miscRegInfo[miscReg][MISCREG_WARN_NOT_FAIL] = false;
+            }
+
+            if (miscRegInfo[miscReg][MISCREG_IMPLEMENTED]) {
+                uint32_t iss = mcrrMrrcIssBuild(isRead, crm, rt, rt2, opc1);
+
+                if (isRead)
+                    return new Mrrc15(machInst, (IntRegIndex) miscReg, rt2, rt, iss);
+                return new Mcrr15(machInst, rt2, rt, (IntRegIndex) miscReg, iss);
+            } else {
+                return new FailUnimplemented(csprintf("%s %s",
+                    isRead ? "mrrc" : "mcrr", miscRegName[miscReg]).c_str(),
+                    machInst);
+            }
+        }
+    }
+    '''
+}};
+
+def format Mcrr15() {{
+    decode_block = '''
+    return decodeMcrrMrrc15(machInst);
+    '''
+}};
+
+def format Mrrc15() {{
+    decode_block = '''
+    return decodeMcrrMrrc15(machInst);
+    '''
+}};
diff --git a/src/arch/arm/isa/formats/neon64.isa b/src/arch/arm/isa/formats/neon64.isa
new file mode 100644
index 0000000..72bbd0c
--- /dev/null
+++ b/src/arch/arm/isa/formats/neon64.isa
@@ -0,0 +1,2626 @@
+// Copyright (c) 2012-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Giacomo Gabrielli
+//          Mbou Eyole
+
+output header {{
+namespace Aarch64
+{
+    // AdvSIMD three same
+    StaticInstPtr decodeNeon3Same(ExtMachInst machInst);
+    // AdvSIMD three different
+    StaticInstPtr decodeNeon3Diff(ExtMachInst machInst);
+    // AdvSIMD two-reg misc
+    StaticInstPtr decodeNeon2RegMisc(ExtMachInst machInst);
+    // AdvSIMD across lanes
+    StaticInstPtr decodeNeonAcrossLanes(ExtMachInst machInst);
+    // AdvSIMD copy
+    StaticInstPtr decodeNeonCopy(ExtMachInst machInst);
+    // AdvSIMD vector x indexed element
+    StaticInstPtr decodeNeonIndexedElem(ExtMachInst machInst);
+    // AdvSIMD modified immediate
+    StaticInstPtr decodeNeonModImm(ExtMachInst machInst);
+    // AdvSIMD shift by immediate
+    StaticInstPtr decodeNeonShiftByImm(ExtMachInst machInst);
+    // AdvSIMD TBL/TBX
+    StaticInstPtr decodeNeonTblTbx(ExtMachInst machInst);
+    // AdvSIMD ZIP/UZP/TRN
+    StaticInstPtr decodeNeonZipUzpTrn(ExtMachInst machInst);
+    // AdvSIMD EXT
+    StaticInstPtr decodeNeonExt(ExtMachInst machInst);
+
+    // AdvSIMD scalar three same
+    StaticInstPtr decodeNeonSc3Same(ExtMachInst machInst);
+    // AdvSIMD scalar three different
+    StaticInstPtr decodeNeonSc3Diff(ExtMachInst machInst);
+    // AdvSIMD scalar two-reg misc
+    StaticInstPtr decodeNeonSc2RegMisc(ExtMachInst machInst);
+    // AdvSIMD scalar pairwise
+    StaticInstPtr decodeNeonScPwise(ExtMachInst machInst);
+    // AdvSIMD scalar copy
+    StaticInstPtr decodeNeonScCopy(ExtMachInst machInst);
+    // AdvSIMD scalar x indexed element
+    StaticInstPtr decodeNeonScIndexedElem(ExtMachInst machInst);
+    // AdvSIMD scalar shift by immediate
+    StaticInstPtr decodeNeonScShiftByImm(ExtMachInst machInst);
+
+    // AdvSIMD load/store
+    StaticInstPtr decodeNeonMem(ExtMachInst machInst);
+}
+}};
+
+output decoder {{
+namespace Aarch64
+{
+    StaticInstPtr
+    decodeNeon3Same(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t u = bits(machInst, 29);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opcode = bits(machInst, 15, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+        uint8_t size_q = (size << 1) | q;
+        uint8_t sz_q = size_q & 0x3;
+
+        switch (opcode) {
+          case 0x00:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UhaddDX, UhaddQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<ShaddDX, ShaddQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x01:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<UqaddDX, UqaddQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeXReg<SqaddDX, SqaddQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x02:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UrhaddDX, UrhaddQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SrhaddDX, SrhaddQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x03:
+            switch (size) {
+              case 0x0:
+                if (u) {
+                    if (q)
+                        return new EorQX<uint64_t>(machInst, vd, vn, vm);
+                    else
+                        return new EorDX<uint64_t>(machInst, vd, vn, vm);
+                } else {
+                    if (q)
+                        return new AndQX<uint64_t>(machInst, vd, vn, vm);
+                    else
+                        return new AndDX<uint64_t>(machInst, vd, vn, vm);
+                }
+              case 0x1:
+                if (u) {
+                    if (q)
+                        return new BslQX<uint64_t>(machInst, vd, vn, vm);
+                    else
+                        return new BslDX<uint64_t>(machInst, vd, vn, vm);
+                } else {
+                    if (q)
+                        return new BicQX<uint64_t>(machInst, vd, vn, vm);
+                    else
+                        return new BicDX<uint64_t>(machInst, vd, vn, vm);
+                }
+              case 0x2:
+                if (u) {
+                    if (q)
+                        return new BitQX<uint64_t>(machInst, vd, vn, vm);
+                    else
+                        return new BitDX<uint64_t>(machInst, vd, vn, vm);
+                } else {
+                    if (q)
+                        return new OrrQX<uint64_t>(machInst, vd, vn, vm);
+                    else
+                        return new OrrDX<uint64_t>(machInst, vd, vn, vm);
+                }
+              case 0x3:
+                if (u) {
+                    if (q)
+                        return new BifQX<uint64_t>(machInst, vd, vn, vm);
+                    else
+                        return new BifDX<uint64_t>(machInst, vd, vn, vm);
+                } else {
+                    if (q)
+                        return new OrnQX<uint64_t>(machInst, vd, vn, vm);
+                    else
+                        return new OrnDX<uint64_t>(machInst, vd, vn, vm);
+                }
+            }
+          case 0x04:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UhsubDX, UhsubQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<ShsubDX, ShsubQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x05:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<UqsubDX, UqsubQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeXReg<SqsubDX, SqsubQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x06:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<CmhiDX, CmhiQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeXReg<CmgtDX, CmgtQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x07:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<CmhsDX, CmhsQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeXReg<CmgeDX, CmgeQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x08:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<UshlDX, UshlQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeXReg<SshlDX, SshlQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x09:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<UqshlDX, UqshlQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeXReg<SqshlDX, SqshlQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x0a:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<UrshlDX, UrshlQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeXReg<SrshlDX, SrshlQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x0b:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<UqrshlDX, UqrshlQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeXReg<SqrshlDX, SqrshlQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x0c:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UmaxDX, UmaxQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SmaxDX, SmaxQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x0d:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UminDX, UminQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SminDX, SminQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x0e:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UabdDX, UabdQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SabdDX, SabdQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x0f:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UabaDX, UabaQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SabaDX, SabaQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x10:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<SubDX, SubQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonUThreeXReg<AddDX, AddQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x11:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeXReg<CmeqDX, CmeqQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonUThreeXReg<CmtstDX, CmtstQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x12:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<MlsDX, MlsQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonUThreeSReg<MlaDX, MlaQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x13:
+            if (size == 0x3 || (size != 0x0 && bits(machInst, 29)))
+                return new Unknown64(machInst);
+            if (u) {
+                if (q)
+                    return new PmulQX<uint8_t>(machInst, vd, vn, vm);
+                else
+                    return new PmulDX<uint8_t>(machInst, vd, vn, vm);
+            } else {
+                return decodeNeonUThreeSReg<MulDX, MulQX>(
+                    q, size, machInst, vd, vn, vm);
+            }
+          case 0x14:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UmaxpDX, UmaxpQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SmaxpDX, SmaxpQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x15:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UminpDX, UminpQX>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SminpDX, SminpQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x16:
+            if (size == 0x3 || size == 0x0)
+                return new Unknown64(machInst);
+            if (u) {
+                if (q)
+                    return decodeNeonSThreeHAndWReg<SqrdmulhQX>(
+                        size, machInst, vd, vn, vm);
+                else
+                    return decodeNeonSThreeHAndWReg<SqrdmulhDX>(
+                        size, machInst, vd, vn, vm);
+            } else {
+                if (q)
+                    return decodeNeonSThreeHAndWReg<SqdmulhQX>(
+                        size, machInst, vd, vn, vm);
+                else
+                    return decodeNeonSThreeHAndWReg<SqdmulhDX>(
+                        size, machInst, vd, vn, vm);
+            }
+          case 0x17:
+            if (u || size_q == 0x6)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUThreeXReg<AddpDX, AddpQX>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x18:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2) {
+                if (u)
+                    return decodeNeonUThreeFpReg<FmaxnmpDX, FmaxnmpQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeFpReg<FmaxnmDX, FmaxnmQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            } else {
+                if (u)
+                    return decodeNeonUThreeFpReg<FminnmpDX, FminnmpQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeFpReg<FminnmDX, FminnmQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            }
+          case 0x19:
+            if (size < 0x2) {
+                if (u || sz_q == 0x2)
+                    return new Unknown64(machInst);
+                else
+                    return decodeNeonUThreeFpReg<FmlaDX, FmlaQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            } else {
+                if (u || sz_q == 0x2)
+                    return new Unknown64(machInst);
+                else
+                    return decodeNeonUThreeFpReg<FmlsDX, FmlsQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            }
+          case 0x1a:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2) {
+                if (u)
+                    return decodeNeonUThreeFpReg<FaddpDX, FaddpQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeFpReg<FaddDX, FaddQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            } else {
+                if (u)
+                    return decodeNeonUThreeFpReg<FabdDX, FabdQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeFpReg<FsubDX, FsubQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            }
+          case 0x1b:
+            if (size < 0x2 && sz_q != 0x2) {
+                if (u)
+                    return decodeNeonUThreeFpReg<FmulDX, FmulQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeFpReg<FmulxDX, FmulxQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            } else {
+                return new Unknown64(machInst);
+            }
+          case 0x1c:
+            if (size < 0x2) {
+                if (u)
+                    return decodeNeonUThreeFpReg<FcmgeDX, FcmgeQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeFpReg<FcmeqDX, FcmeqQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            } else {
+                if (u)
+                    return decodeNeonUThreeFpReg<FcmgtDX, FcmgtQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return new Unknown64(machInst);
+            }
+          case 0x1d:
+            if (size < 0x2) {
+                if (u)
+                    return decodeNeonUThreeFpReg<FacgeDX, FacgeQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return new Unknown64(machInst);
+            } else {
+                if (u)
+                    return decodeNeonUThreeFpReg<FacgtDX, FacgtQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return new Unknown64(machInst);
+            }
+          case 0x1e:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2) {
+                if (u)
+                    return decodeNeonUThreeFpReg<FmaxpDX, FmaxpQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeFpReg<FmaxDX, FmaxQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            } else {
+                if (u)
+                    return decodeNeonUThreeFpReg<FminpDX, FminpQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeFpReg<FminDX, FminQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            }
+          case 0x1f:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2) {
+                if (u)
+                    return decodeNeonUThreeFpReg<FdivDX, FdivQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeFpReg<FrecpsDX, FrecpsQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            } else {
+                if (u)
+                    return new Unknown64(machInst);
+                else
+                    return decodeNeonUThreeFpReg<FrsqrtsDX, FrsqrtsQX>(
+                        q, size & 0x1, machInst, vd, vn, vm);
+            }
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeon3Diff(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t u = bits(machInst, 29);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opcode = bits(machInst, 15, 12);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+        switch (opcode) {
+          case 0x0:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UaddlX, Uaddl2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SaddlX, Saddl2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x1:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UaddwX, Uaddw2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SaddwX, Saddw2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x2:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UsublX, Usubl2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SsublX, Ssubl2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x3:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UsubwX, Usubw2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SsubwX, Ssubw2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x4:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<RaddhnX, Raddhn2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonUThreeSReg<AddhnX, Addhn2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x5:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UabalX, Uabal2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SabalX, Sabal2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x6:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<RsubhnX, Rsubhn2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonUThreeSReg<SubhnX, Subhn2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x7:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UabdlX, Uabdl2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SabdlX, Sabdl2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x8:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UmlalX, Umlal2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SmlalX, Smlal2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0x9:
+            if (u || (size == 0x0 || size == 0x3)) {
+                return new Unknown64(machInst);
+            } else {
+                if (q) {
+                    return decodeNeonSThreeHAndWReg<Sqdmlal2X>(
+                        size, machInst, vd, vn, vm);
+                } else {
+                    return decodeNeonSThreeHAndWReg<SqdmlalX>(
+                        size, machInst, vd, vn, vm);
+                }
+            }
+          case 0xa:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UmlslX, Umlsl2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SmlslX, Smlsl2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0xb:
+            if (u || (size == 0x0 || size == 0x3)) {
+                return new Unknown64(machInst);
+            } else {
+                if (q) {
+                    return decodeNeonSThreeHAndWReg<Sqdmlsl2X>(
+                        size, machInst, vd, vn, vm);
+                } else {
+                    return decodeNeonSThreeHAndWReg<SqdmlslX>(
+                        size, machInst, vd, vn, vm);
+                }
+            }
+          case 0xc:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeSReg<UmullX, Umull2X>(
+                    q, size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeSReg<SmullX, Smull2X>(
+                    q, size, machInst, vd, vn, vm);
+          case 0xd:
+            if (u || (size == 0x0 || size == 0x3)) {
+                return new Unknown64(machInst);
+            } else {
+                if (q) {
+                    return decodeNeonSThreeHAndWReg<Sqdmull2X>(
+                        size, machInst, vd, vn, vm);
+                } else {
+                    return decodeNeonSThreeHAndWReg<SqdmullX>(
+                        size, machInst, vd, vn, vm);
+                }
+            }
+          case 0xe:
+            if (u || size != 0) {
+                return new Unknown64(machInst);
+            } else {
+                if (q)
+                    return new Pmull2X<uint8_t>(machInst, vd, vn, vm);
+                else
+                    return new PmullX<uint8_t>(machInst, vd, vn, vm);
+            }
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeon2RegMisc(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t u = bits(machInst, 29);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opcode = bits(machInst, 16, 12);
+
+        IntRegIndex vd = (IntRegIndex)(uint8_t)bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex)(uint8_t)bits(machInst, 9, 5);
+
+        uint8_t size_q = (size << 1) | q;
+        uint8_t sz_q = size_q & 0x3;
+        uint8_t op = (uint8_t)((bits(machInst, 12) << 1) |
+                               bits(machInst, 29));
+        uint8_t switchVal = opcode | ((u ? 1 : 0) << 5);
+
+        switch (switchVal) {
+          case 0x00:
+            if (op + size >= 3)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscSReg<Rev64DX, Rev64QX>(
+                q, size, machInst, vd, vn);
+          case 0x01:
+            if (op + size >= 3)
+                return new Unknown64(machInst);
+            if (q)
+                return new Rev16QX<uint8_t>(machInst, vd, vn);
+            else
+                return new Rev16DX<uint8_t>(machInst, vd, vn);
+          case 0x02:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscSReg<SaddlpDX, SaddlpQX>(
+                q, size, machInst, vd, vn);
+          case 0x03:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscXReg<SuqaddDX, SuqaddQX>(
+                q, size, machInst, vd, vn);
+          case 0x04:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscSReg<ClsDX, ClsQX>(
+                q, size, machInst, vd, vn);
+          case 0x05:
+            if (size != 0x0)
+                return new Unknown64(machInst);
+            if (q)
+                return new CntQX<uint8_t>(machInst, vd, vn);
+            else
+                return new CntDX<uint8_t>(machInst, vd, vn);
+          case 0x06:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscSReg<SadalpDX, SadalpQX>(
+                q, size, machInst, vd, vn);
+          case 0x07:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscXReg<SqabsDX, SqabsQX>(
+                q, size, machInst, vd, vn);
+          case 0x08:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscXReg<CmgtZeroDX, CmgtZeroQX>(
+                q, size, machInst, vd, vn);
+          case 0x09:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscXReg<CmeqZeroDX, CmeqZeroQX>(
+                q, size, machInst, vd, vn);
+          case 0x0a:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscXReg<CmltZeroDX, CmltZeroQX>(
+                q, size, machInst, vd, vn);
+          case 0x0b:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscXReg<AbsDX, AbsQX>(
+                q, size, machInst, vd, vn);
+          case 0x0c:
+            if (size < 0x2 || sz_q == 0x2)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscFpReg<FcmgtZeroDX, FcmgtZeroQX>(
+                q, size & 0x1, machInst, vd, vn);
+          case 0x0d:
+            if (size < 0x2 || sz_q == 0x2)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscFpReg<FcmeqZeroDX, FcmeqZeroQX>(
+                q, size & 0x1, machInst, vd, vn);
+          case 0x0e:
+            if (size < 0x2 || sz_q == 0x2)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscFpReg<FcmltZeroDX, FcmltZeroQX>(
+                q, size & 0x1, machInst, vd, vn);
+          case 0x0f:
+            if (size < 0x2 || sz_q == 0x2)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscFpReg<FabsDX, FabsQX>(
+                q, size & 0x1, machInst, vd, vn);
+          case 0x12:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscSReg<XtnX, Xtn2X>(
+                q, size, machInst, vd, vn);
+          case 0x14:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscSReg<SqxtnX, Sqxtn2X>(
+                q, size, machInst, vd, vn);
+          case 0x16:
+            if (size > 0x1)
+                return new Unknown64(machInst);
+            if (q) {
+                if (size)
+                    return new Fcvtn2X<uint32_t>(machInst, vd, vn);
+                else
+                    return new Fcvtn2X<uint16_t>(machInst, vd, vn);
+            } else {
+                if (size)
+                    return new FcvtnX<uint32_t>(machInst, vd, vn);
+                else
+                    return new FcvtnX<uint16_t>(machInst, vd, vn);
+            }
+          case 0x17:
+            if (size > 0x1)
+                return new Unknown64(machInst);
+            if (q) {
+                if (size)
+                    return new Fcvtl2X<uint32_t>(machInst, vd, vn);
+                else
+                    return new Fcvtl2X<uint16_t>(machInst, vd, vn);
+            } else {
+                if (size)
+                    return new FcvtlX<uint32_t>(machInst, vd, vn);
+                else
+                    return new FcvtlX<uint16_t>(machInst, vd, vn);
+            }
+          case 0x18:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUTwoMiscFpReg<FrintnDX, FrintnQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscFpReg<FrintpDX, FrintpQX>(
+                    q, size & 0x1, machInst, vd, vn);
+          case 0x19:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUTwoMiscFpReg<FrintmDX, FrintmQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscFpReg<FrintzDX, FrintzQX>(
+                    q, size & 0x1, machInst, vd, vn);
+          case 0x1a:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUTwoMiscFpReg<FcvtnsDX, FcvtnsQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscFpReg<FcvtpsDX, FcvtpsQX>(
+                    q, size & 0x1, machInst, vd, vn);
+          case 0x1b:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUTwoMiscFpReg<FcvtmsDX, FcvtmsQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscFpReg<FcvtzsIntDX, FcvtzsIntQX>(
+                    q, size & 0x1, machInst, vd, vn);
+          case 0x1c:
+            if (size < 0x2) {
+                if (sz_q == 0x2)
+                    return new Unknown64(machInst);
+                return decodeNeonUTwoMiscFpReg<FcvtasDX, FcvtasQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            } else {
+                if (size & 0x1)
+                    return new Unknown64(machInst);
+                if (q)
+                    return new UrecpeQX<uint32_t>(machInst, vd, vn);
+                else
+                    return new UrecpeDX<uint32_t>(machInst, vd, vn);
+            }
+          case 0x1d:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2) {
+                if (q) {
+                    if (size & 0x1)
+                        return new ScvtfIntDQX<uint64_t>(machInst, vd, vn);
+                    else
+                        return new ScvtfIntSQX<uint32_t>(machInst, vd, vn);
+                } else {
+                    if (size & 0x1)
+                        return new Unknown(machInst);
+                    else
+                        return new ScvtfIntDX<uint32_t>(machInst, vd, vn);
+                }
+            } else {
+                return decodeNeonUTwoMiscFpReg<FrecpeDX, FrecpeQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            }
+          case 0x20:
+            if (op + size >= 3)
+                return new Unknown64(machInst);
+            if (q) {
+                if (size & 0x1)
+                    return new Rev32QX<uint16_t>(machInst, vd, vn);
+                else
+                    return new Rev32QX<uint8_t>(machInst, vd, vn);
+            } else {
+                if (size & 0x1)
+                    return new Rev32DX<uint16_t>(machInst, vd, vn);
+                else
+                    return new Rev32DX<uint8_t>(machInst, vd, vn);
+            }
+          case 0x22:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscSReg<UaddlpDX, UaddlpQX>(
+                q, size, machInst, vd, vn);
+          case 0x23:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscXReg<UsqaddDX, UsqaddQX>(
+                q, size, machInst, vd, vn);
+            return new Unknown64(machInst);
+          case 0x24:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscSReg<ClzDX, ClzQX>(
+                q, size, machInst, vd, vn);
+          case 0x25:
+            if (size == 0x0) {
+                if (q)
+                    return new MvnQX<uint64_t>(machInst, vd, vn);
+                else
+                    return new MvnDX<uint64_t>(machInst, vd, vn);
+            } else if (size == 0x1) {
+                if (q)
+                    return new RbitQX<uint8_t>(machInst, vd, vn);
+                else
+                    return new RbitDX<uint8_t>(machInst, vd, vn);
+            } else {
+                return new Unknown64(machInst);
+            }
+          case 0x26:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscSReg<UadalpDX, UadalpQX>(
+                q, size, machInst, vd, vn);
+          case 0x27:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscXReg<SqnegDX, SqnegQX>(
+                q, size, machInst, vd, vn);
+          case 0x28:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscXReg<CmgeZeroDX, CmgeZeroQX>(
+                q, size, machInst, vd, vn);
+          case 0x29:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscXReg<CmleZeroDX, CmleZeroQX>(
+                q, size, machInst, vd, vn);
+          case 0x2b:
+            if (size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscXReg<NegDX, NegQX>(
+                q, size, machInst, vd, vn);
+          case 0x2c:
+            if (size < 0x2 || sz_q == 0x2)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscFpReg<FcmgeZeroDX, FcmgeZeroQX>(
+                q, size & 0x1, machInst, vd, vn);
+          case 0x2d:
+            if (size < 0x2 || sz_q == 0x2)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscFpReg<FcmleZeroDX, FcmleZeroQX>(
+                q, size & 0x1, machInst, vd, vn);
+          case 0x2f:
+            if (size < 0x2 || size_q == 0x6)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscFpReg<FnegDX, FnegQX>(
+                q, size & 0x1, machInst, vd, vn);
+          case 0x32:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonSTwoMiscSReg<SqxtunX, Sqxtun2X>(
+                q, size, machInst, vd, vn);
+          case 0x33:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscSReg<ShllX, Shll2X>(
+                q, size, machInst, vd, vn);
+          case 0x34:
+            if (size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscSReg<UqxtnX, Uqxtn2X>(
+                q, size, machInst, vd, vn);
+          case 0x36:
+            if (size != 0x1)
+                return new Unknown64(machInst);
+            if (q)
+                return new Fcvtxn2X<uint32_t>(machInst, vd, vn);
+            else
+                return new FcvtxnX<uint32_t>(machInst, vd, vn);
+          case 0x38:
+            if (size > 0x1 || sz_q == 0x2)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscFpReg<FrintaDX, FrintaQX>(
+                q, size & 0x1, machInst, vd, vn);
+          case 0x39:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUTwoMiscFpReg<FrintxDX, FrintxQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscFpReg<FrintiDX, FrintiQX>(
+                    q, size & 0x1, machInst, vd, vn);
+          case 0x3a:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUTwoMiscFpReg<FcvtnuDX, FcvtnuQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscFpReg<FcvtpuDX, FcvtpuQX>(
+                    q, size & 0x1, machInst, vd, vn);
+          case 0x3b:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUTwoMiscFpReg<FcvtmuDX, FcvtmuQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscFpReg<FcvtzuIntDX, FcvtzuIntQX>(
+                    q, size & 0x1, machInst, vd, vn);
+          case 0x3c:
+            if (size < 0x2) {
+                return decodeNeonUTwoMiscFpReg<FcvtauDX, FcvtauQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            } else if (size == 0x2) {
+                if (q)
+                    return new UrsqrteQX<uint32_t>(machInst, vd, vn);
+                else
+                    return new UrsqrteDX<uint32_t>(machInst, vd, vn);
+            } else {
+                return new Unknown64(machInst);
+            }
+          case 0x3d:
+            if (sz_q == 0x2)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUTwoMiscFpReg<UcvtfIntDX, UcvtfIntQX>(
+                    q, size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscFpReg<FrsqrteDX, FrsqrteQX>(
+                    q, size & 0x1, machInst, vd, vn);
+          case 0x3f:
+            if (size < 0x2 || sz_q == 0x2)
+                return new Unknown64(machInst);
+            return decodeNeonUTwoMiscFpReg<FsqrtDX, FsqrtQX>(
+                q, size & 0x1, machInst, vd, vn);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonAcrossLanes(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t u = bits(machInst, 29);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opcode = bits(machInst, 16, 12);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+
+        uint8_t size_q = (size << 1) | q;
+        uint8_t sz_q = size_q & 0x3;
+        uint8_t switchVal = opcode | ((u ? 1 : 0) << 5);
+
+        switch (switchVal) {
+          case 0x03:
+            if (size_q == 0x4 || size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonSAcrossLanesLongReg<SaddlvDX, SaddlvQX,
+                                                 SaddlvBQX>(
+                q, size, machInst, vd, vn);
+          case 0x0a:
+            if (size_q == 0x4 || size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonSAcrossLanesReg<SmaxvDX, SmaxvQX>(
+                q, size, machInst, vd, vn);
+          case 0x1a:
+            if (size_q == 0x4 || size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonSAcrossLanesReg<SminvDX, SminvQX>(
+                q, size, machInst, vd, vn);
+          case 0x1b:
+            if (size_q == 0x4 || size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonUAcrossLanesReg<AddvDX, AddvQX>(
+                q, size, machInst, vd, vn);
+          case 0x23:
+            if (size_q == 0x4 || size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonUAcrossLanesLongReg<UaddlvDX, UaddlvQX,
+                                                 UaddlvBQX>(
+                q, size, machInst, vd, vn);
+          case 0x2a:
+            if (size_q == 0x4 || size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonUAcrossLanesReg<UmaxvDX, UmaxvQX>(
+                q, size, machInst, vd, vn);
+          case 0x2c:
+            if (sz_q != 0x1)
+                return new Unknown64(machInst);
+            if (size < 0x2) {
+                if (q)
+                    return new FmaxnmvQX<uint32_t>(machInst, vd, vn);
+                else
+                    return new Unknown64(machInst);
+            } else {
+                if (q)
+                    return new FminnmvQX<uint32_t>(machInst, vd, vn);
+                else
+                    return new Unknown64(machInst);
+            }
+          case 0x2f:
+            if (sz_q != 0x1)
+                return new Unknown64(machInst);
+            if (size < 0x2) {
+                if (q)
+                    return new FmaxvQX<uint32_t>(machInst, vd, vn);
+                else
+                    return new Unknown64(machInst);
+            } else {
+                if (q)
+                    return new FminvQX<uint32_t>(machInst, vd, vn);
+                else
+                    return new Unknown64(machInst);
+            }
+          case 0x3a:
+            if (size_q == 0x4 || size == 0x3)
+                return new Unknown64(machInst);
+            return decodeNeonUAcrossLanesReg<UminvDX, UminvQX>(
+                q, size, machInst, vd, vn);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonCopy(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t op = bits(machInst, 29);
+        uint8_t imm5 = bits(machInst, 20, 16);
+        uint8_t imm4 = bits(machInst, 14, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+
+        uint8_t imm5_pos = findLsbSet(imm5);
+        uint8_t index1 = 0, index2 = 0;
+
+        if (op) {
+            if (!q || (imm4 & mask(imm5_pos)))
+                return new Unknown64(machInst);
+
+            index1 = bits(imm5, 4, imm5_pos + 1);  // dst
+            index2 = bits(imm4, 3, imm5_pos);  // src
+
+            switch (imm5_pos) {
+              case 0:
+                return new InsElemX<uint8_t>(machInst, vd, vn, index1, index2);
+              case 1:
+                return new InsElemX<uint16_t>(machInst, vd, vn, index1, index2);
+              case 2:
+                return new InsElemX<uint32_t>(machInst, vd, vn, index1, index2);
+              case 3:
+                return new InsElemX<uint64_t>(machInst, vd, vn, index1, index2);
+              default:
+                return new Unknown64(machInst);
+            }
+        }
+
+        switch (imm4) {
+          case 0x0:
+            index1 = bits(imm5, 4, imm5_pos + 1);
+            switch (imm5_pos) {
+              case 0:
+                if (q)
+                    return new DupElemQX<uint8_t>(machInst, vd, vn, index1);
+                else
+                    return new DupElemDX<uint8_t>(machInst, vd, vn, index1);
+              case 1:
+                if (q)
+                    return new DupElemQX<uint16_t>(machInst, vd, vn, index1);
+                else
+                    return new DupElemDX<uint16_t>(machInst, vd, vn, index1);
+              case 2:
+                if (q)
+                    return new DupElemQX<uint32_t>(machInst, vd, vn, index1);
+                else
+                    return new DupElemDX<uint32_t>(machInst, vd, vn, index1);
+              case 3:
+                if (q)
+                    return new DupElemQX<uint64_t>(machInst, vd, vn, index1);
+                else
+                    return new Unknown64(machInst);
+              default:
+                return new Unknown64(machInst);
+            }
+          case 0x1:
+            switch (imm5) {
+              case 0x1:
+                if (q)
+                    return new DupGprWQX<uint8_t>(machInst, vd, vn);
+                else
+                    return new DupGprWDX<uint8_t>(machInst, vd, vn);
+              case 0x2:
+                if (q)
+                    return new DupGprWQX<uint16_t>(machInst, vd, vn);
+                else
+                    return new DupGprWDX<uint16_t>(machInst, vd, vn);
+              case 0x4:
+                if (q)
+                    return new DupGprWQX<uint32_t>(machInst, vd, vn);
+                else
+                    return new DupGprWDX<uint32_t>(machInst, vd, vn);
+              case 0x8:
+                if (q)
+                    return new DupGprXQX<uint64_t>(machInst, vd, vn);
+                else
+                    return new Unknown64(machInst);
+            }
+          case 0x3:
+            index1 = imm5 >> (imm5_pos + 1);
+            switch (imm5_pos) {
+              case 0:
+                return new InsGprWX<uint8_t>(machInst, vd, vn, index1);
+              case 1:
+                return new InsGprWX<uint16_t>(machInst, vd, vn, index1);
+              case 2:
+                return new InsGprWX<uint32_t>(machInst, vd, vn, index1);
+              case 3:
+                return new InsGprXX<uint64_t>(machInst, vd, vn, index1);
+              default:
+                return new Unknown64(machInst);
+            }
+          case 0x5:
+            index1 = bits(imm5, 4, imm5_pos + 1);
+            switch (imm5_pos) {
+              case 0:
+                if (q)
+                    return new SmovXX<int8_t>(machInst, vd, vn, index1);
+                else
+                    return new SmovWX<int8_t>(machInst, vd, vn, index1);
+              case 1:
+                if (q)
+                    return new SmovXX<int16_t>(machInst, vd, vn, index1);
+                else
+                    return new SmovWX<int16_t>(machInst, vd, vn, index1);
+              case 2:
+                if (q)
+                    return new SmovXX<int32_t>(machInst, vd, vn, index1);
+                else
+                    return new Unknown64(machInst);
+              default:
+                return new Unknown64(machInst);
+            }
+          case 0x7:
+            index1 = imm5 >> (imm5_pos + 1);
+
+            if ((q && imm5_pos != 3) || (!q && imm5_pos >= 3))
+                return new Unknown64(machInst);
+
+            switch (imm5_pos) {
+              case 0:
+                return new UmovWX<uint8_t>(machInst, vd, vn, index1);
+              case 1:
+                return new UmovWX<uint16_t>(machInst, vd, vn, index1);
+              case 2:
+                return new UmovWX<uint32_t>(machInst, vd, vn, index1);
+              case 3:
+                return new UmovXX<uint64_t>(machInst, vd, vn, index1);
+              default:
+                return new Unknown64(machInst);
+            }
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonIndexedElem(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t u = bits(machInst, 29);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t L = bits(machInst, 21);
+        uint8_t M = bits(machInst, 20);
+        uint8_t opcode = bits(machInst, 15, 12);
+        uint8_t H = bits(machInst, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm_bf = (IntRegIndex) (uint8_t) bits(machInst, 19, 16);
+
+        uint8_t index = 0;
+        uint8_t index_fp = 0;
+        uint8_t vmh = 0;
+        uint8_t sz = size & 0x1;
+        uint8_t sz_q = (sz << 1) | bits(machInst, 30);
+        uint8_t sz_L = (sz << 1) | L;
+
+        // Index and 2nd register operand for integer instructions
+        if (size == 0x1) {
+            index = (H << 2) | (L << 1) | M;
+            // vmh = 0;
+        } else if (size == 0x2) {
+            index = (H << 1) | L;
+            vmh = M;
+        }
+        IntRegIndex vm = (IntRegIndex) (uint8_t) (vmh << 4 | vm_bf);
+
+        // Index and 2nd register operand for FP instructions
+        vmh = M;
+        if ((size & 0x1) == 0) {
+            index_fp = (H << 1) | L;
+        } else if (L == 0) {
+            index_fp = H;
+        }
+        IntRegIndex vm_fp = (IntRegIndex) (uint8_t) (vmh << 4 | vm_bf);
+
+        switch (opcode) {
+          case 0x0:
+            if (!u || (size == 0x0 || size == 0x3))
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUThreeImmHAndWReg<MlaElemDX, MlaElemQX>(
+                    q, size, machInst, vd, vn, vm, index);
+          case 0x1:
+            if (!u && size >= 2 && sz_q != 0x2 && sz_L != 0x3)
+                return decodeNeonUThreeImmFpReg<FmlaElemDX, FmlaElemQX>(
+                    q, sz, machInst, vd, vn, vm_fp, index_fp);
+            else
+                return new Unknown64(machInst);
+          case 0x2:
+            if (size == 0x0 || size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeImmHAndWReg<UmlalElemX, UmlalElem2X>(
+                    q, size, machInst, vd, vn, vm, index);
+            else
+                return decodeNeonSThreeImmHAndWReg<SmlalElemX, SmlalElem2X>(
+                    q, size, machInst, vd, vn, vm, index);
+          case 0x3:
+            if (u || (size == 0x0 || size == 0x3))
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqdmlalElemX,
+                                                   SqdmlalElem2X>(
+                    q, size, machInst, vd, vn, vm, index);
+          case 0x4:
+            if (u && !(size == 0x0 || size == 0x3))
+                return decodeNeonUThreeImmHAndWReg<MlsElemDX, MlsElemQX>(
+                    q, size, machInst, vd, vn, vm, index);
+            else
+                return new Unknown64(machInst);
+          case 0x5:
+            if (!u && size >= 0x2 && sz_L != 0x3 && sz_q != 0x2)
+                return decodeNeonUThreeImmFpReg<FmlsElemDX, FmlsElemQX>(
+                    q, sz, machInst, vd, vn, vm_fp, index_fp);
+            else
+                return new Unknown64(machInst);
+          case 0x6:
+            if (size == 0x0 || size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeImmHAndWReg<UmlslElemX, UmlslElem2X>(
+                    q, size, machInst, vd, vn, vm, index);
+            else
+                return decodeNeonSThreeImmHAndWReg<SmlslElemX, SmlslElem2X>(
+                    q, size, machInst, vd, vn, vm, index);
+          case 0x7:
+            if (u || (size == 0x0 || size == 0x3))
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqdmlslElemX,
+                                                   SqdmlslElem2X>(
+                    q, size, machInst, vd, vn, vm, index);
+          case 0x8:
+            if (u || (size == 0x0 || size == 0x3))
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUThreeImmHAndWReg<MulElemDX, MulElemQX>(
+                    q, size, machInst, vd, vn, vm, index);
+          case 0x9:
+            if (size >= 2 && sz_q != 0x2 && sz_L != 0x3) {
+                if (u)
+                    return decodeNeonUThreeImmFpReg<FmulxElemDX, FmulxElemQX>(
+                        q, sz, machInst, vd, vn, vm_fp, index_fp);
+                else
+                    return decodeNeonUThreeImmFpReg<FmulElemDX, FmulElemQX>(
+                        q, sz, machInst, vd, vn, vm_fp, index_fp);
+            } else {
+                return new Unknown64(machInst);
+            }
+          case 0xa:
+            if (size == 0x0 || size == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeImmHAndWReg<UmullElemX, UmullElem2X>(
+                    q, size, machInst, vd, vn, vm, index);
+            else
+                return decodeNeonSThreeImmHAndWReg<SmullElemX, SmullElem2X>(
+                    q, size, machInst, vd, vn, vm, index);
+          case 0xb:
+            if (u || (size == 0x0 || size == 0x3))
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqdmullElemX, SqdmullElem2X>(
+                    q, size, machInst, vd, vn, vm, index);
+          case 0xc:
+            if (u || (size == 0x0 || size == 0x3))
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqdmulhElemDX, SqdmulhElemQX>(
+                    q, size, machInst, vd, vn, vm, index);
+          case 0xd:
+            if (u || (size == 0x0 || size == 0x3))
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqrdmulhElemDX, SqrdmulhElemQX>(
+                    q, size, machInst, vd, vn, vm, index);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonModImm(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t op = bits(machInst, 29);
+        uint8_t abcdefgh = (bits(machInst, 18, 16) << 5) |
+                           bits(machInst, 9, 5);
+        uint8_t cmode = bits(machInst, 15, 12);
+        uint8_t o2 = bits(machInst, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+
+        if (o2 == 0x1 || (op == 0x1 && cmode == 0xf && !q))
+            return new Unknown64(machInst);
+
+        bool immValid = true;
+        const uint64_t bigImm = simd_modified_imm(op, cmode, abcdefgh,
+                                                  immValid,
+                                                  true /* isAarch64 */);
+        if (!immValid) {
+            return new Unknown(machInst);
+        }
+
+        if (op) {
+            if (bits(cmode, 3) == 0) {
+                if (bits(cmode, 0) == 0) {
+                    if (q)
+                        return new MvniQX<uint64_t>(machInst, vd, bigImm);
+                    else
+                        return new MvniDX<uint64_t>(machInst, vd, bigImm);
+                } else {
+                    if (q)
+                        return new BicImmQX<uint64_t>(machInst, vd, bigImm);
+                    else
+                        return new BicImmDX<uint64_t>(machInst, vd, bigImm);
+                }
+            } else {
+                if (bits(cmode, 2) == 1) {
+                    switch (bits(cmode, 1, 0)) {
+                      case 0:
+                      case 1:
+                        if (q)
+                            return new MvniQX<uint64_t>(machInst, vd, bigImm);
+                        else
+                            return new MvniDX<uint64_t>(machInst, vd, bigImm);
+                      case 2:
+                        if (q)
+                            return new MoviQX<uint64_t>(machInst, vd, bigImm);
+                        else
+                            return new MoviDX<uint64_t>(machInst, vd, bigImm);
+                      case 3:
+                        if (q)
+                            return new FmovQX<uint64_t>(machInst, vd, bigImm);
+                        else
+                            return new MoviDX<uint64_t>(machInst, vd, bigImm);
+                    }
+                } else {
+                    if (bits(cmode, 0) == 0) {
+                        if (q)
+                            return new MvniQX<uint64_t>(machInst, vd, bigImm);
+                        else
+                            return new MvniDX<uint64_t>(machInst, vd, bigImm);
+                    } else {
+                        if (q)
+                            return new BicImmQX<uint64_t>(machInst, vd,
+                                                          bigImm);
+                        else
+                            return new BicImmDX<uint64_t>(machInst, vd,
+                                                          bigImm);
+                    }
+                }
+            }
+        } else {
+            if (bits(cmode, 3) == 0) {
+                if (bits(cmode, 0) == 0) {
+                    if (q)
+                        return new MoviQX<uint64_t>(machInst, vd, bigImm);
+                    else
+                        return new MoviDX<uint64_t>(machInst, vd, bigImm);
+                } else {
+                    if (q)
+                        return new OrrImmQX<uint64_t>(machInst, vd, bigImm);
+                    else
+                        return new OrrImmDX<uint64_t>(machInst, vd, bigImm);
+                }
+            } else {
+                if (bits(cmode, 2) == 1) {
+                    if (bits(cmode, 1, 0) == 0x3) {
+                        if (q)
+                            return new FmovQX<uint32_t>(machInst, vd, bigImm);
+                        else
+                            return new FmovDX<uint32_t>(machInst, vd, bigImm);
+                    } else {
+                        if (q)
+                            return new MoviQX<uint64_t>(machInst, vd, bigImm);
+                        else
+                            return new MoviDX<uint64_t>(machInst, vd, bigImm);
+                    }
+                } else {
+                    if (bits(cmode, 0) == 0) {
+                        if (q)
+                            return new MoviQX<uint64_t>(machInst, vd, bigImm);
+                        else
+                            return new MoviDX<uint64_t>(machInst, vd, bigImm);
+                    } else {
+                        if (q)
+                            return new OrrImmQX<uint64_t>(machInst, vd,
+                                                          bigImm);
+                        else
+                            return new OrrImmDX<uint64_t>(machInst, vd, bigImm);
+                    }
+                }
+            }
+        }
+        return new Unknown(machInst);
+    }
+
+    StaticInstPtr
+    decodeNeonShiftByImm(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t u = bits(machInst, 29);
+        uint8_t immh = bits(machInst, 22, 19);
+        uint8_t immb = bits(machInst, 18, 16);
+        uint8_t opcode = bits(machInst, 15, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+
+        uint8_t immh3 = bits(machInst, 22);
+        uint8_t immh3_q = (immh3 << 1) | q;
+        uint8_t op_u = (bits(machInst, 12) << 1) | u;
+        uint8_t size = findMsbSet(immh);
+        int shiftAmt = 0;
+
+        switch (opcode) {
+          case 0x00:
+            if (immh3_q == 0x2)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftXReg<UshrDX, UshrQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftXReg<SshrDX, SshrQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x02:
+            if (immh3_q == 0x2)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftXReg<UsraDX, UsraQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftXReg<SsraDX, SsraQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x04:
+            if (immh3_q == 0x2)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftXReg<UrshrDX, UrshrQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftXReg<SrshrDX, SrshrQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x06:
+            if (immh3_q == 0x2)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftXReg<UrsraDX, UrsraQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftXReg<SrsraDX, SrsraQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x08:
+            if (u && !(immh3_q == 0x2)) {
+                shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+                return decodeNeonUTwoShiftXReg<SriDX, SriQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            } else {
+                return new Unknown64(machInst);
+            }
+          case 0x0a:
+            if (immh3_q == 0x2)
+                return new Unknown64(machInst);
+            shiftAmt = ((immh << 3) | immb) - (8 << size);
+            if (u)
+                return decodeNeonUTwoShiftXReg<SliDX, SliQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonUTwoShiftXReg<ShlDX, ShlQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x0c:
+            if (u && !(immh3_q == 0x2 || op_u == 0x0)) {
+                shiftAmt = ((immh << 3) | immb) - (8 << size);
+                return decodeNeonSTwoShiftXReg<SqshluDX, SqshluQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            } else {
+                return new Unknown64(machInst);
+            }
+          case 0x0e:
+            if (immh3_q == 0x2 || op_u == 0x0)
+                return new Unknown64(machInst);
+            shiftAmt = ((immh << 3) | immb) - (8 << size);
+            if (u)
+                return decodeNeonUTwoShiftXReg<UqshlImmDX, UqshlImmQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftXReg<SqshlImmDX, SqshlImmQX>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x10:
+            if (immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonSTwoShiftSReg<SqshrunX, Sqshrun2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonUTwoShiftSReg<ShrnX, Shrn2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x11:
+            if (immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonSTwoShiftSReg<SqrshrunX, Sqrshrun2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonUTwoShiftSReg<RshrnX, Rshrn2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x12:
+            if (immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftSReg<UqshrnX, Uqshrn2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftSReg<SqshrnX, Sqshrn2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x13:
+            if (immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftSReg<UqrshrnX, Uqrshrn2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftSReg<SqrshrnX, Sqrshrn2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x14:
+            if (immh3)
+                return new Unknown64(machInst);
+            shiftAmt = ((immh << 3) | immb) - (8 << size);
+            if (u)
+                return decodeNeonUTwoShiftSReg<UshllX, Ushll2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftSReg<SshllX, Sshll2X>(
+                    q, size, machInst, vd, vn, shiftAmt);
+          case 0x1c:
+            if (immh < 0x4 || immh3_q == 0x2)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u) {
+                return decodeNeonUTwoShiftFpReg<UcvtfFixedDX, UcvtfFixedQX>(
+                    q, size & 0x1, machInst, vd, vn, shiftAmt);
+            } else {
+                if (q) {
+                    if (size & 0x1)
+                        return new ScvtfFixedDQX<uint64_t>(machInst, vd, vn,
+                                                           shiftAmt);
+                    else
+                        return new ScvtfFixedSQX<uint32_t>(machInst, vd, vn,
+                                                           shiftAmt);
+                } else {
+                    if (size & 0x1)
+                        return new Unknown(machInst);
+                    else
+                        return new ScvtfFixedDX<uint32_t>(machInst, vd, vn,
+                                                          shiftAmt);
+                }
+            }
+          case 0x1f:
+            if (immh < 0x4 || immh3_q == 0x2)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftFpReg<FcvtzuFixedDX, FcvtzuFixedQX>(
+                    q, size & 0x1, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonUTwoShiftFpReg<FcvtzsFixedDX, FcvtzsFixedQX>(
+                    q, size & 0x1, machInst, vd, vn, shiftAmt);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonTblTbx(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+        uint8_t switchVal = bits(machInst, 14, 12);
+
+        switch (switchVal) {
+          case 0x0:
+            if (q)
+                return new Tbl1QX<uint8_t>(machInst, vd, vn, vm);
+            else
+                return new Tbl1DX<uint8_t>(machInst, vd, vn, vm);
+          case 0x1:
+            if (q)
+                return new Tbx1QX<uint8_t>(machInst, vd, vn, vm);
+            else
+                return new Tbx1DX<uint8_t>(machInst, vd, vn, vm);
+          case 0x2:
+            if (q)
+                return new Tbl2QX<uint8_t>(machInst, vd, vn, vm);
+            else
+                return new Tbl2DX<uint8_t>(machInst, vd, vn, vm);
+          case 0x3:
+            if (q)
+                return new Tbx2QX<uint8_t>(machInst, vd, vn, vm);
+            else
+                return new Tbx2DX<uint8_t>(machInst, vd, vn, vm);
+          case 0x4:
+            if (q)
+                return new Tbl3QX<uint8_t>(machInst, vd, vn, vm);
+            else
+                return new Tbl3DX<uint8_t>(machInst, vd, vn, vm);
+          case 0x5:
+            if (q)
+                return new Tbx3QX<uint8_t>(machInst, vd, vn, vm);
+            else
+                return new Tbx3DX<uint8_t>(machInst, vd, vn, vm);
+          case 0x6:
+            if (q)
+                return new Tbl4QX<uint8_t>(machInst, vd, vn, vm);
+            else
+                return new Tbl4DX<uint8_t>(machInst, vd, vn, vm);
+          case 0x7:
+            if (q)
+                return new Tbx4QX<uint8_t>(machInst, vd, vn, vm);
+            else
+                return new Tbx4DX<uint8_t>(machInst, vd, vn, vm);
+          default:
+            return new Unknown64(machInst);
+        }
+
+        return new Unknown64(machInst);
+    }
+
+    StaticInstPtr
+    decodeNeonZipUzpTrn(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opcode = bits(machInst, 14, 12);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+        switch (opcode) {
+          case 0x1:
+            return decodeNeonUThreeXReg<Uzp1DX, Uzp1QX>(
+                q, size, machInst, vd, vn, vm);
+          case 0x2:
+            return decodeNeonUThreeXReg<Trn1DX, Trn1QX>(
+                q, size, machInst, vd, vn, vm);
+          case 0x3:
+            return decodeNeonUThreeXReg<Zip1DX, Zip1QX>(
+                q, size, machInst, vd, vn, vm);
+          case 0x5:
+            return decodeNeonUThreeXReg<Uzp2DX, Uzp2QX>(
+                q, size, machInst, vd, vn, vm);
+          case 0x6:
+            return decodeNeonUThreeXReg<Trn2DX, Trn2QX>(
+                q, size, machInst, vd, vn, vm);
+          case 0x7:
+            return decodeNeonUThreeXReg<Zip2DX, Zip2QX>(
+                q, size, machInst, vd, vn, vm);
+          default:
+            return new Unknown64(machInst);
+        }
+        return new Unknown64(machInst);
+    }
+
+    StaticInstPtr
+    decodeNeonExt(ExtMachInst machInst)
+    {
+        uint8_t q = bits(machInst, 30);
+        uint8_t op2 = bits(machInst, 23, 22);
+        uint8_t imm4 = bits(machInst, 14, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+        if (op2 != 0 || (q == 0x0 && bits(imm4, 3) == 0x1))
+            return new Unknown64(machInst);
+
+        uint8_t index = q ? imm4 : imm4 & 0x7;
+
+        if (q) {
+            return new ExtQX<uint8_t>(machInst, vd, vn, vm, index);
+        } else {
+            return new ExtDX<uint8_t>(machInst, vd, vn, vm, index);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonSc3Same(ExtMachInst machInst)
+    {
+        uint8_t u = bits(machInst, 29);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opcode = bits(machInst, 15, 11);
+        uint8_t s = bits(machInst, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+        switch (opcode) {
+          case 0x01:
+            if (u)
+                return decodeNeonUThreeUReg<UqaddScX>(
+                    size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeUReg<SqaddScX>(
+                    size, machInst, vd, vn, vm);
+          case 0x05:
+            if (u)
+                return decodeNeonUThreeUReg<UqsubScX>(
+                    size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeUReg<SqsubScX>(
+                    size, machInst, vd, vn, vm);
+          case 0x06:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return new CmhiDX<uint64_t>(machInst, vd, vn, vm);
+            else
+                return new CmgtDX<int64_t>(machInst, vd, vn, vm);
+          case 0x07:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return new CmhsDX<uint64_t>(machInst, vd, vn, vm);
+            else
+                return new CmgeDX<int64_t>(machInst, vd, vn, vm);
+          case 0x08:
+            if (!s && size != 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return new UshlDX<uint64_t>(machInst, vd, vn, vm);
+            else
+                return new SshlDX<int64_t>(machInst, vd, vn, vm);
+          case 0x09:
+            if (!s && size != 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeUReg<UqshlScX>(
+                    size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeUReg<SqshlScX>(
+                    size, machInst, vd, vn, vm);
+          case 0x0a:
+            if (!s && size != 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return new UrshlDX<uint64_t>(machInst, vd, vn, vm);
+            else
+                return new SrshlDX<int64_t>(machInst, vd, vn, vm);
+          case 0x0b:
+            if (!s && size != 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeUReg<UqrshlScX>(
+                    size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeUReg<SqrshlScX>(
+                    size, machInst, vd, vn, vm);
+          case 0x10:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return new SubDX<uint64_t>(machInst, vd, vn, vm);
+            else
+                return new AddDX<uint64_t>(machInst, vd, vn, vm);
+          case 0x11:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return new CmeqDX<uint64_t>(machInst, vd, vn, vm);
+            else
+                return new CmtstDX<uint64_t>(machInst, vd, vn, vm);
+          case 0x16:
+            if (size == 0x3 || size == 0x0)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonSThreeHAndWReg<SqrdmulhScX>(
+                    size, machInst, vd, vn, vm);
+            else
+                return decodeNeonSThreeHAndWReg<SqdmulhScX>(
+                    size, machInst, vd, vn, vm);
+          case 0x1a:
+            if (!u || size < 0x2)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUThreeScFpReg<FabdScX>(
+                    size & 0x1, machInst, vd, vn, vm);
+          case 0x1b:
+            if (u || size > 0x1)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUThreeScFpReg<FmulxScX>(
+                    size & 0x1, machInst, vd, vn, vm);
+          case 0x1c:
+            if (size < 0x2) {
+                if (u)
+                    return decodeNeonUThreeScFpReg<FcmgeScX>(
+                        size & 0x1, machInst, vd, vn, vm);
+                else
+                    return decodeNeonUThreeScFpReg<FcmeqScX>(
+                        size & 0x1, machInst, vd, vn, vm);
+            } else {
+                if (u)
+                    return decodeNeonUThreeScFpReg<FcmgtScX>(
+                        size & 0x1, machInst, vd, vn, vm);
+                else
+                    return new Unknown64(machInst);
+            }
+          case 0x1d:
+            if (!u)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUThreeScFpReg<FacgeScX>(
+                    size & 0x1, machInst, vd, vn, vm);
+            else
+                return decodeNeonUThreeScFpReg<FacgtScX>(
+                    size & 0x1, machInst, vd, vn, vm);
+          case 0x1f:
+            if (u)
+                return new Unknown64(machInst);
+            if (size < 0x2)
+                return decodeNeonUThreeScFpReg<FrecpsScX>(
+                    size & 0x1, machInst, vd, vn, vm);
+            else
+                return decodeNeonUThreeScFpReg<FrsqrtsScX>(
+                    size & 0x1, machInst, vd, vn, vm);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonSc3Diff(ExtMachInst machInst)
+    {
+        if (bits(machInst, 29))
+            return new Unknown64(machInst);
+
+        uint8_t size = bits(machInst, 23, 22);
+        if (size == 0x0 || size == 0x3)
+            return new Unknown64(machInst);
+
+        uint8_t opcode = bits(machInst, 15, 12);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+        switch (opcode) {
+          case 0x9:
+            return decodeNeonSThreeHAndWReg<SqdmlalScX>(size, machInst, vd, vn, vm);
+          case 0xb:
+            return decodeNeonSThreeHAndWReg<SqdmlslScX>(size, machInst, vd, vn, vm);
+          case 0xd:
+            return decodeNeonSThreeHAndWReg<SqdmullScX>(size, machInst, vd, vn, vm);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonSc2RegMisc(ExtMachInst machInst)
+    {
+        uint8_t u = bits(machInst, 29);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opcode = bits(machInst, 16, 12);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+
+        uint8_t switchVal = opcode | ((u ? 1 : 0) << 5);
+        switch (switchVal) {
+          case 0x03:
+            return decodeNeonUTwoMiscUReg<SuqaddScX>(size, machInst, vd, vn);
+          case 0x07:
+            return decodeNeonSTwoMiscUReg<SqabsScX>(size, machInst, vd, vn);
+          case 0x08:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            else
+                return new CmgtZeroDX<int64_t>(machInst, vd, vn);
+          case 0x09:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            else
+                return new CmeqZeroDX<int64_t>(machInst, vd, vn);
+          case 0x0a:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            else
+                return new CmltZeroDX<int64_t>(machInst, vd, vn);
+          case 0x0b:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            else
+                return new AbsDX<int64_t>(machInst, vd, vn);
+          case 0x0c:
+            if (size < 0x2)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUTwoMiscScFpReg<FcmgtZeroScX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x0d:
+            if (size < 0x2)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUTwoMiscScFpReg<FcmeqZeroScX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x0e:
+            if (size < 0x2)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUTwoMiscScFpReg<FcmltZeroScX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x14:
+            if (size == 0x3) {
+                return new Unknown64(machInst);
+            } else {
+                switch (size) {
+                  case 0x0:
+                    return new SqxtnScX<int8_t>(machInst, vd, vn);
+                  case 0x1:
+                    return new SqxtnScX<int16_t>(machInst, vd, vn);
+                  case 0x2:
+                    return new SqxtnScX<int32_t>(machInst, vd, vn);
+                }
+            }
+          case 0x1a:
+            if (size < 0x2)
+                return decodeNeonUTwoMiscScFpReg<FcvtnsScX>(
+                    size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscScFpReg<FcvtpsScX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x1b:
+            if (size < 0x2)
+                return decodeNeonUTwoMiscScFpReg<FcvtmsScX>(
+                    size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscScFpReg<FcvtzsIntScX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x1c:
+            if (size < 0x2)
+                return decodeNeonUTwoMiscScFpReg<FcvtasScX>(
+                    size & 0x1, machInst, vd, vn);
+            else
+                return new Unknown64(machInst);
+          case 0x1d:
+            if (size < 0x2) {
+                if (size & 0x1)
+                    return new ScvtfIntScDX<uint64_t>(machInst, vd, vn);
+                else
+                    return new ScvtfIntScSX<uint32_t>(machInst, vd, vn);
+            } else {
+                return decodeNeonUTwoMiscScFpReg<FrecpeScX>(
+                    size & 0x1, machInst, vd, vn);
+            }
+          case 0x1f:
+            if (size < 0x2)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUTwoMiscScFpReg<FrecpxX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x23:
+            return decodeNeonUTwoMiscUReg<UsqaddScX>(size, machInst, vd, vn);
+          case 0x27:
+            return decodeNeonSTwoMiscUReg<SqnegScX>(size, machInst, vd, vn);
+          case 0x28:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            else
+                return new CmgeZeroDX<int64_t>(machInst, vd, vn);
+          case 0x29:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            else
+                return new CmleZeroDX<int64_t>(machInst, vd, vn);
+          case 0x2b:
+            if (size != 0x3)
+                return new Unknown64(machInst);
+            else
+                return new NegDX<int64_t>(machInst, vd, vn);
+          case 0x2c:
+            if (size < 0x2)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUTwoMiscScFpReg<FcmgeZeroScX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x2d:
+            if (size < 0x2)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUTwoMiscScFpReg<FcmleZeroScX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x32:
+            if (size == 0x3) {
+                return new Unknown64(machInst);
+            } else {
+                switch (size) {
+                  case 0x0:
+                    return new SqxtunScX<int8_t>(machInst, vd, vn);
+                  case 0x1:
+                    return new SqxtunScX<int16_t>(machInst, vd, vn);
+                  case 0x2:
+                    return new SqxtunScX<int32_t>(machInst, vd, vn);
+                }
+            }
+          case 0x34:
+            if (size == 0x3) {
+                return new Unknown64(machInst);
+            } else {
+                switch (size) {
+                  case 0x0:
+                    return new UqxtnScX<uint8_t>(machInst, vd, vn);
+                  case 0x1:
+                    return new UqxtnScX<uint16_t>(machInst, vd, vn);
+                  case 0x2:
+                    return new UqxtnScX<uint32_t>(machInst, vd, vn);
+                }
+            }
+          case 0x36:
+            if (size != 0x1) {
+                return new Unknown64(machInst);
+            } else {
+                return new FcvtxnScX<uint32_t>(machInst, vd, vn);
+            }
+          case 0x3a:
+            if (size < 0x2)
+                return decodeNeonUTwoMiscScFpReg<FcvtnuScX>(
+                    size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscScFpReg<FcvtpuScX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x3b:
+            if (size < 0x2)
+                return decodeNeonUTwoMiscScFpReg<FcvtmuScX>(
+                    size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscScFpReg<FcvtzuIntScX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x3c:
+            if (size < 0x2)
+                return decodeNeonUTwoMiscScFpReg<FcvtauScX>(
+                    size & 0x1, machInst, vd, vn);
+            else
+                return new Unknown64(machInst);
+          case 0x3d:
+            if (size < 0x2)
+                return decodeNeonUTwoMiscScFpReg<UcvtfIntScX>(
+                    size & 0x1, machInst, vd, vn);
+            else
+                return decodeNeonUTwoMiscScFpReg<FrsqrteScX>(
+                    size & 0x1, machInst, vd, vn);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonScPwise(ExtMachInst machInst)
+    {
+        uint8_t u = bits(machInst, 29);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opcode = bits(machInst, 16, 12);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+
+        if (!u) {
+            if (opcode == 0x1b && size == 0x3)
+                return new AddpScQX<uint64_t>(machInst, vd, vn);
+            else
+                return new Unknown64(machInst);
+        }
+
+        uint8_t switchVal = (opcode << 0) | (size << 5);
+        switch (switchVal) {
+          case 0x0c:
+          case 0x2c:
+            return decodeNeonUTwoMiscPwiseScFpReg<FmaxnmpScDX, FmaxnmpScQX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x0d:
+          case 0x2d:
+            return decodeNeonUTwoMiscPwiseScFpReg<FaddpScDX, FaddpScQX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x0f:
+          case 0x2f:
+            return decodeNeonUTwoMiscPwiseScFpReg<FmaxpScDX, FmaxpScQX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x4c:
+          case 0x6c:
+            return decodeNeonUTwoMiscPwiseScFpReg<FminnmpScDX, FminnmpScQX>(
+                    size & 0x1, machInst, vd, vn);
+          case 0x4f:
+          case 0x6f:
+            return decodeNeonUTwoMiscPwiseScFpReg<FminpScDX, FminpScQX>(
+                    size & 0x1, machInst, vd, vn);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonScCopy(ExtMachInst machInst)
+    {
+        if (bits(machInst, 14, 11) != 0 || bits(machInst, 29))
+            return new Unknown64(machInst);
+
+        uint8_t imm5 = bits(machInst, 20, 16);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+
+        uint8_t size = findLsbSet(imm5);
+        uint8_t index = bits(imm5, 4, size + 1);
+
+        return decodeNeonUTwoShiftUReg<DupElemScX>(
+            size, machInst, vd, vn, index);
+    }
+
+    StaticInstPtr
+    decodeNeonScIndexedElem(ExtMachInst machInst)
+    {
+        uint8_t u = bits(machInst, 29);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t L = bits(machInst, 21);
+        uint8_t M = bits(machInst, 20);
+        uint8_t opcode = bits(machInst, 15, 12);
+        uint8_t H = bits(machInst, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm_bf = (IntRegIndex) (uint8_t) bits(machInst, 19, 16);
+
+        uint8_t index = 0;
+        uint8_t index_fp = 0;
+        uint8_t vmh = 0;
+        uint8_t sz_L = bits(machInst, 22, 21);
+
+        // Index and 2nd register operand for integer instructions
+        if (size == 0x1) {
+            index = (H << 2) | (L << 1) | M;
+            // vmh = 0;
+        } else if (size == 0x2) {
+            index = (H << 1) | L;
+            vmh = M;
+        } else if (size == 0x3) {
+            index = H;
+            vmh = M;
+        }
+        IntRegIndex vm = (IntRegIndex) (uint8_t) (vmh << 4 | vm_bf);
+
+        // Index and 2nd register operand for FP instructions
+        vmh = M;
+        if ((size & 0x1) == 0) {
+            index_fp = (H << 1) | L;
+        } else if (L == 0) {
+            index_fp = H;
+        }
+        IntRegIndex vm_fp = (IntRegIndex) (uint8_t) (vmh << 4 | vm_bf);
+
+        if (u && opcode != 9)
+            return new Unknown64(machInst);
+
+        switch (opcode) {
+          case 0x1:
+            if (size < 2 || sz_L == 0x3)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUThreeImmScFpReg<FmlaElemScX>(
+                    size & 0x1, machInst, vd, vn, vm_fp, index_fp);
+          case 0x3:
+            if (size == 0x0 || size == 0x3)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqdmlalElemScX>(
+                    size, machInst, vd, vn, vm, index);
+          case 0x5:
+            if (size < 2 || sz_L == 0x3)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonUThreeImmScFpReg<FmlsElemScX>(
+                    size & 0x1, machInst, vd, vn, vm_fp, index_fp);
+          case 0x7:
+            if (size == 0x0 || size == 0x3)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqdmlslElemScX>(
+                    size, machInst, vd, vn, vm, index);
+          case 0x9:
+            if (size < 2 || sz_L == 0x3)
+                return new Unknown64(machInst);
+            if (u)
+                return decodeNeonUThreeImmScFpReg<FmulxElemScX>(
+                    size & 0x1, machInst, vd, vn, vm_fp, index_fp);
+            else
+                return decodeNeonUThreeImmScFpReg<FmulElemScX>(
+                    size & 0x1, machInst, vd, vn, vm_fp, index_fp);
+          case 0xb:
+            if (size == 0x0 || size == 0x3)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqdmullElemScX>(
+                    size, machInst, vd, vn, vm, index);
+          case 0xc:
+            if (size == 0x0 || size == 0x3)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqdmulhElemScX>(
+                    size, machInst, vd, vn, vm, index);
+          case 0xd:
+            if (size == 0x0 || size == 0x3)
+                return new Unknown64(machInst);
+            else
+                return decodeNeonSThreeImmHAndWReg<SqrdmulhElemScX>(
+                    size, machInst, vd, vn, vm, index);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonScShiftByImm(ExtMachInst machInst)
+    {
+        bool u = bits(machInst, 29);
+        uint8_t immh = bits(machInst, 22, 19);
+        uint8_t immb = bits(machInst, 18, 16);
+        uint8_t opcode = bits(machInst, 15, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+
+        uint8_t immh3 = bits(machInst, 22);
+        uint8_t size = findMsbSet(immh);
+        int shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+
+        if (immh == 0x0)
+            return new Unknown64(machInst);
+
+        switch (opcode) {
+          case 0x00:
+            if (!immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return new UshrDX<uint64_t>(machInst, vd, vn, shiftAmt);
+            else
+                return new SshrDX<int64_t>(machInst, vd, vn, shiftAmt);
+          case 0x02:
+            if (!immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return new UsraDX<uint64_t>(machInst, vd, vn, shiftAmt);
+            else
+                return new SsraDX<int64_t>(machInst, vd, vn, shiftAmt);
+          case 0x04:
+            if (!immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return new UrshrDX<uint64_t>(machInst, vd, vn, shiftAmt);
+            else
+                return new SrshrDX<int64_t>(machInst, vd, vn, shiftAmt);
+          case 0x06:
+            if (!immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return new UrsraDX<uint64_t>(machInst, vd, vn, shiftAmt);
+            else
+                return new SrsraDX<int64_t>(machInst, vd, vn, shiftAmt);
+          case 0x08:
+            if (!immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return new SriDX<uint64_t>(machInst, vd, vn, shiftAmt);
+            else
+                return new Unknown64(machInst);
+          case 0x0a:
+            if (!immh3)
+                return new Unknown64(machInst);
+            shiftAmt = ((immh << 3) | immb) - (8 << size);
+            if (u)
+                return new SliDX<uint64_t>(machInst, vd, vn, shiftAmt);
+            else
+                return new ShlDX<uint64_t>(machInst, vd, vn, shiftAmt);
+          case 0x0c:
+            if (u) {
+                shiftAmt = ((immh << 3) | immb) - (8 << size);
+                return decodeNeonSTwoShiftUReg<SqshluScX>(
+                    size, machInst, vd, vn, shiftAmt);
+            } else {
+                return new Unknown64(machInst);
+            }
+          case 0x0e:
+            shiftAmt = ((immh << 3) | immb) - (8 << size);
+            if (u)
+                return decodeNeonUTwoShiftUReg<UqshlImmScX>(
+                    size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftUReg<SqshlImmScX>(
+                    size, machInst, vd, vn, shiftAmt);
+          case 0x10:
+            if (!u || immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            return decodeNeonSTwoShiftUSReg<SqshrunScX>(
+                size, machInst, vd, vn, shiftAmt);
+          case 0x11:
+            if (!u || immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            return decodeNeonSTwoShiftUSReg<SqrshrunScX>(
+                size, machInst, vd, vn, shiftAmt);
+          case 0x12:
+            if (immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftUSReg<UqshrnScX>(
+                    size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftUSReg<SqshrnScX>(
+                    size, machInst, vd, vn, shiftAmt);
+          case 0x13:
+            if (immh3)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftUSReg<UqrshrnScX>(
+                    size, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonSTwoShiftUSReg<SqrshrnScX>(
+                    size, machInst, vd, vn, shiftAmt);
+          case 0x1c:
+            if (immh < 0x4)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u) {
+                return decodeNeonUTwoShiftUFpReg<UcvtfFixedScX>(
+                    size & 0x1, machInst, vd, vn, shiftAmt);
+            } else {
+                if (size & 0x1)
+                    return new ScvtfFixedScDX<uint64_t>(machInst, vd, vn,
+                                                        shiftAmt);
+                else
+                    return new ScvtfFixedScSX<uint32_t>(machInst, vd, vn,
+                                                        shiftAmt);
+            }
+          case 0x1f:
+            if (immh < 0x4)
+                return new Unknown64(machInst);
+            shiftAmt = (8 << (size + 1)) - ((immh << 3) | immb);
+            if (u)
+                return decodeNeonUTwoShiftUFpReg<FcvtzuFixedScX>(
+                    size & 0x1, machInst, vd, vn, shiftAmt);
+            else
+                return decodeNeonUTwoShiftUFpReg<FcvtzsFixedScX>(
+                    size & 0x1, machInst, vd, vn, shiftAmt);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeNeonMem(ExtMachInst machInst)
+    {
+        uint8_t dataSize = bits(machInst, 30) ? 128 : 64;
+        bool multiple = bits(machInst, 24, 23) < 0x2;
+        bool load = bits(machInst, 22);
+
+        uint8_t numStructElems = 0;
+        uint8_t numRegs = 0;
+
+        if (multiple) {  // AdvSIMD load/store multiple structures
+            uint8_t opcode = bits(machInst, 15, 12);
+            uint8_t eSize = bits(machInst, 11, 10);
+            bool wb = !(bits(machInst, 20, 16) == 0x0 && !bits(machInst, 23));
+
+            switch (opcode) {
+              case 0x0:  // LD/ST4 (4 regs)
+                numStructElems = 4;
+                numRegs = 4;
+                break;
+              case 0x2:  // LD/ST1 (4 regs)
+                numStructElems = 1;
+                numRegs = 4;
+                break;
+              case 0x4:  // LD/ST3 (3 regs)
+                numStructElems = 3;
+                numRegs = 3;
+                break;
+              case 0x6:  // LD/ST1 (3 regs)
+                numStructElems = 1;
+                numRegs = 3;
+                break;
+              case 0x7:  // LD/ST1 (1 reg)
+                numStructElems = 1;
+                numRegs = 1;
+                break;
+              case 0x8:  // LD/ST2 (2 regs)
+                numStructElems = 2;
+                numRegs = 2;
+                break;
+              case 0xa:  // LD/ST1 (2 regs)
+                numStructElems = 1;
+                numRegs = 2;
+                break;
+              default:
+                return new Unknown64(machInst);
+            }
+
+            IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+            IntRegIndex rn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+            IntRegIndex rm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+            if (load) {
+                return new VldMult64(machInst, rn, vd, rm, eSize, dataSize,
+                                     numStructElems, numRegs, wb);
+            } else {
+                return new VstMult64(machInst, rn, vd, rm, eSize, dataSize,
+                                     numStructElems, numRegs, wb);
+            }
+        } else {  // AdvSIMD load/store single structure
+            uint8_t scale = bits(machInst, 15, 14);
+            uint8_t numStructElems = (((uint8_t) bits(machInst, 13) << 1) |
+                                      (uint8_t) bits(machInst, 21)) + 1;
+            uint8_t index = 0;
+            bool wb = !(bits(machInst, 20, 16) == 0x0 && !bits(machInst, 23));
+            bool replicate = false;
+
+            switch (scale) {
+              case 0x0:
+                index = ((uint8_t) bits(machInst, 30) << 3) |
+                    ((uint8_t) bits(machInst, 12) << 2) |
+                    (uint8_t) bits(machInst, 11, 10);
+                break;
+              case 0x1:
+                index = ((uint8_t) bits(machInst, 30) << 2) |
+                    ((uint8_t) bits(machInst, 12) << 1) |
+                    (uint8_t) bits(machInst, 11);
+                break;
+              case 0x2:
+                if (bits(machInst, 10) == 0x0) {
+                    index = ((uint8_t) bits(machInst, 30) << 1) |
+                        bits(machInst, 12);
+                } else {
+                    index = (uint8_t) bits(machInst, 30);
+                    scale = 0x3;
+                }
+                break;
+              case 0x3:
+                scale = bits(machInst, 11, 10);
+                replicate = true;
+                break;
+              default:
+                return new Unknown64(machInst);
+            }
+
+            uint8_t eSize = scale;
+
+            IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+            IntRegIndex rn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+            IntRegIndex rm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+            if (load) {
+                return new VldSingle64(machInst, rn, vd, rm, eSize, dataSize,
+                                       numStructElems, index, wb, replicate);
+            } else {
+                return new VstSingle64(machInst, rn, vd, rm, eSize, dataSize,
+                                       numStructElems, index, wb, replicate);
+            }
+        }
+    }
+}
+}};
diff --git a/src/arch/arm/isa/formats/uncond.isa b/src/arch/arm/isa/formats/uncond.isa
index 4a18a55..c376cd9 100644
--- a/src/arch/arm/isa/formats/uncond.isa
+++ b/src/arch/arm/isa/formats/uncond.isa
@@ -99,11 +99,11 @@
                       case 0x1:
                         return new Clrex(machInst);
                       case 0x4:
-                        return new Dsb(machInst);
+                        return new Dsb(machInst, 0);
                       case 0x5:
-                        return new Dmb(machInst);
+                        return new Dmb(machInst, 0);
                       case 0x6:
-                        return new Isb(machInst);
+                        return new Isb(machInst, 0);
                     }
                 }
             } else if (bits(op2, 0) == 0) {
@@ -166,7 +166,7 @@
                     const uint32_t val = ((machInst >> 20) & 0x5);
                     if (val == 0x4) {
                         const uint32_t mode = bits(machInst, 4, 0);
-                        if (badMode((OperatingMode)mode))
+                        if (badMode32((OperatingMode)mode))
                             return new Unknown(machInst);
                         switch (bits(machInst, 24, 21)) {
                           case 0x2:
@@ -250,17 +250,10 @@
                                         "ldc, ldc2 (immediate)", machInst);
                             }
                         }
-                        if (op1 == 0xC5) {
-                            return new WarnUnimplemented(
-                                    "mrrc, mrrc2", machInst);
-                        }
                     } else {
                         if (bits(op1, 4, 3) != 0 || bits(op1, 1) == 1) {
                             return new WarnUnimplemented(
                                     "stc, stc2", machInst);
-                        } else if (op1 == 0xC4) {
-                            return new WarnUnimplemented(
-                                    "mcrr, mcrrc", machInst);
                         }
                     }
                 }
diff --git a/src/arch/arm/isa/formats/unimp.isa b/src/arch/arm/isa/formats/unimp.isa
index 1c9a4b4..8e34611 100644
--- a/src/arch/arm/isa/formats/unimp.isa
+++ b/src/arch/arm/isa/formats/unimp.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010, 2012 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -85,6 +85,9 @@
       private:
         /// Have we warned on this instruction yet?
         mutable bool warned;
+        /// Full mnemonic for MRC and MCR instructions including the
+        /// coproc. register name
+        std::string fullMnemonic;
 
       public:
         /// Constructor
@@ -96,6 +99,16 @@
             flags[IsNonSpeculative] = true;
         }
 
+        WarnUnimplemented(const char *_mnemonic, ExtMachInst _machInst,
+                          const std::string& _fullMnemonic)
+            : ArmStaticInst(_mnemonic, _machInst, No_OpClass), warned(false),
+              fullMnemonic(_fullMnemonic)
+        {
+            // don't call execute() (which panics) if we're on a
+            // speculative path
+            flags[IsNonSpeculative] = true;
+        }
+
         %(BasicExecDeclare)s
 
         std::string
@@ -147,10 +160,7 @@
     FailUnimplemented::execute(%(CPU_exec_context)s *xc,
                                Trace::InstRecord *traceData) const
     {
-        if (FullSystem)
-            return new UndefinedInstruction;
-        else
-            return new UndefinedInstruction(machInst, false, mnemonic);
+        return new UndefinedInstruction(machInst, false, mnemonic);
     }
 
     Fault
@@ -158,7 +168,8 @@
                                Trace::InstRecord *traceData) const
     {
         if (!warned) {
-            warn("\tinstruction '%s' unimplemented\n", mnemonic);
+            warn("\tinstruction '%s' unimplemented\n",
+                 fullMnemonic.size() ? fullMnemonic.c_str() : mnemonic);
             warned = true;
         }
 
diff --git a/src/arch/arm/isa/includes.isa b/src/arch/arm/isa/includes.isa
index 5dd13d6..a2ce843 100644
--- a/src/arch/arm/isa/includes.isa
+++ b/src/arch/arm/isa/includes.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010, 2012 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -50,10 +50,16 @@
 #include <sstream>
 
 #include "arch/arm/insts/branch.hh"
+#include "arch/arm/insts/branch64.hh"
+#include "arch/arm/insts/data64.hh"
+#include "arch/arm/insts/fplib.hh"
 #include "arch/arm/insts/macromem.hh"
 #include "arch/arm/insts/mem.hh"
+#include "arch/arm/insts/mem64.hh"
 #include "arch/arm/insts/misc.hh"
+#include "arch/arm/insts/misc64.hh"
 #include "arch/arm/insts/mult.hh"
+#include "arch/arm/insts/neon64_mem.hh"
 #include "arch/arm/insts/pred_inst.hh"
 #include "arch/arm/insts/static_inst.hh"
 #include "arch/arm/insts/vfp.hh"
@@ -63,6 +69,7 @@
 }};
 
 output decoder {{
+#include <string>
 #include "arch/arm/decoder.hh"
 #include "arch/arm/faults.hh"
 #include "arch/arm/intregs.hh"
diff --git a/src/arch/arm/isa/insts/aarch64.isa b/src/arch/arm/isa/insts/aarch64.isa
new file mode 100644
index 0000000..6fcf9b5
--- /dev/null
+++ b/src/arch/arm/isa/insts/aarch64.isa
@@ -0,0 +1,58 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+let {{
+    movzCode = 'Dest64 = ((uint64_t)imm1) << imm2;'
+    movzIop = InstObjParams("movz", "Movz", "RegImmImmOp", movzCode, [])
+    header_output += RegImmImmOpDeclare.subst(movzIop)
+    decoder_output += RegImmImmOpConstructor.subst(movzIop)
+    exec_output += BasicExecute.subst(movzIop)
+
+    movkCode = 'Dest64 = insertBits(Dest64, imm2 + 15, imm2, imm1);'
+    movkIop = InstObjParams("movk", "Movk", "RegImmImmOp", movkCode, [])
+    header_output += RegImmImmOpDeclare.subst(movkIop)
+    decoder_output += RegImmImmOpConstructor.subst(movkIop)
+    exec_output += BasicExecute.subst(movkIop)
+
+    movnCode = 'Dest64 = ~(((uint64_t)imm1) << imm2);'
+    movnIop = InstObjParams("movn", "Movn", "RegImmImmOp", movnCode, [])
+    header_output += RegImmImmOpDeclare.subst(movnIop)
+    decoder_output += RegImmImmOpConstructor.subst(movnIop)
+    exec_output += BasicExecute.subst(movnIop)
+}};
diff --git a/src/arch/arm/isa/insts/branch.isa b/src/arch/arm/isa/insts/branch.isa
index e360f45..3ee9d88 100644
--- a/src/arch/arm/isa/insts/branch.isa
+++ b/src/arch/arm/isa/insts/branch.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2012 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -48,7 +48,7 @@
         bCode = '''
         NPC = (uint32_t)(PC + imm);
         '''
-        br_tgt_code = '''pcs.instNPC(branchPC.instPC() + imm);'''
+        br_tgt_code = '''pcs.instNPC((uint32_t)(branchPC.instPC() + imm));'''
         instFlags = ["IsDirectControl"]
         if (link):
             bCode += '''
@@ -86,9 +86,9 @@
             Name += "Imm"
             # Since we're switching ISAs, the target ISA will be the opposite
             # of the current ISA. Thumb is whether the target is ARM.
-            newPC = '(Thumb ? (roundDown(PC, 4) + imm) : (PC + imm))'
+            newPC = '(uint32_t)(Thumb ? (roundDown(PC, 4) + imm) : (PC + imm))'
             br_tgt_code = '''
-            pcs.instNPC((branchPC.thumb() ? (roundDown(branchPC.instPC(),4) + imm) :
+            pcs.instNPC((uint32_t)(branchPC.thumb() ? (roundDown(branchPC.instPC(),4) + imm) :
                                 (branchPC.instPC() + imm)));
             '''
             base = "BranchImmCond"
@@ -150,7 +150,26 @@
         if imm:
             decoder_output += BranchTarget.subst(blxIop)
 
-    #Ignore BXJ for now
+    bxjcode = '''
+    HSTR hstr = Hstr;
+    CPSR cpsr = Cpsr;
+    SCR  scr  = Scr;
+
+    if (ArmSystem::haveVirtualization(xc->tcBase()) && hstr.tjdbx &&
+        !inSecureState(scr, cpsr) && (cpsr.mode != MODE_HYP)) {
+        fault = new HypervisorTrap(machInst, op1, EC_TRAPPED_BXJ);
+    }
+    IWNPC = Op1;
+    '''
+
+    bxjIop = InstObjParams("bxj", "BxjReg", "BranchRegCond",
+                           {"code": bxjcode,
+                            "predicate_test": predicateTest,
+                            "is_ras_pop": "op1 == INTREG_LR" },
+                           ["IsIndirectControl"])
+    header_output += BranchRegCondDeclare.subst(bxjIop)
+    decoder_output += BranchRegCondConstructor.subst(bxjIop)
+    exec_output += PredOpExecute.subst(bxjIop)
 
     #CBNZ, CBZ. These are always unconditional as far as predicates
     for (mnem, test) in (("cbz", "=="), ("cbnz", "!=")):
diff --git a/src/arch/arm/isa/insts/branch64.isa b/src/arch/arm/isa/insts/branch64.isa
new file mode 100644
index 0000000..89cee6c
--- /dev/null
+++ b/src/arch/arm/isa/insts/branch64.isa
@@ -0,0 +1,248 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+//          Giacomo Gabrielli
+
+let {{
+
+    header_output = ""
+    decoder_output = ""
+    exec_output = ""
+
+    # B, BL
+    for (mnem, link) in (("b", False), ("bl", True)):
+        bCode = ('NPC = purifyTaggedAddr(RawPC + imm, xc->tcBase(), '
+                 'currEL(xc->tcBase()));\n')
+        instFlags = ['IsDirectControl', 'IsUncondControl']
+        if (link):
+            bCode += 'XLR = RawPC + 4;\n'
+            instFlags += ['IsCall']
+
+        bIop = InstObjParams(mnem, mnem.capitalize() + "64",
+                             "BranchImm64", bCode, instFlags)
+        header_output += BranchImm64Declare.subst(bIop)
+        decoder_output += BranchImm64Constructor.subst(bIop)
+        exec_output += BasicExecute.subst(bIop)
+
+    # BR, BLR
+    for (mnem, link) in (("br", False), ("blr", True)):
+        bCode = ('NPC = purifyTaggedAddr(XOp1, xc->tcBase(), '
+                 'currEL(xc->tcBase()));\n')
+        instFlags = ['IsIndirectControl', 'IsUncondControl']
+        if (link):
+            bCode += 'XLR = RawPC + 4;\n'
+            instFlags += ['IsCall']
+
+        bIop = InstObjParams(mnem, mnem.capitalize() + "64",
+                             "BranchReg64", bCode, instFlags)
+        header_output += BranchReg64Declare.subst(bIop)
+        decoder_output += BranchReg64Constructor.subst(bIop)
+        exec_output += BasicExecute.subst(bIop)
+
+    # B conditional
+    bCode = '''
+        if (testPredicate(CondCodesNZ, CondCodesC, CondCodesV, condCode))
+            NPC = purifyTaggedAddr(RawPC + imm, xc->tcBase(),
+                                   currEL(xc->tcBase()));
+        else
+            NPC = NPC;
+    '''
+    bIop = InstObjParams("b", "BCond64", "BranchImmCond64", bCode,
+                         ['IsCondControl', 'IsDirectControl'])
+    header_output += BranchImmCond64Declare.subst(bIop)
+    decoder_output += BranchImmCond64Constructor.subst(bIop)
+    exec_output += BasicExecute.subst(bIop)
+
+    # RET
+    bCode = ('NPC = purifyTaggedAddr(XOp1, xc->tcBase(), '
+             'currEL(xc->tcBase()));\n')
+    instFlags = ['IsIndirectControl', 'IsUncondControl', 'IsReturn']
+
+    bIop = InstObjParams('ret', 'Ret64', "BranchRet64", bCode, instFlags)
+    header_output += BranchReg64Declare.subst(bIop)
+    decoder_output += BranchReg64Constructor.subst(bIop)
+    exec_output += BasicExecute.subst(bIop)
+
+    # ERET
+    bCode = '''Addr newPc;
+                CPSR cpsr = Cpsr;
+                CPSR spsr = Spsr;
+
+                ExceptionLevel curr_el = opModeToEL((OperatingMode) (uint8_t) cpsr.mode);
+                switch (curr_el) {
+                  case EL3:
+                    newPc = xc->tcBase()->readMiscReg(MISCREG_ELR_EL3);
+                    break;
+                  case EL2:
+                    newPc = xc->tcBase()->readMiscReg(MISCREG_ELR_EL2);
+                    break;
+                  case EL1:
+                    newPc = xc->tcBase()->readMiscReg(MISCREG_ELR_EL1);
+                    break;
+                  default:
+                    return new UndefinedInstruction(machInst, false, mnemonic);
+                    break;
+                }
+                if (spsr.width && (newPc & mask(2))) {
+                    // To avoid PC Alignment fault when returning to AArch32
+                    if (spsr.t)
+                        newPc = newPc & ~mask(1);
+                    else
+                        newPc = newPc & ~mask(2);
+                }
+                spsr.q = 0;
+                spsr.it1 = 0;
+                spsr.j = 0;
+                spsr.res0_23_22 = 0;
+                spsr.ge = 0;
+                spsr.it2 = 0;
+                spsr.t = 0;
+
+                OperatingMode mode = (OperatingMode) (uint8_t) spsr.mode;
+                bool illegal = false;
+                ExceptionLevel target_el;
+                if (badMode(mode)) {
+                    illegal = true;
+                } else {
+                    target_el = opModeToEL(mode);
+                    if (((target_el == EL2) &&
+                         !ArmSystem::haveVirtualization(xc->tcBase())) ||
+                            (target_el > curr_el) ||
+                            (spsr.width == 1)) {
+                        illegal = true;
+                    } else {
+                        bool known = true;
+                        bool from32 = (spsr.width == 1);
+                        bool to32 = false;
+                        if (false) { // TODO: !haveAArch32EL
+                            to32 = false;
+                        } else if (!ArmSystem::highestELIs64(xc->tcBase())) {
+                            to32 = true;
+                        } else {
+                            bool scr_rw, hcr_rw;
+                            if (ArmSystem::haveSecurity(xc->tcBase())) {
+                                SCR scr = xc->tcBase()->readMiscReg(MISCREG_SCR_EL3);
+                                scr_rw = scr.rw;
+                            } else {
+                                scr_rw = true;
+                            }
+
+                            if (ArmSystem::haveVirtualization(xc->tcBase())) {
+                                HCR hcr = xc->tcBase()->readMiscReg(MISCREG_HCR_EL2);
+                                hcr_rw = hcr.rw;
+                            } else {
+                                hcr_rw = scr_rw;
+                            }
+
+                            switch (target_el) {
+                              case EL3:
+                                to32 = false;
+                                break;
+                              case EL2:
+                                to32 = !scr_rw;
+                                break;
+                              case EL1:
+                                to32 = !scr_rw || !hcr_rw;
+                                break;
+                              case EL0:
+                                if (curr_el == EL0) {
+                                    to32 = cpsr.width;
+                                } else if (!scr_rw || !hcr_rw) {
+                                    // EL0 using AArch32 if EL1 using AArch32
+                                    to32 = true;
+                                } else {
+                                    known = false;
+                                    to32 = false;
+                                }
+                            }
+                        }
+                        if (known)
+                            illegal = (from32 != to32);
+                    }
+                }
+
+                if (illegal) {
+                    uint8_t old_mode = cpsr.mode;
+                    spsr.mode = old_mode; // Preserve old mode when invalid
+                    spsr.il = 1;
+                } else {
+                    if (cpsr.width != spsr.width)
+                        panic("AArch32/AArch64 interprocessing not supported yet");
+                }
+                Cpsr = spsr;
+
+                CondCodesNZ = spsr.nz;
+                CondCodesC  = spsr.c;
+                CondCodesV  = spsr.v;
+                NPC = purifyTaggedAddr(newPc, xc->tcBase(),
+                    opModeToEL((OperatingMode) (uint8_t) spsr.mode));
+                LLSCLock = 0;  // Clear exclusive monitor
+                SevMailbox = 1; //Set Event Register
+    '''
+    instFlags = ['IsSerializeAfter', 'IsNonSpeculative', 'IsSquashAfter']
+    bIop = InstObjParams('eret', 'Eret64', "BranchEret64", bCode, instFlags)
+    header_output += BasicDeclare.subst(bIop)
+    decoder_output += BasicConstructor64.subst(bIop)
+    exec_output += BasicExecute.subst(bIop)
+
+    # CBNZ, CBZ
+    for (mnem, test) in (("cbz", "=="), ("cbnz", "!=")):
+        code = ('NPC = (Op164 %(test)s 0) ? '
+                'purifyTaggedAddr(RawPC + imm, xc->tcBase(), '
+                'currEL(xc->tcBase())) : NPC;\n')
+        code = code % {"test": test}
+        iop = InstObjParams(mnem, mnem.capitalize() + "64",
+                            "BranchImmReg64", code,
+                            ['IsCondControl', 'IsDirectControl'])
+        header_output += BranchImmReg64Declare.subst(iop)
+        decoder_output += BranchImmReg64Constructor.subst(iop)
+        exec_output += BasicExecute.subst(iop)
+
+    # TBNZ, TBZ
+    for (mnem, test) in (("tbz", "=="), ("tbnz", "!=")):
+        code = ('NPC = ((Op164 & imm1) %(test)s 0) ? '
+                'purifyTaggedAddr(RawPC + imm2, xc->tcBase(), '
+                'currEL(xc->tcBase())) : NPC;\n')
+        code = code % {"test": test}
+        iop = InstObjParams(mnem, mnem.capitalize() + "64",
+                            "BranchImmImmReg64", code,
+                            ['IsCondControl', 'IsDirectControl'])
+        header_output += BranchImmImmReg64Declare.subst(iop)
+        decoder_output += BranchImmImmReg64Constructor.subst(iop)
+        exec_output += BasicExecute.subst(iop)
+}};
diff --git a/src/arch/arm/isa/insts/data.isa b/src/arch/arm/isa/insts/data.isa
index be56554..8816764 100644
--- a/src/arch/arm/isa/insts/data.isa
+++ b/src/arch/arm/isa/insts/data.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010, 2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -257,7 +257,8 @@
             CPSR old_cpsr = Cpsr;
 
             CPSR new_cpsr =
-                cpsrWriteByInstr(old_cpsr, Spsr, 0xF, true, sctlr.nmfi);
+                cpsrWriteByInstr(old_cpsr, Spsr, Scr, Nsacr, 0xF, true,
+                                 sctlr.nmfi, xc->tcBase());
             Cpsr = ~CondCodesMask & new_cpsr;
             CondCodesNZ = new_cpsr.nz;
             CondCodesC = new_cpsr.c;
diff --git a/src/arch/arm/isa/insts/data64.isa b/src/arch/arm/isa/insts/data64.isa
new file mode 100644
index 0000000..77d7541
--- /dev/null
+++ b/src/arch/arm/isa/insts/data64.isa
@@ -0,0 +1,465 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+let {{
+
+    header_output = ""
+    decoder_output = ""
+    exec_output = ""
+
+    def createCcCode64(carry, overflow):
+        code = ""
+        code += '''
+            uint16_t _iz, _in;
+            _in = bits(resTemp, intWidth - 1);
+            _iz = ((resTemp & mask(intWidth)) == 0);
+            CondCodesNZ = (_in << 1) | _iz;
+            DPRINTF(Arm, "(in, iz) = (%%d, %%d)\\n", _in, _iz);
+        '''
+        if overflow and overflow != "none":
+            code +=  '''
+                uint16_t _iv;
+                _iv = %s & 1;
+                CondCodesV = _iv;
+                DPRINTF(Arm, "(iv) = (%%d)\\n", _iv);
+            ''' % overflow
+        if carry and carry != "none":
+            code += '''
+                uint16_t _ic;
+                _ic = %s & 1;
+                CondCodesC = _ic;
+                DPRINTF(Arm, "(ic) = (%%d)\\n", _ic);
+            ''' % carry
+        return code
+
+    oldC = 'CondCodesC'
+    oldV = 'CondCodesV'
+    # Dicts of ways to set the carry flag.
+    carryCode64 = {
+        "none": "none",
+        "add": 'findCarry(intWidth, resTemp, Op164, secOp)',
+        "sub": 'findCarry(intWidth, resTemp, Op164, ~secOp)',
+        "logic": '0'
+    }
+    # Dict of ways to set the overflow flag.
+    overflowCode64 = {
+        "none": "none",
+        "add": 'findOverflow(intWidth, resTemp, Op164, secOp)',
+        "sub": 'findOverflow(intWidth, resTemp, Op164, ~secOp)',
+        "logic": '0'
+    }
+
+    immOp2 = "uint64_t secOp M5_VAR_USED = imm;"
+    sRegOp2 = "uint64_t secOp M5_VAR_USED = " + \
+              "shiftReg64(Op264, shiftAmt, shiftType, intWidth);"
+    eRegOp2 = "uint64_t secOp M5_VAR_USED = " + \
+              "extendReg64(Op264, extendType, shiftAmt, intWidth);"
+
+    def buildDataWork(mnem, code, flagType, suffix, buildCc, buildNonCc,
+                      base, templateBase):
+        code = '''
+        uint64_t resTemp M5_VAR_USED = 0;
+        ''' + code
+        ccCode = createCcCode64(carryCode64[flagType], overflowCode64[flagType])
+        Name = mnem.capitalize() + suffix
+        iop = InstObjParams(mnem, Name, base, code)
+        iopCc = InstObjParams(mnem + "s", Name + "Cc", base, code + ccCode)
+
+        def subst(iop):
+            global header_output, decoder_output, exec_output
+            header_output += eval(templateBase + "Declare").subst(iop)
+            decoder_output += eval(templateBase + "Constructor").subst(iop)
+            exec_output += BasicExecute.subst(iop)
+
+        if buildNonCc:
+            subst(iop)
+        if buildCc:
+            subst(iopCc)
+
+    def buildXImmDataInst(mnem, code, flagType = "logic", \
+                          buildCc = True, buildNonCc = True, \
+                          suffix = "XImm"):
+        buildDataWork(mnem, immOp2 + code, flagType, suffix,
+                      buildCc, buildNonCc, "DataXImmOp", "DataXImm")
+
+    def buildXSRegDataInst(mnem, code, flagType = "logic", \
+                           buildCc = True, buildNonCc = True, \
+                           suffix = "XSReg"):
+        buildDataWork(mnem, sRegOp2 + code, flagType, suffix,
+                      buildCc, buildNonCc, "DataXSRegOp", "DataXSReg")
+
+    def buildXERegDataInst(mnem, code, flagType = "logic", \
+                           buildCc = True, buildNonCc = True, \
+                           suffix = "XEReg"):
+        buildDataWork(mnem, eRegOp2 + code, flagType, suffix,
+                      buildCc, buildNonCc, "DataXERegOp", "DataXEReg")
+
+    def buildDataInst(mnem, code, flagType = "logic",
+                      buildCc = True, buildNonCc = True):
+        buildXImmDataInst(mnem, code, flagType, buildCc, buildNonCc)
+        buildXSRegDataInst(mnem, code, flagType, buildCc, buildNonCc)
+        buildXERegDataInst(mnem, code, flagType, buildCc, buildNonCc)
+
+    buildXImmDataInst("adr", "Dest64 = RawPC + imm", buildCc = False);
+    buildXImmDataInst("adrp", "Dest64 = (RawPC & ~mask(12)) + imm",
+                      buildCc = False);
+    buildDataInst("and", "Dest64 = resTemp = Op164 & secOp;")
+    buildDataInst("eor", "Dest64 = Op164 ^ secOp;", buildCc = False)
+    buildXSRegDataInst("eon", "Dest64 = Op164 ^ ~secOp;", buildCc = False)
+    buildDataInst("sub", "Dest64 = resTemp = Op164 - secOp;", "sub")
+    buildDataInst("add", "Dest64 = resTemp = Op164 + secOp;", "add")
+    buildXSRegDataInst("adc",
+            "Dest64 = resTemp = Op164 + secOp + %s;" % oldC, "add")
+    buildXSRegDataInst("sbc",
+            "Dest64 = resTemp = Op164 - secOp - !%s;" % oldC, "sub")
+    buildDataInst("orr", "Dest64 = Op164 | secOp;", buildCc = False)
+    buildXSRegDataInst("orn", "Dest64 = Op164 | ~secOp;", buildCc = False)
+    buildXSRegDataInst("bic", "Dest64 = resTemp = Op164 & ~secOp;")
+
+    def buildDataXImmInst(mnem, code, optArgs = []):
+        global header_output, decoder_output, exec_output
+        classNamePrefix = mnem[0].upper() + mnem[1:]
+        templateBase = "DataXImm"
+        iop = InstObjParams(mnem, classNamePrefix + "64",
+                            templateBase + "Op", code, optArgs)
+        header_output += eval(templateBase + "Declare").subst(iop)
+        decoder_output += eval(templateBase + "Constructor").subst(iop)
+        exec_output += BasicExecute.subst(iop)
+
+    def buildDataXRegInst(mnem, regOps, code, optArgs = [],
+                          overrideOpClass=None):
+        global header_output, decoder_output, exec_output
+        templateBase = "DataX%dReg" % regOps
+        classNamePrefix = mnem[0].upper() + mnem[1:]
+        if overrideOpClass:
+            iop = InstObjParams(mnem, classNamePrefix + "64",
+                                templateBase + "Op",
+                                { 'code': code, 'op_class': overrideOpClass},
+                                optArgs)
+        else:
+            iop = InstObjParams(mnem, classNamePrefix + "64",
+                                templateBase + "Op", code, optArgs)
+        header_output += eval(templateBase + "Declare").subst(iop)
+        decoder_output += eval(templateBase + "Constructor").subst(iop)
+        exec_output += BasicExecute.subst(iop)
+
+    buildDataXRegInst("madd", 3, "Dest64 = Op164 + Op264 * Op364",
+        overrideOpClass="IntMultOp")
+    buildDataXRegInst("msub", 3, "Dest64 = Op164 - Op264 * Op364",
+        overrideOpClass="IntMultOp")
+    buildDataXRegInst("smaddl", 3,
+        "XDest = XOp1 + sext<32>(WOp2) * sext<32>(WOp3)",
+        overrideOpClass="IntMultOp")
+    buildDataXRegInst("smsubl", 3,
+        "XDest = XOp1 - sext<32>(WOp2) * sext<32>(WOp3)",
+        overrideOpClass="IntMultOp")
+    buildDataXRegInst("smulh", 2, '''
+        uint64_t op1H = (int32_t)(XOp1 >> 32);
+        uint64_t op1L = (uint32_t)XOp1;
+        uint64_t op2H = (int32_t)(XOp2 >> 32);
+        uint64_t op2L = (uint32_t)XOp2;
+        uint64_t mid1 = ((op1L * op2L) >> 32) + op1H * op2L;
+        uint64_t mid2 = op1L * op2H;
+        uint64_t result = ((uint64_t)(uint32_t)mid1 + (uint32_t)mid2) >> 32;
+        result += shiftReg64(mid1, 32, ASR, intWidth);
+        result += shiftReg64(mid2, 32, ASR, intWidth);
+        XDest = result + op1H * op2H;
+    ''', overrideOpClass="IntMultOp")
+    buildDataXRegInst("umaddl", 3, "XDest = XOp1 + WOp2 * WOp3",
+        overrideOpClass="IntMultOp")
+    buildDataXRegInst("umsubl", 3, "XDest = XOp1 - WOp2 * WOp3",
+        overrideOpClass="IntMultOp")
+    buildDataXRegInst("umulh", 2, '''
+        uint64_t op1H = (uint32_t)(XOp1 >> 32);
+        uint64_t op1L = (uint32_t)XOp1;
+        uint64_t op2H = (uint32_t)(XOp2 >> 32);
+        uint64_t op2L = (uint32_t)XOp2;
+        uint64_t mid1 = ((op1L * op2L) >> 32) + op1H * op2L;
+        uint64_t mid2 = op1L * op2H;
+        uint64_t result = ((uint64_t)(uint32_t)mid1 + (uint32_t)mid2) >> 32;
+        result += mid1 >> 32;
+        result += mid2 >> 32;
+        XDest = result + op1H * op2H;
+    ''', overrideOpClass="IntMultOp")
+
+    buildDataXRegInst("asrv", 2,
+        "Dest64 = shiftReg64(Op164, Op264, ASR, intWidth)")
+    buildDataXRegInst("lslv", 2,
+        "Dest64 = shiftReg64(Op164, Op264, LSL, intWidth)")
+    buildDataXRegInst("lsrv", 2,
+        "Dest64 = shiftReg64(Op164, Op264, LSR, intWidth)")
+    buildDataXRegInst("rorv", 2,
+        "Dest64 = shiftReg64(Op164, Op264, ROR, intWidth)")
+    buildDataXRegInst("sdiv", 2, '''
+        int64_t op1 = Op164;
+        int64_t op2 = Op264;
+        if (intWidth == 32) {
+            op1 = sext<32>(op1);
+            op2 = sext<32>(op2);
+        }
+        Dest64 = op2 == -1 ? -op1 : op2 ? op1 / op2 : 0;
+    ''', overrideOpClass="IntDivOp")
+    buildDataXRegInst("udiv", 2, "Dest64 = Op264 ? Op164 / Op264 : 0",
+        overrideOpClass="IntDivOp")
+
+    buildDataXRegInst("cls", 1, '''
+        uint64_t op1 = Op164;
+        if (bits(op1, intWidth - 1))
+            op1 ^= mask(intWidth);
+        Dest64 = (op1 == 0) ? intWidth - 1 : (intWidth - 2 - findMsbSet(op1));
+    ''')
+    buildDataXRegInst("clz", 1, '''
+        Dest64 = (Op164 == 0) ? intWidth : (intWidth - 1 - findMsbSet(Op164));
+    ''')
+    buildDataXRegInst("rbit", 1, '''
+        uint64_t result = Op164;
+        uint64_t lBit = 1ULL << (intWidth - 1);
+        uint64_t rBit = 1ULL;
+        while (lBit > rBit) {
+            uint64_t maskBits = lBit | rBit;
+            uint64_t testBits = result & maskBits;
+            // If these bits are different, swap them by toggling them.
+            if (testBits && testBits != maskBits)
+                result ^= maskBits;
+            lBit >>= 1; rBit <<= 1;
+        }
+        Dest64 = result;
+    ''')
+    buildDataXRegInst("rev", 1, '''
+        if (intWidth == 32)
+            Dest64 = betole<uint32_t>(Op164);
+        else
+            Dest64 = betole<uint64_t>(Op164);
+    ''')
+    buildDataXRegInst("rev16", 1, '''
+        int count = intWidth / 16;
+        uint64_t result = 0;
+        for (unsigned i = 0; i < count; i++) {
+            uint16_t hw = Op164 >> (i * 16);
+            result |= (uint64_t)betole<uint16_t>(hw) << (i * 16);
+        }
+        Dest64 = result;
+    ''')
+    buildDataXRegInst("rev32", 1, '''
+        int count = intWidth / 32;
+        uint64_t result = 0;
+        for (unsigned i = 0; i < count; i++) {
+            uint32_t hw = Op164 >> (i * 32);
+            result |= (uint64_t)betole<uint32_t>(hw) << (i * 32);
+        }
+        Dest64 = result;
+    ''')
+
+    msrMrs64EnabledCheckCode = '''
+        // Check for read/write access right
+        if (!can%sAArch64SysReg(flat_idx, Scr64, cpsr, xc->tcBase())) {
+            if (flat_idx == MISCREG_DAIF ||
+                flat_idx == MISCREG_DC_ZVA_Xt ||
+                flat_idx == MISCREG_DC_CVAC_Xt ||
+                flat_idx == MISCREG_DC_CIVAC_Xt
+                )
+                return new UndefinedInstruction(machInst, 0, EC_TRAPPED_MSR_MRS_64);
+            return new UndefinedInstruction(machInst, false, mnemonic);
+        }
+
+        // Check for traps to supervisor (FP/SIMD regs)
+        if (el <= EL1 && msrMrs64TrapToSup(flat_idx, el, Cpacr64))
+            return new SupervisorTrap(machInst, 0x1E00000, EC_TRAPPED_SIMD_FP);
+
+        bool is_vfp_neon = false;
+
+        // Check for traps to hypervisor
+        if ((ArmSystem::haveVirtualization(xc->tcBase()) && el <= EL2) &&
+            msrMrs64TrapToHyp(flat_idx, %s, CptrEl264, Hcr64, &is_vfp_neon)) {
+            return new HypervisorTrap(machInst, is_vfp_neon ? 0x1E00000 : imm,
+                is_vfp_neon ? EC_TRAPPED_SIMD_FP : EC_TRAPPED_MSR_MRS_64);
+        }
+
+        // Check for traps to secure monitor
+        if ((ArmSystem::haveSecurity(xc->tcBase()) && el <= EL3) &&
+            msrMrs64TrapToMon(flat_idx, CptrEl364, el, &is_vfp_neon)) {
+            return new SecureMonitorTrap(machInst,
+                is_vfp_neon ? 0x1E00000 : imm,
+                is_vfp_neon ? EC_TRAPPED_SIMD_FP : EC_TRAPPED_MSR_MRS_64);
+        }
+    '''
+
+    buildDataXImmInst("mrs", '''
+        MiscRegIndex flat_idx = (MiscRegIndex) xc->tcBase()->
+            flattenMiscIndex(op1);
+        CPSR cpsr = Cpsr;
+        ExceptionLevel el = (ExceptionLevel) (uint8_t) cpsr.el;
+        %s
+        XDest = MiscOp1_ud;
+    ''' % (msrMrs64EnabledCheckCode % ('Read', 'true'),),
+        ["IsSerializeBefore"])
+
+    buildDataXRegInst("mrsNZCV", 1, '''
+        CPSR cpsr = 0;
+        cpsr.nz = CondCodesNZ;
+        cpsr.c = CondCodesC;
+        cpsr.v = CondCodesV;
+        XDest = cpsr;
+    ''')
+
+    buildDataXImmInst("msr", '''
+        MiscRegIndex flat_idx = (MiscRegIndex) xc->tcBase()->
+            flattenMiscIndex(dest);
+        CPSR cpsr = Cpsr;
+        ExceptionLevel el = (ExceptionLevel) (uint8_t) cpsr.el;
+        %s
+        MiscDest_ud = XOp1;
+    ''' % (msrMrs64EnabledCheckCode % ('Write', 'false'),),
+        ["IsSerializeAfter", "IsNonSpeculative"])
+
+    buildDataXRegInst("msrNZCV", 1, '''
+        CPSR cpsr = XOp1;
+        CondCodesNZ = cpsr.nz;
+        CondCodesC = cpsr.c;
+        CondCodesV = cpsr.v;
+    ''')
+
+    msrdczva_ea_code = '''
+        MiscRegIndex flat_idx = (MiscRegIndex) xc->tcBase()->flattenMiscIndex(dest);
+        CPSR cpsr = Cpsr;
+        ExceptionLevel el = (ExceptionLevel) (uint8_t) cpsr.el;
+    '''
+
+    msrdczva_ea_code += msrMrs64EnabledCheckCode % ('Write', 'false')
+    msrdczva_ea_code += '''
+           Request::Flags memAccessFlags = Request::CACHE_BLOCK_ZERO|ArmISA::TLB::MustBeOne;
+           EA = XBase;
+           assert(!(Dczid & 0x10));
+           uint64_t op_size = power(2, Dczid + 2);
+           EA &= ~(op_size - 1);
+
+   '''
+
+    msrDCZVAIop = InstObjParams("dczva", "Dczva", "SysDC64",
+                { "ea_code" : msrdczva_ea_code,
+                  "memacc_code" : ";", "use_uops" : 0,
+                  "op_wb" : ";", "fa_code" : ";"}, ['IsStore', 'IsMemRef']);
+    header_output += DCStore64Declare.subst(msrDCZVAIop);
+    decoder_output += DCStore64Constructor.subst(msrDCZVAIop);
+    exec_output += DCStore64Execute.subst(msrDCZVAIop);
+    exec_output += DCStore64InitiateAcc.subst(msrDCZVAIop);
+    exec_output += Store64CompleteAcc.subst(msrDCZVAIop);
+
+
+
+    buildDataXImmInst("msrSP", '''
+        if (!canWriteAArch64SysReg(
+                (MiscRegIndex) xc->tcBase()->flattenMiscIndex(dest),
+                Scr64, Cpsr, xc->tcBase())) {
+            return new UndefinedInstruction(machInst, false, mnemonic);
+        }
+        MiscDest_ud = imm;
+    ''', optArgs = ["IsSerializeAfter", "IsNonSpeculative"])
+
+    buildDataXImmInst("msrDAIFSet", '''
+        if (!canWriteAArch64SysReg(
+                (MiscRegIndex) xc->tcBase()->flattenMiscIndex(dest),
+                Scr64, Cpsr, xc->tcBase())) {
+            return new UndefinedInstruction(machInst, 0, EC_TRAPPED_MSR_MRS_64);
+        }
+        CPSR cpsr = Cpsr;
+        cpsr.daif = cpsr.daif | imm;
+        Cpsr = cpsr;
+    ''', optArgs = ["IsSerializeAfter", "IsNonSpeculative"])
+
+    buildDataXImmInst("msrDAIFClr", '''
+        if (!canWriteAArch64SysReg(
+                (MiscRegIndex) xc->tcBase()->flattenMiscIndex(dest),
+                Scr64, Cpsr, xc->tcBase())) {
+            return new UndefinedInstruction(machInst, 0, EC_TRAPPED_MSR_MRS_64);
+        }
+        CPSR cpsr = Cpsr;
+        cpsr.daif = cpsr.daif & ~imm;
+        Cpsr = cpsr;
+    ''', optArgs = ["IsSerializeAfter", "IsNonSpeculative"])
+
+    def buildDataXCompInst(mnem, instType, suffix, code):
+        global header_output, decoder_output, exec_output
+        templateBase = "DataXCond%s" % instType
+        iop = InstObjParams(mnem, mnem.capitalize() + suffix + "64",
+                            templateBase + "Op", code)
+        header_output += eval(templateBase + "Declare").subst(iop)
+        decoder_output += eval(templateBase + "Constructor").subst(iop)
+        exec_output += BasicExecute.subst(iop)
+
+    def buildDataXCondImmInst(mnem, code):
+        buildDataXCompInst(mnem, "CompImm", "Imm", code)
+    def buildDataXCondRegInst(mnem, code):
+        buildDataXCompInst(mnem, "CompReg", "Reg", code)
+    def buildDataXCondSelInst(mnem, code):
+        buildDataXCompInst(mnem, "Sel", "", code)
+
+    def condCompCode(flagType, op, imm):
+        ccCode = createCcCode64(carryCode64[flagType], overflowCode64[flagType])
+        opDecl = "uint64_t secOp M5_VAR_USED = imm;"
+        if not imm:
+            opDecl = "uint64_t secOp M5_VAR_USED = Op264;"
+        return opDecl + '''
+            if (testPredicate(CondCodesNZ, CondCodesC, CondCodesV, condCode)) {
+                uint64_t resTemp = Op164 ''' + op + ''' secOp;
+        ''' + ccCode + '''
+            } else {
+                CondCodesNZ = (defCc >> 2) & 0x3;
+                CondCodesC = (defCc >> 1) & 0x1;
+                CondCodesV = defCc & 0x1;
+            }
+        '''
+
+    buildDataXCondImmInst("ccmn", condCompCode("add", "+", True))
+    buildDataXCondImmInst("ccmp", condCompCode("sub", "-", True))
+    buildDataXCondRegInst("ccmn", condCompCode("add", "+", False))
+    buildDataXCondRegInst("ccmp", condCompCode("sub", "-", False))
+
+    condSelCode = '''
+        if (testPredicate(CondCodesNZ, CondCodesC, CondCodesV, condCode)) {
+            Dest64 = Op164;
+        } else {
+            Dest64 = %(altVal)s;
+        }
+    '''
+    buildDataXCondSelInst("csel", condSelCode % {"altVal" : "Op264"})
+    buildDataXCondSelInst("csinc", condSelCode % {"altVal" : "Op264 + 1"})
+    buildDataXCondSelInst("csinv", condSelCode % {"altVal" : "~Op264"})
+    buildDataXCondSelInst("csneg", condSelCode % {"altVal" : "-Op264"})
+}};
diff --git a/src/arch/arm/isa/insts/div.isa b/src/arch/arm/isa/insts/div.isa
index 1ff6ef9..0896ea9 100644
--- a/src/arch/arm/isa/insts/div.isa
+++ b/src/arch/arm/isa/insts/div.isa
@@ -40,12 +40,6 @@
 let {{
     sdivCode = '''
     if (Op2_sw == 0) {
-        if (((SCTLR)Sctlr).dz) {
-            if (FullSystem)
-                return new UndefinedInstruction;
-            else
-                return new UndefinedInstruction(false, mnemonic);
-        }
         Dest_sw = 0;
     } else if (Op1_sw == INT_MIN && Op2_sw == -1) {
         Dest_sw = INT_MIN;
@@ -63,12 +57,6 @@
 
     udivCode = '''
     if (Op2_uw == 0) {
-        if (((SCTLR)Sctlr).dz) {
-            if (FullSystem)
-                return new UndefinedInstruction;
-            else
-                return new UndefinedInstruction(false, mnemonic);
-        }
         Dest_uw = 0;
     } else {
         Dest_uw = Op1_uw / Op2_uw;
diff --git a/src/arch/arm/isa/insts/fp.isa b/src/arch/arm/isa/insts/fp.isa
index b701995..60f030c 100644
--- a/src/arch/arm/isa/insts/fp.isa
+++ b/src/arch/arm/isa/insts/fp.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -191,14 +191,17 @@
     decoder_output = ""
     exec_output = ""
 
-    vmsrIop = InstObjParams("vmsr", "Vmsr", "FpRegRegOp",
-                            { "code": vmsrEnabledCheckCode + \
-                                      "MiscDest = Op1;",
+    vmsrCode = vmsrEnabledCheckCode + '''
+    MiscDest = Op1;
+    '''
+
+    vmsrIop = InstObjParams("vmsr", "Vmsr", "FpRegRegImmOp",
+                            { "code": vmsrCode,
                               "predicate_test": predicateTest,
                               "op_class": "SimdFloatMiscOp" },
                              ["IsSerializeAfter","IsNonSpeculative"])
-    header_output += FpRegRegOpDeclare.subst(vmsrIop);
-    decoder_output += FpRegRegOpConstructor.subst(vmsrIop);
+    header_output += FpRegRegImmOpDeclare.subst(vmsrIop);
+    decoder_output += FpRegRegImmOpConstructor.subst(vmsrIop);
     exec_output += PredOpExecute.subst(vmsrIop);
 
     vmsrFpscrCode = vmsrEnabledCheckCode + '''
@@ -215,14 +218,36 @@
     decoder_output += FpRegRegOpConstructor.subst(vmsrFpscrIop);
     exec_output += PredOpExecute.subst(vmsrFpscrIop);
 
-    vmrsIop = InstObjParams("vmrs", "Vmrs", "FpRegRegOp",
-                            { "code": vmrsEnabledCheckCode + \
-                                    "Dest = MiscOp1;",
+    vmrsCode = vmrsEnabledCheckCode + '''
+    CPSR cpsr = Cpsr;
+    SCR  scr  = Scr;
+    if (!inSecureState(scr, cpsr) && (cpsr.mode != MODE_HYP)) {
+        HCR hcr = Hcr;
+        bool hypTrap = false;
+        switch(xc->tcBase()->flattenMiscIndex(op1)) {
+          case MISCREG_FPSID:
+            hypTrap = hcr.tid0;
+            break;
+          case MISCREG_MVFR0:
+          case MISCREG_MVFR1:
+            hypTrap = hcr.tid3;
+            break;
+        }
+        if (hypTrap) {
+            return new HypervisorTrap(machInst, imm,
+                EC_TRAPPED_CP10_MRC_VMRS);
+        }
+    }
+    Dest = MiscOp1;
+    '''
+
+    vmrsIop = InstObjParams("vmrs", "Vmrs", "FpRegRegImmOp",
+                            { "code": vmrsCode,
                               "predicate_test": predicateTest,
                               "op_class": "SimdFloatMiscOp" },
                             ["IsSerializeBefore"])
-    header_output += FpRegRegOpDeclare.subst(vmrsIop);
-    decoder_output += FpRegRegOpConstructor.subst(vmrsIop);
+    header_output += FpRegRegImmOpDeclare.subst(vmrsIop);
+    decoder_output += FpRegRegImmOpConstructor.subst(vmrsIop);
     exec_output += PredOpExecute.subst(vmrsIop);
 
     vmrsFpscrIop = InstObjParams("vmrs", "VmrsFpscr", "FpRegRegOp",
@@ -323,7 +348,7 @@
     decoder_output  += FpRegRegOpConstructor.subst(vmovRegQIop);
     exec_output += PredOpExecute.subst(vmovRegQIop);
 
-    vmovCoreRegBCode = vfpEnabledCheckCode + '''
+    vmovCoreRegBCode = simdEnabledCheckCode + '''
         FpDest_uw = insertBits(FpDest_uw, imm * 8 + 7, imm * 8, Op1_ub);
     '''
     vmovCoreRegBIop = InstObjParams("vmov", "VmovCoreRegB", "FpRegRegImmOp",
@@ -334,7 +359,7 @@
     decoder_output  += FpRegRegImmOpConstructor.subst(vmovCoreRegBIop);
     exec_output += PredOpExecute.subst(vmovCoreRegBIop);
 
-    vmovCoreRegHCode = vfpEnabledCheckCode + '''
+    vmovCoreRegHCode = simdEnabledCheckCode + '''
         FpDest_uw = insertBits(FpDest_uw, imm * 16 + 15, imm * 16, Op1_uh);
     '''
     vmovCoreRegHIop = InstObjParams("vmov", "VmovCoreRegH", "FpRegRegImmOp",
@@ -453,6 +478,17 @@
     singleCode = singleSimpleCode + '''
         FpscrExc = fpscr;
     '''
+    singleTernOp = vfpEnabledCheckCode + '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        float cOp1 = FpOp1;
+        float cOp2 = FpOp2;
+        float cOp3 = FpDestP0;
+        FpDestP0   = ternaryOp(fpscr, %(palam)s, %(op)s,
+                               fpscr.fz, fpscr.dn, fpscr.rMode);
+        finishVfp(fpscr, state, fpscr.fz);
+        FpscrExc = fpscr;
+    '''
     singleBinOp = "binaryOp(fpscr, FpOp1, FpOp2," + \
                 "%(func)s, fpscr.fz, fpscr.dn, fpscr.rMode)"
     singleUnaryOp = "unaryOp(fpscr, FpOp1, %(func)s, fpscr.fz, fpscr.rMode)"
@@ -463,6 +499,19 @@
         FpDestP1_uw = dblHi(dest);
         FpscrExc = fpscr;
     '''
+    doubleTernOp = vfpEnabledCheckCode + '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        double cOp1  = dbl(FpOp1P0_uw, FpOp1P1_uw);
+        double cOp2  = dbl(FpOp2P0_uw, FpOp2P1_uw);
+        double cOp3  = dbl(FpDestP0_uw, FpDestP1_uw);
+        double cDest = ternaryOp(fpscr, %(palam)s, %(op)s,
+                                 fpscr.fz, fpscr.dn, fpscr.rMode);
+        FpDestP0_uw  = dblLow(cDest);
+        FpDestP1_uw  = dblHi(cDest);
+        finishVfp(fpscr, state, fpscr.fz);
+        FpscrExc = fpscr;
+    '''
     doubleBinOp = '''
         binaryOp(fpscr, dbl(FpOp1P0_uw, FpOp1P1_uw),
                         dbl(FpOp2P0_uw, FpOp2P1_uw),
@@ -473,6 +522,37 @@
                 fpscr.fz, fpscr.rMode)
     '''
 
+    def buildTernaryFpOp(Name, base, opClass, singleOp, doubleOp, paramStr):
+        global header_output, decoder_output, exec_output
+
+        code = singleTernOp % { "op": singleOp, "palam": paramStr }
+        sIop = InstObjParams(Name.lower() + "s", Name + "S", base,
+                { "code": code,
+                  "predicate_test": predicateTest,
+                  "op_class": opClass }, [])
+        code = doubleTernOp % { "op": doubleOp, "palam": paramStr }
+        dIop = InstObjParams(Name.lower() + "d", Name + "D", base,
+                { "code": code,
+                  "predicate_test": predicateTest,
+                  "op_class": opClass }, [])
+
+        declareTempl     = eval(base + "Declare");
+        constructorTempl = eval(base + "Constructor");
+
+        for iop in sIop, dIop:
+            header_output  += declareTempl.subst(iop)
+            decoder_output += constructorTempl.subst(iop)
+            exec_output    += PredOpExecute.subst(iop)
+
+    buildTernaryFpOp("Vfma",  "FpRegRegRegOp", "SimdFloatMultAccOp",
+                     "fpMulAdd<float>", "fpMulAdd<double>", " cOp1, cOp2,  cOp3" )
+    buildTernaryFpOp("Vfms",  "FpRegRegRegOp", "SimdFloatMultAccOp",
+                     "fpMulAdd<float>", "fpMulAdd<double>", "-cOp1, cOp2,  cOp3" )
+    buildTernaryFpOp("Vfnma", "FpRegRegRegOp", "SimdFloatMultAccOp",
+                     "fpMulAdd<float>", "fpMulAdd<double>", "-cOp1, cOp2, -cOp3" )
+    buildTernaryFpOp("Vfnms", "FpRegRegRegOp", "SimdFloatMultAccOp",
+                     "fpMulAdd<float>", "fpMulAdd<double>", " cOp1, cOp2, -cOp3" )
+
     def buildBinFpOp(name, Name, base, opClass, singleOp, doubleOp):
         global header_output, decoder_output, exec_output
 
@@ -830,7 +910,7 @@
         VfpSavedState state = prepFpState(fpscr.rMode);
         vfpFlushToZero(fpscr, FpOp1);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_uw = vfpFpSToFixed(FpOp1, false, false, 0, false);
+        FpDest_uw = vfpFpToFixed<float>(FpOp1, false, 32, 0, false);
         __asm__ __volatile__("" :: "m" (FpDest_uw));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -849,7 +929,7 @@
         vfpFlushToZero(fpscr, cOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        uint64_t result = vfpFpDToFixed(cOp1, false, false, 0, false);
+        uint64_t result = vfpFpToFixed<double>(cOp1, false, 32, 0, false);
         __asm__ __volatile__("" :: "m" (result));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = result;
@@ -868,7 +948,7 @@
         VfpSavedState state = prepFpState(fpscr.rMode);
         vfpFlushToZero(fpscr, FpOp1);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_sw = vfpFpSToFixed(FpOp1, true, false, 0, false);
+        FpDest_sw = vfpFpToFixed<float>(FpOp1, true, 32, 0, false);
         __asm__ __volatile__("" :: "m" (FpDest_sw));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -887,7 +967,7 @@
         vfpFlushToZero(fpscr, cOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        int64_t result = vfpFpDToFixed(cOp1, true, false, 0, false);
+        int64_t result = vfpFpToFixed<double>(cOp1, true, 32, 0, false);
         __asm__ __volatile__("" :: "m" (result));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = result;
@@ -907,7 +987,7 @@
         VfpSavedState state = prepFpState(fpscr.rMode);
         fesetround(FeRoundZero);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_uw = vfpFpSToFixed(FpOp1, false, false, 0);
+        FpDest_uw = vfpFpToFixed<float>(FpOp1, false, 32, 0);
         __asm__ __volatile__("" :: "m" (FpDest_uw));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -927,7 +1007,7 @@
         VfpSavedState state = prepFpState(fpscr.rMode);
         fesetround(FeRoundZero);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        uint64_t result = vfpFpDToFixed(cOp1, false, false, 0);
+        uint64_t result = vfpFpToFixed<double>(cOp1, false, 32, 0);
         __asm__ __volatile__("" :: "m" (result));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = result;
@@ -947,7 +1027,7 @@
         VfpSavedState state = prepFpState(fpscr.rMode);
         fesetround(FeRoundZero);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_sw = vfpFpSToFixed(FpOp1, true, false, 0);
+        FpDest_sw = vfpFpToFixed<float>(FpOp1, true, 32, 0);
         __asm__ __volatile__("" :: "m" (FpDest_sw));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -967,7 +1047,7 @@
         VfpSavedState state = prepFpState(fpscr.rMode);
         fesetround(FeRoundZero);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        int64_t result = vfpFpDToFixed(cOp1, true, false, 0);
+        int64_t result = vfpFpToFixed<double>(cOp1, true, 32, 0);
         __asm__ __volatile__("" :: "m" (result));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = result;
@@ -1333,7 +1413,7 @@
         vfpFlushToZero(fpscr, FpOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_sw = vfpFpSToFixed(FpOp1, true, false, imm);
+        FpDest_sw = vfpFpToFixed<float>(FpOp1, true, 32, imm);
         __asm__ __volatile__("" :: "m" (FpDest_sw));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -1352,7 +1432,7 @@
         vfpFlushToZero(fpscr, cOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        uint64_t mid = vfpFpDToFixed(cOp1, true, false, imm);
+        uint64_t mid = vfpFpToFixed<double>(cOp1, true, 32, imm);
         __asm__ __volatile__("" :: "m" (mid));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = mid;
@@ -1372,7 +1452,7 @@
         vfpFlushToZero(fpscr, FpOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_uw = vfpFpSToFixed(FpOp1, false, false, imm);
+        FpDest_uw = vfpFpToFixed<float>(FpOp1, false, 32, imm);
         __asm__ __volatile__("" :: "m" (FpDest_uw));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -1391,7 +1471,7 @@
         vfpFlushToZero(fpscr, cOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        uint64_t mid = vfpFpDToFixed(cOp1, false, false, imm);
+        uint64_t mid = vfpFpToFixed<double>(cOp1, false, 32, imm);
         __asm__ __volatile__("" :: "m" (mid));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = mid;
@@ -1410,7 +1490,7 @@
         FPSCR fpscr = (FPSCR) FpscrExc;
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (FpOp1_sw) : "m" (FpOp1_sw));
-        FpDest = vfpSFixedToFpS(fpscr.fz, fpscr.dn, FpOp1_sw, false, imm);
+        FpDest = vfpSFixedToFpS(fpscr.fz, fpscr.dn, FpOp1_sw, 32, imm);
         __asm__ __volatile__("" :: "m" (FpDest));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -1428,7 +1508,7 @@
         uint64_t mid = ((uint64_t)FpOp1P0_uw | ((uint64_t)FpOp1P1_uw << 32));
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (mid) : "m" (mid));
-        double cDest = vfpSFixedToFpD(fpscr.fz, fpscr.dn, mid, false, imm);
+        double cDest = vfpSFixedToFpD(fpscr.fz, fpscr.dn, mid, 32, imm);
         __asm__ __volatile__("" :: "m" (cDest));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = dblLow(cDest);
@@ -1447,7 +1527,7 @@
         FPSCR fpscr = (FPSCR) FpscrExc;
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (FpOp1_uw) : "m" (FpOp1_uw));
-        FpDest = vfpUFixedToFpS(fpscr.fz, fpscr.dn, FpOp1_uw, false, imm);
+        FpDest = vfpUFixedToFpS(fpscr.fz, fpscr.dn, FpOp1_uw, 32, imm);
         __asm__ __volatile__("" :: "m" (FpDest));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -1465,7 +1545,7 @@
         uint64_t mid = ((uint64_t)FpOp1P0_uw | ((uint64_t)FpOp1P1_uw << 32));
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (mid) : "m" (mid));
-        double cDest = vfpUFixedToFpD(fpscr.fz, fpscr.dn, mid, false, imm);
+        double cDest = vfpUFixedToFpD(fpscr.fz, fpscr.dn, mid, 32, imm);
         __asm__ __volatile__("" :: "m" (cDest));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = dblLow(cDest);
@@ -1485,7 +1565,7 @@
         vfpFlushToZero(fpscr, FpOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_sh = vfpFpSToFixed(FpOp1, true, true, imm);
+        FpDest_sh = vfpFpToFixed<float>(FpOp1, true, 16, imm);
         __asm__ __volatile__("" :: "m" (FpDest_sh));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -1505,7 +1585,7 @@
         vfpFlushToZero(fpscr, cOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        uint64_t result = vfpFpDToFixed(cOp1, true, true, imm);
+        uint64_t result = vfpFpToFixed<double>(cOp1, true, 16, imm);
         __asm__ __volatile__("" :: "m" (result));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = result;
@@ -1526,7 +1606,7 @@
         vfpFlushToZero(fpscr, FpOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_uh = vfpFpSToFixed(FpOp1, false, true, imm);
+        FpDest_uh = vfpFpToFixed<float>(FpOp1, false, 16, imm);
         __asm__ __volatile__("" :: "m" (FpDest_uh));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -1546,7 +1626,7 @@
         vfpFlushToZero(fpscr, cOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        uint64_t mid = vfpFpDToFixed(cOp1, false, true, imm);
+        uint64_t mid = vfpFpToFixed<double>(cOp1, false, 16, imm);
         __asm__ __volatile__("" :: "m" (mid));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = mid;
@@ -1566,7 +1646,7 @@
         FPSCR fpscr = (FPSCR) FpscrExc;
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (FpOp1_sh) : "m" (FpOp1_sh));
-        FpDest = vfpSFixedToFpS(fpscr.fz, fpscr.dn, FpOp1_sh, true, imm);
+        FpDest = vfpSFixedToFpS(fpscr.fz, fpscr.dn, FpOp1_sh, 16, imm);
         __asm__ __volatile__("" :: "m" (FpDest));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -1585,7 +1665,7 @@
         uint64_t mid = ((uint64_t)FpOp1P0_uw | ((uint64_t)FpOp1P1_uw << 32));
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (mid) : "m" (mid));
-        double cDest = vfpSFixedToFpD(fpscr.fz, fpscr.dn, mid, true, imm);
+        double cDest = vfpSFixedToFpD(fpscr.fz, fpscr.dn, mid, 16, imm);
         __asm__ __volatile__("" :: "m" (cDest));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = dblLow(cDest);
@@ -1605,7 +1685,7 @@
         FPSCR fpscr = (FPSCR) FpscrExc;
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (FpOp1_uh) : "m" (FpOp1_uh));
-        FpDest = vfpUFixedToFpS(fpscr.fz, fpscr.dn, FpOp1_uh, true, imm);
+        FpDest = vfpUFixedToFpS(fpscr.fz, fpscr.dn, FpOp1_uh, 16, imm);
         __asm__ __volatile__("" :: "m" (FpDest));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
@@ -1624,7 +1704,7 @@
         uint64_t mid = ((uint64_t)FpOp1P0_uw | ((uint64_t)FpOp1P1_uw << 32));
         VfpSavedState state = prepFpState(fpscr.rMode);
         __asm__ __volatile__("" : "=m" (mid) : "m" (mid));
-        double cDest = vfpUFixedToFpD(fpscr.fz, fpscr.dn, mid, true, imm);
+        double cDest = vfpUFixedToFpD(fpscr.fz, fpscr.dn, mid, 16, imm);
         __asm__ __volatile__("" :: "m" (cDest));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = dblLow(cDest);
diff --git a/src/arch/arm/isa/insts/fp64.isa b/src/arch/arm/isa/insts/fp64.isa
new file mode 100644
index 0000000..95dec50
--- /dev/null
+++ b/src/arch/arm/isa/insts/fp64.isa
@@ -0,0 +1,811 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2012-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Thomas Grocutt
+//          Edmund Grimley Evans
+
+let {{
+
+    header_output = ""
+    decoder_output = ""
+    exec_output = ""
+
+    fmovImmSCode = vfp64EnabledCheckCode + '''
+        AA64FpDestP0_uw = bits(imm, 31, 0);
+        AA64FpDestP1_uw = 0;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+    '''
+    fmovImmSIop = InstObjParams("fmov", "FmovImmS", "FpRegImmOp",
+                                { "code": fmovImmSCode,
+                                  "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegImmOpDeclare.subst(fmovImmSIop);
+    decoder_output += FpRegImmOpConstructor.subst(fmovImmSIop);
+    exec_output    += BasicExecute.subst(fmovImmSIop);
+
+    fmovImmDCode = vfp64EnabledCheckCode + '''
+        AA64FpDestP0_uw = bits(imm, 31, 0);
+        AA64FpDestP1_uw = bits(imm, 63, 32);
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+    '''
+    fmovImmDIop = InstObjParams("fmov", "FmovImmD", "FpRegImmOp",
+                                { "code": fmovImmDCode,
+                                  "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegImmOpDeclare.subst(fmovImmDIop);
+    decoder_output += AA64FpRegImmOpConstructor.subst(fmovImmDIop);
+    exec_output    += BasicExecute.subst(fmovImmDIop);
+
+    fmovRegSCode = vfp64EnabledCheckCode + '''
+        AA64FpDestP0_uw = AA64FpOp1P0_uw;
+        AA64FpDestP1_uw = 0;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+    '''
+    fmovRegSIop = InstObjParams("fmov", "FmovRegS", "FpRegRegOp",
+                                { "code": fmovRegSCode,
+                                  "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fmovRegSIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fmovRegSIop);
+    exec_output    += BasicExecute.subst(fmovRegSIop);
+
+    fmovRegDCode = vfp64EnabledCheckCode + '''
+        AA64FpDestP0_uw = AA64FpOp1P0_uw;
+        AA64FpDestP1_uw = AA64FpOp1P1_uw;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+    '''
+    fmovRegDIop = InstObjParams("fmov", "FmovRegD", "FpRegRegOp",
+                                { "code": fmovRegDCode,
+                                  "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fmovRegDIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fmovRegDIop);
+    exec_output    += BasicExecute.subst(fmovRegDIop);
+
+    fmovCoreRegWCode = vfp64EnabledCheckCode + '''
+        AA64FpDestP0_uw = WOp1_uw;
+        AA64FpDestP1_uw = 0;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+    '''
+    fmovCoreRegWIop = InstObjParams("fmov", "FmovCoreRegW", "FpRegRegOp",
+                                    { "code": fmovCoreRegWCode,
+                                      "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fmovCoreRegWIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fmovCoreRegWIop);
+    exec_output    += BasicExecute.subst(fmovCoreRegWIop);
+
+    fmovCoreRegXCode = vfp64EnabledCheckCode + '''
+        AA64FpDestP0_uw = XOp1_ud;
+        AA64FpDestP1_uw = XOp1_ud >> 32;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+    '''
+    fmovCoreRegXIop = InstObjParams("fmov", "FmovCoreRegX", "FpRegRegOp",
+                                    { "code": fmovCoreRegXCode,
+                                      "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fmovCoreRegXIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fmovCoreRegXIop);
+    exec_output    += BasicExecute.subst(fmovCoreRegXIop);
+
+    fmovUCoreRegXCode = vfp64EnabledCheckCode + '''
+        AA64FpDestP2_uw = XOp1_ud;
+        AA64FpDestP3_uw = XOp1_ud >> 32;
+    '''
+    fmovUCoreRegXIop = InstObjParams("fmov", "FmovUCoreRegX", "FpRegRegOp",
+                                    { "code": fmovUCoreRegXCode,
+                                      "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fmovUCoreRegXIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fmovUCoreRegXIop);
+    exec_output    += BasicExecute.subst(fmovUCoreRegXIop);
+
+    fmovRegCoreWCode = vfp64EnabledCheckCode + '''
+        WDest = AA64FpOp1P0_uw;
+    '''
+    fmovRegCoreWIop = InstObjParams("fmov", "FmovRegCoreW", "FpRegRegOp",
+                                     { "code": fmovRegCoreWCode,
+                                       "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fmovRegCoreWIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fmovRegCoreWIop);
+    exec_output    += BasicExecute.subst(fmovRegCoreWIop);
+
+    fmovRegCoreXCode = vfp64EnabledCheckCode + '''
+        XDest = ( ((uint64_t) AA64FpOp1P1_uw) << 32) | AA64FpOp1P0_uw;
+    '''
+    fmovRegCoreXIop = InstObjParams("fmov", "FmovRegCoreX", "FpRegRegOp",
+                                     { "code": fmovRegCoreXCode,
+                                       "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fmovRegCoreXIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fmovRegCoreXIop);
+    exec_output    += BasicExecute.subst(fmovRegCoreXIop);
+
+    fmovURegCoreXCode = vfp64EnabledCheckCode + '''
+        XDest = ( ((uint64_t) AA64FpOp1P3_uw) << 32) | AA64FpOp1P2_uw;
+    '''
+    fmovURegCoreXIop = InstObjParams("fmov", "FmovURegCoreX", "FpRegRegOp",
+                                    { "code":     fmovURegCoreXCode,
+                                      "op_class": "SimdFloatMiscOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fmovURegCoreXIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fmovURegCoreXIop);
+    exec_output    += BasicExecute.subst(fmovURegCoreXIop);
+}};
+
+let {{
+
+    header_output = ""
+    decoder_output = ""
+    exec_output = ""
+
+    singleIntConvCode = vfp64EnabledCheckCode + '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        uint32_t cOp1  = AA64FpOp1P0_uw;
+        uint32_t cDest = %(op)s;
+        AA64FpDestP0_uw = cDest;
+        AA64FpDestP1_uw = 0;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+        FpscrExc = fpscr;
+    '''
+
+    singleIntConvCode2 = vfp64EnabledCheckCode + '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        uint32_t cOp1  = AA64FpOp1P0_uw;
+        uint32_t cOp2  = AA64FpOp2P0_uw;
+        uint32_t cDest = %(op)s;
+        AA64FpDestP0_uw = cDest;
+        AA64FpDestP1_uw = 0;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+        FpscrExc = fpscr;
+    '''
+
+    singleBinOp = "binaryOp(fpscr, AA64FpOp1P0, AA64FpOp2P0," + \
+                "%(func)s, fpscr.fz, fpscr.dn, fpscr.rMode)"
+    singleUnaryOp = "unaryOp(fpscr, AA64FpOp1P0, %(func)s, fpscr.fz, fpscr.rMode)"
+
+    doubleIntConvCode = vfp64EnabledCheckCode + '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        uint64_t cOp1  = ((uint64_t) AA64FpOp1P1_uw) << 32 | AA64FpOp1P0_uw;
+        uint64_t cDest = %(op)s;
+        AA64FpDestP0_uw = cDest & 0xFFFFFFFF;
+        AA64FpDestP1_uw = cDest >> 32;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+        FpscrExc = fpscr;
+    '''
+
+    doubleIntConvCode2 = vfp64EnabledCheckCode + '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        uint64_t cOp1  = ((uint64_t) AA64FpOp1P1_uw) << 32 | AA64FpOp1P0_uw;
+        uint64_t cOp2  = ((uint64_t) AA64FpOp2P1_uw) << 32 | AA64FpOp2P0_uw;
+        uint64_t cDest = %(op)s;
+        AA64FpDestP0_uw = cDest & 0xFFFFFFFF;
+        AA64FpDestP1_uw = cDest >> 32;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+        FpscrExc = fpscr;
+    '''
+
+    doubleBinOp = '''
+        binaryOp(fpscr, dbl(AA64FpOp1P0_uw, AA64FpOp1P1_uw),
+                        dbl(AA64FpOp2P0_uw, AA64FpOp2P1_uw),
+                        %(func)s, fpscr.fz, fpscr.dn, fpscr.rMode);
+    '''
+    doubleUnaryOp = '''
+        unaryOp(fpscr, dbl(AA64FpOp1P0_uw, AA64FpOp1P1_uw), %(func)s,
+                fpscr.fz, fpscr.rMode)
+    '''
+
+    def buildTernaryFpOp(name, opClass, sOp, dOp):
+        global header_output, decoder_output, exec_output
+        for isDouble in True, False:
+            code = vfp64EnabledCheckCode + '''
+                FPSCR fpscr = (FPSCR) FpscrExc;
+            '''
+            if isDouble:
+                code += '''
+                    uint64_t cOp1 = AA64FpOp1P0_uw | (uint64_t)AA64FpOp1P1_uw << 32;
+                    uint64_t cOp2 = AA64FpOp2P0_uw | (uint64_t)AA64FpOp2P1_uw << 32;
+                    uint64_t cOp3 = AA64FpOp3P0_uw | (uint64_t)AA64FpOp3P1_uw << 32;
+                    uint64_t cDest;
+                ''' "cDest = " + dOp + ";" + '''
+                    AA64FpDestP0_uw = cDest;
+                    AA64FpDestP1_uw = cDest >> 32;
+                '''
+            else:
+                code += '''
+                    uint32_t cOp1 = AA64FpOp1P0_uw;
+                    uint32_t cOp2 = AA64FpOp2P0_uw;
+                    uint32_t cOp3 = AA64FpOp3P0_uw;
+                    uint32_t cDest;
+                ''' "cDest = " + sOp + ";" + '''
+                    AA64FpDestP0_uw = cDest;
+                    AA64FpDestP1_uw = 0;
+                '''
+            code += '''
+                AA64FpDestP2_uw = 0;
+                AA64FpDestP3_uw = 0;
+                FpscrExc = fpscr;
+            '''
+
+            iop = InstObjParams(name.lower(), name + ("D" if isDouble else "S"),
+                                "FpRegRegRegRegOp",
+                                { "code": code, "op_class": opClass }, [])
+
+            header_output  += AA64FpRegRegRegRegOpDeclare.subst(iop)
+            decoder_output += AA64FpRegRegRegRegOpConstructor.subst(iop)
+            exec_output    += BasicExecute.subst(iop)
+
+    buildTernaryFpOp("FMAdd", "SimdFloatMultAccOp",
+                     "fplibMulAdd<uint32_t>(cOp3, cOp1, cOp2, fpscr)",
+                     "fplibMulAdd<uint64_t>(cOp3, cOp1, cOp2, fpscr)" )
+    buildTernaryFpOp("FMSub", "SimdFloatMultAccOp",
+                     "fplibMulAdd<uint32_t>(cOp3, fplibNeg<uint32_t>(cOp1), cOp2, fpscr)",
+                     "fplibMulAdd<uint64_t>(cOp3, fplibNeg<uint64_t>(cOp1), cOp2, fpscr)" )
+    buildTernaryFpOp("FNMAdd", "SimdFloatMultAccOp",
+                     "fplibMulAdd<uint32_t>(fplibNeg<uint32_t>(cOp3), fplibNeg<uint32_t>(cOp1), cOp2, fpscr)",
+                     "fplibMulAdd<uint64_t>(fplibNeg<uint64_t>(cOp3), fplibNeg<uint64_t>(cOp1), cOp2, fpscr)" )
+    buildTernaryFpOp("FNMSub", "SimdFloatMultAccOp",
+                     "fplibMulAdd<uint32_t>(fplibNeg<uint32_t>(cOp3), cOp1, cOp2, fpscr)",
+                     "fplibMulAdd<uint64_t>(fplibNeg<uint64_t>(cOp3), cOp1, cOp2, fpscr)" )
+
+    def buildBinFpOp(name, Name, base, opClass, singleOp, doubleOp):
+        global header_output, decoder_output, exec_output
+
+        code = singleIntConvCode2 % { "op": singleOp }
+        sIop = InstObjParams(name, Name + "S", base,
+                { "code": code,
+                  "op_class": opClass }, [])
+
+        code = doubleIntConvCode2 % { "op": doubleOp }
+        dIop = InstObjParams(name, Name + "D", base,
+                { "code": code,
+                  "op_class": opClass }, [])
+
+        declareTempl     = eval(         base + "Declare");
+        constructorTempl = eval("AA64" + base + "Constructor");
+
+        for iop in sIop, dIop:
+            header_output  += declareTempl.subst(iop)
+            decoder_output += constructorTempl.subst(iop)
+            exec_output    += BasicExecute.subst(iop)
+
+    buildBinFpOp("fadd", "FAdd", "FpRegRegRegOp", "SimdFloatAddOp",
+                 "fplibAdd<uint32_t>(cOp1, cOp2, fpscr)",
+                 "fplibAdd<uint64_t>(cOp1, cOp2, fpscr)")
+    buildBinFpOp("fsub", "FSub", "FpRegRegRegOp", "SimdFloatAddOp",
+                 "fplibSub<uint32_t>(cOp1, cOp2, fpscr)",
+                 "fplibSub<uint64_t>(cOp1, cOp2, fpscr)")
+    buildBinFpOp("fdiv", "FDiv", "FpRegRegRegOp", "SimdFloatDivOp",
+                 "fplibDiv<uint32_t>(cOp1, cOp2, fpscr)",
+                 "fplibDiv<uint64_t>(cOp1, cOp2, fpscr)")
+    buildBinFpOp("fmul", "FMul", "FpRegRegRegOp", "SimdFloatMultOp",
+                 "fplibMul<uint32_t>(cOp1, cOp2, fpscr)",
+                 "fplibMul<uint64_t>(cOp1, cOp2, fpscr)")
+    buildBinFpOp("fnmul", "FNMul", "FpRegRegRegOp", "SimdFloatMultOp",
+                 "fplibNeg<uint32_t>(fplibMul<uint32_t>(cOp1, cOp2, fpscr))",
+                 "fplibNeg<uint64_t>(fplibMul<uint64_t>(cOp1, cOp2, fpscr))")
+    buildBinFpOp("fmin", "FMin", "FpRegRegRegOp", "SimdFloatCmpOp",
+                 "fplibMin<uint32_t>(cOp1, cOp2, fpscr)",
+                 "fplibMin<uint64_t>(cOp1, cOp2, fpscr)")
+    buildBinFpOp("fmax", "FMax", "FpRegRegRegOp", "SimdFloatCmpOp",
+                 "fplibMax<uint32_t>(cOp1, cOp2, fpscr)",
+                 "fplibMax<uint64_t>(cOp1, cOp2, fpscr)")
+    buildBinFpOp("fminnm", "FMinNM", "FpRegRegRegOp", "SimdFloatCmpOp",
+                 "fplibMinNum<uint32_t>(cOp1, cOp2, fpscr)",
+                 "fplibMinNum<uint64_t>(cOp1, cOp2, fpscr)")
+    buildBinFpOp("fmaxnm", "FMaxNM", "FpRegRegRegOp", "SimdFloatCmpOp",
+                 "fplibMaxNum<uint32_t>(cOp1, cOp2, fpscr)",
+                 "fplibMaxNum<uint64_t>(cOp1, cOp2, fpscr)")
+
+    def buildUnaryFpOp(name, Name, base, opClass, singleOp, doubleOp = None):
+        if doubleOp is None:
+            doubleOp = singleOp
+        global header_output, decoder_output, exec_output
+
+        code = singleIntConvCode % { "op": singleOp }
+        sIop = InstObjParams(name, Name + "S", base,
+                { "code": code,
+                  "op_class": opClass }, [])
+        code = doubleIntConvCode % { "op": doubleOp }
+        dIop = InstObjParams(name, Name + "D", base,
+                { "code": code,
+                  "op_class": opClass }, [])
+
+        declareTempl     = eval(         base + "Declare");
+        constructorTempl = eval("AA64" + base + "Constructor");
+
+        for iop in sIop, dIop:
+            header_output  += declareTempl.subst(iop)
+            decoder_output += constructorTempl.subst(iop)
+            exec_output    += BasicExecute.subst(iop)
+
+    buildUnaryFpOp("fsqrt", "FSqrt", "FpRegRegOp", "SimdFloatSqrtOp",
+                   "fplibSqrt<uint32_t>(cOp1, fpscr)", "fplibSqrt<uint64_t>(cOp1, fpscr)")
+
+    def buildSimpleUnaryFpOp(name, Name, base, opClass, singleOp,
+                             doubleOp = None, isIntConv = True):
+        if doubleOp is None:
+            doubleOp = singleOp
+        global header_output, decoder_output, exec_output
+
+        if isIntConv:
+            sCode = singleIntConvCode
+            dCode = doubleIntConvCode
+        else:
+            sCode = singleCode
+            dCode = doubleCode
+
+        for code, op, suffix in [[sCode, singleOp, "S"],
+                                 [dCode, doubleOp, "D"]]:
+            iop = InstObjParams(name, Name + suffix, base,
+                { "code": code % { "op": op },
+                  "op_class": opClass }, [])
+
+            declareTempl     = eval(         base + "Declare");
+            constructorTempl = eval("AA64" + base + "Constructor");
+
+            header_output  += declareTempl.subst(iop)
+            decoder_output += constructorTempl.subst(iop)
+            exec_output    += BasicExecute.subst(iop)
+
+    buildSimpleUnaryFpOp("fneg", "FNeg", "FpRegRegOp", "SimdFloatMiscOp",
+                         "fplibNeg<uint32_t>(cOp1)", "fplibNeg<uint64_t>(cOp1)")
+    buildSimpleUnaryFpOp("fabs", "FAbs", "FpRegRegOp", "SimdFloatMiscOp",
+                         "fplibAbs<uint32_t>(cOp1)", "fplibAbs<uint64_t>(cOp1)")
+    buildSimpleUnaryFpOp("frintn", "FRIntN", "FpRegRegOp", "SimdFloatMiscOp",
+                         "fplibRoundInt<uint32_t>(cOp1, FPRounding_TIEEVEN, false, fpscr)",
+                         "fplibRoundInt<uint64_t>(cOp1, FPRounding_TIEEVEN, false, fpscr)")
+    buildSimpleUnaryFpOp("frintp", "FRIntP", "FpRegRegOp", "SimdFloatMiscOp",
+                         "fplibRoundInt<uint32_t>(cOp1, FPRounding_POSINF, false, fpscr)",
+                         "fplibRoundInt<uint64_t>(cOp1, FPRounding_POSINF, false, fpscr)")
+    buildSimpleUnaryFpOp("frintm", "FRIntM", "FpRegRegOp", "SimdFloatMiscOp",
+                         "fplibRoundInt<uint32_t>(cOp1, FPRounding_NEGINF, false, fpscr)",
+                         "fplibRoundInt<uint64_t>(cOp1, FPRounding_NEGINF, false, fpscr)")
+    buildSimpleUnaryFpOp("frintz", "FRIntZ", "FpRegRegOp", "SimdFloatMiscOp",
+                         "fplibRoundInt<uint32_t>(cOp1, FPRounding_ZERO, false, fpscr)",
+                         "fplibRoundInt<uint64_t>(cOp1, FPRounding_ZERO, false, fpscr)")
+    buildSimpleUnaryFpOp("frinta", "FRIntA", "FpRegRegOp", "SimdFloatMiscOp",
+                         "fplibRoundInt<uint32_t>(cOp1, FPRounding_TIEAWAY, false, fpscr)",
+                         "fplibRoundInt<uint64_t>(cOp1, FPRounding_TIEAWAY, false, fpscr)")
+    buildSimpleUnaryFpOp("frinti", "FRIntI", "FpRegRegOp", "SimdFloatMiscOp",
+                         "fplibRoundInt<uint32_t>(cOp1, FPCRRounding(fpscr), false, fpscr)",
+                         "fplibRoundInt<uint64_t>(cOp1, FPCRRounding(fpscr), false, fpscr)")
+    buildSimpleUnaryFpOp("frintx", "FRIntX", "FpRegRegOp", "SimdFloatMiscOp",
+                         "fplibRoundInt<uint32_t>(cOp1, FPCRRounding(fpscr), true, fpscr)",
+                         "fplibRoundInt<uint64_t>(cOp1, FPCRRounding(fpscr), true, fpscr)")
+}};
+
+let {{
+
+    header_output = ""
+    decoder_output = ""
+    exec_output = ""
+
+    # Creates the integer to floating point instructions, including variants for
+    # signed/unsigned, float/double, etc
+    for regL, regOpL, width in [["W", "w", 32],
+                                ["X", "d", 64]]:
+        for isDouble in True, False:
+            for us, usCode in [["U", "uint%d_t cSrc = %sOp1_u%s;" %(width, regL, regOpL)],
+                               ["S", "int%d_t  cSrc = %sOp1_u%s;" %(width, regL, regOpL)]]:
+                fcvtIntFpDCode = vfp64EnabledCheckCode + '''
+                    FPSCR fpscr = (FPSCR) FpscrExc;
+                    %s
+                ''' %(usCode)
+
+                if isDouble:
+                    fcvtIntFpDCode += '''
+                        uint64_t cDest = fplibFixedToFP<uint64_t>(cSrc, 0,
+                            %s, FPCRRounding(fpscr), fpscr);
+                        AA64FpDestP0_uw = cDest;
+                        AA64FpDestP1_uw = cDest >> 32;
+                    ''' % ("true" if us == "U" else "false")
+                else:
+                    fcvtIntFpDCode += '''
+                        uint32_t cDest = fplibFixedToFP<uint32_t>(cSrc, 0,
+                            %s, FPCRRounding(fpscr), fpscr);
+                        AA64FpDestP0_uw = cDest;
+                        AA64FpDestP1_uw = 0;
+                    ''' % ("true" if us == "U" else "false")
+                fcvtIntFpDCode += '''
+                    AA64FpDestP2_uw = 0;
+                    AA64FpDestP3_uw = 0;
+                    FpscrExc = fpscr;
+                '''
+
+                instName = "Fcvt%s%sIntFp%s" %(regL, us, "D" if isDouble else "S")
+                mnem     = "%scvtf" %(us.lower())
+                fcvtIntFpDIop = InstObjParams(mnem, instName, "FpRegRegOp",
+                                                { "code": fcvtIntFpDCode,
+                                                  "op_class": "SimdFloatCvtOp" }, [])
+                header_output  += FpRegRegOpDeclare.subst(fcvtIntFpDIop);
+                decoder_output += AA64FpRegRegOpConstructor.subst(fcvtIntFpDIop);
+                exec_output    += BasicExecute.subst(fcvtIntFpDIop);
+
+    # Generates the floating point to integer conversion instructions in various
+    # variants, eg signed/unsigned
+    def buildFpCvtIntOp(isDouble, isSigned, isXReg):
+        global header_output, decoder_output, exec_output
+
+        for rmode, roundingMode in [["N", "FPRounding_TIEEVEN"],
+                                    ["P", "FPRounding_POSINF"],
+                                    ["M", "FPRounding_NEGINF"],
+                                    ["Z", "FPRounding_ZERO"],
+                                    ["A", "FPRounding_TIEAWAY"]]:
+            fcvtFpIntCode = vfp64EnabledCheckCode + '''
+                FPSCR fpscr = (FPSCR) FpscrExc;'''
+            if isDouble:
+                fcvtFpIntCode += '''
+                uint64_t cOp1 = AA64FpOp1P0_uw | (uint64_t)AA64FpOp1P1_uw << 32;
+                '''
+            else:
+                fcvtFpIntCode += "uint32_t cOp1 = AA64FpOp1P0_uw;"
+
+            fcvtFpIntCode += '''
+                %sDest = fplibFPToFixed<uint%s_t, uint%s_t>(cOp1, 0, %s, %s, fpscr);
+                FpscrExc = fpscr;
+            ''' %("X"      if isXReg   else "W",
+                  "64"     if isDouble else "32",
+                  "64"     if isXReg   else "32",
+                  "false"  if isSigned else "true",
+                  roundingMode)
+
+            instName = "FcvtFp%sInt%s%s%s" %("S" if isSigned else "U",
+                                             "X" if isXReg   else "W",
+                                             "D" if isDouble else "S", rmode)
+            mnem     = "fcvt%s%s" %(rmode, "s" if isSigned else "u")
+            fcvtFpIntIop = InstObjParams(mnem, instName, "FpRegRegOp",
+                                        { "code": fcvtFpIntCode,
+                                        "op_class": "SimdFloatCvtOp" }, [])
+            header_output  += FpRegRegOpDeclare.subst(fcvtFpIntIop);
+            decoder_output += FpRegRegOpConstructor.subst(fcvtFpIntIop);
+            exec_output    += BasicExecute.subst(fcvtFpIntIop);
+
+    # Now actually do the building with the different variants
+    for isDouble in True, False:
+       for isSigned in True, False:
+           for isXReg in True, False:
+             buildFpCvtIntOp(isDouble, isSigned, isXReg)
+
+    fcvtFpSFpDCode = vfp64EnabledCheckCode + '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        uint64_t cDest = fplibConvert<uint32_t, uint64_t>(AA64FpOp1P0_uw,
+            FPCRRounding(fpscr), fpscr);
+        AA64FpDestP0_uw = cDest;
+        AA64FpDestP1_uw = cDest >> 32;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+        FpscrExc = fpscr;
+    '''
+    fcvtFpSFpDIop = InstObjParams("fcvt", "FCvtFpSFpD", "FpRegRegOp",
+                                     { "code": fcvtFpSFpDCode,
+                                       "op_class": "SimdFloatCvtOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fcvtFpSFpDIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fcvtFpSFpDIop);
+    exec_output    += BasicExecute.subst(fcvtFpSFpDIop);
+
+    fcvtFpDFpSCode = vfp64EnabledCheckCode + '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        uint64_t cOp1 = AA64FpOp1P0_uw | (uint64_t)AA64FpOp1P1_uw << 32;
+        AA64FpDestP0_uw = fplibConvert<uint64_t, uint32_t>(cOp1,
+            FPCRRounding(fpscr), fpscr);
+        AA64FpDestP1_uw = 0;
+        AA64FpDestP2_uw = 0;
+        AA64FpDestP3_uw = 0;
+        FpscrExc = fpscr;
+    '''
+    fcvtFpDFpSIop = InstObjParams("fcvt", "FcvtFpDFpS", "FpRegRegOp",
+                                 {"code":     fcvtFpDFpSCode,
+                                  "op_class": "SimdFloatCvtOp" }, [])
+    header_output  += FpRegRegOpDeclare.subst(fcvtFpDFpSIop);
+    decoder_output += AA64FpRegRegOpConstructor.subst(fcvtFpDFpSIop);
+    exec_output    += BasicExecute.subst(fcvtFpDFpSIop);
+
+    # Half precision to single or double precision conversion
+    for isDouble in True, False:
+        code = vfp64EnabledCheckCode + '''
+            FPSCR fpscr = (FPSCR) FpscrExc;
+            %s cDest = fplibConvert<uint16_t, uint%s_t>(AA64FpOp1P0_uw,
+                FPCRRounding(fpscr), fpscr);
+        ''' % ("uint64_t" if isDouble else "uint32_t",
+               "64" if isDouble else "32")
+        if isDouble:
+            code += '''
+                AA64FpDestP0_uw = cDest;
+                AA64FpDestP1_uw = cDest >> 32;
+            '''
+        else:
+            code += '''
+                AA64FpDestP0_uw = cDest;
+                AA64FpDestP1_uw = 0;
+            '''
+        code += '''
+            AA64FpDestP2_uw = 0;
+            AA64FpDestP3_uw = 0;
+            FpscrExc = fpscr;
+        '''
+
+        instName = "FcvtFpHFp%s" %("D" if isDouble else "S")
+        fcvtFpHFpIop = InstObjParams("fcvt", instName, "FpRegRegOp",
+                                     { "code": code,
+                                       "op_class": "SimdFloatCvtOp" }, [])
+        header_output  += FpRegRegOpDeclare.subst(fcvtFpHFpIop);
+        decoder_output += AA64FpRegRegOpConstructor.subst(fcvtFpHFpIop);
+        exec_output    += BasicExecute.subst(fcvtFpHFpIop);
+
+    # single or double precision to Half precision conversion
+    for isDouble in True, False:
+        code = vfp64EnabledCheckCode + '''
+            FPSCR fpscr = (FPSCR) FpscrExc;
+            %s;
+            AA64FpDestP0_uw = fplibConvert<uint%s_t, uint16_t>(cOp1,
+                FPCRRounding(fpscr), fpscr);
+            AA64FpDestP1_uw = 0;
+            AA64FpDestP2_uw = 0;
+            AA64FpDestP3_uw = 0;
+            FpscrExc = fpscr;
+        ''' % ("uint64_t cOp1 = AA64FpOp1P0_uw | (uint64_t)AA64FpOp1P1_uw << 32"
+               if isDouble else "uint32_t cOp1 = AA64FpOp1P0_uw",
+               "64" if isDouble else "32")
+
+        instName = "FcvtFp%sFpH" %("D" if isDouble else "S")
+        fcvtFpFpHIop = InstObjParams("fcvt", instName, "FpRegRegOp",
+                                     { "code": code,
+                                       "op_class": "SimdFloatCvtOp" }, [])
+        header_output  += FpRegRegOpDeclare.subst(fcvtFpFpHIop);
+        decoder_output += AA64FpRegRegOpConstructor.subst(fcvtFpFpHIop);
+        exec_output    += BasicExecute.subst(fcvtFpFpHIop);
+
+    # Build the various versions of the floating point compare instructions
+    def buildFCmpOp(isQuiet, isDouble, isImm):
+        global header_output, decoder_output, exec_output
+
+        fcmpCode = vfp64EnabledCheckCode + '''
+            FPSCR fpscr = (FPSCR) FpscrExc;
+            %s cOp1 = %s;
+        ''' % ("uint64_t" if isDouble else "uint32_t",
+               "AA64FpDestP0_uw | (uint64_t)AA64FpDestP1_uw << 32"
+               if isDouble else "AA64FpDestP0_uw")
+        if isImm:
+            fcmpCode += '''
+                %s cOp2 = imm;
+            ''' % ("uint64_t" if isDouble else "uint32_t")
+        else:
+            fcmpCode += '''
+                %s cOp2  = %s;
+            ''' % ("uint64_t" if isDouble else "uint32_t",
+                   "AA64FpOp1P0_uw | (uint64_t)AA64FpOp1P1_uw << 32"
+                   if isDouble else "AA64FpOp1P0_uw")
+        fcmpCode += '''
+            int cc = fplibCompare<uint%s_t>(cOp1, cOp2, %s, fpscr);
+            CondCodesNZ = cc >> 2 & 3;
+            CondCodesC = cc >> 1 & 1;
+            CondCodesV = cc & 1;
+            FpCondCodes = fpscr & FpCondCodesMask;
+            FpscrExc    = fpscr;
+        ''' % ("64" if isDouble else "32", "false" if isQuiet else "true")
+
+        typeName = "Imm" if isImm else "Reg"
+        instName = "FCmp%s%s%s" %(""  if isQuiet  else "E", typeName,
+                                  "D" if isDouble else "S")
+        fcmpIop = InstObjParams("fcmp%s" %(""  if isQuiet else "e"), instName,
+                                "FpReg%sOp" %(typeName),
+                               {"code":     fcmpCode,
+                                "op_class": "SimdFloatCmpOp"}, [])
+
+        declareTemp     = eval("FpReg%sOpDeclare"         %(typeName));
+        constructorTemp = eval("AA64FpReg%sOpConstructor" %(typeName));
+        header_output  += declareTemp.subst(fcmpIop);
+        decoder_output += constructorTemp.subst(fcmpIop);
+        exec_output    += BasicExecute.subst(fcmpIop);
+
+    for isQuiet in True, False:
+        for isDouble in True, False:
+            for isImm in True, False:
+                buildFCmpOp(isQuiet, isDouble, isImm)
+
+    # Build the various versions of the conditional floating point compare
+    # instructions
+    def buildFCCmpOp(isQuiet, isDouble):
+        global header_output, decoder_output, exec_output
+
+        fccmpCode = vfp64EnabledCheckCode + '''
+            FPSCR fpscr = (FPSCR) FpscrExc;
+            if (testPredicate(CondCodesNZ, CondCodesC, CondCodesV, condCode)) {
+                %s cOp1 = %s;
+                %s cOp2 = %s;
+                int cc = fplibCompare<uint%s_t>(cOp1, cOp2, %s, fpscr);
+                CondCodesNZ = cc >> 2 & 3;
+                CondCodesC = cc >> 1 & 1;
+                CondCodesV = cc & 1;
+            } else {
+                CondCodesNZ = (defCc >> 2) & 0x3;
+                CondCodesC  = (defCc >> 1) & 0x1;
+                CondCodesV  = defCc & 0x1;
+            }
+            FpCondCodes = fpscr & FpCondCodesMask;
+            FpscrExc    = fpscr;
+        ''' % ("uint64_t" if isDouble else "uint32_t",
+               "AA64FpOp1P0_uw | (uint64_t)AA64FpOp1P1_uw << 32"
+               if isDouble else "AA64FpOp1P0_uw",
+               "uint64_t" if isDouble else "uint32_t",
+               "AA64FpOp2P0_uw | (uint64_t)AA64FpOp2P1_uw << 32"
+               if isDouble else "AA64FpOp2P0_uw",
+               "64" if isDouble else "32", "false" if isQuiet else "true")
+
+        instName = "FCCmp%sReg%s" %(""  if isQuiet  else "E",
+                                    "D" if isDouble else "S")
+        fccmpIop = InstObjParams("fccmp%s" %(""  if isQuiet  else "e"),
+                                 instName, "FpCondCompRegOp",
+                                {"code":           fccmpCode,
+                                 "op_class":       "SimdFloatCmpOp"}, [])
+        header_output  += DataXCondCompRegDeclare.subst(fccmpIop);
+        decoder_output += DataXCondCompRegConstructor.subst(fccmpIop);
+        exec_output    += BasicExecute.subst(fccmpIop);
+
+    for isQuiet in True, False:
+        for isDouble in True, False:
+            buildFCCmpOp(isQuiet, isDouble)
+
+}};
+
+let {{
+
+    header_output = ""
+    decoder_output = ""
+    exec_output = ""
+
+    # Generates the variants of the floating to fixed point instructions
+    def buildFpCvtFixedOp(isSigned, isDouble, isXReg):
+        global header_output, decoder_output, exec_output
+
+        fcvtFpFixedCode = vfp64EnabledCheckCode + '''
+            FPSCR fpscr = (FPSCR) FpscrExc;
+        '''
+        if isDouble:
+            fcvtFpFixedCode += '''
+                uint64_t cOp1 = AA64FpOp1P0_uw | (uint64_t)AA64FpOp1P1_uw << 32;
+            '''
+        else:
+            fcvtFpFixedCode += "uint32_t cOp1 = AA64FpOp1P0_uw;"
+        fcvtFpFixedCode += '''
+            %sDest = fplibFPToFixed<uint%s_t, uint%s_t>(cOp1, 64 - imm, %s,
+                FPRounding_ZERO, fpscr);
+            FpscrExc = fpscr;
+        ''' %("X"      if isXReg   else "W",
+              "64"     if isDouble else "32",
+              "64"     if isXReg   else "32",
+              "false"  if isSigned else "true")
+
+        instName = "FcvtFp%sFixed%s%s" %("S" if isSigned else "U",
+                                         "D" if isDouble else "S",
+                                         "X" if isXReg   else "W")
+        mnem = "fcvtz%s" %("s" if isSigned else "u")
+        fcvtFpFixedIop = InstObjParams(mnem, instName, "FpRegRegImmOp",
+                                       { "code": fcvtFpFixedCode,
+                                         "op_class": "SimdFloatCvtOp" }, [])
+        header_output  += FpRegRegImmOpDeclare.subst(fcvtFpFixedIop);
+        decoder_output += AA64FpRegRegImmOpConstructor.subst(fcvtFpFixedIop);
+        exec_output    += BasicExecute.subst(fcvtFpFixedIop);
+
+    # Generates the variants of the fixed to floating point instructions
+    def buildFixedCvtFpOp(isSigned, isDouble, isXReg):
+        global header_output, decoder_output, exec_output
+
+        srcRegType = "X" if isXReg   else "W"
+        fcvtFixedFpCode = vfp64EnabledCheckCode + '''
+            FPSCR fpscr = (FPSCR) FpscrExc;
+            %s result = fplibFixedToFP<uint%s_t>((%s%s_t)%sOp1, 64 - imm,
+                %s, FPCRRounding(fpscr), fpscr);
+        ''' %("uint64_t" if isDouble else "uint32_t",
+              "64" if isDouble else "32",
+              "int" if isSigned else "uint", "64" if isXReg else "32",
+              srcRegType,
+              "false" if isSigned else "true")
+        if isDouble:
+            fcvtFixedFpCode += '''
+                AA64FpDestP0_uw = result;
+                AA64FpDestP1_uw = result >> 32;
+            '''
+        else:
+            fcvtFixedFpCode += '''
+                AA64FpDestP0_uw = result;
+                AA64FpDestP1_uw = 0;
+            '''
+        fcvtFixedFpCode += '''
+            AA64FpDestP2_uw = 0;
+            AA64FpDestP3_uw = 0;
+            FpscrExc = fpscr;
+        '''
+
+        instName = "Fcvt%sFixedFp%s%s" %("S" if isSigned else "U",
+                                         "D" if isDouble else "S",
+                                         srcRegType)
+        mnem = "%scvtf" %("s" if isSigned else "u")
+        fcvtFixedFpIop = InstObjParams(mnem, instName, "FpRegRegImmOp",
+                                       { "code":     fcvtFixedFpCode,
+                                         "op_class": "SimdFloatCvtOp" }, [])
+        header_output  += FpRegRegImmOpDeclare.subst(fcvtFixedFpIop);
+        decoder_output += FpRegRegImmOpConstructor.subst(fcvtFixedFpIop);
+        exec_output    += BasicExecute.subst(fcvtFixedFpIop);
+
+    # loop over the variants building the instructions for each
+    for isXReg in True, False:
+        for isDouble in True, False:
+            for isSigned in True, False:
+                buildFpCvtFixedOp(isSigned, isDouble, isXReg)
+                buildFixedCvtFpOp(isSigned, isDouble, isXReg)
+}};
+
+let {{
+
+    header_output  = ""
+    decoder_output = ""
+    exec_output    = ""
+
+    for isDouble in True, False:
+        code = '''
+            if (testPredicate(CondCodesNZ, CondCodesC, CondCodesV, condCode)) {
+                AA64FpDestP0_uw = AA64FpOp1P0_uw;
+        '''
+        if isDouble:
+            code += '''
+                    AA64FpDestP1_uw = AA64FpOp1P1_uw;
+                } else {
+                    AA64FpDestP0_uw = AA64FpOp2P0_uw;
+                    AA64FpDestP1_uw = AA64FpOp2P1_uw;
+                }
+            '''
+        else:
+            code += '''
+                } else {
+                    AA64FpDestP0_uw = AA64FpOp2P0_uw;
+                }
+                AA64FpDestP1_uw = 0;
+            '''
+        code += '''
+            AA64FpDestP2_uw = 0;
+            AA64FpDestP3_uw = 0;
+        '''
+
+        iop = InstObjParams("fcsel", "FCSel%s" %("D" if isDouble else "S"),
+                            "FpCondSelOp", code)
+        header_output  += DataXCondSelDeclare.subst(iop)
+        decoder_output += DataXCondSelConstructor.subst(iop)
+        exec_output    += BasicExecute.subst(iop)
+}};
diff --git a/src/arch/arm/isa/insts/insts.isa b/src/arch/arm/isa/insts/insts.isa
index c01e87d..9d90f77 100644
--- a/src/arch/arm/isa/insts/insts.isa
+++ b/src/arch/arm/isa/insts/insts.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2012 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -37,6 +37,9 @@
 //
 // Authors: Gabe Black
 
+//AArch64 instructions
+##include "aarch64.isa"
+
 //Basic forms of various templates
 ##include "basic.isa"
 
@@ -46,8 +49,15 @@
 //Loads of a single item
 ##include "ldr.isa"
 
+//Loads of a single item, AArch64
+##include "ldr64.isa"
+
 //Miscellaneous instructions that don't fit elsewhere
 ##include "misc.isa"
+##include "misc64.isa"
+
+//Stores of a single item, AArch64
+##include "str64.isa"
 
 //Stores of a single item
 ##include "str.isa"
@@ -61,8 +71,12 @@
 //Data processing instructions
 ##include "data.isa"
 
+//AArch64 data processing instructions
+##include "data64.isa"
+
 //Branches
 ##include "branch.isa"
+##include "branch64.isa"
 
 //Multiply
 ##include "mult.isa"
@@ -72,9 +86,14 @@
 
 //VFP
 ##include "fp.isa"
+##include "fp64.isa"
 
 //Neon
 ##include "neon.isa"
 
+//AArch64 Neon
+##include "neon64.isa"
+##include "neon64_mem.isa"
+
 //m5 Psuedo-ops
 ##include "m5ops.isa"
diff --git a/src/arch/arm/isa/insts/ldr.isa b/src/arch/arm/isa/insts/ldr.isa
index f599fa4..6bfe401 100644
--- a/src/arch/arm/isa/insts/ldr.isa
+++ b/src/arch/arm/isa/insts/ldr.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2011 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -38,6 +38,7 @@
 // Authors: Gabe Black
 
 let {{
+    import math
 
     header_output = ""
     decoder_output = ""
@@ -78,7 +79,8 @@
              newDecoder,
              newExec) = self.fillTemplates(self.name, self.Name, codeBlobs,
                                            self.memFlags, instFlags, base,
-                                           wbDecl, pcDecl, self.rasPop)
+                                           wbDecl, pcDecl, self.rasPop,
+                                           self.size, self.sign)
 
             header_output += newHeader
             decoder_output += newDecoder
@@ -160,7 +162,7 @@
                                       self.size, self.sign, self.user)
 
             # Add memory request flags where necessary
-            self.memFlags.append("%d" % (self.size - 1))
+            self.memFlags.append("%d" % int(math.log(self.size, 2)))
             if self.user:
                 self.memFlags.append("ArmISA::TLB::UserMode")
 
diff --git a/src/arch/arm/isa/insts/ldr64.isa b/src/arch/arm/isa/insts/ldr64.isa
new file mode 100644
index 0000000..78460f6
--- /dev/null
+++ b/src/arch/arm/isa/insts/ldr64.isa
@@ -0,0 +1,446 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+let {{
+
+    header_output = ""
+    decoder_output = ""
+    exec_output = ""
+
+    class LoadInst64(LoadStoreInst):
+        execBase = 'Load64'
+        micro = False
+
+        def __init__(self, mnem, Name, size=4, sign=False, user=False,
+                     literal=False, flavor="normal", top=False):
+            super(LoadInst64, self).__init__()
+
+            self.name = mnem
+            self.Name = Name
+            self.size = size
+            self.sign = sign
+            self.user = user
+            self.literal = literal
+            self.flavor = flavor
+            self.top = top
+
+            self.memFlags = ["ArmISA::TLB::MustBeOne"]
+            self.instFlags = []
+            self.codeBlobs = {"postacc_code" : ""}
+
+            # Add memory request flags where necessary
+            if self.user:
+                self.memFlags.append("ArmISA::TLB::UserMode")
+
+            if self.flavor == "dprefetch":
+                self.memFlags.append("Request::PREFETCH")
+                self.instFlags = ['IsDataPrefetch']
+            elif self.flavor == "iprefetch":
+                self.memFlags.append("Request::PREFETCH")
+                self.instFlags = ['IsInstPrefetch']
+            if self.micro:
+                self.instFlags.append("IsMicroop")
+
+            if self.flavor in ("acexp", "exp"):
+                # For exclusive pair ops alignment check is based on total size
+                self.memFlags.append("%d" % int(math.log(self.size, 2) + 1))
+            elif not (self.size == 16 and self.top):
+                # Only the first microop should perform alignment checking.
+                self.memFlags.append("%d" % int(math.log(self.size, 2)))
+
+            if self.flavor not in ("acquire", "acex", "exclusive",
+                                   "acexp", "exp"):
+                self.memFlags.append("ArmISA::TLB::AllowUnaligned")
+
+            if self.flavor in ("acquire", "acex", "acexp"):
+                self.instFlags.extend(["IsMemBarrier",
+                                       "IsWriteBarrier",
+                                       "IsReadBarrier"])
+            if self.flavor in ("acex", "exclusive", "exp", "acexp"):
+                self.memFlags.append("Request::LLSC")
+
+        def buildEACode(self):
+            # Address computation code
+            eaCode = ""
+            if self.flavor == "fp":
+                eaCode += vfp64EnabledCheckCode
+
+            if self.literal:
+                eaCode += "EA = RawPC"
+            else:
+                eaCode += SPAlignmentCheckCode + "EA = XBase"
+
+            if self.size == 16:
+                if self.top:
+                    eaCode += " + (isBigEndian64(xc->tcBase()) ? 0 : 8)"
+                else:
+                    eaCode += " + (isBigEndian64(xc->tcBase()) ? 8 : 0)"
+            if not self.post:
+                eaCode += self.offset
+            eaCode += ";"
+
+            self.codeBlobs["ea_code"] = eaCode
+
+        def emitHelper(self, base='Memory64', wbDecl=None):
+            global header_output, decoder_output, exec_output
+
+            # If this is a microop itself, don't allow anything that would
+            # require further microcoding.
+            if self.micro:
+                assert not wbDecl
+
+            fa_code = None
+            if not self.micro and self.flavor in ("normal", "widen", "acquire"):
+                fa_code = '''
+                    fault->annotate(ArmFault::SAS, %s);
+                    fault->annotate(ArmFault::SSE, %s);
+                    fault->annotate(ArmFault::SRT, dest);
+                    fault->annotate(ArmFault::SF, %s);
+                    fault->annotate(ArmFault::AR, %s);
+                ''' % ("0" if self.size == 1 else
+                       "1" if self.size == 2 else
+                       "2" if self.size == 4 else "3",
+                       "true" if self.sign else "false",
+                       "true" if (self.size == 8 or
+                                  self.flavor == "widen") else "false",
+                       "true" if self.flavor == "acquire" else "false")
+
+            (newHeader, newDecoder, newExec) = \
+                self.fillTemplates(self.name, self.Name, self.codeBlobs,
+                                   self.memFlags, self.instFlags,
+                                   base, wbDecl, faCode=fa_code)
+
+            header_output += newHeader
+            decoder_output += newDecoder
+            exec_output += newExec
+
+    class LoadImmInst64(LoadInst64):
+        def __init__(self, *args, **kargs):
+            super(LoadImmInst64, self).__init__(*args, **kargs)
+            self.offset = " + imm"
+
+            self.wbDecl = "MicroAddXiUop(machInst, base, base, imm);"
+
+    class LoadRegInst64(LoadInst64):
+        def __init__(self, *args, **kargs):
+            super(LoadRegInst64, self).__init__(*args, **kargs)
+            self.offset = " + extendReg64(XOffset, type, shiftAmt, 64)"
+
+            self.wbDecl = \
+                "MicroAddXERegUop(machInst, base, base, " + \
+                "                 offset, type, shiftAmt);"
+
+    class LoadRawRegInst64(LoadInst64):
+        def __init__(self, *args, **kargs):
+            super(LoadRawRegInst64, self).__init__(*args, **kargs)
+            self.offset = ""
+
+    class LoadSingle64(LoadInst64):
+        def emit(self):
+            self.buildEACode()
+
+            # Code that actually handles the access
+            if self.flavor in ("dprefetch", "iprefetch"):
+                accCode = 'uint64_t temp M5_VAR_USED = Mem%s;'
+            elif self.flavor == "fp":
+                if self.size in (1, 2, 4):
+                    accCode = '''
+                        AA64FpDestP0_uw = cSwap(Mem%s,
+                                                isBigEndian64(xc->tcBase()));
+                        AA64FpDestP1_uw = 0;
+                        AA64FpDestP2_uw = 0;
+                        AA64FpDestP3_uw = 0;
+                    '''
+                elif self.size == 8 or (self.size == 16 and not self.top):
+                    accCode = '''
+                        uint64_t data = cSwap(Mem%s,
+                                              isBigEndian64(xc->tcBase()));
+                        AA64FpDestP0_uw = (uint32_t)data;
+                        AA64FpDestP1_uw = (data >> 32);
+                    '''
+                    # Only zero out the other half if this isn't part of a
+                    # pair of 8 byte loads implementing a 16 byte load.
+                    if self.size == 8:
+                        accCode += '''
+                            AA64FpDestP2_uw = 0;
+                            AA64FpDestP3_uw = 0;
+                        '''
+                elif self.size == 16 and self.top:
+                    accCode = '''
+                        uint64_t data = cSwap(Mem%s,
+                                              isBigEndian64(xc->tcBase()));
+                        AA64FpDestP2_uw = (uint32_t)data;
+                        AA64FpDestP3_uw = (data >> 32);
+                    '''
+            elif self.flavor == "widen" or self.size == 8:
+                accCode = "XDest = cSwap(Mem%s, isBigEndian64(xc->tcBase()));"
+            else:
+                accCode = "WDest = cSwap(Mem%s, isBigEndian64(xc->tcBase()));"
+            if self.size == 16:
+                accCode = accCode % buildMemSuffix(self.sign, 8)
+            else:
+                accCode = accCode % buildMemSuffix(self.sign, self.size)
+
+            self.codeBlobs["memacc_code"] = accCode
+
+            # Push it out to the output files
+            wbDecl = None
+            if self.writeback and not self.micro:
+                wbDecl = self.wbDecl
+            self.emitHelper(self.base, wbDecl)
+
+    class LoadDouble64(LoadInst64):
+        def emit(self):
+            self.buildEACode()
+
+            # Code that actually handles the access
+            if self.flavor == "fp":
+                accCode = '''
+                    uint64_t data = cSwap(Mem_ud, isBigEndian64(xc->tcBase()));
+                    AA64FpDestP0_uw = (uint32_t)data;
+                    AA64FpDestP1_uw = 0;
+                    AA64FpDestP2_uw = 0;
+                    AA64FpDestP3_uw = 0;
+                    AA64FpDest2P0_uw = (data >> 32);
+                    AA64FpDest2P1_uw = 0;
+                    AA64FpDest2P2_uw = 0;
+                    AA64FpDest2P3_uw = 0;
+                '''
+            else:
+                if self.sign:
+                    if self.size == 4:
+                        accCode = '''
+                            uint64_t data = cSwap(Mem_ud,
+                                                  isBigEndian64(xc->tcBase()));
+                            XDest = sext<32>((uint32_t)data);
+                            XDest2 = sext<32>(data >> 32);
+                        '''
+                    elif self.size == 8:
+                        accCode = '''
+                            XDest = sext<64>(Mem_tud.a);
+                            XDest2 = sext<64>(Mem_tud.b);
+                        '''
+                else:
+                    if self.size == 4:
+                        accCode = '''
+                            uint64_t data = cSwap(Mem_ud,
+                                                  isBigEndian64(xc->tcBase()));
+                            XDest = (uint32_t)data;
+                            XDest2 = data >> 32;
+                        '''
+                    elif self.size == 8:
+                        accCode = '''
+                            XDest = Mem_tud.a;
+                            XDest2 = Mem_tud.b;
+                        '''
+            self.codeBlobs["memacc_code"] = accCode
+
+            # Push it out to the output files
+            wbDecl = None
+            if self.writeback and not self.micro:
+                wbDecl = self.wbDecl
+            self.emitHelper(self.base, wbDecl)
+
+    class LoadImm64(LoadImmInst64, LoadSingle64):
+        decConstBase = 'LoadStoreImm64'
+        base = 'ArmISA::MemoryImm64'
+        writeback = False
+        post = False
+
+    class LoadPre64(LoadImmInst64, LoadSingle64):
+        decConstBase = 'LoadStoreImm64'
+        base = 'ArmISA::MemoryPreIndex64'
+        writeback = True
+        post = False
+
+    class LoadPost64(LoadImmInst64, LoadSingle64):
+        decConstBase = 'LoadStoreImm64'
+        base = 'ArmISA::MemoryPostIndex64'
+        writeback = True
+        post = True
+
+    class LoadReg64(LoadRegInst64, LoadSingle64):
+        decConstBase = 'LoadStoreReg64'
+        base = 'ArmISA::MemoryReg64'
+        writeback = False
+        post = False
+
+    class LoadRaw64(LoadRawRegInst64, LoadSingle64):
+        decConstBase = 'LoadStoreRaw64'
+        base = 'ArmISA::MemoryRaw64'
+        writeback = False
+        post = False
+
+    class LoadEx64(LoadRawRegInst64, LoadSingle64):
+        decConstBase = 'LoadStoreEx64'
+        base = 'ArmISA::MemoryEx64'
+        writeback = False
+        post = False
+
+    class LoadLit64(LoadImmInst64, LoadSingle64):
+        decConstBase = 'LoadStoreLit64'
+        base = 'ArmISA::MemoryLiteral64'
+        writeback = False
+        post = False
+
+    def buildLoads64(mnem, NameBase, size, sign, flavor="normal"):
+        LoadImm64(mnem, NameBase + "_IMM", size, sign, flavor=flavor).emit()
+        LoadPre64(mnem, NameBase + "_PRE", size, sign, flavor=flavor).emit()
+        LoadPost64(mnem, NameBase + "_POST", size, sign, flavor=flavor).emit()
+        LoadReg64(mnem, NameBase + "_REG", size, sign, flavor=flavor).emit()
+
+    buildLoads64("ldrb", "LDRB64", 1, False)
+    buildLoads64("ldrsb", "LDRSBW64", 1, True)
+    buildLoads64("ldrsb", "LDRSBX64", 1, True, flavor="widen")
+    buildLoads64("ldrh", "LDRH64", 2, False)
+    buildLoads64("ldrsh", "LDRSHW64", 2, True)
+    buildLoads64("ldrsh", "LDRSHX64", 2, True, flavor="widen")
+    buildLoads64("ldrsw", "LDRSW64", 4, True, flavor="widen")
+    buildLoads64("ldr", "LDRW64", 4, False)
+    buildLoads64("ldr", "LDRX64", 8, False)
+    buildLoads64("ldr", "LDRBFP64", 1, False, flavor="fp")
+    buildLoads64("ldr", "LDRHFP64", 2, False, flavor="fp")
+    buildLoads64("ldr", "LDRSFP64", 4, False, flavor="fp")
+    buildLoads64("ldr", "LDRDFP64", 8, False, flavor="fp")
+
+    LoadImm64("prfm", "PRFM64_IMM", 8, flavor="dprefetch").emit()
+    LoadReg64("prfm", "PRFM64_REG", 8, flavor="dprefetch").emit()
+    LoadLit64("prfm", "PRFM64_LIT", 8, literal=True, flavor="dprefetch").emit()
+    LoadImm64("prfum", "PRFUM64_IMM", 8, flavor="dprefetch").emit()
+
+    LoadImm64("ldurb", "LDURB64_IMM", 1, False).emit()
+    LoadImm64("ldursb", "LDURSBW64_IMM", 1, True).emit()
+    LoadImm64("ldursb", "LDURSBX64_IMM", 1, True, flavor="widen").emit()
+    LoadImm64("ldurh", "LDURH64_IMM", 2, False).emit()
+    LoadImm64("ldursh", "LDURSHW64_IMM", 2, True).emit()
+    LoadImm64("ldursh", "LDURSHX64_IMM", 2, True, flavor="widen").emit()
+    LoadImm64("ldursw", "LDURSW64_IMM", 4, True, flavor="widen").emit()
+    LoadImm64("ldur", "LDURW64_IMM", 4, False).emit()
+    LoadImm64("ldur", "LDURX64_IMM", 8, False).emit()
+    LoadImm64("ldur", "LDURBFP64_IMM", 1, flavor="fp").emit()
+    LoadImm64("ldur", "LDURHFP64_IMM", 2, flavor="fp").emit()
+    LoadImm64("ldur", "LDURSFP64_IMM", 4, flavor="fp").emit()
+    LoadImm64("ldur", "LDURDFP64_IMM", 8, flavor="fp").emit()
+
+    LoadImm64("ldtrb", "LDTRB64_IMM", 1, False, True).emit()
+    LoadImm64("ldtrsb", "LDTRSBW64_IMM", 1, True, True).emit()
+    LoadImm64("ldtrsb", "LDTRSBX64_IMM", 1, True, True, flavor="widen").emit()
+    LoadImm64("ldtrh", "LDTRH64_IMM", 2, False, True).emit()
+    LoadImm64("ldtrsh", "LDTRSHW64_IMM", 2, True, True).emit()
+    LoadImm64("ldtrsh", "LDTRSHX64_IMM", 2, True, True, flavor="widen").emit()
+    LoadImm64("ldtrsw", "LDTRSW64_IMM", 4, True, flavor="widen").emit()
+    LoadImm64("ldtr", "LDTRW64_IMM", 4, False, True).emit()
+    LoadImm64("ldtr", "LDTRX64_IMM", 8, False, True).emit()
+
+    LoadLit64("ldrsw", "LDRSWL64_LIT", 4, True, \
+              literal=True, flavor="widen").emit()
+    LoadLit64("ldr", "LDRWL64_LIT", 4, False, literal=True).emit()
+    LoadLit64("ldr", "LDRXL64_LIT", 8, False, literal=True).emit()
+    LoadLit64("ldr", "LDRSFP64_LIT", 4, literal=True, flavor="fp").emit()
+    LoadLit64("ldr", "LDRDFP64_LIT", 8, literal=True, flavor="fp").emit()
+
+    LoadRaw64("ldar", "LDARX64", 8, flavor="acquire").emit()
+    LoadRaw64("ldar", "LDARW64", 4, flavor="acquire").emit()
+    LoadRaw64("ldarh", "LDARH64", 2, flavor="acquire").emit()
+    LoadRaw64("ldarb", "LDARB64", 1, flavor="acquire").emit()
+
+    LoadEx64("ldaxr", "LDAXRX64", 8, flavor="acex").emit()
+    LoadEx64("ldaxr", "LDAXRW64", 4, flavor="acex").emit()
+    LoadEx64("ldaxrh", "LDAXRH64", 2, flavor="acex").emit()
+    LoadEx64("ldaxrb", "LDAXRB64", 1, flavor="acex").emit()
+
+    LoadEx64("ldxr", "LDXRX64", 8, flavor="exclusive").emit()
+    LoadEx64("ldxr", "LDXRW64", 4, flavor="exclusive").emit()
+    LoadEx64("ldxrh", "LDXRH64", 2, flavor="exclusive").emit()
+    LoadEx64("ldxrb", "LDXRB64", 1, flavor="exclusive").emit()
+
+    class LoadImmU64(LoadImm64):
+        decConstBase = 'LoadStoreImmU64'
+        micro = True
+
+    class LoadImmDU64(LoadImmInst64, LoadDouble64):
+        decConstBase = 'LoadStoreImmDU64'
+        base = 'ArmISA::MemoryDImm64'
+        micro = True
+        post = False
+        writeback = False
+
+    class LoadImmDouble64(LoadImmInst64, LoadDouble64):
+        decConstBase = 'LoadStoreImmDU64'
+        base = 'ArmISA::MemoryDImm64'
+        micro = False
+        post = False
+        writeback = False
+
+    class LoadRegU64(LoadReg64):
+        decConstBase = 'LoadStoreRegU64'
+        micro = True
+
+    class LoadLitU64(LoadLit64):
+        decConstBase = 'LoadStoreLitU64'
+        micro = True
+
+    LoadImmDouble64("ldaxp", "LDAXPW64", 4, flavor="acexp").emit()
+    LoadImmDouble64("ldaxp", "LDAXPX64", 8, flavor="acexp").emit()
+    LoadImmDouble64("ldxp", "LDXPW64", 4, flavor="exp").emit()
+    LoadImmDouble64("ldxp", "LDXPX64", 8, flavor="exp").emit()
+
+    LoadImmU64("ldrxi_uop", "MicroLdrXImmUop", 8).emit()
+    LoadRegU64("ldrxr_uop", "MicroLdrXRegUop", 8).emit()
+    LoadLitU64("ldrxl_uop", "MicroLdrXLitUop", 8, literal=True).emit()
+    LoadImmU64("ldrfpxi_uop", "MicroLdrFpXImmUop", 8, flavor="fp").emit()
+    LoadRegU64("ldrfpxr_uop", "MicroLdrFpXRegUop", 8, flavor="fp").emit()
+    LoadLitU64("ldrfpxl_uop", "MicroLdrFpXLitUop", 8, literal=True,
+               flavor="fp").emit()
+    LoadImmU64("ldrqbfpxi_uop", "MicroLdrQBFpXImmUop",
+               16, flavor="fp", top = False).emit()
+    LoadRegU64("ldrqbfpxr_uop", "MicroLdrQBFpXRegUop",
+               16, flavor="fp", top = False).emit()
+    LoadLitU64("ldrqbfpxl_uop", "MicroLdrQBFpXLitUop",
+               16, literal=True, flavor="fp", top = False).emit()
+    LoadImmU64("ldrqtfpxi_uop", "MicroLdrQTFpXImmUop",
+               16, flavor="fp", top = True).emit()
+    LoadRegU64("ldrqtfpxr_uop", "MicroLdrQTFpXRegUop",
+               16, flavor="fp", top = True).emit()
+    LoadLitU64("ldrqtfpxl_uop", "MicroLdrQTFpXLitUop",
+               16, literal=True, flavor="fp", top = True).emit()
+    LoadImmDU64("ldrduxi_uop", "MicroLdrDUXImmUop", 4, sign=False).emit()
+    LoadImmDU64("ldrdsxi_uop", "MicroLdrDSXImmUop", 4, sign=True).emit()
+    LoadImmDU64("ldrdfpxi_uop", "MicroLdrDFpXImmUop", 4, flavor="fp").emit()
+}};
diff --git a/src/arch/arm/isa/insts/m5ops.isa b/src/arch/arm/isa/insts/m5ops.isa
index 06ed34a..928d1be 100644
--- a/src/arch/arm/isa/insts/m5ops.isa
+++ b/src/arch/arm/isa/insts/m5ops.isa
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010, 2012-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -58,6 +58,7 @@
     armCode = '''
     PseudoInst::arm(xc->tcBase());
     '''
+
     armIop = InstObjParams("arm", "Arm", "PredOp",
                            { "code": armCode,
                              "predicate_test": predicateTest },
@@ -69,6 +70,7 @@
     quiesceCode = '''
     PseudoInst::quiesce(xc->tcBase());
     '''
+
     quiesceIop = InstObjParams("quiesce", "Quiesce", "PredOp",
                            { "code": quiesceCode,
                              "predicate_test": predicateTest },
@@ -81,6 +83,10 @@
     PseudoInst::quiesceNs(xc->tcBase(), join32to64(R1, R0));
     '''
 
+    quiesceNsCode64 = '''
+    PseudoInst::quiesceNs(xc->tcBase(), X0);
+    '''
+
     quiesceNsIop = InstObjParams("quiesceNs", "QuiesceNs", "PredOp",
                            { "code": quiesceNsCode,
                              "predicate_test": predicateTest },
@@ -89,10 +95,22 @@
     decoder_output += BasicConstructor.subst(quiesceNsIop)
     exec_output += QuiescePredOpExecute.subst(quiesceNsIop)
 
+    quiesceNsIop = InstObjParams("quiesceNs", "QuiesceNs64", "PredOp",
+                           { "code": quiesceNsCode64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative", "IsQuiesce"])
+    header_output += BasicDeclare.subst(quiesceNsIop)
+    decoder_output += BasicConstructor.subst(quiesceNsIop)
+    exec_output += QuiescePredOpExecute.subst(quiesceNsIop)
+
     quiesceCyclesCode = '''
     PseudoInst::quiesceCycles(xc->tcBase(), join32to64(R1, R0));
     '''
 
+    quiesceCyclesCode64 = '''
+    PseudoInst::quiesceCycles(xc->tcBase(), X0);
+    '''
+
     quiesceCyclesIop = InstObjParams("quiesceCycles", "QuiesceCycles", "PredOp",
                            { "code": quiesceCyclesCode,
                              "predicate_test": predicateTest },
@@ -101,12 +119,23 @@
     decoder_output += BasicConstructor.subst(quiesceCyclesIop)
     exec_output += QuiescePredOpExecute.subst(quiesceCyclesIop)
 
+    quiesceCyclesIop = InstObjParams("quiesceCycles", "QuiesceCycles64", "PredOp",
+                           { "code": quiesceCyclesCode64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative", "IsQuiesce", "IsUnverifiable"])
+    header_output += BasicDeclare.subst(quiesceCyclesIop)
+    decoder_output += BasicConstructor.subst(quiesceCyclesIop)
+    exec_output += QuiescePredOpExecute.subst(quiesceCyclesIop)
+
     quiesceTimeCode = '''
     uint64_t qt_val = PseudoInst::quiesceTime(xc->tcBase());
     R0 = bits(qt_val, 31, 0);
     R1 = bits(qt_val, 63, 32);
     '''
 
+    quiesceTimeCode64 = '''
+    X0 = PseudoInst::quiesceTime(xc->tcBase());
+    '''
     quiesceTimeIop = InstObjParams("quiesceTime", "QuiesceTime", "PredOp",
                            { "code": quiesceTimeCode,
                              "predicate_test": predicateTest },
@@ -115,12 +144,23 @@
     decoder_output += BasicConstructor.subst(quiesceTimeIop)
     exec_output += PredOpExecute.subst(quiesceTimeIop)
 
+    quiesceTimeIop = InstObjParams("quiesceTime", "QuiesceTime64", "PredOp",
+                           { "code": quiesceTimeCode64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative", "IsUnverifiable"])
+    header_output += BasicDeclare.subst(quiesceTimeIop)
+    decoder_output += BasicConstructor.subst(quiesceTimeIop)
+    exec_output += PredOpExecute.subst(quiesceTimeIop)
+
     rpnsCode = '''
     uint64_t rpns_val = PseudoInst::rpns(xc->tcBase());
     R0 = bits(rpns_val, 31, 0);
     R1 = bits(rpns_val, 63, 32);
     '''
 
+    rpnsCode64 = '''
+    X0 = PseudoInst::rpns(xc->tcBase());
+    '''
     rpnsIop = InstObjParams("rpns", "Rpns", "PredOp",
                            { "code": rpnsCode,
                              "predicate_test": predicateTest },
@@ -129,10 +169,22 @@
     decoder_output += BasicConstructor.subst(rpnsIop)
     exec_output += PredOpExecute.subst(rpnsIop)
 
+    rpnsIop = InstObjParams("rpns", "Rpns64", "PredOp",
+                           { "code": rpnsCode64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative", "IsUnverifiable"])
+    header_output += BasicDeclare.subst(rpnsIop)
+    decoder_output += BasicConstructor.subst(rpnsIop)
+    exec_output += PredOpExecute.subst(rpnsIop)
+
     wakeCpuCode = '''
     PseudoInst::wakeCPU(xc->tcBase(), join32to64(R1,R0));
     '''
 
+    wakeCpuCode64 = '''
+    PseudoInst::wakeCPU(xc->tcBase(), X0);
+    '''
+
     wakeCPUIop = InstObjParams("wakeCPU", "WakeCPU", "PredOp",
                    { "code": wakeCpuCode,
                      "predicate_test": predicateTest },
@@ -141,6 +193,14 @@
     decoder_output += BasicConstructor.subst(wakeCPUIop)
     exec_output += PredOpExecute.subst(wakeCPUIop)
 
+    wakeCPUIop = InstObjParams("wakeCPU", "WakeCPU64", "PredOp",
+                   { "code": wakeCpuCode64,
+                     "predicate_test": predicateTest },
+                     ["IsNonSpeculative", "IsUnverifiable"])
+    header_output += BasicDeclare.subst(wakeCPUIop)
+    decoder_output += BasicConstructor.subst(wakeCPUIop)
+    exec_output += PredOpExecute.subst(wakeCPUIop)
+
     deprecated_ivlbIop = InstObjParams("deprecated_ivlb", "Deprecated_ivlb", "PredOp",
                            { "code": '''warn_once("Obsolete M5 ivlb instruction encountered.\\n");''',
                              "predicate_test": predicateTest })
@@ -171,6 +231,11 @@
     m5exit_code = '''
         PseudoInst::m5exit(xc->tcBase(), join32to64(R1, R0));
     '''
+
+    m5exit_code64 = '''
+        PseudoInst::m5exit(xc->tcBase(), X0);
+    '''
+
     m5exitIop = InstObjParams("m5exit", "M5exit", "PredOp",
                                    { "code": m5exit_code,
                                      "predicate_test": predicateTest },
@@ -190,6 +255,14 @@
     decoder_output += BasicConstructor.subst(m5failIop)
     exec_output += PredOpExecute.subst(m5failIop)
 
+    m5exitIop = InstObjParams("m5exit", "M5exit64", "PredOp",
+                                   { "code": m5exit_code64,
+                                     "predicate_test": predicateTest },
+                                     ["No_OpClass", "IsNonSpeculative"])
+    header_output += BasicDeclare.subst(m5exitIop)
+    decoder_output += BasicConstructor.subst(m5exitIop)
+    exec_output += PredOpExecute.subst(m5exitIop)
+
     loadsymbolCode = '''
     PseudoInst::loadsymbol(xc->tcBase());
     '''
@@ -208,6 +281,10 @@
     R1 = bits(ip_val, 63, 32);
     '''
 
+    initparamCode64 = '''
+    X0 = PseudoInst::initParam(xc->tcBase());
+    '''
+
     initparamIop = InstObjParams("initparam", "Initparam", "PredOp",
                            { "code": initparamCode,
                              "predicate_test": predicateTest },
@@ -216,10 +293,21 @@
     decoder_output += BasicConstructor.subst(initparamIop)
     exec_output += PredOpExecute.subst(initparamIop)
 
+    initparamIop = InstObjParams("initparam", "Initparam64", "PredOp",
+                           { "code": initparamCode64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative"])
+    header_output += BasicDeclare.subst(initparamIop)
+    decoder_output += BasicConstructor.subst(initparamIop)
+    exec_output += PredOpExecute.subst(initparamIop)
+
     resetstats_code = '''
     PseudoInst::resetstats(xc->tcBase(), join32to64(R1, R0), join32to64(R3, R2));
     '''
 
+    resetstats_code64 = '''
+    PseudoInst::resetstats(xc->tcBase(), X0, X1);
+    '''
     resetstatsIop = InstObjParams("resetstats", "Resetstats", "PredOp",
                            { "code": resetstats_code,
                              "predicate_test": predicateTest },
@@ -228,9 +316,22 @@
     decoder_output += BasicConstructor.subst(resetstatsIop)
     exec_output += PredOpExecute.subst(resetstatsIop)
 
+    resetstatsIop = InstObjParams("resetstats", "Resetstats64", "PredOp",
+                           { "code": resetstats_code64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative"])
+    header_output += BasicDeclare.subst(resetstatsIop)
+    decoder_output += BasicConstructor.subst(resetstatsIop)
+    exec_output += PredOpExecute.subst(resetstatsIop)
+
     dumpstats_code = '''
     PseudoInst::dumpstats(xc->tcBase(), join32to64(R1, R0), join32to64(R3, R2));
     '''
+
+    dumpstats_code64 = '''
+    PseudoInst::dumpstats(xc->tcBase(), X0, X1);
+    '''
+
     dumpstatsIop = InstObjParams("dumpstats", "Dumpstats", "PredOp",
                            { "code": dumpstats_code,
                              "predicate_test": predicateTest },
@@ -239,9 +340,22 @@
     decoder_output += BasicConstructor.subst(dumpstatsIop)
     exec_output += PredOpExecute.subst(dumpstatsIop)
 
+    dumpstatsIop = InstObjParams("dumpstats", "Dumpstats64", "PredOp",
+                           { "code": dumpstats_code64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative"])
+    header_output += BasicDeclare.subst(dumpstatsIop)
+    decoder_output += BasicConstructor.subst(dumpstatsIop)
+    exec_output += PredOpExecute.subst(dumpstatsIop)
+
     dumpresetstats_code = '''
     PseudoInst::dumpresetstats(xc->tcBase(), join32to64(R1, R0), join32to64(R3, R2));
     '''
+
+    dumpresetstats_code64 = '''
+    PseudoInst::dumpresetstats(xc->tcBase(), X0, X1);
+    '''
+
     dumpresetstatsIop = InstObjParams("dumpresetstats", "Dumpresetstats", "PredOp",
                            { "code": dumpresetstats_code,
                              "predicate_test": predicateTest },
@@ -250,9 +364,22 @@
     decoder_output += BasicConstructor.subst(dumpresetstatsIop)
     exec_output += PredOpExecute.subst(dumpresetstatsIop)
 
+    dumpresetstatsIop = InstObjParams("dumpresetstats", "Dumpresetstats64", "PredOp",
+                           { "code": dumpresetstats_code64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative"])
+    header_output += BasicDeclare.subst(dumpresetstatsIop)
+    decoder_output += BasicConstructor.subst(dumpresetstatsIop)
+    exec_output += PredOpExecute.subst(dumpresetstatsIop)
+
     m5checkpoint_code = '''
     PseudoInst::m5checkpoint(xc->tcBase(), join32to64(R1, R0), join32to64(R3, R2));
     '''
+
+    m5checkpoint_code64 = '''
+    PseudoInst::m5checkpoint(xc->tcBase(), X0, X1);
+    '''
+
     m5checkpointIop = InstObjParams("m5checkpoint", "M5checkpoint", "PredOp",
                            { "code": m5checkpoint_code,
                              "predicate_test": predicateTest },
@@ -261,11 +388,27 @@
     decoder_output += BasicConstructor.subst(m5checkpointIop)
     exec_output += PredOpExecute.subst(m5checkpointIop)
 
+    m5checkpointIop = InstObjParams("m5checkpoint", "M5checkpoint64", "PredOp",
+                           { "code": m5checkpoint_code64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative", "IsUnverifiable"])
+    header_output += BasicDeclare.subst(m5checkpointIop)
+    decoder_output += BasicConstructor.subst(m5checkpointIop)
+    exec_output += PredOpExecute.subst(m5checkpointIop)
+
     m5readfileCode = '''
     int n = 4;
     uint64_t offset = getArgument(xc->tcBase(), n, sizeof(uint64_t), false);
     R0 = PseudoInst::readfile(xc->tcBase(), R0, join32to64(R3,R2), offset);
     '''
+
+    m5readfileCode64 = '''
+    int n = 4;
+    uint64_t offset = getArgument(xc->tcBase(), n, sizeof(uint64_t), false);
+    n = 6;
+    X0 = PseudoInst::readfile(xc->tcBase(), (uint32_t)X0, X1, offset);
+    '''
+
     m5readfileIop = InstObjParams("m5readfile", "M5readfile", "PredOp",
                            { "code": m5readfileCode,
                              "predicate_test": predicateTest },
@@ -274,6 +417,14 @@
     decoder_output += BasicConstructor.subst(m5readfileIop)
     exec_output += PredOpExecute.subst(m5readfileIop)
 
+    m5readfileIop = InstObjParams("m5readfile", "M5readfile64", "PredOp",
+                           { "code": m5readfileCode64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative", "IsUnverifiable"])
+    header_output += BasicDeclare.subst(m5readfileIop)
+    decoder_output += BasicConstructor.subst(m5readfileIop)
+    exec_output += PredOpExecute.subst(m5readfileIop)
+
     m5writefileCode = '''
     int n = 4;
     uint64_t offset = getArgument(xc->tcBase(), n, sizeof(uint64_t), false);
@@ -282,6 +433,16 @@
     R0 = PseudoInst::writefile(xc->tcBase(), R0, join32to64(R3,R2), offset,
                                 filenameAddr);
     '''
+
+    m5writefileCode64 = '''
+    int n = 4;
+    uint64_t offset = getArgument(xc->tcBase(), n, sizeof(uint64_t), false);
+    n = 6;
+    Addr filenameAddr = getArgument(xc->tcBase(), n, sizeof(Addr), false);
+    X0 = PseudoInst::writefile(xc->tcBase(), (uint32_t)X0, X1, offset,
+                                filenameAddr);
+    '''
+
     m5writefileIop = InstObjParams("m5writefile", "M5writefile", "PredOp",
                            { "code": m5writefileCode,
                              "predicate_test": predicateTest },
@@ -290,6 +451,14 @@
     decoder_output += BasicConstructor.subst(m5writefileIop)
     exec_output += PredOpExecute.subst(m5writefileIop)
 
+    m5writefileIop = InstObjParams("m5writefile", "M5writefile64", "PredOp",
+                           { "code": m5writefileCode64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative"])
+    header_output += BasicDeclare.subst(m5writefileIop)
+    decoder_output += BasicConstructor.subst(m5writefileIop)
+    exec_output += PredOpExecute.subst(m5writefileIop)
+
     m5breakIop = InstObjParams("m5break", "M5break", "PredOp",
                            { "code": "PseudoInst::debugbreak(xc->tcBase());",
                              "predicate_test": predicateTest },
@@ -309,6 +478,9 @@
     m5addsymbolCode = '''
     PseudoInst::addsymbol(xc->tcBase(), join32to64(R1, R0), R2);
     '''
+    m5addsymbolCode64 = '''
+    PseudoInst::addsymbol(xc->tcBase(), X0, (uint32_t)X1);
+    '''
     m5addsymbolIop = InstObjParams("m5addsymbol", "M5addsymbol", "PredOp",
                            { "code": m5addsymbolCode,
                              "predicate_test": predicateTest },
@@ -317,8 +489,17 @@
     decoder_output += BasicConstructor.subst(m5addsymbolIop)
     exec_output += PredOpExecute.subst(m5addsymbolIop)
 
+    m5addsymbolIop = InstObjParams("m5addsymbol", "M5addsymbol64", "PredOp",
+                           { "code": m5addsymbolCode64,
+                             "predicate_test": predicateTest },
+                             ["IsNonSpeculative"])
+    header_output += BasicDeclare.subst(m5addsymbolIop)
+    decoder_output += BasicConstructor.subst(m5addsymbolIop)
+    exec_output += PredOpExecute.subst(m5addsymbolIop)
+
     m5panicCode = '''panic("M5 panic instruction called at pc=%#x.",
                      xc->pcState().pc());'''
+
     m5panicIop = InstObjParams("m5panic", "M5panic", "PredOp",
                      { "code": m5panicCode,
                        "predicate_test": predicateTest },
@@ -332,6 +513,13 @@
                           join32to64(R1, R0),
                           join32to64(R3, R2)
                       );'''
+
+    m5workbeginCode64 = '''PseudoInst::workbegin(
+                          xc->tcBase(),
+                          X0,
+                          X1
+                      );'''
+
     m5workbeginIop = InstObjParams("m5workbegin", "M5workbegin", "PredOp",
                      { "code": m5workbeginCode,
                        "predicate_test": predicateTest },
@@ -340,11 +528,26 @@
     decoder_output += BasicConstructor.subst(m5workbeginIop)
     exec_output += PredOpExecute.subst(m5workbeginIop)
 
+    m5workbeginIop = InstObjParams("m5workbegin", "M5workbegin64", "PredOp",
+                     { "code": m5workbeginCode64,
+                       "predicate_test": predicateTest },
+                       ["IsNonSpeculative"])
+    header_output += BasicDeclare.subst(m5workbeginIop)
+    decoder_output += BasicConstructor.subst(m5workbeginIop)
+    exec_output += PredOpExecute.subst(m5workbeginIop)
+
     m5workendCode = '''PseudoInst::workend(
                         xc->tcBase(),
                         join32to64(R1, R0),
                         join32to64(R3, R2)
                     );'''
+
+    m5workendCode64 = '''PseudoInst::workend(
+                        xc->tcBase(),
+                        X0,
+                        X1
+                    );'''
+
     m5workendIop = InstObjParams("m5workend", "M5workend", "PredOp",
                      { "code": m5workendCode,
                        "predicate_test": predicateTest },
@@ -353,4 +556,11 @@
     decoder_output += BasicConstructor.subst(m5workendIop)
     exec_output += PredOpExecute.subst(m5workendIop)
 
+    m5workendIop = InstObjParams("m5workend", "M5workend64", "PredOp",
+                     { "code": m5workendCode64,
+                       "predicate_test": predicateTest },
+                       ["IsNonSpeculative"])
+    header_output += BasicDeclare.subst(m5workendIop)
+    decoder_output += BasicConstructor.subst(m5workendIop)
+    exec_output += PredOpExecute.subst(m5workendIop)
 }};
diff --git a/src/arch/arm/isa/insts/macromem.isa b/src/arch/arm/isa/insts/macromem.isa
index db36a3f..f164595 100644
--- a/src/arch/arm/isa/insts/macromem.isa
+++ b/src/arch/arm/isa/insts/macromem.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -91,7 +91,8 @@
         SCTLR sctlr = Sctlr;
 
         CPSR new_cpsr =
-            cpsrWriteByInstr(old_cpsr, Spsr, 0xF, true, sctlr.nmfi);
+            cpsrWriteByInstr(old_cpsr, Spsr, Scr, Nsacr, 0xF, true,
+                             sctlr.nmfi, xc->tcBase());
         Cpsr = ~CondCodesMask & new_cpsr;
         CondCodesNZ = new_cpsr.nz;
         CondCodesC = new_cpsr.c;
@@ -158,8 +159,8 @@
 
     header_output = decoder_output = exec_output = ''
 
-    loadIops = (microLdrUopIop, microLdrRetUopIop,
-                microLdrFpUopIop, microLdrDBFpUopIop, microLdrDTFpUopIop)
+    loadIops = (microLdrUopIop, microLdrRetUopIop, microLdrFpUopIop,
+                microLdrDBFpUopIop, microLdrDTFpUopIop)
     storeIops = (microStrUopIop, microStrFpUopIop,
                  microStrDBFpUopIop, microStrDTFpUopIop)
     for iop in loadIops + storeIops:
@@ -178,7 +179,7 @@
 let {{
     exec_output = header_output = ''
 
-    eaCode = 'EA = URa + imm;'
+    eaCode = 'EA = XURa + imm;'
 
     for size in (1, 2, 3, 4, 6, 8, 12, 16):
         # Set up the memory access.
@@ -592,6 +593,26 @@
         URa = URb + shift_rm_imm(URc, shiftAmt, shiftType, OptShiftRmCondCodesC);
     '''
 
+    microAddXiUopIop = InstObjParams('addxi_uop', 'MicroAddXiUop',
+                                     'MicroIntImmXOp',
+                                     'XURa = XURb + imm;',
+                                     ['IsMicroop'])
+
+    microAddXiSpAlignUopIop = InstObjParams('addxi_uop', 'MicroAddXiSpAlignUop',
+                                            'MicroIntImmXOp', '''
+        if (isSP((IntRegIndex) urb) && bits(XURb, 3, 0) &&
+            SPAlignmentCheckEnabled(xc->tcBase())) {
+            return new SPAlignmentFault();
+        }
+        XURa = XURb + imm;
+    ''', ['IsMicroop'])
+
+    microAddXERegUopIop = InstObjParams('addxr_uop', 'MicroAddXERegUop',
+                                        'MicroIntRegXOp',
+                                        'XURa = XURb + ' + \
+                                            'extendReg64(XURc, type, shiftAmt, 64);',
+                                        ['IsMicroop'])
+
     microAddUopIop = InstObjParams('add_uop', 'MicroAddUop',
                                    'MicroIntRegOp',
                                    {'code': microAddUopCode,
@@ -604,6 +625,11 @@
                                      'predicate_test': predicateTest},
                                     ['IsMicroop'])
 
+    microSubXiUopIop = InstObjParams('subxi_uop', 'MicroSubXiUop',
+                                     'MicroIntImmXOp',
+                                     'XURa = XURb - imm;',
+                                     ['IsMicroop'])
+
     microSubUopCode = '''
         URa = URb - shift_rm_imm(URc, shiftAmt, shiftType, OptShiftRmCondCodesC);
     '''
@@ -631,8 +657,8 @@
                     SCTLR sctlr = Sctlr;
                     pNPC = URa;
                     CPSR new_cpsr =
-                    cpsrWriteByInstr(cpsrOrCondCodes, URb,
-                                     0xF, true, sctlr.nmfi);
+                    cpsrWriteByInstr(cpsrOrCondCodes, URb, Scr, Nsacr,
+                                     0xF, true, sctlr.nmfi, xc->tcBase());
                     Cpsr = ~CondCodesMask & new_cpsr;
                     NextThumb = new_cpsr.t;
                     NextJazelle = new_cpsr.j;
@@ -651,25 +677,37 @@
                                          ['IsMicroop'])
 
     header_output = MicroIntImmDeclare.subst(microAddiUopIop) + \
+                    MicroIntImmDeclare.subst(microAddXiUopIop) + \
+                    MicroIntImmDeclare.subst(microAddXiSpAlignUopIop) + \
                     MicroIntImmDeclare.subst(microSubiUopIop) + \
+                    MicroIntImmDeclare.subst(microSubXiUopIop) + \
                     MicroIntRegDeclare.subst(microAddUopIop) + \
                     MicroIntRegDeclare.subst(microSubUopIop) + \
+                    MicroIntXERegDeclare.subst(microAddXERegUopIop) + \
                     MicroIntMovDeclare.subst(microUopRegMovIop) + \
                     MicroIntMovDeclare.subst(microUopRegMovRetIop) + \
                     MicroSetPCCPSRDeclare.subst(microUopSetPCCPSRIop)
 
     decoder_output = MicroIntImmConstructor.subst(microAddiUopIop) + \
+                     MicroIntImmXConstructor.subst(microAddXiUopIop) + \
+                     MicroIntImmXConstructor.subst(microAddXiSpAlignUopIop) + \
                      MicroIntImmConstructor.subst(microSubiUopIop) + \
+                     MicroIntImmXConstructor.subst(microSubXiUopIop) + \
                      MicroIntRegConstructor.subst(microAddUopIop) + \
                      MicroIntRegConstructor.subst(microSubUopIop) + \
+                     MicroIntXERegConstructor.subst(microAddXERegUopIop) + \
                      MicroIntMovConstructor.subst(microUopRegMovIop) + \
                      MicroIntMovConstructor.subst(microUopRegMovRetIop) + \
                      MicroSetPCCPSRConstructor.subst(microUopSetPCCPSRIop)
 
     exec_output = PredOpExecute.subst(microAddiUopIop) + \
+                  BasicExecute.subst(microAddXiUopIop) + \
+                  BasicExecute.subst(microAddXiSpAlignUopIop) + \
                   PredOpExecute.subst(microSubiUopIop) + \
+                  BasicExecute.subst(microSubXiUopIop) + \
                   PredOpExecute.subst(microAddUopIop) + \
                   PredOpExecute.subst(microSubUopIop) + \
+                  BasicExecute.subst(microAddXERegUopIop) + \
                   PredOpExecute.subst(microUopRegMovIop) + \
                   PredOpExecute.subst(microUopRegMovRetIop) + \
                   PredOpExecute.subst(microUopSetPCCPSRIop)
@@ -681,6 +719,25 @@
     header_output = MacroMemDeclare.subst(iop)
     decoder_output = MacroMemConstructor.subst(iop)
 
+    iop = InstObjParams("ldpstp", "LdpStp", 'PairMemOp', "", [])
+    header_output += PairMemDeclare.subst(iop)
+    decoder_output += PairMemConstructor.subst(iop)
+
+    iopImm = InstObjParams("bigfpmemimm", "BigFpMemImm", "BigFpMemImmOp", "")
+    iopPre = InstObjParams("bigfpmempre", "BigFpMemPre", "BigFpMemPreOp", "")
+    iopPost = InstObjParams("bigfpmempost", "BigFpMemPost", "BigFpMemPostOp", "")
+    for iop in (iopImm, iopPre, iopPost):
+        header_output += BigFpMemImmDeclare.subst(iop)
+        decoder_output += BigFpMemImmConstructor.subst(iop)
+
+    iop = InstObjParams("bigfpmemreg", "BigFpMemReg", "BigFpMemRegOp", "")
+    header_output += BigFpMemRegDeclare.subst(iop)
+    decoder_output += BigFpMemRegConstructor.subst(iop)
+
+    iop = InstObjParams("bigfpmemlit", "BigFpMemLit", "BigFpMemLitOp", "")
+    header_output += BigFpMemLitDeclare.subst(iop)
+    decoder_output += BigFpMemLitConstructor.subst(iop)
+
     iop = InstObjParams("vldmult", "VldMult", 'VldMultOp', "", [])
     header_output += VMemMultDeclare.subst(iop)
     decoder_output += VMemMultConstructor.subst(iop)
diff --git a/src/arch/arm/isa/insts/mem.isa b/src/arch/arm/isa/insts/mem.isa
index c39f1b1..aed6bab 100644
--- a/src/arch/arm/isa/insts/mem.isa
+++ b/src/arch/arm/isa/insts/mem.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2012 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -48,8 +48,8 @@
             self.constructTemplate = eval(self.decConstBase + 'Constructor')
 
         def fillTemplates(self, name, Name, codeBlobs, memFlags, instFlags,
-                          base = 'Memory', wbDecl = None, pcDecl = None,
-                          rasPop = False):
+                          base='Memory', wbDecl=None, pcDecl=None,
+                          rasPop=False, size=4, sign=False, faCode=None):
             # Make sure flags are in lists (convert to lists if not).
             memFlags = makeList(memFlags)
             instFlags = makeList(instFlags)
@@ -63,6 +63,22 @@
 
             codeBlobs["ea_code"] = eaCode
 
+            if faCode:
+                # For AArch64 the fa_code snippet comes already assembled here
+                codeBlobs["fa_code"] = faCode
+            elif wbDecl == None:
+                codeBlobs["fa_code"] = '''
+                    if (dest != INTREG_PC) {
+                        fault->annotate(ArmFault::SAS, %s);
+                        fault->annotate(ArmFault::SSE, %s);
+                        fault->annotate(ArmFault::SRT, dest);
+                    }
+                ''' %("0"    if size == 1 else
+                      "1"    if size == 2 else "2",
+                      "true" if sign      else "false")
+            else:
+                codeBlobs["fa_code"] = ''
+
             macroName = Name
             instFlagsCopy = list(instFlags)
             codeBlobsCopy = dict(codeBlobs)
@@ -108,6 +124,7 @@
                                       "use_uops" : use_uops,
                                       "use_pc" : use_pc,
                                       "use_wb" : use_wb,
+                                      "fa_code" : '',
                                       "is_ras_pop" : is_ras_pop },
                                     ['IsMacroop'])
                 header_output += self.declareTemplate.subst(iop)
@@ -176,8 +193,13 @@
         return Name
 
     def buildMemSuffix(sign, size):
-        if size == 4:
-            memSuffix = ''
+        if size == 8:
+            memSuffix = '_ud'
+        elif size == 4:
+            if sign:
+                memSuffix = '_sw'
+            else:
+                memSuffix = '_uw'
         elif size == 2:
             if sign:
                 memSuffix = '_sh'
diff --git a/src/arch/arm/isa/insts/misc.isa b/src/arch/arm/isa/insts/misc.isa
index b8425a2..678a125 100644
--- a/src/arch/arm/isa/insts/misc.isa
+++ b/src/arch/arm/isa/insts/misc.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2012 ARM Limited
+// Copyright (c) 2010-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -40,21 +40,102 @@
 let {{
 
     svcCode = '''
-    if (FullSystem) {
-        fault = new SupervisorCall;
-    } else {
-        fault = new SupervisorCall(machInst);
-    }
+    fault = new SupervisorCall(machInst, imm);
     '''
 
-    svcIop = InstObjParams("svc", "Svc", "PredOp",
+    svcIop = InstObjParams("svc", "Svc", "ImmOp",
                            { "code": svcCode,
                              "predicate_test": predicateTest },
                            ["IsSyscall", "IsNonSpeculative", "IsSerializeAfter"])
-    header_output = BasicDeclare.subst(svcIop)
-    decoder_output = BasicConstructor.subst(svcIop)
+    header_output = ImmOpDeclare.subst(svcIop)
+    decoder_output = ImmOpConstructor.subst(svcIop)
     exec_output = PredOpExecute.subst(svcIop)
 
+    smcCode = '''
+    HCR  hcr  = Hcr;
+    CPSR cpsr = Cpsr;
+    SCR  scr  = Scr;
+
+    if ((cpsr.mode != MODE_USER) && FullSystem) {
+        if (ArmSystem::haveVirtualization(xc->tcBase()) &&
+            !inSecureState(scr, cpsr) && (cpsr.mode != MODE_HYP) && hcr.tsc) {
+            fault = new HypervisorTrap(machInst, 0, EC_SMC_TO_HYP);
+        } else {
+            if (scr.scd) {
+                fault = disabledFault();
+            } else {
+                fault = new SecureMonitorCall(machInst);
+            }
+        }
+    } else {
+        fault = disabledFault();
+    }
+    '''
+
+    smcIop = InstObjParams("smc", "Smc", "PredOp",
+                           { "code": smcCode,
+                             "predicate_test": predicateTest },
+                           ["IsNonSpeculative", "IsSerializeAfter"])
+    header_output += BasicDeclare.subst(smcIop)
+    decoder_output += BasicConstructor.subst(smcIop)
+    exec_output += PredOpExecute.subst(smcIop)
+
+    hvcCode = '''
+    CPSR cpsr = Cpsr;
+    SCR  scr  = Scr;
+
+    // Filter out the various cases where this instruction isn't defined
+    if (!FullSystem || !ArmSystem::haveVirtualization(xc->tcBase()) ||
+        (cpsr.mode == MODE_USER) ||
+        (ArmSystem::haveSecurity(xc->tcBase()) && (!scr.ns || !scr.hce))) {
+        fault = disabledFault();
+    } else {
+        fault = new HypervisorCall(machInst, imm);
+    }
+    '''
+
+    hvcIop = InstObjParams("hvc", "Hvc", "ImmOp",
+                           { "code": hvcCode,
+                             "predicate_test": predicateTest },
+                           ["IsNonSpeculative", "IsSerializeAfter"])
+    header_output += ImmOpDeclare.subst(hvcIop)
+    decoder_output += ImmOpConstructor.subst(hvcIop)
+    exec_output += PredOpExecute.subst(hvcIop)
+
+    eretCode = '''
+        SCTLR sctlr   = Sctlr;
+        CPSR old_cpsr = Cpsr;
+        old_cpsr.nz   = CondCodesNZ;
+        old_cpsr.c    = CondCodesC;
+        old_cpsr.v    = CondCodesV;
+        old_cpsr.ge   = CondCodesGE;
+
+        CPSR new_cpsr = cpsrWriteByInstr(old_cpsr, Spsr, Scr, Nsacr, 0xF,
+                            true, sctlr.nmfi, xc->tcBase());
+        Cpsr        = ~CondCodesMask & new_cpsr;
+        CondCodesNZ = new_cpsr.nz;
+        CondCodesC  = new_cpsr.c;
+        CondCodesV  = new_cpsr.v;
+        CondCodesGE = new_cpsr.ge;
+
+        NextThumb = (new_cpsr).t;
+                    NextJazelle = (new_cpsr).j;
+                    NextItState = (((new_cpsr).it2 << 2) & 0xFC)
+                        | ((new_cpsr).it1 & 0x3);
+
+        NPC = (old_cpsr.mode == MODE_HYP) ? ElrHyp : LR;
+    '''
+
+    eretIop = InstObjParams("eret", "Eret", "PredOp",
+                           { "code": eretCode,
+                             "predicate_test": predicateTest },
+                           ["IsNonSpeculative", "IsSerializeAfter"])
+    header_output += BasicDeclare.subst(eretIop)
+    decoder_output += BasicConstructor.subst(eretIop)
+    exec_output += PredOpExecute.subst(eretIop)
+
+
+
 }};
 
 let {{
@@ -87,6 +168,59 @@
     decoder_output += MrsConstructor.subst(mrsSpsrIop)
     exec_output += PredOpExecute.subst(mrsSpsrIop)
 
+    mrsBankedRegCode = '''
+        bool isIntReg;
+        int  regIdx;
+
+        if (decodeMrsMsrBankedReg(byteMask, r, isIntReg, regIdx, Cpsr, Scr, Nsacr)) {
+            if (isIntReg) {
+                Dest = DecodedBankedIntReg;
+            } else {
+                Dest = xc->readMiscReg(regIdx);
+            }
+        } else {
+            return new UndefinedInstruction(machInst, false, mnemonic);
+        }
+    '''
+    mrsBankedRegIop = InstObjParams("mrs", "MrsBankedReg", "MrsOp",
+                                    { "code": mrsBankedRegCode,
+                                      "predicate_test": predicateTest },
+                                    ["IsSerializeBefore"])
+    header_output += MrsBankedRegDeclare.subst(mrsBankedRegIop)
+    decoder_output += MrsBankedRegConstructor.subst(mrsBankedRegIop)
+    exec_output += PredOpExecute.subst(mrsBankedRegIop)
+
+    msrBankedRegCode = '''
+        bool isIntReg;
+        int  regIdx;
+
+        if (decodeMrsMsrBankedReg(byteMask, r, isIntReg, regIdx, Cpsr, Scr, Nsacr)) {
+            if (isIntReg) {
+                // This is a bit nasty, you would have thought that
+                // DecodedBankedIntReg wouldn't be written to unless the
+                // conditions on the IF statements above are met, however if
+                // you look at the generated C code you'll find that they are.
+                // However this is safe as DecodedBankedIntReg (which is used
+                // in operands.isa to get the index of DecodedBankedIntReg)
+                // will return INTREG_DUMMY if its not a valid integer
+                // register, so redirecting the write to somewhere we don't
+                // care about.
+                DecodedBankedIntReg = Op1;
+            } else {
+                xc->setMiscReg(regIdx, Op1);
+            }
+        } else {
+            return new UndefinedInstruction(machInst, false, mnemonic);
+        }
+    '''
+    msrBankedRegIop = InstObjParams("msr", "MsrBankedReg", "MsrRegOp",
+                                    { "code": msrBankedRegCode,
+                                      "predicate_test": predicateTest },
+                                    ["IsSerializeAfter"])
+    header_output += MsrBankedRegDeclare.subst(msrBankedRegIop)
+    decoder_output += MsrBankedRegConstructor.subst(msrBankedRegIop)
+    exec_output += PredOpExecute.subst(msrBankedRegIop)
+
     msrCpsrRegCode = '''
         SCTLR sctlr = Sctlr;
         CPSR old_cpsr = Cpsr;
@@ -96,7 +230,8 @@
         old_cpsr.ge = CondCodesGE;
 
         CPSR new_cpsr =
-            cpsrWriteByInstr(old_cpsr, Op1, byteMask, false, sctlr.nmfi);
+            cpsrWriteByInstr(old_cpsr, Op1, Scr, Nsacr, byteMask, false,
+                             sctlr.nmfi, xc->tcBase());
         Cpsr = ~CondCodesMask & new_cpsr;
         CondCodesNZ = new_cpsr.nz;
         CondCodesC = new_cpsr.c;
@@ -128,7 +263,8 @@
         old_cpsr.v = CondCodesV;
         old_cpsr.ge = CondCodesGE;
         CPSR new_cpsr =
-            cpsrWriteByInstr(old_cpsr, imm, byteMask, false, sctlr.nmfi);
+            cpsrWriteByInstr(old_cpsr, imm, Scr, Nsacr, byteMask, false,
+                             sctlr.nmfi, xc->tcBase());
         Cpsr = ~CondCodesMask & new_cpsr;
         CondCodesNZ = new_cpsr.nz;
         CondCodesC = new_cpsr.c;
@@ -488,12 +624,10 @@
     decoder_output += BasicConstructor.subst(bkptIop)
     exec_output += BasicExecute.subst(bkptIop)
 
-    nopIop = InstObjParams("nop", "NopInst", "PredOp", \
-            { "code" : "", "predicate_test" : predicateTest },
-            ['IsNop'])
+    nopIop = InstObjParams("nop", "NopInst", "ArmStaticInst", "", ['IsNop'])
     header_output += BasicDeclare.subst(nopIop)
-    decoder_output += BasicConstructor.subst(nopIop)
-    exec_output += PredOpExecute.subst(nopIop)
+    decoder_output += BasicConstructor64.subst(nopIop)
+    exec_output += BasicExecute.subst(nopIop)
 
     yieldIop = InstObjParams("yield", "YieldInst", "PredOp", \
             { "code" : "", "predicate_test" : predicateTest })
@@ -502,14 +636,31 @@
     exec_output += PredOpExecute.subst(yieldIop)
 
     wfeCode = '''
-    // WFE Sleeps if SevMailbox==0 and no unmasked interrupts are pending
+    HCR  hcr  = Hcr;
+    CPSR cpsr = Cpsr;
+    SCR  scr  = Scr64;
+    SCTLR sctlr = Sctlr;
+
+    // WFE Sleeps if SevMailbox==0 and no unmasked interrupts are pending,
+    ThreadContext *tc = xc->tcBase();
     if (SevMailbox == 1) {
         SevMailbox = 0;
-        PseudoInst::quiesceSkip(xc->tcBase());
-    } else if (xc->tcBase()->getCpuPtr()->getInterruptController()->checkInterrupts(xc->tcBase())) {
-        PseudoInst::quiesceSkip(xc->tcBase());
+        PseudoInst::quiesceSkip(tc);
+    } else if (tc->getCpuPtr()->getInterruptController()->checkInterrupts(tc)) {
+        PseudoInst::quiesceSkip(tc);
+    } else if (cpsr.el == EL0 && !sctlr.ntwe) {
+        PseudoInst::quiesceSkip(tc);
+        fault = new SupervisorTrap(machInst, 0x1E00001, EC_TRAPPED_WFI_WFE);
+    } else if (ArmSystem::haveVirtualization(tc) &&
+               !inSecureState(scr, cpsr) && (cpsr.mode != MODE_HYP) &&
+               hcr.twe) {
+        PseudoInst::quiesceSkip(tc);
+        fault = new HypervisorTrap(machInst, 0x1E00001, EC_TRAPPED_WFI_WFE);
+    } else if (ArmSystem::haveSecurity(tc) && cpsr.el != EL3 && scr.twe) {
+        PseudoInst::quiesceSkip(tc);
+        fault = new SecureMonitorTrap(machInst, 0x1E00001, EC_TRAPPED_WFI_WFE);
     } else {
-        PseudoInst::quiesce(xc->tcBase());
+        PseudoInst::quiesce(tc);
     }
     '''
     wfePredFixUpCode = '''
@@ -528,12 +679,30 @@
     exec_output += QuiescePredOpExecuteWithFixup.subst(wfeIop)
 
     wfiCode = '''
+    HCR  hcr  = Hcr;
+    CPSR cpsr = Cpsr;
+    SCR  scr  = Scr64;
+    SCTLR sctlr = Sctlr;
+
     // WFI doesn't sleep if interrupts are pending (masked or not)
-    if (xc->tcBase()->getCpuPtr()->getInterruptController()->checkRaw()) {
-        PseudoInst::quiesceSkip(xc->tcBase());
+    ThreadContext *tc = xc->tcBase();
+    if (tc->getCpuPtr()->getInterruptController()->checkWfiWake(hcr, cpsr,
+                                                                scr)) {
+        PseudoInst::quiesceSkip(tc);
+    } else if (cpsr.el == EL0 && !sctlr.ntwi) {
+        PseudoInst::quiesceSkip(tc);
+        fault = new SupervisorTrap(machInst, 0x1E00000, EC_TRAPPED_WFI_WFE);
+    } else if (ArmSystem::haveVirtualization(tc) && hcr.twi &&
+               (cpsr.mode != MODE_HYP) && !inSecureState(scr, cpsr)) {
+        PseudoInst::quiesceSkip(tc);
+        fault = new HypervisorTrap(machInst, 0x1E00000, EC_TRAPPED_WFI_WFE);
+    } else if (ArmSystem::haveSecurity(tc) && cpsr.el != EL3 && scr.twi) {
+        PseudoInst::quiesceSkip(tc);
+        fault = new SecureMonitorTrap(machInst, 0x1E00000, EC_TRAPPED_WFI_WFE);
     } else {
-        PseudoInst::quiesce(xc->tcBase());
+        PseudoInst::quiesce(tc);
     }
+    tc->getCpuPtr()->clearInterrupt(INT_ABT, 0);
     '''
     wfiIop = InstObjParams("wfi", "WfiInst", "PredOp", \
             { "code" : wfiCode, "predicate_test" : predicateTest },
@@ -564,6 +733,16 @@
     decoder_output += BasicConstructor.subst(sevIop)
     exec_output += PredOpExecute.subst(sevIop)
 
+    sevlCode = '''
+    SevMailbox = 1;
+    '''
+    sevlIop = InstObjParams("sevl", "SevlInst", "PredOp", \
+            { "code" : sevlCode, "predicate_test" : predicateTest },
+            ["IsNonSpeculative", "IsSquashAfter", "IsUnverifiable"])
+    header_output += BasicDeclare.subst(sevlIop)
+    decoder_output += BasicConstructor.subst(sevlIop)
+    exec_output += BasicExecute.subst(sevlIop)
+
     itIop = InstObjParams("it", "ItInst", "PredOp", \
             { "code" : ";",
               "predicate_test" : predicateTest }, [])
@@ -571,10 +750,7 @@
     decoder_output += BasicConstructor.subst(itIop)
     exec_output += PredOpExecute.subst(itIop)
     unknownCode = '''
-    if (FullSystem)
-        return new UndefinedInstruction;
-    else
-        return new UndefinedInstruction(machInst, true);
+    return new UndefinedInstruction(machInst, true);
     '''
     unknownIop = InstObjParams("unknown", "Unknown", "UnknownOp", \
                                { "code": unknownCode,
@@ -626,108 +802,152 @@
     exec_output += PredOpExecute.subst(bfiIop)
 
     mrc14code = '''
-    CPSR cpsr = Cpsr;
-    if (cpsr.mode == MODE_USER) {
-        if (FullSystem)
-            return new UndefinedInstruction;
-        else
-            return new UndefinedInstruction(false, mnemonic);
+    MiscRegIndex miscReg = (MiscRegIndex) xc->tcBase()->flattenMiscIndex(op1);
+    if (!canReadCoprocReg(miscReg, Scr, Cpsr, xc->tcBase())) {
+            return new UndefinedInstruction(machInst, false, mnemonic);
+    }
+    if (mcrMrc14TrapToHyp((const MiscRegIndex) op1, Hcr, Cpsr, Scr, Hdcr,
+                          Hstr, Hcptr, imm)) {
+        return new HypervisorTrap(machInst, imm, EC_TRAPPED_CP14_MCR_MRC);
     }
     Dest = MiscOp1;
     '''
 
-    mrc14Iop = InstObjParams("mrc", "Mrc14", "RegRegOp",
+    mrc14Iop = InstObjParams("mrc", "Mrc14", "RegRegImmOp",
                              { "code": mrc14code,
                                "predicate_test": predicateTest }, [])
-    header_output += RegRegOpDeclare.subst(mrc14Iop)
-    decoder_output += RegRegOpConstructor.subst(mrc14Iop)
+    header_output += RegRegImmOpDeclare.subst(mrc14Iop)
+    decoder_output += RegRegImmOpConstructor.subst(mrc14Iop)
     exec_output += PredOpExecute.subst(mrc14Iop)
 
 
     mcr14code = '''
-    CPSR cpsr = Cpsr;
-    if (cpsr.mode == MODE_USER) {
-        if (FullSystem)
-            return new UndefinedInstruction;
-        else
-            return new UndefinedInstruction(false, mnemonic);
+    MiscRegIndex miscReg = (MiscRegIndex) xc->tcBase()->flattenMiscIndex(dest);
+    if (!canWriteCoprocReg(miscReg, Scr, Cpsr, xc->tcBase())) {
+        return new UndefinedInstruction(machInst, false, mnemonic);
+    }
+    if (mcrMrc14TrapToHyp(miscReg, Hcr, Cpsr, Scr, Hdcr,
+                          Hstr, Hcptr, imm)) {
+        return new HypervisorTrap(machInst, imm, EC_TRAPPED_CP14_MCR_MRC);
     }
     MiscDest = Op1;
     '''
-    mcr14Iop = InstObjParams("mcr", "Mcr14", "RegRegOp",
+    mcr14Iop = InstObjParams("mcr", "Mcr14", "RegRegImmOp",
                              { "code": mcr14code,
                                "predicate_test": predicateTest },
                                ["IsSerializeAfter","IsNonSpeculative"])
-    header_output += RegRegOpDeclare.subst(mcr14Iop)
-    decoder_output += RegRegOpConstructor.subst(mcr14Iop)
+    header_output += RegRegImmOpDeclare.subst(mcr14Iop)
+    decoder_output += RegRegImmOpConstructor.subst(mcr14Iop)
     exec_output += PredOpExecute.subst(mcr14Iop)
 
-    mrc14UserIop = InstObjParams("mrc", "Mrc14User", "RegRegOp",
-                                 { "code": "Dest = MiscOp1;",
-                                   "predicate_test": predicateTest }, [])
-    header_output += RegRegOpDeclare.subst(mrc14UserIop)
-    decoder_output += RegRegOpConstructor.subst(mrc14UserIop)
-    exec_output += PredOpExecute.subst(mrc14UserIop)
-
-    mcr14UserIop = InstObjParams("mcr", "Mcr14User", "RegRegOp",
-                                 { "code": "MiscDest = Op1",
-                                   "predicate_test": predicateTest },
-                                   ["IsSerializeAfter","IsNonSpeculative"])
-    header_output += RegRegOpDeclare.subst(mcr14UserIop)
-    decoder_output += RegRegOpConstructor.subst(mcr14UserIop)
-    exec_output += PredOpExecute.subst(mcr14UserIop)
-
     mrc15code = '''
-    CPSR cpsr = Cpsr;
-    if (cpsr.mode == MODE_USER) {
-        if (FullSystem)
-            return new UndefinedInstruction;
-        else
-            return new UndefinedInstruction(false, mnemonic);
+    int preFlatOp1 = flattenMiscRegNsBanked(op1, xc->tcBase());
+    MiscRegIndex miscReg = (MiscRegIndex)
+                           xc->tcBase()->flattenMiscIndex(preFlatOp1);
+    bool hypTrap = mcrMrc15TrapToHyp(miscReg, Hcr, Cpsr, Scr, Hdcr, Hstr,
+                                     Hcptr, imm);
+    bool canRead = canReadCoprocReg(miscReg, Scr, Cpsr, xc->tcBase());
+
+    // if we're in non secure PL1 mode then we can trap regargless of whether
+    // the register is accessable, in other modes we trap if only if the register
+    // IS accessable.
+    if (!canRead & !(hypTrap & !inUserMode(Cpsr) & !inSecureState(Scr, Cpsr))) {
+        return new UndefinedInstruction(machInst, false, mnemonic);
     }
-    Dest = MiscOp1;
+    if (hypTrap) {
+        return new HypervisorTrap(machInst, imm, EC_TRAPPED_CP15_MCR_MRC);
+    }
+    Dest = MiscNsBankedOp1;
     '''
 
-    mrc15Iop = InstObjParams("mrc", "Mrc15", "RegRegOp",
+    mrc15Iop = InstObjParams("mrc", "Mrc15", "RegRegImmOp",
                              { "code": mrc15code,
                                "predicate_test": predicateTest }, [])
-    header_output += RegRegOpDeclare.subst(mrc15Iop)
-    decoder_output += RegRegOpConstructor.subst(mrc15Iop)
+    header_output += RegRegImmOpDeclare.subst(mrc15Iop)
+    decoder_output += RegRegImmOpConstructor.subst(mrc15Iop)
     exec_output += PredOpExecute.subst(mrc15Iop)
 
 
     mcr15code = '''
-    CPSR cpsr = Cpsr;
-    if (cpsr.mode == MODE_USER) {
-        if (FullSystem)
-            return new UndefinedInstruction;
-        else
-            return new UndefinedInstruction(false, mnemonic);
+    int preFlatDest = flattenMiscRegNsBanked(dest, xc->tcBase());
+    MiscRegIndex miscReg = (MiscRegIndex)
+                       xc->tcBase()->flattenMiscIndex(preFlatDest);
+    bool hypTrap  = mcrMrc15TrapToHyp(miscReg, Hcr, Cpsr, Scr, Hdcr, Hstr,
+                                      Hcptr, imm);
+    bool canWrite = canWriteCoprocReg(miscReg, Scr, Cpsr, xc->tcBase());
+
+    // if we're in non secure PL1 mode then we can trap regargless of whether
+    // the register is accessable, in other modes we trap if only if the register
+    // IS accessable.
+    if (!canWrite & !(hypTrap & !inUserMode(Cpsr) & !inSecureState(Scr, Cpsr))) {
+        return new UndefinedInstruction(machInst, false, mnemonic);
     }
-    MiscDest = Op1;
+    if (hypTrap) {
+        return new HypervisorTrap(machInst, imm, EC_TRAPPED_CP15_MCR_MRC);
+    }
+    MiscNsBankedDest = Op1;
     '''
-    mcr15Iop = InstObjParams("mcr", "Mcr15", "RegRegOp",
+    mcr15Iop = InstObjParams("mcr", "Mcr15", "RegRegImmOp",
                              { "code": mcr15code,
                                "predicate_test": predicateTest },
                                ["IsSerializeAfter","IsNonSpeculative"])
-    header_output += RegRegOpDeclare.subst(mcr15Iop)
-    decoder_output += RegRegOpConstructor.subst(mcr15Iop)
+    header_output += RegRegImmOpDeclare.subst(mcr15Iop)
+    decoder_output += RegRegImmOpConstructor.subst(mcr15Iop)
     exec_output += PredOpExecute.subst(mcr15Iop)
 
-    mrc15UserIop = InstObjParams("mrc", "Mrc15User", "RegRegOp",
-                                 { "code": "Dest = MiscOp1;",
-                                   "predicate_test": predicateTest }, [])
-    header_output += RegRegOpDeclare.subst(mrc15UserIop)
-    decoder_output += RegRegOpConstructor.subst(mrc15UserIop)
-    exec_output += PredOpExecute.subst(mrc15UserIop)
 
-    mcr15UserIop = InstObjParams("mcr", "Mcr15User", "RegRegOp",
-                                 { "code": "MiscDest = Op1",
-                                   "predicate_test": predicateTest },
-                                   ["IsSerializeAfter","IsNonSpeculative"])
-    header_output += RegRegOpDeclare.subst(mcr15UserIop)
-    decoder_output += RegRegOpConstructor.subst(mcr15UserIop)
-    exec_output += PredOpExecute.subst(mcr15UserIop)
+    mrrc15code = '''
+    int preFlatOp1 = flattenMiscRegNsBanked(op1, xc->tcBase());
+    MiscRegIndex miscReg = (MiscRegIndex)
+                           xc->tcBase()->flattenMiscIndex(preFlatOp1);
+    bool hypTrap = mcrrMrrc15TrapToHyp(miscReg, Cpsr, Scr, Hstr, Hcr, imm);
+    bool canRead = canReadCoprocReg(miscReg, Scr, Cpsr, xc->tcBase());
+
+    // if we're in non secure PL1 mode then we can trap regargless of whether
+    // the register is accessable, in other modes we trap if only if the register
+    // IS accessable.
+    if (!canRead & !(hypTrap & !inUserMode(Cpsr) & !inSecureState(Scr, Cpsr))) {
+        return new UndefinedInstruction(machInst, false, mnemonic);
+    }
+    if (hypTrap) {
+        return new HypervisorTrap(machInst, imm, EC_TRAPPED_CP15_MCRR_MRRC);
+    }
+    Dest = bits(MiscNsBankedOp164, 63, 32);
+    Dest2 = bits(MiscNsBankedOp164, 31, 0);
+    '''
+    mrrc15Iop = InstObjParams("mrrc", "Mrrc15", "MrrcOp",
+                              { "code": mrrc15code,
+                                "predicate_test": predicateTest }, [])
+    header_output += MrrcOpDeclare.subst(mrrc15Iop)
+    decoder_output += MrrcOpConstructor.subst(mrrc15Iop)
+    exec_output += PredOpExecute.subst(mrrc15Iop)
+
+
+    mcrr15code = '''
+    int preFlatDest = flattenMiscRegNsBanked(dest, xc->tcBase());
+    MiscRegIndex miscReg = (MiscRegIndex)
+                           xc->tcBase()->flattenMiscIndex(preFlatDest);
+    bool hypTrap  = mcrrMrrc15TrapToHyp(miscReg, Cpsr, Scr, Hstr, Hcr, imm);
+    bool canWrite = canWriteCoprocReg(miscReg, Scr, Cpsr, xc->tcBase());
+
+    // if we're in non secure PL1 mode then we can trap regargless of whether
+    // the register is accessable, in other modes we trap if only if the register
+    // IS accessable.
+    if (!canWrite & !(hypTrap & !inUserMode(Cpsr) & !inSecureState(Scr, Cpsr))) {
+        return new UndefinedInstruction(machInst, false, mnemonic);
+    }
+    if (hypTrap) {
+        return new HypervisorTrap(machInst, imm, EC_TRAPPED_CP15_MCRR_MRRC);
+    }
+    MiscNsBankedDest64 = ((uint64_t) Op1 << 32) | Op2;
+    '''
+    mcrr15Iop = InstObjParams("mcrr", "Mcrr15", "McrrOp",
+                              { "code": mcrr15code,
+                                "predicate_test": predicateTest }, [])
+    header_output += McrrOpDeclare.subst(mcrr15Iop)
+    decoder_output += McrrOpConstructor.subst(mcrr15Iop)
+    exec_output += PredOpExecute.subst(mcrr15Iop)
+
 
     enterxCode = '''
         NextThumb = true;
@@ -775,35 +995,53 @@
     exec_output += PredOpExecute.subst(clrexIop)
 
     isbCode = '''
+        // If the barrier is due to a CP15 access check for hyp traps
+        if ((imm != 0) && mcrMrc15TrapToHyp(MISCREG_CP15ISB, Hcr, Cpsr, Scr,
+            Hdcr, Hstr, Hcptr, imm)) {
+            return new HypervisorTrap(machInst, imm,
+                EC_TRAPPED_CP15_MCR_MRC);
+        }
         fault = new FlushPipe;
     '''
-    isbIop = InstObjParams("isb", "Isb", "PredOp",
+    isbIop = InstObjParams("isb", "Isb", "ImmOp",
                              {"code": isbCode,
                                "predicate_test": predicateTest},
                                 ['IsSerializeAfter'])
-    header_output += BasicDeclare.subst(isbIop)
-    decoder_output += BasicConstructor.subst(isbIop)
+    header_output += ImmOpDeclare.subst(isbIop)
+    decoder_output += ImmOpConstructor.subst(isbIop)
     exec_output += PredOpExecute.subst(isbIop)
 
     dsbCode = '''
+        // If the barrier is due to a CP15 access check for hyp traps
+        if ((imm != 0) && mcrMrc15TrapToHyp(MISCREG_CP15DSB, Hcr, Cpsr, Scr,
+            Hdcr, Hstr, Hcptr, imm)) {
+            return new HypervisorTrap(machInst, imm,
+                EC_TRAPPED_CP15_MCR_MRC);
+        }
         fault = new FlushPipe;
     '''
-    dsbIop = InstObjParams("dsb", "Dsb", "PredOp",
+    dsbIop = InstObjParams("dsb", "Dsb", "ImmOp",
                              {"code": dsbCode,
                                "predicate_test": predicateTest},
                               ['IsMemBarrier', 'IsSerializeAfter'])
-    header_output += BasicDeclare.subst(dsbIop)
-    decoder_output += BasicConstructor.subst(dsbIop)
+    header_output += ImmOpDeclare.subst(dsbIop)
+    decoder_output += ImmOpConstructor.subst(dsbIop)
     exec_output += PredOpExecute.subst(dsbIop)
 
     dmbCode = '''
+        // If the barrier is due to a CP15 access check for hyp traps
+        if ((imm != 0) && mcrMrc15TrapToHyp(MISCREG_CP15DMB, Hcr, Cpsr, Scr,
+            Hdcr, Hstr, Hcptr, imm)) {
+            return new HypervisorTrap(machInst, imm,
+                EC_TRAPPED_CP15_MCR_MRC);
+        }
     '''
-    dmbIop = InstObjParams("dmb", "Dmb", "PredOp",
+    dmbIop = InstObjParams("dmb", "Dmb", "ImmOp",
                              {"code": dmbCode,
                                "predicate_test": predicateTest},
                                ['IsMemBarrier'])
-    header_output += BasicDeclare.subst(dmbIop)
-    decoder_output += BasicConstructor.subst(dmbIop)
+    header_output += ImmOpDeclare.subst(dmbIop)
+    decoder_output += ImmOpConstructor.subst(dmbIop)
     exec_output += PredOpExecute.subst(dmbIop)
 
     dbgCode = '''
diff --git a/src/arch/arm/isa/insts/misc64.isa b/src/arch/arm/isa/insts/misc64.isa
new file mode 100644
index 0000000..6ebbcc2
--- /dev/null
+++ b/src/arch/arm/isa/insts/misc64.isa
@@ -0,0 +1,147 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+let {{
+    svcCode = '''
+    fault = new SupervisorCall(machInst, bits(machInst, 20, 5));
+    '''
+
+    svcIop = InstObjParams("svc", "Svc64", "ArmStaticInst",
+                           svcCode, ["IsSyscall", "IsNonSpeculative",
+                                     "IsSerializeAfter"])
+    header_output = BasicDeclare.subst(svcIop)
+    decoder_output = BasicConstructor64.subst(svcIop)
+    exec_output = BasicExecute.subst(svcIop)
+
+    # @todo: extend to take into account Virtualization.
+    smcCode = '''
+    SCR scr = Scr64;
+    CPSR cpsr = Cpsr;
+
+    if (!ArmSystem::haveSecurity(xc->tcBase()) || inUserMode(cpsr) || scr.smd) {
+        fault = disabledFault();
+    } else {
+        fault = new SecureMonitorCall(machInst);
+    }
+    '''
+
+    smcIop = InstObjParams("smc", "Smc64", "ArmStaticInst",
+                           smcCode, ["IsNonSpeculative", "IsSerializeAfter"])
+    header_output += BasicDeclare.subst(smcIop)
+    decoder_output += BasicConstructor64.subst(smcIop)
+    exec_output += BasicExecute.subst(smcIop)
+
+    def subst(templateBase, iop):
+        global header_output, decoder_output, exec_output
+        header_output += eval(templateBase + "Declare").subst(iop)
+        decoder_output += eval(templateBase + "Constructor").subst(iop)
+        exec_output += BasicExecute.subst(iop)
+
+    bfmMaskCode = '''
+    uint64_t bitMask;
+    int diff = imm2 - imm1;
+    if (imm1 <= imm2) {
+        bitMask = mask(diff + 1);
+    } else {
+        bitMask = mask(imm2 + 1);
+        bitMask = (bitMask >> imm1) | (bitMask << (intWidth - imm1));
+        diff += intWidth;
+    }
+    uint64_t topBits M5_VAR_USED = ~mask(diff+1);
+    uint64_t result = (Op164 >> imm1) | (Op164 << (intWidth - imm1));
+    result &= bitMask;
+    '''
+
+    bfmCode = bfmMaskCode + 'Dest64 = result | (Dest64 & ~bitMask);'
+    bfmIop = InstObjParams("bfm", "Bfm64", "RegRegImmImmOp64", bfmCode);
+    subst("RegRegImmImmOp64", bfmIop)
+
+    ubfmCode = bfmMaskCode + 'Dest64 = result;'
+    ubfmIop = InstObjParams("ubfm", "Ubfm64", "RegRegImmImmOp64", ubfmCode);
+    subst("RegRegImmImmOp64", ubfmIop)
+
+    sbfmCode = bfmMaskCode + \
+        'Dest64 = result | (bits(Op164, imm2) ? topBits : 0);'
+    sbfmIop = InstObjParams("sbfm", "Sbfm64", "RegRegImmImmOp64", sbfmCode);
+    subst("RegRegImmImmOp64", sbfmIop)
+
+    extrCode = '''
+        if (imm == 0) {
+            Dest64 = Op264;
+        } else {
+            Dest64 = (Op164 << (intWidth - imm)) | (Op264 >> imm);
+        }
+    '''
+    extrIop = InstObjParams("extr", "Extr64", "RegRegRegImmOp64", extrCode);
+    subst("RegRegRegImmOp64", extrIop);
+
+    unknownCode = '''
+            return new UndefinedInstruction(machInst, true);
+    '''
+    unknown64Iop = InstObjParams("unknown", "Unknown64", "UnknownOp64",
+                                 unknownCode)
+    header_output += BasicDeclare.subst(unknown64Iop)
+    decoder_output += BasicConstructor64.subst(unknown64Iop)
+    exec_output += BasicExecute.subst(unknown64Iop)
+
+    isbIop = InstObjParams("isb", "Isb64", "ArmStaticInst",
+                           "fault = new FlushPipe;", ['IsSerializeAfter'])
+    header_output += BasicDeclare.subst(isbIop)
+    decoder_output += BasicConstructor64.subst(isbIop)
+    exec_output += BasicExecute.subst(isbIop)
+
+    dsbIop = InstObjParams("dsb", "Dsb64", "ArmStaticInst",
+                           "fault = new FlushPipe;",
+                           ['IsMemBarrier', 'IsSerializeAfter'])
+    header_output += BasicDeclare.subst(dsbIop)
+    decoder_output += BasicConstructor64.subst(dsbIop)
+    exec_output += BasicExecute.subst(dsbIop)
+
+    dmbIop = InstObjParams("dmb", "Dmb64", "ArmStaticInst", "",
+                           ['IsMemBarrier'])
+    header_output += BasicDeclare.subst(dmbIop)
+    decoder_output += BasicConstructor64.subst(dmbIop)
+    exec_output += BasicExecute.subst(dmbIop)
+
+    clrexIop = InstObjParams("clrex", "Clrex64", "ArmStaticInst",
+                             "LLSCLock = 0;")
+    header_output += BasicDeclare.subst(clrexIop)
+    decoder_output += BasicConstructor64.subst(clrexIop)
+    exec_output += BasicExecute.subst(clrexIop)
+}};
diff --git a/src/arch/arm/isa/insts/neon.isa b/src/arch/arm/isa/insts/neon.isa
index 876bb3b..ca5c303 100644
--- a/src/arch/arm/isa/insts/neon.isa
+++ b/src/arch/arm/isa/insts/neon.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2011 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -94,8 +94,8 @@
     template <template <typename T> class Base>
     StaticInstPtr
     decodeNeonUThreeUSReg(unsigned size,
-                         ExtMachInst machInst, IntRegIndex dest,
-                         IntRegIndex op1, IntRegIndex op2)
+                          ExtMachInst machInst, IntRegIndex dest,
+                          IntRegIndex op1, IntRegIndex op2)
     {
         switch (size) {
           case 0:
@@ -112,8 +112,8 @@
     template <template <typename T> class Base>
     StaticInstPtr
     decodeNeonSThreeUSReg(unsigned size,
-                         ExtMachInst machInst, IntRegIndex dest,
-                         IntRegIndex op1, IntRegIndex op2)
+                          ExtMachInst machInst, IntRegIndex dest,
+                          IntRegIndex op1, IntRegIndex op2)
     {
         switch (size) {
           case 0:
@@ -129,6 +129,38 @@
 
     template <template <typename T> class Base>
     StaticInstPtr
+    decodeNeonSThreeHAndWReg(unsigned size, ExtMachInst machInst,
+                             IntRegIndex dest, IntRegIndex op1,
+                             IntRegIndex op2)
+    {
+        switch (size) {
+          case 1:
+            return new Base<int16_t>(machInst, dest, op1, op2);
+          case 2:
+            return new Base<int32_t>(machInst, dest, op1, op2);
+          default:
+            return new Unknown(machInst);
+        }
+    }
+
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeNeonSThreeImmHAndWReg(unsigned size, ExtMachInst machInst,
+                                IntRegIndex dest, IntRegIndex op1,
+                                IntRegIndex op2, uint64_t imm)
+    {
+        switch (size) {
+          case 1:
+            return new Base<int16_t>(machInst, dest, op1, op2, imm);
+          case 2:
+            return new Base<int32_t>(machInst, dest, op1, op2, imm);
+          default:
+            return new Unknown(machInst);
+        }
+    }
+
+    template <template <typename T> class Base>
+    StaticInstPtr
     decodeNeonUSThreeUSReg(bool notSigned, unsigned size,
                            ExtMachInst machInst, IntRegIndex dest,
                            IntRegIndex op1, IntRegIndex op2)
@@ -177,6 +209,38 @@
     template <template <typename T> class BaseD,
               template <typename T> class BaseQ>
     StaticInstPtr
+    decodeNeonSThreeXReg(bool q, unsigned size,
+                         ExtMachInst machInst, IntRegIndex dest,
+                         IntRegIndex op1, IntRegIndex op2)
+    {
+        if (q) {
+            return decodeNeonSThreeUReg<BaseQ>(
+                    size, machInst, dest, op1, op2);
+        } else {
+            return decodeNeonSThreeUSReg<BaseD>(
+                    size, machInst, dest, op1, op2);
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonUThreeXReg(bool q, unsigned size,
+                         ExtMachInst machInst, IntRegIndex dest,
+                         IntRegIndex op1, IntRegIndex op2)
+    {
+        if (q) {
+            return decodeNeonUThreeUReg<BaseQ>(
+                    size, machInst, dest, op1, op2);
+        } else {
+            return decodeNeonUThreeUSReg<BaseD>(
+                    size, machInst, dest, op1, op2);
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
     decodeNeonUSThreeSReg(bool q, bool notSigned, unsigned size,
                           ExtMachInst machInst, IntRegIndex dest,
                           IntRegIndex op1, IntRegIndex op2)
@@ -241,6 +305,124 @@
     template <template <typename T> class BaseD,
               template <typename T> class BaseQ>
     StaticInstPtr
+    decodeNeonUThreeFpReg(bool q, unsigned size, ExtMachInst machInst,
+                          IntRegIndex dest, IntRegIndex op1, IntRegIndex op2)
+    {
+        if (q) {
+            if (size)
+                return new BaseQ<uint64_t>(machInst, dest, op1, op2);
+            else
+                return new BaseQ<uint32_t>(machInst, dest, op1, op2);
+        } else {
+            if (size)
+                return new Unknown(machInst);
+            else
+                return new BaseD<uint32_t>(machInst, dest, op1, op2);
+        }
+    }
+
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeNeonUThreeScFpReg(bool size, ExtMachInst machInst,
+                            IntRegIndex dest, IntRegIndex op1, IntRegIndex op2)
+    {
+        if (size)
+            return new Base<uint64_t>(machInst, dest, op1, op2);
+        else
+            return new Base<uint32_t>(machInst, dest, op1, op2);
+    }
+
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeNeonUThreeImmScFpReg(bool size, ExtMachInst machInst,
+                               IntRegIndex dest, IntRegIndex op1,
+                               IntRegIndex op2, uint64_t imm)
+    {
+        if (size)
+            return new Base<uint64_t>(machInst, dest, op1, op2, imm);
+        else
+            return new Base<uint32_t>(machInst, dest, op1, op2, imm);
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonUThreeImmHAndWReg(bool q, unsigned size, ExtMachInst machInst,
+                                IntRegIndex dest, IntRegIndex op1,
+                                IntRegIndex op2, uint64_t imm)
+    {
+        if (q) {
+            switch (size) {
+              case 1:
+                return new BaseQ<uint16_t>(machInst, dest, op1, op2, imm);
+              case 2:
+                return new BaseQ<uint32_t>(machInst, dest, op1, op2, imm);
+              default:
+                return new Unknown(machInst);
+            }
+        } else {
+            switch (size) {
+              case 1:
+                return new BaseD<uint16_t>(machInst, dest, op1, op2, imm);
+              case 2:
+                return new BaseD<uint32_t>(machInst, dest, op1, op2, imm);
+              default:
+                return new Unknown(machInst);
+            }
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonSThreeImmHAndWReg(bool q, unsigned size, ExtMachInst machInst,
+                                IntRegIndex dest, IntRegIndex op1,
+                                IntRegIndex op2, uint64_t imm)
+    {
+        if (q) {
+            switch (size) {
+              case 1:
+                return new BaseQ<int16_t>(machInst, dest, op1, op2, imm);
+              case 2:
+                return new BaseQ<int32_t>(machInst, dest, op1, op2, imm);
+              default:
+                return new Unknown(machInst);
+            }
+        } else {
+            switch (size) {
+              case 1:
+                return new BaseD<int16_t>(machInst, dest, op1, op2, imm);
+              case 2:
+                return new BaseD<int32_t>(machInst, dest, op1, op2, imm);
+              default:
+                return new Unknown(machInst);
+            }
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonUThreeImmFpReg(bool q, unsigned size, ExtMachInst machInst,
+                             IntRegIndex dest, IntRegIndex op1,
+                             IntRegIndex op2, uint64_t imm)
+    {
+        if (q) {
+            if (size)
+                return new BaseQ<uint64_t>(machInst, dest, op1, op2, imm);
+            else
+                return new BaseQ<uint32_t>(machInst, dest, op1, op2, imm);
+        } else {
+            if (size)
+                return new Unknown(machInst);
+            else
+                return new BaseD<uint32_t>(machInst, dest, op1, op2, imm);
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
     decodeNeonUTwoShiftReg(bool q, unsigned size,
                            ExtMachInst machInst, IntRegIndex dest,
                            IntRegIndex op1, uint64_t imm)
@@ -345,6 +527,46 @@
         }
     }
 
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeNeonUTwoShiftUReg(unsigned size,
+                            ExtMachInst machInst, IntRegIndex dest,
+                            IntRegIndex op1, uint64_t imm)
+    {
+        switch (size) {
+          case 0:
+            return new Base<uint8_t>(machInst, dest, op1, imm);
+          case 1:
+            return new Base<uint16_t>(machInst, dest, op1, imm);
+          case 2:
+            return new Base<uint32_t>(machInst, dest, op1, imm);
+          case 3:
+            return new Base<uint64_t>(machInst, dest, op1, imm);
+          default:
+            return new Unknown(machInst);
+        }
+    }
+
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeNeonSTwoShiftUReg(unsigned size,
+                            ExtMachInst machInst, IntRegIndex dest,
+                            IntRegIndex op1, uint64_t imm)
+    {
+        switch (size) {
+          case 0:
+            return new Base<int8_t>(machInst, dest, op1, imm);
+          case 1:
+            return new Base<int16_t>(machInst, dest, op1, imm);
+          case 2:
+            return new Base<int32_t>(machInst, dest, op1, imm);
+          case 3:
+            return new Base<int64_t>(machInst, dest, op1, imm);
+          default:
+            return new Unknown(machInst);
+        }
+    }
+
     template <template <typename T> class BaseD,
               template <typename T> class BaseQ>
     StaticInstPtr
@@ -411,6 +633,66 @@
         }
     }
 
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonUTwoShiftXReg(bool q, unsigned size, ExtMachInst machInst,
+                            IntRegIndex dest, IntRegIndex op1, uint64_t imm)
+    {
+        if (q) {
+            return decodeNeonUTwoShiftUReg<BaseQ>(
+                size, machInst, dest, op1, imm);
+        } else {
+            return decodeNeonUTwoShiftUSReg<BaseD>(
+                size, machInst, dest, op1, imm);
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonSTwoShiftXReg(bool q, unsigned size, ExtMachInst machInst,
+                            IntRegIndex dest, IntRegIndex op1, uint64_t imm)
+    {
+        if (q) {
+            return decodeNeonSTwoShiftUReg<BaseQ>(
+                size, machInst, dest, op1, imm);
+        } else {
+            return decodeNeonSTwoShiftUSReg<BaseD>(
+                size, machInst, dest, op1, imm);
+        }
+    }
+
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeNeonUTwoShiftUFpReg(unsigned size, ExtMachInst machInst,
+                              IntRegIndex dest, IntRegIndex op1, uint64_t imm)
+    {
+        if (size)
+            return new Base<uint64_t>(machInst, dest, op1, imm);
+        else
+            return new Base<uint32_t>(machInst, dest, op1, imm);
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonUTwoShiftFpReg(bool q, unsigned size, ExtMachInst machInst,
+                             IntRegIndex dest, IntRegIndex op1, uint64_t imm)
+    {
+        if (q) {
+            if (size)
+                return new BaseQ<uint64_t>(machInst, dest, op1, imm);
+            else
+                return new BaseQ<uint32_t>(machInst, dest, op1, imm);
+        } else {
+            if (size)
+                return new Unknown(machInst);
+            else
+                return new BaseD<uint32_t>(machInst, dest, op1, imm);
+        }
+    }
+
     template <template <typename T> class Base>
     StaticInstPtr
     decodeNeonUTwoMiscUSReg(unsigned size,
@@ -451,8 +733,8 @@
               template <typename T> class BaseQ>
     StaticInstPtr
     decodeNeonUTwoMiscSReg(bool q, unsigned size,
-                          ExtMachInst machInst, IntRegIndex dest,
-                          IntRegIndex op1)
+                           ExtMachInst machInst, IntRegIndex dest,
+                           IntRegIndex op1)
     {
         if (q) {
             return decodeNeonUTwoMiscUSReg<BaseQ>(size, machInst, dest, op1);
@@ -465,8 +747,8 @@
               template <typename T> class BaseQ>
     StaticInstPtr
     decodeNeonSTwoMiscSReg(bool q, unsigned size,
-                          ExtMachInst machInst, IntRegIndex dest,
-                          IntRegIndex op1)
+                           ExtMachInst machInst, IntRegIndex dest,
+                           IntRegIndex op1)
     {
         if (q) {
             return decodeNeonSTwoMiscUSReg<BaseQ>(size, machInst, dest, op1);
@@ -498,8 +780,8 @@
     template <template <typename T> class Base>
     StaticInstPtr
     decodeNeonSTwoMiscUReg(unsigned size,
-                            ExtMachInst machInst, IntRegIndex dest,
-                            IntRegIndex op1)
+                           ExtMachInst machInst, IntRegIndex dest,
+                           IntRegIndex op1)
     {
         switch (size) {
           case 0:
@@ -559,6 +841,221 @@
         }
     }
 
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonUTwoMiscXReg(bool q, unsigned size, ExtMachInst machInst,
+                           IntRegIndex dest, IntRegIndex op1)
+    {
+        if (q) {
+            return decodeNeonUTwoMiscUReg<BaseQ>(size, machInst, dest, op1);
+        } else {
+            return decodeNeonUTwoMiscUSReg<BaseD>(size, machInst, dest, op1);
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonSTwoMiscXReg(bool q, unsigned size, ExtMachInst machInst,
+                           IntRegIndex dest, IntRegIndex op1)
+    {
+        if (q) {
+            return decodeNeonSTwoMiscUReg<BaseQ>(size, machInst, dest, op1);
+        } else {
+            return decodeNeonSTwoMiscUSReg<BaseD>(size, machInst, dest, op1);
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonUTwoMiscFpReg(bool q, unsigned size, ExtMachInst machInst,
+                            IntRegIndex dest, IntRegIndex op1)
+    {
+        if (q) {
+            if (size)
+                return new BaseQ<uint64_t>(machInst, dest, op1);
+            else
+                return new BaseQ<uint32_t>(machInst, dest, op1);
+        } else {
+            if (size)
+                return new Unknown(machInst);
+            else
+                return new BaseD<uint32_t>(machInst, dest, op1);
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonUTwoMiscPwiseScFpReg(unsigned size, ExtMachInst machInst,
+                                   IntRegIndex dest, IntRegIndex op1)
+    {
+        if (size)
+            return new BaseQ<uint64_t>(machInst, dest, op1);
+        else
+            return new BaseD<uint32_t>(machInst, dest, op1);
+    }
+
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeNeonUTwoMiscScFpReg(unsigned size, ExtMachInst machInst,
+                              IntRegIndex dest, IntRegIndex op1)
+    {
+        if (size)
+            return new Base<uint64_t>(machInst, dest, op1);
+        else
+            return new Base<uint32_t>(machInst, dest, op1);
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonUAcrossLanesReg(bool q, unsigned size, ExtMachInst machInst,
+                              IntRegIndex dest, IntRegIndex op1)
+    {
+        if (q) {
+            switch (size) {
+              case 0x0:
+                return new BaseQ<uint8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseQ<uint16_t>(machInst, dest, op1);
+              case 0x2:
+                return new BaseQ<uint32_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        } else {
+            switch (size) {
+              case 0x0:
+                return new BaseD<uint8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseD<uint16_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ,
+              template <typename T> class BaseBQ>
+    StaticInstPtr
+    decodeNeonUAcrossLanesReg(bool q, unsigned size, ExtMachInst machInst,
+                              IntRegIndex dest, IntRegIndex op1)
+    {
+        if (q) {
+            switch (size) {
+              case 0x0:
+                return new BaseQ<uint8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseQ<uint16_t>(machInst, dest, op1);
+              case 0x2:
+                return new BaseBQ<uint32_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        } else {
+            switch (size) {
+              case 0x0:
+                return new BaseD<uint8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseD<uint16_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ>
+    StaticInstPtr
+    decodeNeonSAcrossLanesReg(bool q, unsigned size, ExtMachInst machInst,
+                              IntRegIndex dest, IntRegIndex op1)
+    {
+        if (q) {
+            switch (size) {
+              case 0x0:
+                return new BaseQ<int8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseQ<int16_t>(machInst, dest, op1);
+              case 0x2:
+                return new BaseQ<int32_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        } else {
+            switch (size) {
+              case 0x0:
+                return new BaseD<int8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseD<int16_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ,
+              template <typename T> class BaseBQ>
+    StaticInstPtr
+    decodeNeonUAcrossLanesLongReg(bool q, unsigned size, ExtMachInst machInst,
+                                  IntRegIndex dest, IntRegIndex op1)
+    {
+        if (q) {
+            switch (size) {
+              case 0x0:
+                return new BaseQ<uint8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseQ<uint16_t>(machInst, dest, op1);
+              case 0x2:
+                return new BaseBQ<uint32_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        } else {
+            switch (size) {
+              case 0x0:
+                return new BaseD<uint8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseD<uint16_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        }
+    }
+
+    template <template <typename T> class BaseD,
+              template <typename T> class BaseQ,
+              template <typename T> class BaseBQ>
+    StaticInstPtr
+    decodeNeonSAcrossLanesLongReg(bool q, unsigned size, ExtMachInst machInst,
+                                  IntRegIndex dest, IntRegIndex op1)
+    {
+        if (q) {
+            switch (size) {
+              case 0x0:
+                return new BaseQ<int8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseQ<int16_t>(machInst, dest, op1);
+              case 0x2:
+                return new BaseBQ<int32_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        } else {
+            switch (size) {
+              case 0x0:
+                return new BaseD<int8_t>(machInst, dest, op1);
+              case 0x1:
+                return new BaseD<int16_t>(machInst, dest, op1);
+              default:
+                return new Unknown(machInst);
+            }
+        }
+    }
 }};
 
 output exec {{
@@ -872,10 +1369,7 @@
             readDestCode = 'destElem = gtoh(destReg.elements[i]);'
         eWalkCode += '''
         if (imm < 0 && imm >= eCount) {
-            if (FullSystem)
-                fault = new UndefinedInstruction;
-            else
-                fault = new UndefinedInstruction(false, mnemonic);
+            fault = new UndefinedInstruction(machInst, false, mnemonic);
         } else {
             for (unsigned i = 0; i < eCount; i++) {
                 Element srcElem1 = gtoh(srcReg1.elements[i]);
@@ -926,10 +1420,7 @@
             readDestCode = 'destElem = gtoh(destReg.elements[i]);'
         eWalkCode += '''
         if (imm < 0 && imm >= eCount) {
-            if (FullSystem)
-                fault = new UndefinedInstruction;
-            else
-                fault = new UndefinedInstruction(false, mnemonic);
+            fault = new UndefinedInstruction(machInst, false, mnemonic);
         } else {
             for (unsigned i = 0; i < eCount; i++) {
                 Element srcElem1 = gtoh(srcReg1.elements[i]);
@@ -978,10 +1469,7 @@
             readDestCode = 'destReg = destRegs[i];'
         eWalkCode += '''
         if (imm < 0 && imm >= eCount) {
-            if (FullSystem)
-                fault = new UndefinedInstruction;
-            else
-                fault = new UndefinedInstruction(false, mnemonic);
+            fault = new UndefinedInstruction(machInst, false, mnemonic);
         } else {
             for (unsigned i = 0; i < rCount; i++) {
                 FloatReg srcReg1 = srcRegs1[i];
@@ -2156,7 +2644,7 @@
         bool done;
         destReg = processNans(fpscr, done, true, srcReg1, srcReg2);
         if (!done) {
-            destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMaxS,
+            destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMax<float>,
                                true, true, VfpRoundNearest);
         } else if (flushToZero(srcReg1, srcReg2)) {
             fpscr.idc = 1;
@@ -2171,7 +2659,7 @@
         bool done;
         destReg = processNans(fpscr, done, true, srcReg1, srcReg2);
         if (!done) {
-            destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMinS,
+            destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMin<float>,
                                true, true, VfpRoundNearest);
         } else if (flushToZero(srcReg1, srcReg2)) {
             fpscr.idc = 1;
@@ -2234,6 +2722,24 @@
     threeEqualRegInstFp("vmla", "NVmlaDFp", "SimdFloatMultAccOp", ("float",), 2, vmlafpCode, True)
     threeEqualRegInstFp("vmla", "NVmlaQFp", "SimdFloatMultAccOp", ("float",), 4, vmlafpCode, True)
 
+    vfmafpCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        destReg = ternaryOp(fpscr, srcReg1, srcReg2, destReg, fpMulAdd<float>,
+                            true, true, VfpRoundNearest);
+        FpscrExc = fpscr;
+    '''
+    threeEqualRegInstFp("vfma", "NVfmaDFp", "SimdFloatMultAccOp", ("float",), 2, vfmafpCode, True)
+    threeEqualRegInstFp("vfma", "NVfmaQFp", "SimdFloatMultAccOp", ("float",), 4, vfmafpCode, True)
+
+    vfmsfpCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        destReg = ternaryOp(fpscr, -srcReg1, srcReg2, destReg, fpMulAdd<float>,
+                            true, true, VfpRoundNearest);
+        FpscrExc = fpscr;
+    '''
+    threeEqualRegInstFp("vfms", "NVfmsDFp", "SimdFloatMultAccOp", ("float",), 2, vfmsfpCode, True)
+    threeEqualRegInstFp("vfms", "NVfmsQFp", "SimdFloatMultAccOp", ("float",), 4, vfmsfpCode, True)
+
     vmlsfpCode = '''
         FPSCR fpscr = (FPSCR) FpscrExc;
         float mid = binaryOp(fpscr, srcReg1, srcReg2, fpMulS,
@@ -2765,7 +3271,7 @@
             fpscr.idc = 1;
         VfpSavedState state = prepFpState(VfpRoundNearest);
         __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
-        destReg = vfpFpSToFixed(srcElem1, false, false, imm);
+        destReg = vfpFpToFixed<float>(srcElem1, false, 32, imm);
         __asm__ __volatile__("" :: "m" (destReg));
         finishVfp(fpscr, state, true);
         FpscrExc = fpscr;
@@ -2781,7 +3287,7 @@
             fpscr.idc = 1;
         VfpSavedState state = prepFpState(VfpRoundNearest);
         __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
-        destReg = vfpFpSToFixed(srcElem1, true, false, imm);
+        destReg = vfpFpToFixed<float>(srcElem1, true, 32, imm);
         __asm__ __volatile__("" :: "m" (destReg));
         finishVfp(fpscr, state, true);
         FpscrExc = fpscr;
@@ -2795,7 +3301,7 @@
         FPSCR fpscr = (FPSCR) FpscrExc;
         VfpSavedState state = prepFpState(VfpRoundNearest);
         __asm__ __volatile__("" : "=m" (srcReg1) : "m" (srcReg1));
-        destElem = vfpUFixedToFpS(true, true, srcReg1, false, imm);
+        destElem = vfpUFixedToFpS(true, true, srcReg1, 32, imm);
         __asm__ __volatile__("" :: "m" (destElem));
         finishVfp(fpscr, state, true);
         FpscrExc = fpscr;
@@ -2809,7 +3315,7 @@
         FPSCR fpscr = (FPSCR) FpscrExc;
         VfpSavedState state = prepFpState(VfpRoundNearest);
         __asm__ __volatile__("" : "=m" (srcReg1) : "m" (srcReg1));
-        destElem = vfpSFixedToFpS(true, true, srcReg1, false, imm);
+        destElem = vfpSFixedToFpS(true, true, srcReg1, 32, imm);
         __asm__ __volatile__("" :: "m" (destElem));
         finishVfp(fpscr, state, true);
         FpscrExc = fpscr;
@@ -3296,10 +3802,7 @@
             } else {
                 index -= eCount;
                 if (index >= eCount) {
-                    if (FullSystem)
-                        fault = new UndefinedInstruction;
-                    else
-                        fault = new UndefinedInstruction(false, mnemonic);
+                    fault = new UndefinedInstruction(machInst, false, mnemonic);
                 } else {
                     destReg.elements[i] = srcReg2.elements[index];
                 }
diff --git a/src/arch/arm/isa/insts/neon64.isa b/src/arch/arm/isa/insts/neon64.isa
new file mode 100644
index 0000000..e065761
--- /dev/null
+++ b/src/arch/arm/isa/insts/neon64.isa
@@ -0,0 +1,3355 @@
+// -*- mode: c++ -*-
+
+// Copyright (c) 2012-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Giacomo Gabrielli
+//          Mbou Eyole
+
+let {{
+
+    header_output = ""
+    exec_output = ""
+
+    # FP types (FP operations always work with unsigned representations)
+    floatTypes = ("uint32_t", "uint64_t")
+    smallFloatTypes = ("uint32_t",)
+
+    def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
+                           readDest=False, pairwise=False, scalar=False,
+                           byElem=False):
+        assert (not pairwise) or ((not byElem) and (not scalar))
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect srcReg1, destReg;
+        '''
+        if byElem:
+            # 2nd register operand has to be read fully
+            eWalkCode += '''
+        FullRegVect srcReg2;
+        '''
+        else:
+            eWalkCode += '''
+        RegVect srcReg2;
+        '''
+        for reg in range(rCount):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
+        ''' % { "reg" : reg }
+            if readDest:
+                eWalkCode += '''
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+        if byElem:
+            # 2nd operand has to be read fully
+            for reg in range(rCount, 4):
+                eWalkCode += '''
+        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
+        ''' % { "reg" : reg }
+        readDestCode = ''
+        if readDest:
+            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
+        if pairwise:
+            eWalkCode += '''
+        for (unsigned i = 0; i < eCount; i++) {
+            Element srcElem1 = gtoh(2 * i < eCount ?
+                                    srcReg1.elements[2 * i] :
+                                    srcReg2.elements[2 * i - eCount]);
+            Element srcElem2 = gtoh(2 * i < eCount ?
+                                    srcReg1.elements[2 * i + 1] :
+                                    srcReg2.elements[2 * i + 1 - eCount]);
+            Element destElem;
+            %(readDest)s
+            %(op)s
+            destReg.elements[i] = htog(destElem);
+        }
+        ''' % { "op" : op, "readDest" : readDestCode }
+        else:
+            scalarCheck = '''
+            if (i != 0) {
+                destReg.elements[i] = 0;
+                continue;
+            }
+            '''
+            eWalkCode += '''
+        for (unsigned i = 0; i < eCount; i++) {
+            %(scalarCheck)s
+            Element srcElem1 = gtoh(srcReg1.elements[i]);
+            Element srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
+            Element destElem;
+            %(readDest)s
+            %(op)s
+            destReg.elements[i] = htog(destElem);
+        }
+        ''' % { "op" : op, "readDest" : readDestCode,
+                "scalarCheck" : scalarCheck if scalar else "",
+                "src2Index" : "imm" if byElem else "i" }
+        for reg in range(rCount):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        if rCount < 4:  # zero upper half
+            for reg in range(rCount, 4):
+                eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        if byElem:
+            header_output += NeonX2RegImmOpDeclare.subst(iop)
+        else:
+            header_output += NeonX2RegOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def threeUnequalRegInstX(name, Name, opClass, types, op,
+                             bigSrc1, bigSrc2, bigDest, readDest, scalar=False,
+                             byElem=False, hi=False):
+        assert not (scalar and hi)
+        global header_output, exec_output
+        src1Cnt = src2Cnt = destCnt = 2
+        src1Prefix = src2Prefix = destPrefix = ''
+        if bigSrc1:
+            src1Cnt = 4
+            src1Prefix = 'Big'
+        if bigSrc2:
+            src2Cnt = 4
+            src2Prefix = 'Big'
+        if bigDest:
+            destCnt = 4
+            destPrefix = 'Big'
+        if byElem:
+            src2Prefix = 'Full'
+        eWalkCode = simd64EnabledCheckCode + '''
+        %sRegVect srcReg1;
+        %sRegVect srcReg2;
+        %sRegVect destReg;
+        ''' % (src1Prefix, src2Prefix, destPrefix)
+        srcReg1 = 0
+        if hi and not bigSrc1:  # long/widening operations
+            srcReg1 = 2
+        for reg in range(src1Cnt):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(srcReg1)d_uw);
+        ''' % { "reg" : reg, "srcReg1" : srcReg1 }
+            srcReg1 += 1
+        srcReg2 = 0
+        if (not byElem) and (hi and not bigSrc2):  # long/widening operations
+            srcReg2 = 2
+        for reg in range(src2Cnt):
+            eWalkCode += '''
+        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(srcReg2)d_uw);
+        ''' % { "reg" : reg, "srcReg2" : srcReg2 }
+            srcReg2 += 1
+        if byElem:
+            # 2nd operand has to be read fully
+            for reg in range(src2Cnt, 4):
+                eWalkCode += '''
+        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
+        ''' % { "reg" : reg }
+        if readDest:
+            for reg in range(destCnt):
+                eWalkCode += '''
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+        readDestCode = ''
+        if readDest:
+            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
+        scalarCheck = '''
+            if (i != 0) {
+                destReg.elements[i] = 0;
+                continue;
+            }
+            '''
+        eWalkCode += '''
+        for (unsigned i = 0; i < eCount; i++) {
+            %(scalarCheck)s
+            %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]);
+            %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
+            %(destPrefix)sElement destElem;
+            %(readDest)s
+            %(op)s
+            destReg.elements[i] = htog(destElem);
+        }
+        ''' % { "op" : op, "readDest" : readDestCode,
+                "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
+                "destPrefix" : destPrefix,
+                "scalarCheck" : scalarCheck if scalar else "",
+                "src2Index" : "imm" if byElem else "i" }
+        destReg = 0
+        if hi and not bigDest:
+            # narrowing operations
+            destReg = 2
+        for reg in range(destCnt):
+            eWalkCode += '''
+        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg, "destReg": destReg }
+            destReg += 1
+        if destCnt < 4 and not hi:  # zero upper half
+            for reg in range(destCnt, 4):
+                eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
+                            { "code": eWalkCode,
+                              "r_count": 2,
+                              "op_class": opClass }, [])
+        if byElem:
+            header_output += NeonX2RegImmOpDeclare.subst(iop)
+        else:
+            header_output += NeonX2RegOpDeclare.subst(iop)
+        exec_output += NeonXUnequalRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
+                            scalar=False, byElem=False, hi=False):
+        assert not byElem
+        threeUnequalRegInstX(name, Name, opClass, types, op,
+                             True, True, False, readDest, scalar, byElem, hi)
+
+    def threeRegLongInstX(name, Name, opClass, types, op, readDest=False,
+                          scalar=False, byElem=False, hi=False):
+        threeUnequalRegInstX(name, Name, opClass, types, op,
+                             False, False, True, readDest, scalar, byElem, hi)
+
+    def threeRegWideInstX(name, Name, opClass, types, op, readDest=False,
+                          scalar=False, byElem=False, hi=False):
+        assert not byElem
+        threeUnequalRegInstX(name, Name, opClass, types, op,
+                             True, False, True, readDest, scalar, byElem, hi)
+
+    def twoEqualRegInstX(name, Name, opClass, types, rCount, op,
+                         readDest=False, scalar=False, byElem=False,
+                         hasImm=False, isDup=False):
+        global header_output, exec_output
+        assert (not isDup) or byElem
+        if byElem:
+            hasImm = True
+        if isDup:
+            eWalkCode = simd64EnabledCheckCode + '''
+        FullRegVect srcReg1;
+        RegVect destReg;
+        '''
+        else:
+            eWalkCode = simd64EnabledCheckCode + '''
+        RegVect srcReg1, destReg;
+        '''
+        for reg in range(4 if isDup else rCount):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        ''' % { "reg" : reg }
+            if readDest:
+                eWalkCode += '''
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+        readDestCode = ''
+        if readDest:
+            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
+        scalarCheck = '''
+            if (i != 0) {
+                destReg.elements[i] = 0;
+                continue;
+            }
+            '''
+        eWalkCode += '''
+        for (unsigned i = 0; i < eCount; i++) {
+            %(scalarCheck)s
+            unsigned j = i;
+            Element srcElem1 = gtoh(srcReg1.elements[%(src1Index)s]);
+            Element destElem;
+            %(readDest)s
+            %(op)s
+            destReg.elements[j] = htog(destElem);
+        }
+        ''' % { "op" : op, "readDest" : readDestCode,
+                "scalarCheck" : scalarCheck if scalar else "",
+                "src1Index" : "imm" if byElem else "i" }
+        for reg in range(rCount):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        if rCount < 4:  # zero upper half
+            for reg in range(rCount, 4):
+                eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        if hasImm:
+            header_output += NeonX1RegImmOpDeclare.subst(iop)
+        else:
+            header_output += NeonX1RegOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def twoRegLongInstX(name, Name, opClass, types, op, readDest=False,
+                        hi=False, hasImm=False):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect srcReg1;
+        BigRegVect destReg;
+        '''
+        destReg = 0 if not hi else 2
+        for reg in range(2):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(destReg)d_uw);
+        ''' % { "reg" : reg, "destReg": destReg }
+            destReg += 1
+        destReg = 0 if not hi else 2
+        if readDest:
+            for reg in range(4):
+                eWalkCode += '''
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+                destReg += 1
+        readDestCode = ''
+        if readDest:
+            readDestCode = 'destReg = gtoh(destReg.elements[i]);'
+        eWalkCode += '''
+        for (unsigned i = 0; i < eCount; i++) {
+            Element srcElem1 = gtoh(srcReg1.elements[i]);
+            BigElement destElem;
+            %(readDest)s
+            %(op)s
+            destReg.elements[i] = htog(destElem);
+        }
+        ''' % { "op" : op, "readDest" : readDestCode }
+        for reg in range(4):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
+                            { "code": eWalkCode,
+                              "r_count": 2,
+                              "op_class": opClass }, [])
+        if hasImm:
+            header_output += NeonX1RegImmOpDeclare.subst(iop)
+        else:
+            header_output += NeonX1RegOpDeclare.subst(iop)
+        exec_output += NeonXUnequalRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
+                          scalar=False, hi=False, hasImm=False):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        BigRegVect srcReg1;
+        RegVect destReg;
+        '''
+        for reg in range(4):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        ''' % { "reg" : reg }
+        if readDest:
+            for reg in range(2):
+                eWalkCode += '''
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+        else:
+            eWalkCode += '''
+        destReg.elements[0] = 0;
+        ''' % { "reg" : reg }
+        readDestCode = ''
+        if readDest:
+            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
+        scalarCheck = '''
+            if (i != 0) {
+                destReg.elements[i] = 0;
+                continue;
+            }
+            '''
+        eWalkCode += '''
+        for (unsigned i = 0; i < eCount; i++) {
+            %(scalarCheck)s
+            BigElement srcElem1 = gtoh(srcReg1.elements[i]);
+            Element destElem;
+            %(readDest)s
+            %(op)s
+            destReg.elements[i] = htog(destElem);
+        }
+        ''' % { "op" : op, "readDest" : readDestCode,
+                "scalarCheck" : scalarCheck if scalar else "" }
+        destReg = 0 if not hi else 2
+        for reg in range(2):
+            eWalkCode += '''
+        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg, "destReg": destReg }
+            destReg += 1
+        if not hi:
+            for reg in range(2, 4):  # zero upper half
+                eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
+                            { "code": eWalkCode,
+                              "r_count": 2,
+                              "op_class": opClass }, [])
+        if hasImm:
+            header_output += NeonX1RegImmOpDeclare.subst(iop)
+        else:
+            header_output += NeonX1RegOpDeclare.subst(iop)
+        exec_output += NeonXUnequalRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def threeRegScrambleInstX(name, Name, opClass, types, rCount, op):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect srcReg1, srcReg2, destReg;
+        '''
+        for reg in range(rCount):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
+        ''' % { "reg" : reg }
+        eWalkCode += op
+        for reg in range(rCount):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        if rCount < 4:
+            for reg in range(rCount, 4):
+                eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX2RegOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX2RegOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def insFromVecElemInstX(name, Name, opClass, types, rCount):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        FullRegVect srcReg1;
+        RegVect destReg;
+        '''
+        for reg in range(4):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        ''' % { "reg" : reg }
+        for reg in range(rCount):
+            eWalkCode += '''
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+        eWalkCode += '''
+        Element srcElem1 = gtoh(srcReg1.elements[imm2]);
+        Element destElem = srcElem1;
+        destReg.elements[imm1] = htog(destElem);
+        '''
+        for reg in range(rCount):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX1Reg2ImmOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX1Reg2ImmOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect srcReg1, destReg;
+        '''
+        for reg in range(rCount):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        ''' % { "reg" : reg }
+        eWalkCode += '''
+        Element srcElem1 = gtoh(srcReg1.elements[0]);
+        Element srcElem2 = gtoh(srcReg1.elements[1]);
+        Element destElem;
+        %(op)s
+        destReg.elements[0] = htog(destElem);
+        ''' % { "op" : op }
+        destCnt = rCount / 2
+        for reg in range(destCnt):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        for reg in range(destCnt, 4):  # zero upper half
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX1RegOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX1RegOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
+                          doubleDest=False, long=False):
+        global header_output, exec_output
+        destPrefix = "Big" if long else ""
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect srcReg1;
+        %sRegVect destReg;
+        ''' % destPrefix
+        for reg in range(rCount):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        ''' % { "reg" : reg }
+        eWalkCode += '''
+        destReg.regs[0] = 0;
+        %(destPrefix)sElement destElem = 0;
+        for (unsigned i = 0; i < eCount; i++) {
+            Element srcElem1 = gtoh(srcReg1.elements[i]);
+            if (i == 0) {
+                destElem = srcElem1;
+            } else {
+                %(op)s
+            }
+        }
+        destReg.elements[0] = htog(destElem);
+        ''' % { "op" : op, "destPrefix" : destPrefix }
+        destCnt = 2 if doubleDest else 1
+        for reg in range(destCnt):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        for reg in range(destCnt, 4):  # zero upper half
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX1RegOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX1RegOpDeclare.subst(iop)
+        if long:
+            exec_output += NeonXUnequalRegOpExecute.subst(iop)
+        else:
+            exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def twoRegCondenseInstX(name, Name, opClass, types, rCount, op,
+                            readDest=False):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect srcRegs;
+        BigRegVect destReg;
+        '''
+        for reg in range(rCount):
+            eWalkCode += '''
+        srcRegs.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        ''' % { "reg" : reg }
+            if readDest:
+                eWalkCode += '''
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+        readDestCode = ''
+        if readDest:
+            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
+        eWalkCode += '''
+        for (unsigned i = 0; i < eCount / 2; i++) {
+            Element srcElem1 = gtoh(srcRegs.elements[2 * i]);
+            Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]);
+            BigElement destElem;
+            %(readDest)s
+            %(op)s
+            destReg.elements[i] = htog(destElem);
+        }
+        ''' % { "op" : op, "readDest" : readDestCode }
+        for reg in range(rCount):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        if rCount < 4:  # zero upper half
+            for reg in range(rCount, 4):
+                eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX1RegOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX1RegOpDeclare.subst(iop)
+        exec_output += NeonXUnequalRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect destReg;
+        '''
+        if readDest:
+            for reg in range(rCount):
+                eWalkCode += '''
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+        readDestCode = ''
+        if readDest:
+            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
+        eWalkCode += '''
+        for (unsigned i = 0; i < eCount; i++) {
+            Element destElem;
+            %(readDest)s
+            %(op)s
+            destReg.elements[i] = htog(destElem);
+        }
+        ''' % { "op" : op, "readDest" : readDestCode }
+        for reg in range(rCount):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        if rCount < 4:  # zero upper half
+            for reg in range(rCount, 4):
+                eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataXImmOnlyOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX1RegImmOnlyOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def dupGprInstX(name, Name, opClass, types, rCount, gprSpec):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect destReg;
+        for (unsigned i = 0; i < eCount; i++) {
+            destReg.elements[i] = htog((Element) %sOp1);
+        }
+        ''' % gprSpec
+        for reg in range(rCount):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        if rCount < 4:  # zero upper half
+            for reg in range(rCount, 4):
+                eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX1RegOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX1RegOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def extInstX(name, Name, opClass, types, rCount, op):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect srcReg1, srcReg2, destReg;
+        '''
+        for reg in range(rCount):
+            eWalkCode += '''
+        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
+        ''' % { "reg" : reg }
+        eWalkCode += op
+        for reg in range(rCount):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        if rCount < 4:  # zero upper half
+            for reg in range(rCount, 4):
+                eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX2RegImmOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX2RegImmOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        RegVect destReg;
+        '''
+        for reg in range(rCount):
+            eWalkCode += '''
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+        eWalkCode += '''
+        destReg.elements[imm] = htog((Element) %sOp1);
+        ''' % gprSpec
+        for reg in range(rCount):
+            eWalkCode += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX1RegImmOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX1RegImmOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def insToGprInstX(name, Name, opClass, types, rCount, gprSpec,
+                      signExt=False):
+        global header_output, exec_output
+        eWalkCode = simd64EnabledCheckCode + '''
+        FullRegVect srcReg;
+        '''
+        for reg in range(4):
+            eWalkCode += '''
+        srcReg.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
+        ''' % { "reg" : reg }
+        if signExt:
+            eWalkCode += '''
+        %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]);
+        ''' % gprSpec
+        else:
+            eWalkCode += '''
+        %sDest = srcReg.elements[imm];
+        ''' % gprSpec
+        iop = InstObjParams(name, Name,
+                            "DataX1RegImmOp",
+                            { "code": eWalkCode,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX1RegImmOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount):
+        global header_output, decoder_output, exec_output
+        code = simd64EnabledCheckCode + '''
+        union
+        {
+            uint8_t bytes[64];
+            FloatRegBits regs[16];
+        } table;
+
+        union
+        {
+            uint8_t bytes[%(rCount)d * 4];
+            FloatRegBits regs[%(rCount)d];
+        } destReg, srcReg2;
+
+        const unsigned length = %(length)d;
+        const bool isTbl = %(isTbl)s;
+        ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl }
+        for reg in range(rCount):
+            code += '''
+        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
+        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+        ''' % { "reg" : reg }
+        for reg in range(16):
+            if reg < length * 4:
+                code += '''
+        table.regs[%(reg)d] = htog(AA64FpOp1P%(p)dV%(v)dS_uw);
+        ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 }
+            else:
+                code += '''
+        table.regs[%(reg)d] = 0;
+        ''' % { "reg" : reg }
+        code += '''
+        for (unsigned i = 0; i < sizeof(destReg); i++) {
+            uint8_t index = srcReg2.bytes[i];
+            if (index < 16 * length) {
+                destReg.bytes[i] = table.bytes[index];
+            } else {
+                if (isTbl)
+                    destReg.bytes[i] = 0;
+                // else destReg.bytes[i] unchanged
+            }
+        }
+        '''
+        for reg in range(rCount):
+            code += '''
+        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
+        ''' % { "reg" : reg }
+        if rCount < 4:  # zero upper half
+            for reg in range(rCount, 4):
+                code += '''
+        AA64FpDestP%(reg)d_uw = 0;
+        ''' % { "reg" : reg }
+        iop = InstObjParams(name, Name,
+                            "DataX2RegOp",
+                            { "code": code,
+                              "r_count": rCount,
+                              "op_class": opClass }, [])
+        header_output += NeonX2RegOpDeclare.subst(iop)
+        exec_output += NeonXEqualRegOpExecute.subst(iop)
+        for type in types:
+            substDict = { "targs" : type,
+                          "class_name" : Name }
+            exec_output += NeonXExecDeclare.subst(substDict)
+
+    # ABS
+    absCode = '''
+            if (srcElem1 < 0) {
+                destElem = -srcElem1;
+            } else {
+                destElem = srcElem1;
+            }
+    '''
+    twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode)
+    twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode)
+    # ADD
+    addCode = "destElem = srcElem1 + srcElem2;"
+    threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode)
+    threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode)
+    # ADDHN, ADDHN2
+    addhnCode = '''
+            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
+                        (sizeof(Element) * 8);
+    '''
+    threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes,
+                        addhnCode)
+    threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes,
+                        addhnCode, hi=True)
+    # ADDP (scalar)
+    twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4,
+                          addCode)
+    # ADDP (vector)
+    threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2,
+                       addCode, pairwise=True)
+    threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4,
+                       addCode, pairwise=True)
+    # ADDV
+    # Note: SimdAddOp can be a bit optimistic here
+    addAcrossCode = "destElem += srcElem1;"
+    twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
+                      2, addAcrossCode)
+    twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
+                      addAcrossCode)
+    # AND
+    andCode = "destElem = srcElem1 & srcElem2;"
+    threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
+    threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode)
+    # BIC (immediate)
+    bicImmCode = "destElem &= ~imm;"
+    oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2,
+                   bicImmCode, True)
+    oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4,
+                   bicImmCode, True)
+    # BIC (register)
+    bicCode = "destElem = srcElem1 & ~srcElem2;"
+    threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode)
+    threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode)
+    # BIF
+    bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);"
+    threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode,
+                       True)
+    threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode,
+                       True)
+    # BIT
+    bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);"
+    threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode,
+                       True)
+    threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode,
+                       True)
+    # BSL
+    bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);"
+    threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode,
+                       True)
+    threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
+                       True)
+    # CLS
+    clsCode = '''
+            unsigned count = 0;
+            if (srcElem1 < 0) {
+                srcElem1 <<= 1;
+                while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
+                    count++;
+                    srcElem1 <<= 1;
+                }
+            } else {
+                srcElem1 <<= 1;
+                while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
+                    count++;
+                    srcElem1 <<= 1;
+                }
+            }
+            destElem = count;
+    '''
+    twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode)
+    twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode)
+    # CLZ
+    clzCode = '''
+            unsigned count = 0;
+            while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
+                count++;
+                srcElem1 <<= 1;
+            }
+            destElem = count;
+    '''
+    twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode)
+    twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode)
+    # CMEQ (register)
+    cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;"
+    threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2,
+                       cmeqCode)
+    threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4,
+                       cmeqCode)
+    # CMEQ (zero)
+    cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;"
+    twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2,
+                     cmeqZeroCode)
+    twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4,
+                     cmeqZeroCode)
+    # CMGE (register)
+    cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;"
+    threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode)
+    threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode)
+    # CMGE (zero)
+    cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;"
+    twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2,
+                     cmgeZeroCode)
+    twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4,
+                     cmgeZeroCode)
+    # CMGT (register)
+    cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;"
+    threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode)
+    threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode)
+    # CMGT (zero)
+    cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;"
+    twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2,
+                     cmgtZeroCode)
+    twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4,
+                     cmgtZeroCode)
+    # CMHI (register)
+    threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2,
+                       cmgtCode)
+    threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4,
+                       cmgtCode)
+    # CMHS (register)
+    threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2,
+                       cmgeCode)
+    threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4,
+                       cmgeCode)
+    # CMLE (zero)
+    cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;"
+    twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2,
+                     cmleZeroCode)
+    twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4,
+                     cmleZeroCode)
+    # CMLT (zero)
+    cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;"
+    twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2,
+                     cmltZeroCode)
+    twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4,
+                     cmltZeroCode)
+    # CMTST (register)
+    tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;"
+    threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2,
+                       tstCode)
+    threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4,
+                       tstCode)
+    # CNT
+    cntCode = '''
+            unsigned count = 0;
+            while (srcElem1 && count < sizeof(Element) * 8) {
+                count += srcElem1 & 0x1;
+                srcElem1 >>= 1;
+            }
+            destElem = count;
+    '''
+    twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode)
+    twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode)
+    # DUP (element)
+    dupCode = "destElem = srcElem1;"
+    twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2,
+                     dupCode, isDup=True, byElem=True)
+    twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4,
+                     dupCode, isDup=True, byElem=True)
+    twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4,
+                     dupCode, isDup=True, byElem=True, scalar=True)
+    # DUP (general register)
+    dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W')
+    dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
+    dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X')
+    # EOR
+    eorCode = "destElem = srcElem1 ^ srcElem2;"
+    threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode)
+    threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode)
+    # EXT
+    extCode = '''
+            for (unsigned i = 0; i < eCount; i++) {
+                unsigned index = i + imm;
+                if (index < eCount) {
+                    destReg.elements[i] = srcReg1.elements[index];
+                } else {
+                    index -= eCount;
+                    if (index >= eCount) {
+                        fault = new UndefinedInstruction(machInst, false, mnemonic);
+                    } else {
+                        destReg.elements[i] = srcReg2.elements[index];
+                    }
+                }
+            }
+    '''
+    extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode)
+    extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode)
+    # FABD
+    fpOp = '''
+            FPSCR fpscr = (FPSCR) FpscrExc;
+            destElem = %s;
+            FpscrExc = fpscr;
+    '''
+    fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))"
+    threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2,
+                       fabdCode)
+    threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4,
+                       fabdCode)
+    threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4,
+                       fabdCode, scalar=True)
+    # FABS
+    fabsCode = fpOp % "fplibAbs<Element>(srcElem1)"
+    twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2,
+                     fabsCode)
+    twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4,
+                     fabsCode)
+    # FACGE
+    fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1),"
+                         " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0")
+    facgeCode = fpCmpAbsOp % "GE"
+    threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes,
+                       2, facgeCode)
+    threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4,
+                       facgeCode)
+    threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4,
+                       facgeCode, scalar=True)
+    # FACGT
+    facgtCode = fpCmpAbsOp % "GT"
+    threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes,
+                       2, facgtCode)
+    threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4,
+                       facgtCode)
+    threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4,
+                       facgtCode, scalar=True)
+    # FADD
+    fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)"
+    faddCode = fpBinOp % "Add"
+    threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2,
+                       faddCode)
+    threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4,
+                       faddCode)
+    # FADDP (scalar)
+    twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp",
+                          ("uint32_t",), 2, faddCode)
+    twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp",
+                          ("uint64_t",), 4, faddCode)
+    # FADDP (vector)
+    threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes,
+                       2, faddCode, pairwise=True)
+    threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4,
+                       faddCode, pairwise=True)
+    # FCMEQ (register)
+    fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?"
+                      " -1 : 0")
+    fcmeqCode = fpCmpOp % "EQ"
+    threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes,
+                       2, fcmeqCode)
+    threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fcmeqCode)
+    threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4,
+                       fcmeqCode, scalar=True)
+    # FCMEQ (zero)
+    fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0"
+    fcmeqZeroCode = fpCmpZeroOp % "EQ"
+    twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes,
+                     2, fcmeqZeroCode)
+    twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmeqZeroCode)
+    twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmeqZeroCode, scalar=True)
+    # FCMGE (register)
+    fcmgeCode = fpCmpOp % "GE"
+    threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes,
+                       2, fcmgeCode)
+    threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fcmgeCode)
+    threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4,
+                       fcmgeCode, scalar=True)
+    # FCMGE (zero)
+    fcmgeZeroCode = fpCmpZeroOp % "GE"
+    twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes,
+                     2, fcmgeZeroCode)
+    twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmgeZeroCode)
+    twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmgeZeroCode, scalar=True)
+    # FCMGT (register)
+    fcmgtCode = fpCmpOp % "GT"
+    threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes,
+                       2, fcmgtCode)
+    threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fcmgtCode)
+    threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4,
+                       fcmgtCode, scalar=True)
+    # FCMGT (zero)
+    fcmgtZeroCode = fpCmpZeroOp % "GT"
+    twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes,
+                     2, fcmgtZeroCode)
+    twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmgtZeroCode)
+    twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmgtZeroCode, scalar=True)
+    # FCMLE (zero)
+    fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?"
+                             " -1 : 0")
+    fcmleZeroCode = fpCmpRevZeroOp % "GE"
+    twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes,
+                     2, fcmleZeroCode)
+    twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmleZeroCode)
+    twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmleZeroCode, scalar=True)
+    # FCMLT (zero)
+    fcmltZeroCode = fpCmpRevZeroOp % "GT"
+    twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes,
+                     2, fcmltZeroCode)
+    twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmltZeroCode)
+    twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4,
+                     fcmltZeroCode, scalar=True)
+    # FCVTAS
+    fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>("
+                       "srcElem1, %s, %s, %s, fpscr)")
+    fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY")
+    twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2,
+                     fcvtasCode)
+    twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtasCode)
+    twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtasCode, scalar=True)
+    # FCVTAU
+    fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY")
+    twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2,
+                     fcvtauCode)
+    twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtauCode)
+    twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtauCode, scalar=True)
+    # FCVTL, FCVTL2
+    fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>("
+                        "srcElem1, FPCRRounding(fpscr), fpscr)")
+    twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"),
+                    fcvtlCode)
+    twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"),
+                    fcvtlCode, hi=True)
+    # FCVTMS
+    fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF")
+    twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2,
+                     fcvtmsCode)
+    twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtmsCode)
+    twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtmsCode, scalar=True)
+    # FCVTMU
+    fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF")
+    twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2,
+                     fcvtmuCode)
+    twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtmuCode)
+    twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtmuCode, scalar=True)
+    # FCVTN, FCVTN2
+    fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>("
+                        "srcElem1, FPCRRounding(fpscr), fpscr)")
+    twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp",
+                      ("uint16_t", "uint32_t"), fcvtnCode)
+    twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp",
+                      ("uint16_t", "uint32_t"), fcvtnCode, hi=True)
+    # FCVTNS
+    fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN")
+    twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2,
+                     fcvtnsCode)
+    twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtnsCode)
+    twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtnsCode, scalar=True)
+    # FCVTNU
+    fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN")
+    twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2,
+                     fcvtnuCode)
+    twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtnuCode)
+    twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtnuCode, scalar=True)
+    # FCVTPS
+    fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF")
+    twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2,
+                     fcvtpsCode)
+    twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtpsCode)
+    twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtpsCode, scalar=True)
+    # FCVTPU
+    fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF")
+    twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2,
+                     fcvtpuCode)
+    twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtpuCode)
+    twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtpuCode, scalar=True)
+    # FCVTXN, FCVTXN2
+    fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>("
+                         "srcElem1, FPRounding_ODD, fpscr)")
+    twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes,
+                      fcvtxnCode)
+    twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes,
+                      fcvtxnCode, hi=True)
+    twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes,
+                      fcvtxnCode, scalar=True)
+    # FCVTZS (fixed-point)
+    fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO")
+    twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes,
+                     2, fcvtzsCode, hasImm=True)
+    twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtzsCode, hasImm=True)
+    twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtzsCode, hasImm=True, scalar=True)
+    # FCVTZS (integer)
+    fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO")
+    twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes,
+                     2, fcvtzsIntCode)
+    twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtzsIntCode)
+    twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtzsIntCode, scalar=True)
+    # FCVTZU (fixed-point)
+    fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO")
+    twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes,
+                     2, fcvtzuCode, hasImm=True)
+    twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtzuCode, hasImm=True)
+    twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtzuCode, hasImm=True, scalar=True)
+    # FCVTZU (integer)
+    fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO")
+    twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2,
+                     fcvtzuIntCode)
+    twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4,
+                     fcvtzuIntCode)
+    twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4,
+                     fcvtzuIntCode, scalar=True)
+    # FDIV
+    fdivCode = fpBinOp % "Div"
+    threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2,
+                       fdivCode)
+    threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4,
+                       fdivCode)
+    # FMAX
+    fmaxCode = fpBinOp % "Max"
+    threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2,
+                       fmaxCode)
+    threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fmaxCode)
+    # FMAXNM
+    fmaxnmCode = fpBinOp % "MaxNum"
+    threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes,
+                       2, fmaxnmCode)
+    threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fmaxnmCode)
+    # FMAXNMP (scalar)
+    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp",
+                          ("uint32_t",), 2, fmaxnmCode)
+    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp",
+                          ("uint64_t",), 4, fmaxnmCode)
+    # FMAXNMP (vector)
+    threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp",
+                       smallFloatTypes, 2, fmaxnmCode, pairwise=True)
+    threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fmaxnmCode, pairwise=True)
+    # FMAXNMV
+    # Note: SimdFloatCmpOp can be a bit optimistic here
+    fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
+    fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
+    twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
+                      4, fmaxnmAcrossCode)
+    # FMAXP (scalar)
+    twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
+                          ("uint32_t",), 2, fmaxCode)
+    twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp",
+                          ("uint64_t",), 4, fmaxCode)
+    # FMAXP (vector)
+    threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes,
+                       2, fmaxCode, pairwise=True)
+    threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fmaxCode, pairwise=True)
+    # FMAXV
+    # Note: SimdFloatCmpOp can be a bit optimistic here
+    fmaxAcrossCode = fpAcrossOp % "Max"
+    twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
+                      fmaxAcrossCode)
+    # FMIN
+    fminCode = fpBinOp % "Min"
+    threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
+                       fminCode)
+    threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fminCode)
+    # FMINNM
+    fminnmCode = fpBinOp % "MinNum"
+    threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes,
+                       2, fminnmCode)
+    threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fminnmCode)
+    # FMINNMP (scalar)
+    twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp",
+                          ("uint32_t",), 2, fminnmCode)
+    twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp",
+                          ("uint64_t",), 4, fminnmCode)
+    # FMINNMP (vector)
+    threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp",
+                       smallFloatTypes, 2, fminnmCode, pairwise=True)
+    threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fminnmCode, pairwise=True)
+    # FMINNMV
+    # Note: SimdFloatCmpOp can be a bit optimistic here
+    fminnmAcrossCode = fpAcrossOp % "MinNum"
+    twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
+                      4, fminnmAcrossCode)
+    # FMINP (scalar)
+    twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
+                          ("uint32_t",), 2, fminCode)
+    twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp",
+                          ("uint64_t",), 4, fminCode)
+    # FMINP (vector)
+    threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes,
+                       2, fminCode, pairwise=True)
+    threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4,
+                       fminCode, pairwise=True)
+    # FMINV
+    # Note: SimdFloatCmpOp can be a bit optimistic here
+    fminAcrossCode = fpAcrossOp % "Min"
+    twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
+                      fminAcrossCode)
+    # FMLA (by element)
+    fmlaCode = fpOp % ("fplibMulAdd<Element>("
+                       "destElem, srcElem1, srcElem2, fpscr)")
+    threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp",
+                       smallFloatTypes, 2, fmlaCode, True, byElem=True)
+    threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes,
+                       4, fmlaCode, True, byElem=True)
+    threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes,
+                       4, fmlaCode, True, byElem=True, scalar=True)
+    # FMLA (vector)
+    threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes,
+                       2, fmlaCode, True)
+    threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4,
+                       fmlaCode, True)
+    # FMLS (by element)
+    fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem,"
+                       " fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
+    threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp",
+                       smallFloatTypes, 2, fmlsCode, True, byElem=True)
+    threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes,
+                       4, fmlsCode, True, byElem=True)
+    threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes,
+                       4, fmlsCode, True, byElem=True, scalar=True)
+    # FMLS (vector)
+    threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes,
+                       2, fmlsCode, True)
+    threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4,
+                       fmlsCode, True)
+    # FMOV
+    fmovCode = 'destElem = imm;'
+    oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2,
+                   fmovCode)
+    oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode)
+    # FMUL (by element)
+    fmulCode = fpBinOp % "Mul"
+    threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp",
+                       smallFloatTypes, 2, fmulCode, byElem=True)
+    threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4,
+                       fmulCode, byElem=True)
+    threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4,
+                       fmulCode, byElem=True, scalar=True)
+    # FMUL (vector)
+    threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2,
+                       fmulCode)
+    threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4,
+                       fmulCode)
+    # FMULX
+    fmulxCode = fpBinOp % "MulX"
+    threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes,
+                       2, fmulxCode)
+    threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4,
+                       fmulxCode)
+    threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4,
+                       fmulxCode, scalar=True)
+    # FMULX (by element)
+    threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp",
+                       smallFloatTypes, 2, fmulxCode, byElem=True)
+    threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes,
+                       4, fmulxCode, byElem=True)
+    threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes,
+                       4, fmulxCode, byElem=True, scalar=True)
+    # FNEG
+    fnegCode = fpOp % "fplibNeg<Element>(srcElem1)"
+    twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2,
+                     fnegCode)
+    twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4,
+                     fnegCode)
+    # FRECPE
+    frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)"
+    twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp",
+                     smallFloatTypes, 2, frecpeCode)
+    twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4,
+                     frecpeCode)
+    twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes,
+                     4, frecpeCode, scalar=True)
+    # FRECPS
+    frecpsCode = fpBinOp % "RecipStepFused"
+    threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp",
+                       smallFloatTypes, 2, frecpsCode)
+    threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes,
+                       4, frecpsCode)
+    threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes,
+                       4, frecpsCode, scalar=True)
+    # FRECPX
+    frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)"
+    twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4,
+                     frecpxCode, scalar=True)
+    # FRINTA
+    frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)"
+    frintaCode = frintCode % ("FPRounding_TIEAWAY", "false")
+    twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2,
+                     frintaCode)
+    twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4,
+                     frintaCode)
+    # FRINTI
+    frintiCode = frintCode % ("FPCRRounding(fpscr)", "false")
+    twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2,
+                     frintiCode)
+    twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4,
+                     frintiCode)
+    # FRINTM
+    frintmCode = frintCode % ("FPRounding_NEGINF", "false")
+    twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2,
+                     frintmCode)
+    twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4,
+                     frintmCode)
+    # FRINTN
+    frintnCode = frintCode % ("FPRounding_TIEEVEN", "false")
+    twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2,
+                     frintnCode)
+    twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4,
+                     frintnCode)
+    # FRINTP
+    frintpCode = frintCode % ("FPRounding_POSINF", "false")
+    twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2,
+                     frintpCode)
+    twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4,
+                     frintpCode)
+    # FRINTX
+    frintxCode = frintCode % ("FPCRRounding(fpscr)", "true")
+    twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2,
+                     frintxCode)
+    twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4,
+                     frintxCode)
+    # FRINTZ
+    frintzCode = frintCode % ("FPRounding_ZERO", "false")
+    twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2,
+                     frintzCode)
+    twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4,
+                     frintzCode)
+    # FRSQRTE
+    frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)"
+    twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp",
+                     smallFloatTypes, 2, frsqrteCode)
+    twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4,
+                     frsqrteCode)
+    twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4,
+                     frsqrteCode, scalar=True)
+    # FRSQRTS
+    frsqrtsCode = fpBinOp % "RSqrtStepFused"
+    threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp",
+                       smallFloatTypes, 2, frsqrtsCode)
+    threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes,
+                       4, frsqrtsCode)
+    threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes,
+                       4, frsqrtsCode, scalar=True)
+    # FSQRT
+    fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)"
+    twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2,
+                     fsqrtCode)
+    twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4,
+                     fsqrtCode)
+    # FSUB
+    fsubCode = fpBinOp % "Sub"
+    threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2,
+                       fsubCode)
+    threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4,
+                       fsubCode)
+    # INS (element)
+    insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4)
+    # INS (general register)
+    insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4,
+                    'W')
+    insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X')
+    # MLA (by element)
+    mlaCode = "destElem += srcElem1 * srcElem2;"
+    threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp",
+                       ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True)
+    threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp",
+                       ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True)
+    # MLA (vector)
+    threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2,
+                       mlaCode, True)
+    threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4,
+                       mlaCode, True)
+    # MLS (by element)
+    mlsCode = "destElem -= srcElem1 * srcElem2;"
+    threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp",
+                       ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True)
+    threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp",
+                       ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True)
+    # MLS (vector)
+    threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2,
+                       mlsCode, True)
+    threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4,
+                       mlsCode, True)
+    # MOV (element) -> alias to INS (element)
+    # MOV (from general) -> alias to INS (general register)
+    # MOV (scalar) -> alias to DUP (element)
+    # MOV (to general) -> alias to UMOV
+    # MOV (vector) -> alias to ORR (register)
+    # MOVI
+    movImmCode = "destElem = imm;"
+    oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2,
+                   movImmCode)
+    oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4,
+                   movImmCode)
+    # MUL (by element)
+    mulCode = "destElem = srcElem1 * srcElem2;"
+    threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp",
+                       ("uint16_t", "uint32_t"), 2, mulCode, byElem=True)
+    threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp",
+                       ("uint16_t", "uint32_t"), 4, mulCode, byElem=True)
+    # MUL (vector)
+    threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2,
+                       mulCode)
+    threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4,
+                       mulCode)
+    # MVN
+    mvnCode = "destElem = ~srcElem1;"
+    twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode)
+    twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode)
+    # MVNI
+    mvniCode = "destElem = ~imm;"
+    oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode)
+    oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode)
+    # NEG
+    negCode = "destElem = -srcElem1;"
+    twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode)
+    twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode)
+    # NOT -> alias to MVN
+    # ORN
+    ornCode = "destElem = srcElem1 | ~srcElem2;"
+    threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode)
+    threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode)
+    # ORR (immediate)
+    orrImmCode = "destElem |= imm;"
+    oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2,
+                   orrImmCode, True)
+    oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4,
+                   orrImmCode, True)
+    # ORR (register)
+    orrCode = "destElem = srcElem1 | srcElem2;"
+    threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode)
+    threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode)
+    # PMUL
+    pmulCode = '''
+            destElem = 0;
+            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
+                if (bits(srcElem2, j))
+                    destElem ^= srcElem1 << j;
+            }
+    '''
+    threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2,
+                       pmulCode)
+    threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4,
+                       pmulCode)
+    # PMULL, PMULL2
+    # Note: 64-bit PMULL is not available (Crypto. Extension)
+    pmullCode = '''
+            destElem = 0;
+            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
+                if (bits(srcElem2, j))
+                    destElem ^= (BigElement)srcElem1 << j;
+            }
+    '''
+    threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode)
+    threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",),
+                      pmullCode, hi=True)
+    # RADDHN, RADDHN2
+    raddhnCode = '''
+            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
+                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
+                       (sizeof(Element) * 8);
+    '''
+    threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes,
+                        raddhnCode)
+    threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes,
+                        raddhnCode, hi=True)
+    # RBIT
+    rbitCode = '''
+            destElem = 0;
+            Element temp = srcElem1;
+            for (int i = 0; i < 8 * sizeof(Element); i++) {
+                destElem = destElem  | ((temp & 0x1) <<
+                                        (8 * sizeof(Element) - 1 - i));
+                temp >>= 1;
+            }
+    '''
+    twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode)
+    twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode)
+    # REV16
+    rev16Code = '''
+            destElem = srcElem1;
+            unsigned groupSize = ((1 << 1) / sizeof(Element));
+            unsigned reverseMask = (groupSize - 1);
+            j = i ^ reverseMask;
+    '''
+    twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2,
+                     rev16Code)
+    twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4,
+                     rev16Code)
+    # REV32
+    rev32Code = '''
+            destElem = srcElem1;
+            unsigned groupSize = ((1 << 2) / sizeof(Element));
+            unsigned reverseMask = (groupSize - 1);
+            j = i ^ reverseMask;
+    '''
+    twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"),
+                     2, rev32Code)
+    twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"),
+                     4, rev32Code)
+    # REV64
+    rev64Code = '''
+            destElem = srcElem1;
+            unsigned groupSize = ((1 << 3) / sizeof(Element));
+            unsigned reverseMask = (groupSize - 1);
+            j = i ^ reverseMask;
+    '''
+    twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2,
+                     rev64Code)
+    twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4,
+                     rev64Code)
+    # RSHRN, RSHRN2
+    rshrnCode = '''
+            if (imm > sizeof(srcElem1) * 8) {
+                destElem = 0;
+            } else if (imm) {
+                Element rBit = bits(srcElem1, imm - 1);
+                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
+            } else {
+                destElem = srcElem1;
+            }
+    '''
+    twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes,
+                      rshrnCode, hasImm=True)
+    twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes,
+                      rshrnCode, hasImm=True, hi=True)
+    # RSUBHN, RSUBHN2
+    rsubhnCode = '''
+            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
+                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
+                       (sizeof(Element) * 8);
+    '''
+    threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes,
+                        rsubhnCode)
+    threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes,
+                        rsubhnCode, hi=True)
+    # SABA
+    abaCode = '''
+            destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
+                                                (srcElem2 - srcElem1);
+    '''
+    threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2,
+                       abaCode, True)
+    threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4,
+                       abaCode, True)
+    # SABAL, SABAL2
+    abalCode = '''
+            destElem += (srcElem1 > srcElem2) ?
+                ((BigElement)srcElem1 - (BigElement)srcElem2) :
+                ((BigElement)srcElem2 - (BigElement)srcElem1);
+    '''
+    threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes,
+                      abalCode, True)
+    threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes,
+                      abalCode, True, hi=True)
+    # SABD
+    abdCode = '''
+            destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
+                                               (srcElem2 - srcElem1);
+    '''
+    threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2,
+                       abdCode)
+    threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4,
+                       abdCode)
+    # SABDL, SABDL2
+    abdlCode = '''
+            destElem = (srcElem1 > srcElem2) ?
+                ((BigElement)srcElem1 - (BigElement)srcElem2) :
+                ((BigElement)srcElem2 - (BigElement)srcElem1);
+    '''
+    threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes,
+                      abdlCode, True)
+    threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes,
+                      abdlCode, True, hi=True)
+    # SADALP
+    adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;"
+    twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2,
+                        adalpCode, True)
+    twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4,
+                        adalpCode, True)
+    # SADDL, SADDL2
+    addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;"
+    threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes,
+                      addlwCode)
+    threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes,
+                      addlwCode, hi=True)
+    # SADDLP
+    twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2,
+                        addlwCode)
+    twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4,
+                        addlwCode)
+    # SADDLV
+    # Note: SimdAddOp can be a bit optimistic here
+    addAcrossLongCode = "destElem += (BigElement)srcElem1;"
+    twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"),
+                      2, addAcrossLongCode, long=True)
+    twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"),
+                      4, addAcrossLongCode, long=True)
+    twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4,
+                      addAcrossLongCode, doubleDest=True, long=True)
+    # SADDW, SADDW2
+    threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes,
+                      addlwCode)
+    threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes,
+                      addlwCode, hi=True)
+    # SCVTF (fixed-point)
+    scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, imm,"
+                             " false, FPCRRounding(fpscr), fpscr)")
+    twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
+                     scvtfFixedCode % 32, hasImm=True)
+    twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4,
+                     scvtfFixedCode % 32, hasImm=True)
+    twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4,
+                     scvtfFixedCode % 64, hasImm=True)
+    twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes,
+                     4, scvtfFixedCode % 32, hasImm=True, scalar=True)
+    twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4,
+                     scvtfFixedCode % 64, hasImm=True, scalar=True)
+    # SCVTF (integer)
+    scvtfIntCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, 0,"
+                           " false, FPCRRounding(fpscr), fpscr)")
+    twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
+                     scvtfIntCode % 32)
+    twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4,
+                     scvtfIntCode % 32)
+    twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4,
+                     scvtfIntCode % 64)
+    twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", smallFloatTypes, 4,
+                     scvtfIntCode % 32, scalar=True)
+    twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4,
+                     scvtfIntCode % 64, scalar=True)
+    # SHADD
+    haddCode = '''
+            Element carryBit =
+                (((unsigned)srcElem1 & 0x1) +
+                 ((unsigned)srcElem2 & 0x1)) >> 1;
+            // Use division instead of a shift to ensure the sign extension works
+            // right. The compiler will figure out if it can be a shift. Mask the
+            // inputs so they get truncated correctly.
+            destElem = (((srcElem1 & ~(Element)1) / 2) +
+                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
+    '''
+    threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2,
+                       haddCode)
+    threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4,
+                       haddCode)
+    # SHL
+    shlCode = '''
+            if (imm >= sizeof(Element) * 8)
+                destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
+            else
+                destElem = srcElem1 << imm;
+    '''
+    twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode,
+                     hasImm=True)
+    twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode,
+                     hasImm=True)
+    # SHLL, SHLL2
+    shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);"
+    twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode)
+    twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode,
+                    hi=True)
+    # SHRN, SHRN2
+    shrnCode = '''
+            if (imm >= sizeof(srcElem1) * 8) {
+                destElem = 0;
+            } else {
+                destElem = srcElem1 >> imm;
+            }
+    '''
+    twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes,
+                      shrnCode, hasImm=True)
+    twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes,
+                      shrnCode, hasImm=True, hi=True)
+    # SHSUB
+    hsubCode = '''
+            Element borrowBit =
+                (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
+            // Use division instead of a shift to ensure the sign extension works
+            // right. The compiler will figure out if it can be a shift. Mask the
+            // inputs so they get truncated correctly.
+            destElem = (((srcElem1 & ~(Element)1) / 2) -
+                        ((srcElem2 & ~(Element)1) / 2)) - borrowBit;
+    '''
+    threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2,
+                       hsubCode)
+    threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4,
+                       hsubCode)
+    # SLI
+    sliCode = '''
+            if (imm >= sizeof(Element) * 8)
+                destElem = destElem;
+            else
+                destElem = (srcElem1 << imm) | (destElem & mask(imm));
+    '''
+    twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode,
+                     True, hasImm=True)
+    twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode,
+                     True, hasImm=True)
+    # SMAX
+    maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;"
+    threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2,
+                       maxCode)
+    threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4,
+                       maxCode)
+    # SMAXP
+    threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2,
+                       maxCode, pairwise=True)
+    threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4,
+                       maxCode, pairwise=True)
+    # SMAXV
+    maxAcrossCode = '''
+            if (i == 0 || srcElem1 > destElem)
+                destElem = srcElem1;
+    '''
+    twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"),
+                      2, maxAcrossCode)
+    twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4,
+                      maxAcrossCode)
+    # SMIN
+    minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;"
+    threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2,
+                       minCode)
+    threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4,
+                       minCode)
+    # SMINP
+    threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2,
+                       minCode, pairwise=True)
+    threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4,
+                       minCode, pairwise=True)
+    # SMINV
+    minAcrossCode = '''
+            if (i == 0 || srcElem1 < destElem)
+                destElem = srcElem1;
+    '''
+    twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"),
+                      2, minAcrossCode)
+    twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4,
+                      minAcrossCode)
+    # SMLAL, SMLAL2 (by element)
+    mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;"
+    threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), mlalCode, True, byElem=True)
+    threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), mlalCode, True, byElem=True,
+                      hi=True)
+    # SMLAL, SMLAL2 (vector)
+    threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes,
+                      mlalCode, True)
+    threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes,
+                      mlalCode, True, hi=True)
+    # SMLSL, SMLSL2 (by element)
+    mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;"
+    threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes,
+                      mlslCode, True, byElem=True)
+    threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp",
+                      smallSignedTypes, mlslCode, True, byElem=True, hi=True)
+    # SMLSL, SMLSL2 (vector)
+    threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes,
+                      mlslCode, True)
+    threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes,
+                      mlslCode, True, hi=True)
+    # SMOV
+    insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4,
+                  'W', True)
+    insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X',
+                  True)
+    # SMULL, SMULL2 (by element)
+    mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;"
+    threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes,
+                      mullCode, byElem=True)
+    threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes,
+                      mullCode, byElem=True, hi=True)
+    # SMULL, SMULL2 (vector)
+    threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes,
+                      mullCode)
+    threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes,
+                      mullCode, hi=True)
+    # SQABS
+    sqabsCode = '''
+        FPSCR fpscr = (FPSCR) FpscrQc;
+        if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) {
+            fpscr.qc = 1;
+            destElem = ~srcElem1;
+        } else if (srcElem1 < 0) {
+            destElem = -srcElem1;
+        } else {
+            destElem = srcElem1;
+        }
+        FpscrQc = fpscr;
+    '''
+    twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2,
+                     sqabsCode)
+    twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4,
+                     sqabsCode)
+    twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4,
+                     sqabsCode, scalar=True)
+    # SQADD
+    sqaddCode = '''
+            destElem = srcElem1 + srcElem2;
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            bool negDest = (destElem < 0);
+            bool negSrc1 = (srcElem1 < 0);
+            bool negSrc2 = (srcElem2 < 0);
+            if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
+                destElem = (Element)1 << (sizeof(Element) * 8 - 1);
+                if (negDest)
+                    destElem -= 1;
+                fpscr.qc = 1;
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2,
+                       sqaddCode)
+    threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4,
+                       sqaddCode)
+    threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4,
+                       sqaddCode, scalar=True)
+    # SQDMLAL, SQDMLAL2 (by element)
+    qdmlalCode = '''
+        FPSCR fpscr = (FPSCR) FpscrQc;
+        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
+        Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1);
+        Element halfNeg = maxNeg / 2;
+        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
+            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
+            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
+            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
+            fpscr.qc = 1;
+        }
+        bool negPreDest = ltz(destElem);
+        destElem += midElem;
+        bool negDest = ltz(destElem);
+        bool negMid = ltz(midElem);
+        if (negPreDest == negMid && negMid != negDest) {
+            destElem = mask(sizeof(BigElement) * 8 - 1);
+            if (negPreDest)
+                destElem = ~destElem;
+            fpscr.qc = 1;
+        }
+        FpscrQc = fpscr;
+    '''
+    threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True)
+    threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
+                      hi=True)
+    threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
+                      scalar=True)
+    # SQDMLAL, SQDMLAL2 (vector)
+    threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlalCode, True)
+    threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlalCode, True, hi=True)
+    threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlalCode, True, scalar=True)
+    # SQDMLSL, SQDMLSL2 (by element)
+    qdmlslCode = '''
+        FPSCR fpscr = (FPSCR) FpscrQc;
+        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
+        Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1);
+        Element halfNeg = maxNeg / 2;
+        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
+            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
+            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
+            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
+            fpscr.qc = 1;
+        }
+        bool negPreDest = ltz(destElem);
+        destElem -= midElem;
+        bool negDest = ltz(destElem);
+        bool posMid = ltz((BigElement)-midElem);
+        if (negPreDest == posMid && posMid != negDest) {
+            destElem = mask(sizeof(BigElement) * 8 - 1);
+            if (negPreDest)
+                destElem = ~destElem;
+            fpscr.qc = 1;
+        }
+        FpscrQc = fpscr;
+    '''
+    threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True)
+    threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
+                      hi=True)
+    threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
+                      scalar=True)
+    # SQDMLSL, SQDMLSL2 (vector)
+    threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlslCode, True)
+    threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlslCode, True, hi=True)
+    threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp",
+                      ("int16_t", "int32_t"), qdmlslCode, True, scalar=True)
+    # SQDMULH (by element)
+    sqdmulhCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
+                       (sizeof(Element) * 8);
+            if (srcElem1 == srcElem2 &&
+                    srcElem1 == (Element)((Element)1 <<
+                        (sizeof(Element) * 8 - 1))) {
+                destElem = ~srcElem1;
+                fpscr.qc = 1;
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True)
+    threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True)
+    threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True,
+                       scalar=True)
+    # SQDMULH (vector)
+    threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 2, sqdmulhCode)
+    threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 4, sqdmulhCode)
+    threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True)
+    # SQDMULL, SQDMULL2 (by element)
+    qdmullCode = '''
+        FPSCR fpscr = (FPSCR) FpscrQc;
+        destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
+        if (srcElem1 == srcElem2 &&
+                srcElem1 == (Element)((Element)1 <<
+                    (Element)(sizeof(Element) * 8 - 1))) {
+            destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
+            fpscr.qc = 1;
+        }
+        FpscrQc = fpscr;
+    '''
+    threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp",
+                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True)
+    threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp",
+                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
+                      hi=True)
+    threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp",
+                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
+                      scalar=True)
+    # SQDMULL, SQDMULL2 (vector)
+    threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp",
+                      ("int16_t", "int32_t"), qdmullCode, True)
+    threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp",
+                      ("int16_t", "int32_t"), qdmullCode, True, hi=True)
+    threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp",
+                      ("int16_t", "int32_t"), qdmullCode, True, scalar=True)
+    # SQNEG
+    sqnegCode = '''
+        FPSCR fpscr = (FPSCR) FpscrQc;
+        if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) {
+            fpscr.qc = 1;
+            destElem = ~srcElem1;
+        } else {
+            destElem = -srcElem1;
+        }
+        FpscrQc = fpscr;
+    '''
+    twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2,
+                     sqnegCode)
+    twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4,
+                     sqnegCode)
+    twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4,
+                     sqnegCode, scalar=True)
+    # SQRDMULH (by element)
+    sqrdmulhCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
+                        ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
+                       (sizeof(Element) * 8);
+            Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1);
+            Element halfNeg = maxNeg / 2;
+            if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
+                (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
+                (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
+                if (destElem < 0) {
+                    destElem = mask(sizeof(Element) * 8 - 1);
+                } else {
+                    destElem = (Element)1 << (sizeof(Element) * 8 - 1);
+                }
+                fpscr.qc = 1;
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True)
+    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True)
+    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True,
+                       scalar=True)
+    # SQRDMULH (vector)
+    threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 2, sqrdmulhCode)
+    threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 4, sqrdmulhCode)
+    threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp",
+                       ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True)
+    # SQRSHL
+    sqrshlCode = '''
+            int16_t shiftAmt = (int8_t)srcElem2;
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (shiftAmt < 0) {
+                shiftAmt = -shiftAmt;
+                Element rBit = 0;
+                if (shiftAmt <= sizeof(Element) * 8)
+                    rBit = bits(srcElem1, shiftAmt - 1);
+                if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
+                    rBit = 1;
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    shiftAmt = sizeof(Element) * 8 - 1;
+                    destElem = 0;
+                } else {
+                    destElem = (srcElem1 >> shiftAmt);
+                }
+                // Make sure the right shift sign extended when it should.
+                if (srcElem1 < 0 && destElem >= 0) {
+                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
+                                                 1 - shiftAmt));
+                }
+                destElem += rBit;
+            } else if (shiftAmt > 0) {
+                bool sat = false;
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    if (srcElem1 != 0)
+                        sat = true;
+                    else
+                        destElem = 0;
+                } else {
+                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
+                                sizeof(Element) * 8 - 1 - shiftAmt) !=
+                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
+                        sat = true;
+                    } else {
+                        destElem = srcElem1 << shiftAmt;
+                    }
+                }
+                if (sat) {
+                    fpscr.qc = 1;
+                    destElem = mask(sizeof(Element) * 8 - 1);
+                    if (srcElem1 < 0)
+                        destElem = ~destElem;
+                }
+            } else {
+                destElem = srcElem1;
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2,
+                       sqrshlCode)
+    threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4,
+                       sqrshlCode)
+    threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4,
+                       sqrshlCode, scalar=True)
+    # SQRSHRN, SQRSHRN2
+    sqrshrnCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (imm > sizeof(srcElem1) * 8) {
+                if (srcElem1 != 0 && srcElem1 != -1)
+                    fpscr.qc = 1;
+                destElem = 0;
+            } else if (imm) {
+                BigElement mid = (srcElem1 >> (imm - 1));
+                uint64_t rBit = mid & 0x1;
+                mid >>= 1;
+                mid |= -(mid & ((BigElement)1 <<
+                            (sizeof(BigElement) * 8 - 1 - imm)));
+                mid += rBit;
+                if (mid != (Element)mid) {
+                    destElem = mask(sizeof(Element) * 8 - 1);
+                    if (srcElem1 < 0)
+                        destElem = ~destElem;
+                    fpscr.qc = 1;
+                } else {
+                    destElem = mid;
+                }
+            } else {
+                if (srcElem1 != (Element)srcElem1) {
+                    destElem = mask(sizeof(Element) * 8 - 1);
+                    if (srcElem1 < 0)
+                        destElem = ~destElem;
+                    fpscr.qc = 1;
+                } else {
+                    destElem = srcElem1;
+                }
+            }
+            FpscrQc = fpscr;
+    '''
+    twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes,
+                      sqrshrnCode, hasImm=True)
+    twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes,
+                      sqrshrnCode, hasImm=True, hi=True)
+    twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes,
+                      sqrshrnCode, hasImm=True, scalar=True)
+    # SQRSHRUN, SQRSHRUN2
+    sqrshrunCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (imm > sizeof(srcElem1) * 8) {
+                if (srcElem1 != 0)
+                    fpscr.qc = 1;
+                destElem = 0;
+            } else if (imm) {
+                BigElement mid = (srcElem1 >> (imm - 1));
+                uint64_t rBit = mid & 0x1;
+                mid >>= 1;
+                mid |= -(mid & ((BigElement)1 <<
+                                (sizeof(BigElement) * 8 - 1 - imm)));
+                mid += rBit;
+                if (bits(mid, sizeof(BigElement) * 8 - 1,
+                              sizeof(Element) * 8) != 0) {
+                    if (srcElem1 < 0) {
+                        destElem = 0;
+                    } else {
+                        destElem = mask(sizeof(Element) * 8);
+                    }
+                    fpscr.qc = 1;
+                } else {
+                    destElem = mid;
+                }
+            } else {
+                if (srcElem1 < 0) {
+                    fpscr.qc = 1;
+                    destElem = 0;
+                } else {
+                    destElem = srcElem1;
+                }
+            }
+            FpscrQc = fpscr;
+    '''
+    twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes,
+                      sqrshrunCode, hasImm=True)
+    twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp",
+                      smallSignedTypes, sqrshrunCode, hasImm=True, hi=True)
+    twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp",
+                      smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True)
+    # SQSHL (immediate)
+    sqshlImmCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (imm >= sizeof(Element) * 8) {
+                if (srcElem1 != 0) {
+                    destElem = (Element)1 << (sizeof(Element) * 8 - 1);
+                    if (srcElem1 > 0)
+                        destElem = ~destElem;
+                    fpscr.qc = 1;
+                } else {
+                    destElem = 0;
+                }
+            } else if (imm) {
+                destElem = (srcElem1 << imm);
+                uint64_t topBits = bits((uint64_t)srcElem1,
+                                        sizeof(Element) * 8 - 1,
+                                        sizeof(Element) * 8 - 1 - imm);
+                if (topBits != 0 && topBits != mask(imm + 1)) {
+                    destElem = (Element)1 << (sizeof(Element) * 8 - 1);
+                    if (srcElem1 > 0)
+                        destElem = ~destElem;
+                    fpscr.qc = 1;
+                }
+            } else {
+                destElem = srcElem1;
+            }
+            FpscrQc = fpscr;
+    '''
+    twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2,
+                     sqshlImmCode, hasImm=True)
+    twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4,
+                     sqshlImmCode, hasImm=True)
+    twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4,
+                     sqshlImmCode, hasImm=True, scalar=True)
+    # SQSHL (register)
+    sqshlCode = '''
+            int16_t shiftAmt = (int8_t)srcElem2;
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (shiftAmt < 0) {
+                shiftAmt = -shiftAmt;
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    shiftAmt = sizeof(Element) * 8 - 1;
+                    destElem = 0;
+                } else {
+                    destElem = (srcElem1 >> shiftAmt);
+                }
+                // Make sure the right shift sign extended when it should.
+                if (srcElem1 < 0 && destElem >= 0) {
+                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
+                                                 1 - shiftAmt));
+                }
+            } else if (shiftAmt > 0) {
+                bool sat = false;
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    if (srcElem1 != 0)
+                        sat = true;
+                    else
+                        destElem = 0;
+                } else {
+                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
+                                sizeof(Element) * 8 - 1 - shiftAmt) !=
+                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
+                        sat = true;
+                    } else {
+                        destElem = srcElem1 << shiftAmt;
+                    }
+                }
+                if (sat) {
+                    fpscr.qc = 1;
+                    destElem = mask(sizeof(Element) * 8 - 1);
+                    if (srcElem1 < 0)
+                        destElem = ~destElem;
+                }
+            } else {
+                destElem = srcElem1;
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2,
+                       sqshlCode)
+    threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4,
+                       sqshlCode)
+    threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4,
+                       sqshlCode, scalar=True)
+    # SQSHLU
+    sqshluCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (imm >= sizeof(Element) * 8) {
+                if (srcElem1 < 0) {
+                    destElem = 0;
+                    fpscr.qc = 1;
+                } else if (srcElem1 > 0) {
+                    destElem = mask(sizeof(Element) * 8);
+                    fpscr.qc = 1;
+                } else {
+                    destElem = 0;
+                }
+            } else if (imm) {
+                destElem = (srcElem1 << imm);
+                uint64_t topBits = bits((uint64_t)srcElem1,
+                                        sizeof(Element) * 8 - 1,
+                                        sizeof(Element) * 8 - imm);
+                if (srcElem1 < 0) {
+                    destElem = 0;
+                    fpscr.qc = 1;
+                } else if (topBits != 0) {
+                    destElem = mask(sizeof(Element) * 8);
+                    fpscr.qc = 1;
+                }
+            } else {
+                if (srcElem1 < 0) {
+                    fpscr.qc = 1;
+                    destElem = 0;
+                } else {
+                    destElem = srcElem1;
+                }
+            }
+            FpscrQc = fpscr;
+    '''
+    twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2,
+                     sqshluCode, hasImm=True)
+    twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4,
+                     sqshluCode, hasImm=True)
+    twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4,
+                     sqshluCode, hasImm=True, scalar=True)
+    # SQSHRN, SQSHRN2
+    sqshrnCode = '''
+        FPSCR fpscr = (FPSCR) FpscrQc;
+        if (imm > sizeof(srcElem1) * 8) {
+            if (srcElem1 != 0 && srcElem1 != -1)
+                fpscr.qc = 1;
+            destElem = 0;
+        } else if (imm) {
+            BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
+            mid |= -(mid & ((BigElement)1 <<
+                        (sizeof(BigElement) * 8 - 1 - imm)));
+            if (mid != (Element)mid) {
+                destElem = mask(sizeof(Element) * 8 - 1);
+                if (srcElem1 < 0)
+                    destElem = ~destElem;
+                fpscr.qc = 1;
+            } else {
+                destElem = mid;
+            }
+        } else {
+            destElem = srcElem1;
+        }
+        FpscrQc = fpscr;
+    '''
+    twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes,
+                      sqshrnCode, hasImm=True)
+    twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes,
+                      sqshrnCode, hasImm=True, hi=True)
+    twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes,
+                      sqshrnCode, hasImm=True, scalar=True)
+    # SQSHRUN, SQSHRUN2
+    sqshrunCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (imm > sizeof(srcElem1) * 8) {
+                if (srcElem1 != 0)
+                    fpscr.qc = 1;
+                destElem = 0;
+            } else if (imm) {
+                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
+                if (bits(mid, sizeof(BigElement) * 8 - 1,
+                              sizeof(Element) * 8) != 0) {
+                    if (srcElem1 < 0) {
+                        destElem = 0;
+                    } else {
+                        destElem = mask(sizeof(Element) * 8);
+                    }
+                    fpscr.qc = 1;
+                } else {
+                    destElem = mid;
+                }
+            } else {
+                destElem = srcElem1;
+            }
+            FpscrQc = fpscr;
+    '''
+    twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes,
+                      sqshrunCode, hasImm=True)
+    twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes,
+                      sqshrunCode, hasImm=True, hi=True)
+    twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes,
+                      sqshrunCode, hasImm=True, scalar=True)
+    # SQSUB
+    sqsubCode = '''
+            destElem = srcElem1 - srcElem2;
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            bool negDest = (destElem < 0);
+            bool negSrc1 = (srcElem1 < 0);
+            bool posSrc2 = (srcElem2 >= 0);
+            if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
+                destElem = (Element)1 << (sizeof(Element) * 8 - 1);
+                if (negDest)
+                    destElem -= 1;
+                fpscr.qc = 1;
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2,
+                       sqsubCode)
+    threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4,
+                       sqsubCode)
+    threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4,
+                       sqsubCode, scalar=True)
+    # SQXTN, SQXTN2
+    sqxtnCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            destElem = srcElem1;
+            if ((BigElement)destElem != srcElem1) {
+                fpscr.qc = 1;
+                destElem = mask(sizeof(Element) * 8 - 1);
+                if (srcElem1 < 0)
+                    destElem = ~destElem;
+            }
+            FpscrQc = fpscr;
+    '''
+    twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes,
+                      sqxtnCode)
+    twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes,
+                      sqxtnCode, hi=True)
+    twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes,
+                      sqxtnCode, scalar=True)
+    # SQXTUN, SQXTUN2
+    sqxtunCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            destElem = srcElem1;
+            if (srcElem1 < 0 ||
+                    ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) {
+                fpscr.qc = 1;
+                destElem = mask(sizeof(Element) * 8);
+                if (srcElem1 < 0)
+                    destElem = ~destElem;
+            }
+            FpscrQc = fpscr;
+    '''
+    twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes,
+                      sqxtunCode)
+    twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes,
+                      sqxtunCode, hi=True)
+    twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes,
+                      sqxtunCode, scalar=True)
+    # SRHADD
+    rhaddCode = '''
+            Element carryBit =
+                (((unsigned)srcElem1 & 0x1) +
+                 ((unsigned)srcElem2 & 0x1) + 1) >> 1;
+            // Use division instead of a shift to ensure the sign extension works
+            // right. The compiler will figure out if it can be a shift. Mask the
+            // inputs so they get truncated correctly.
+            destElem = (((srcElem1 & ~(Element)1) / 2) +
+                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
+    '''
+    threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2,
+                       rhaddCode)
+    threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4,
+                       rhaddCode)
+    # SRI
+    sriCode = '''
+            if (imm >= sizeof(Element) * 8)
+                destElem = destElem;
+            else
+                destElem = (srcElem1 >> imm) |
+                    (destElem & ~mask(sizeof(Element) * 8 - imm));
+    '''
+    twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode,
+                     True, hasImm=True)
+    twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode,
+                     True, hasImm=True)
+    # SRSHL
+    rshlCode = '''
+            int16_t shiftAmt = (int8_t)srcElem2;
+            if (shiftAmt < 0) {
+                shiftAmt = -shiftAmt;
+                Element rBit = 0;
+                if (shiftAmt <= sizeof(Element) * 8)
+                    rBit = bits(srcElem1, shiftAmt - 1);
+                if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
+                    rBit = 1;
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    shiftAmt = sizeof(Element) * 8 - 1;
+                    destElem = 0;
+                } else {
+                    destElem = (srcElem1 >> shiftAmt);
+                }
+                // Make sure the right shift sign extended when it should.
+                if (ltz(srcElem1) && !ltz(destElem)) {
+                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
+                                                 1 - shiftAmt));
+                }
+                destElem += rBit;
+            } else if (shiftAmt > 0) {
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    destElem = 0;
+                } else {
+                    destElem = srcElem1 << shiftAmt;
+                }
+            } else {
+                destElem = srcElem1;
+            }
+    '''
+    threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2,
+                       rshlCode)
+    threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4,
+                       rshlCode)
+    # SRSHR
+    rshrCode = '''
+            if (imm > sizeof(srcElem1) * 8) {
+                destElem = 0;
+            } else if (imm) {
+                Element rBit = bits(srcElem1, imm - 1);
+                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
+            } else {
+                destElem = srcElem1;
+            }
+    '''
+    twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2,
+                     rshrCode, hasImm=True)
+    twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4,
+                     rshrCode, hasImm=True)
+    # SRSRA
+    rsraCode = '''
+            if (imm > sizeof(srcElem1) * 8) {
+                destElem += 0;
+            } else if (imm) {
+                Element rBit = bits(srcElem1, imm - 1);
+                destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
+            } else {
+                destElem += srcElem1;
+            }
+    '''
+    twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2,
+                     rsraCode, True, hasImm=True)
+    twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4,
+                     rsraCode, True, hasImm=True)
+    # SSHL
+    shlCode = '''
+            int16_t shiftAmt = (int8_t)srcElem2;
+            if (shiftAmt < 0) {
+                shiftAmt = -shiftAmt;
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    shiftAmt = sizeof(Element) * 8 - 1;
+                    destElem = 0;
+                } else {
+                    destElem = (srcElem1 >> shiftAmt);
+                }
+                // Make sure the right shift sign extended when it should.
+                if (ltz(srcElem1) && !ltz(destElem)) {
+                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
+                                                 1 - shiftAmt));
+                }
+            } else {
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    destElem = 0;
+                } else {
+                    destElem = srcElem1 << shiftAmt;
+                }
+            }
+    '''
+    threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2,
+                       shlCode)
+    threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4,
+                       shlCode)
+    # SSHLL, SSHLL2
+    shllCode = '''
+            if (imm >= sizeof(destElem) * 8) {
+                destElem = 0;
+            } else {
+                destElem = (BigElement)srcElem1 << imm;
+            }
+    '''
+    twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes,
+                    shllCode, hasImm=True)
+    twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes,
+                    shllCode, hasImm=True, hi=True)
+    # SSHR
+    shrCode = '''
+            if (imm >= sizeof(srcElem1) * 8) {
+                if (ltz(srcElem1))
+                    destElem = -1;
+                else
+                    destElem = 0;
+            } else {
+                destElem = srcElem1 >> imm;
+            }
+    '''
+    twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode,
+                     hasImm=True)
+    twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode,
+                     hasImm=True)
+    # SSRA
+    sraCode = '''
+            Element mid;;
+            if (imm >= sizeof(srcElem1) * 8) {
+                mid = ltz(srcElem1) ? -1 : 0;
+            } else {
+                mid = srcElem1 >> imm;
+                if (ltz(srcElem1) && !ltz(mid)) {
+                    mid |= -(mid & ((Element)1 <<
+                                    (sizeof(Element) * 8 - 1 - imm)));
+                }
+            }
+            destElem += mid;
+    '''
+    twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode,
+                     True, hasImm=True)
+    twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode,
+                     True, hasImm=True)
+    # SSUBL
+    sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;"
+    threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes,
+                      sublwCode)
+    threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes,
+                      sublwCode, hi=True)
+    # SSUBW
+    threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes,
+                      sublwCode)
+    threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes,
+                      sublwCode, hi=True)
+    # SUB
+    subCode = "destElem = srcElem1 - srcElem2;"
+    threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode)
+    threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode)
+    # SUBHN, SUBHN2
+    subhnCode = '''
+            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
+                        (sizeof(Element) * 8);
+    '''
+    threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes,
+                        subhnCode)
+    threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes,
+                        subhnCode, hi=True)
+    # SUQADD
+    suqaddCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            Element tmp = destElem + srcElem1;
+            if (bits(destElem, sizeof(Element) * 8 - 1) == 0) {
+                if (bits(tmp, sizeof(Element) * 8 - 1) == 1 ||
+                        tmp < srcElem1 || tmp < destElem) {
+                    destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
+                    fpscr.qc = 1;
+                } else {
+                    destElem = tmp;
+                }
+            } else {
+                Element absDestElem = (~destElem) + 1;
+                if (absDestElem < srcElem1) {
+                    // Still check for positive sat., no need to check for negative sat.
+                    if (bits(tmp, sizeof(Element) * 8 - 1) == 1) {
+                        destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
+                        fpscr.qc = 1;
+                    } else {
+                        destElem = tmp;
+                    }
+                } else {
+                    destElem = tmp;
+                }
+            }
+            FpscrQc = fpscr;
+    '''
+    twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
+                     suqaddCode, True)
+    twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4,
+                     suqaddCode, True)
+    twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4,
+                     suqaddCode, True, scalar=True)
+    # SXTL -> alias to SSHLL
+    # TBL
+    tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2)
+    tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4)
+    tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2)
+    tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4)
+    tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2)
+    tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4)
+    tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2)
+    tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4)
+    # TBX
+    tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2)
+    tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4)
+    tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2)
+    tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4)
+    tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2)
+    tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4)
+    tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2)
+    tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4)
+    # TRN1
+    trnCode = '''
+        unsigned part = %s;
+        for (unsigned i = 0; i < eCount / 2; i++) {
+            destReg.elements[2 * i] = srcReg1.elements[2 * i + part];
+            destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part];
+        }
+    '''
+    threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2,
+                          trnCode % "0")
+    threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4,
+                          trnCode % "0")
+    # TRN2
+    threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2,
+                          trnCode % "1")
+    threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4,
+                          trnCode % "1")
+    # UABA
+    threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2,
+                       abaCode, True)
+    threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4,
+                       abaCode, True)
+    # UABAL, UABAL2
+    threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes,
+                      abalCode, True)
+    threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes,
+                      abalCode, True, hi=True)
+    # UABD
+    threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2,
+                       abdCode)
+    threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4,
+                       abdCode)
+    # UABDL, UABDL2
+    threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes,
+                      abdlCode, True)
+    threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes,
+                      abdlCode, True, hi=True)
+    # UADALP
+    twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes,
+                        2, adalpCode, True)
+    twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes,
+                        4, adalpCode, True)
+    # UADDL, UADDL2
+    threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes,
+                      addlwCode)
+    threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes,
+                      addlwCode, hi=True)
+    # UADDLP
+    twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes,
+                        2, addlwCode)
+    twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes,
+                        4, addlwCode)
+    # UADDLV
+    twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp",
+                      ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True)
+    twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp",
+                      ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True)
+    twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4,
+                      addAcrossLongCode, doubleDest=True, long=True)
+    # UADDW
+    threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes,
+                      addlwCode)
+    threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes,
+                      addlwCode, hi=True)
+    # UCVTF (fixed-point)
+    ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true,"
+                             " FPCRRounding(fpscr), fpscr)")
+    twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
+                     ucvtfFixedCode, hasImm=True)
+    twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4,
+                     ucvtfFixedCode, hasImm=True)
+    twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4,
+                     ucvtfFixedCode, hasImm=True, scalar=True)
+    # UCVTF (integer)
+    ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true,"
+                           " FPCRRounding(fpscr), fpscr)")
+    twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
+                     ucvtfIntCode)
+    twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4,
+                     ucvtfIntCode)
+    twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4,
+                     ucvtfIntCode, scalar=True)
+    # UHADD
+    threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2,
+                       haddCode)
+    threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4,
+                       haddCode)
+    # UHSUB
+    threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2,
+                       hsubCode)
+    threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4,
+                       hsubCode)
+    # UMAX
+    threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2,
+                       maxCode)
+    threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4,
+                       maxCode)
+    # UMAXP
+    threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2,
+                       maxCode, pairwise=True)
+    threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4,
+                       maxCode, pairwise=True)
+    # UMAXV
+    twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
+                      2, maxAcrossCode)
+    twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4,
+                      maxAcrossCode)
+    # UMIN
+    threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2,
+                       minCode)
+    threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4,
+                       minCode)
+    # UMINP
+    threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2,
+                       minCode, pairwise=True)
+    threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4,
+                       minCode, pairwise=True)
+    # UMINV
+    twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
+                      2, minAcrossCode)
+    twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4,
+                      minAcrossCode)
+    # UMLAL (by element)
+    threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp",
+                      smallUnsignedTypes, mlalCode, True, byElem=True)
+    threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp",
+                      smallUnsignedTypes, mlalCode, True, byElem=True, hi=True)
+    # UMLAL (vector)
+    threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes,
+                      mlalCode, True)
+    threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes,
+                      mlalCode, True, hi=True)
+    # UMLSL (by element)
+    threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp",
+                      smallUnsignedTypes, mlslCode, True, byElem=True)
+    threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp",
+                      smallUnsignedTypes, mlslCode, True, byElem=True, hi=True)
+    # UMLSL (vector)
+    threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes,
+                      mlslCode, True)
+    threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes,
+                      mlslCode, True, hi=True)
+    # UMOV
+    insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
+    insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X')
+    # UMULL, UMULL2 (by element)
+    threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes,
+                      mullCode, byElem=True)
+    threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes,
+                      mullCode, byElem=True, hi=True)
+    # UMULL, UMULL2 (vector)
+    threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes,
+                      mullCode)
+    threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes,
+                      mullCode, hi=True)
+    # UQADD
+    uqaddCode = '''
+            destElem = srcElem1 + srcElem2;
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (destElem < srcElem1 || destElem < srcElem2) {
+                destElem = (Element)(-1);
+                fpscr.qc = 1;
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
+                       uqaddCode)
+    threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4,
+                       uqaddCode)
+    threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4,
+                       uqaddCode, scalar=True)
+    # UQRSHL
+    uqrshlCode = '''
+            int16_t shiftAmt = (int8_t)srcElem2;
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (shiftAmt < 0) {
+                shiftAmt = -shiftAmt;
+                Element rBit = 0;
+                if (shiftAmt <= sizeof(Element) * 8)
+                    rBit = bits(srcElem1, shiftAmt - 1);
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    shiftAmt = sizeof(Element) * 8 - 1;
+                    destElem = 0;
+                } else {
+                    destElem = (srcElem1 >> shiftAmt);
+                }
+                destElem += rBit;
+            } else {
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    if (srcElem1 != 0) {
+                        destElem = mask(sizeof(Element) * 8);
+                        fpscr.qc = 1;
+                    } else {
+                        destElem = 0;
+                    }
+                } else {
+                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
+                                sizeof(Element) * 8 - shiftAmt)) {
+                        destElem = mask(sizeof(Element) * 8);
+                        fpscr.qc = 1;
+                    } else {
+                        destElem = srcElem1 << shiftAmt;
+                    }
+                }
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes,
+                       2, uqrshlCode)
+    threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4,
+                       uqrshlCode)
+    threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4,
+                       uqrshlCode, scalar=True)
+    # UQRSHRN
+    uqrshrnCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (imm > sizeof(srcElem1) * 8) {
+                if (srcElem1 != 0)
+                    fpscr.qc = 1;
+                destElem = 0;
+            } else if (imm) {
+                BigElement mid = (srcElem1 >> (imm - 1));
+                uint64_t rBit = mid & 0x1;
+                mid >>= 1;
+                mid += rBit;
+                if (mid != (Element)mid) {
+                    destElem = mask(sizeof(Element) * 8);
+                    fpscr.qc = 1;
+                } else {
+                    destElem = mid;
+                }
+            } else {
+                if (srcElem1 != (Element)srcElem1) {
+                    destElem = mask(sizeof(Element) * 8 - 1);
+                    fpscr.qc = 1;
+                } else {
+                    destElem = srcElem1;
+                }
+            }
+            FpscrQc = fpscr;
+    '''
+    twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes,
+                      uqrshrnCode, hasImm=True)
+    twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp",
+                      smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True)
+    twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp",
+                      smallUnsignedTypes, uqrshrnCode, hasImm=True,
+                      scalar=True)
+    # UQSHL (immediate)
+    uqshlImmCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (imm >= sizeof(Element) * 8) {
+                if (srcElem1 != 0) {
+                    destElem = mask(sizeof(Element) * 8);
+                    fpscr.qc = 1;
+                } else {
+                    destElem = 0;
+                }
+            } else if (imm) {
+                destElem = (srcElem1 << imm);
+                uint64_t topBits = bits((uint64_t)srcElem1,
+                                        sizeof(Element) * 8 - 1,
+                                        sizeof(Element) * 8 - imm);
+                if (topBits != 0) {
+                    destElem = mask(sizeof(Element) * 8);
+                    fpscr.qc = 1;
+                }
+            } else {
+                destElem = srcElem1;
+            }
+            FpscrQc = fpscr;
+    '''
+    twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2,
+                     uqshlImmCode, hasImm=True)
+    twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4,
+                     uqshlImmCode, hasImm=True)
+    twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4,
+                     uqshlImmCode, hasImm=True, scalar=True)
+    # UQSHL (register)
+    uqshlCode = '''
+            int16_t shiftAmt = (int8_t)srcElem2;
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (shiftAmt < 0) {
+                shiftAmt = -shiftAmt;
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    shiftAmt = sizeof(Element) * 8 - 1;
+                    destElem = 0;
+                } else {
+                    destElem = (srcElem1 >> shiftAmt);
+                }
+            } else if (shiftAmt > 0) {
+                if (shiftAmt >= sizeof(Element) * 8) {
+                    if (srcElem1 != 0) {
+                        destElem = mask(sizeof(Element) * 8);
+                        fpscr.qc = 1;
+                    } else {
+                        destElem = 0;
+                    }
+                } else {
+                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
+                                sizeof(Element) * 8 - shiftAmt)) {
+                        destElem = mask(sizeof(Element) * 8);
+                        fpscr.qc = 1;
+                    } else {
+                        destElem = srcElem1 << shiftAmt;
+                    }
+                }
+            } else {
+                destElem = srcElem1;
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2,
+                       uqshlCode)
+    threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4,
+                       uqshlCode)
+    threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4,
+                       uqshlCode, scalar=True)
+    # UQSHRN, UQSHRN2
+    uqshrnCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (imm > sizeof(srcElem1) * 8) {
+                if (srcElem1 != 0)
+                    fpscr.qc = 1;
+                destElem = 0;
+            } else if (imm) {
+                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
+                if (mid != (Element)mid) {
+                    destElem = mask(sizeof(Element) * 8);
+                    fpscr.qc = 1;
+                } else {
+                    destElem = mid;
+                }
+            } else {
+                destElem = srcElem1;
+            }
+            FpscrQc = fpscr;
+    '''
+    twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes,
+                      uqshrnCode, hasImm=True)
+    twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes,
+                      uqshrnCode, hasImm=True, hi=True)
+    twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes,
+                      uqshrnCode, hasImm=True, scalar=True)
+    # UQSUB
+    uqsubCode = '''
+            destElem = srcElem1 - srcElem2;
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            if (destElem > srcElem1) {
+                destElem = 0;
+                fpscr.qc = 1;
+            }
+            FpscrQc = fpscr;
+    '''
+    threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2,
+                       uqsubCode)
+    threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4,
+                       uqsubCode)
+    threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4,
+                       uqsubCode, scalar=True)
+    # UQXTN
+    uqxtnCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            destElem = srcElem1;
+            if ((BigElement)destElem != srcElem1) {
+                fpscr.qc = 1;
+                destElem = mask(sizeof(Element) * 8);
+            }
+            FpscrQc = fpscr;
+    '''
+    twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes,
+                      uqxtnCode)
+    twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes,
+                      uqxtnCode, hi=True)
+    twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes,
+                      uqxtnCode, scalar=True)
+    # URECPE
+    urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);"
+    twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2,
+                     urecpeCode)
+    twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4,
+                     urecpeCode)
+    # URHADD
+    threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes,
+                       2, rhaddCode)
+    threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes,
+                       4, rhaddCode)
+    # URSHL
+    threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2,
+                       rshlCode)
+    threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4,
+                       rshlCode)
+    # URSHR
+    twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2,
+                     rshrCode, hasImm=True)
+    twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4,
+                     rshrCode, hasImm=True)
+    # URSQRTE
+    ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);"
+    twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2,
+                     ursqrteCode)
+    twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4,
+                     ursqrteCode)
+    # URSRA
+    twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2,
+                     rsraCode, True, hasImm=True)
+    twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4,
+                     rsraCode, True, hasImm=True)
+    # USHL
+    threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2,
+                       shlCode)
+    threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4,
+                       shlCode)
+    # USHLL, USHLL2
+    twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes,
+                    shllCode, hasImm=True)
+    twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes,
+                    shllCode, hi=True, hasImm=True)
+    # USHR
+    twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2,
+                     shrCode, hasImm=True)
+    twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4,
+                     shrCode, hasImm=True)
+    # USQADD
+    usqaddCode = '''
+            FPSCR fpscr = (FPSCR) FpscrQc;
+            Element tmp = destElem + srcElem1;
+            if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) {
+                if (tmp < srcElem1 || tmp < destElem) {
+                    destElem = (Element)(-1);
+                    fpscr.qc = 1;
+                } else {
+                    destElem = tmp;
+                }
+            } else {
+                Element absSrcElem1 = (~srcElem1) + 1;
+                if (absSrcElem1 > destElem) {
+                    destElem = 0;
+                    fpscr.qc = 1;
+                } else {
+                    destElem = tmp;
+                }
+            }
+            FpscrQc = fpscr;
+    '''
+    twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
+                     usqaddCode, True)
+    twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4,
+                     usqaddCode, True)
+    twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4,
+                     usqaddCode, True, scalar=True)
+    # USRA
+    twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2,
+                     sraCode, True, hasImm=True)
+    twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4,
+                     sraCode, True, hasImm=True)
+    # USUBL
+    threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes,
+                      sublwCode)
+    threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes,
+                      sublwCode, hi=True)
+    # USUBW
+    threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes,
+                      sublwCode)
+    threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes,
+                      sublwCode, hi=True)
+    # UXTL -> alias to USHLL
+    # UZP1
+    uzpCode = '''
+        unsigned part = %s;
+        for (unsigned i = 0; i < eCount / 2; i++) {
+            destReg.elements[i] = srcReg1.elements[2 * i + part];
+            destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part];
+        }
+    '''
+    threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2,
+                          uzpCode % "0")
+    threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4,
+                          uzpCode % "0")
+    # UZP2
+    threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2,
+                          uzpCode % "1")
+    threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4,
+                          uzpCode % "1")
+    # XTN, XTN2
+    xtnCode = "destElem = srcElem1;"
+    twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode)
+    twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes,
+                      xtnCode, hi=True)
+    # ZIP1
+    zipCode = '''
+        unsigned base = %s;
+        for (unsigned i = 0; i < eCount / 2; i++) {
+            destReg.elements[2 * i] = srcReg1.elements[base + i];
+            destReg.elements[2 * i + 1] = srcReg2.elements[base + i];
+        }
+    '''
+    threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2,
+                          zipCode % "0")
+    threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4,
+                          zipCode % "0")
+    # ZIP2
+    threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2,
+                          zipCode % "eCount / 2")
+    threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4,
+                          zipCode % "eCount / 2")
+
+}};
diff --git a/src/arch/arm/isa/insts/neon64_mem.isa b/src/arch/arm/isa/insts/neon64_mem.isa
new file mode 100644
index 0000000..32a37f8
--- /dev/null
+++ b/src/arch/arm/isa/insts/neon64_mem.isa
@@ -0,0 +1,471 @@
+// -*- mode: c++ -*-
+
+// Copyright (c) 2012-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Mbou Eyole
+//          Giacomo Gabrielli
+
+let {{
+
+    header_output = ''
+    decoder_output = ''
+    exec_output = ''
+
+    def mkMemAccMicroOp(name):
+        global header_output, decoder_output, exec_output
+        SPAlignmentCheckCodeNeon = '''
+            if (baseIsSP && bits(XURa, 3, 0) &&
+                SPAlignmentCheckEnabled(xc->tcBase())) {
+                return new SPAlignmentFault();
+            }
+        '''
+        eaCode = SPAlignmentCheckCodeNeon + '''
+            EA = XURa + imm;
+        '''
+        memDecl = '''
+            const int MaxNumBytes = 16;
+            union MemUnion {
+                uint8_t bytes[MaxNumBytes];
+                uint32_t floatRegBits[MaxNumBytes / 4];
+            };
+        '''
+
+        # Do endian conversion for all the elements
+        convCode = '''
+            VReg x = {0, 0};
+
+            x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
+                (XReg) memUnion.floatRegBits[0];
+            x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
+                (XReg) memUnion.floatRegBits[2];
+
+            const unsigned eCount = 16 / (1 << eSize);
+
+            if (isBigEndian64(xc->tcBase())) {
+                for (unsigned i = 0; i < eCount; i++) {
+                    switch (eSize) {
+                      case 0x3:  // 64-bit
+                        writeVecElem(&x, (XReg) gtobe(
+                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
+                        break;
+                      case 0x2:  // 32-bit
+                        writeVecElem(&x, (XReg) gtobe(
+                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
+                        break;
+                      case 0x1:  // 16-bit
+                        writeVecElem(&x, (XReg) gtobe(
+                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
+                        break;
+                      default:  // 8-bit
+                        break;  // Nothing to do here
+                    }
+                }
+            } else {
+                for (unsigned i = 0; i < eCount; i++) {
+                    switch (eSize) {
+                      case 0x3:  // 64-bit
+                        writeVecElem(&x, (XReg) gtole(
+                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
+                        break;
+                      case 0x2:  // 32-bit
+                        writeVecElem(&x, (XReg) gtole(
+                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
+                        break;
+                      case 0x1:  // 16-bit
+                        writeVecElem(&x, (XReg) gtole(
+                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
+                        break;
+                      default:  // 8-bit
+                        break;  // Nothing to do here
+                    }
+                }
+            }
+
+            memUnion.floatRegBits[0] = (uint32_t) x.lo;
+            memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
+            memUnion.floatRegBits[2] = (uint32_t) x.hi;
+            memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
+        '''
+
+        # Offload everything into registers
+        regSetCode = ''
+        for reg in range(4):
+            regSetCode += '''
+            AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
+            ''' % { 'reg' : reg }
+
+        # Pull everything in from registers
+        regGetCode = ''
+        for reg in range(4):
+            regGetCode += '''
+            memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
+            ''' % { 'reg' : reg }
+
+        loadMemAccCode = convCode + regSetCode
+        storeMemAccCode = regGetCode + convCode
+
+        loadIop = InstObjParams(name + 'ld',
+                'MicroNeonLoad64',
+                'MicroNeonMemOp',
+            {   'mem_decl' : memDecl,
+                'memacc_code' : loadMemAccCode,
+                'ea_code' : simd64EnabledCheckCode + eaCode,
+            },
+            [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
+        storeIop = InstObjParams(name + 'st',
+                'MicroNeonStore64',
+                'MicroNeonMemOp',
+            {   'mem_decl' : memDecl,
+                'memacc_code' : storeMemAccCode,
+                'ea_code' : simd64EnabledCheckCode + eaCode,
+            },
+            [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
+
+        exec_output += NeonLoadExecute64.subst(loadIop) + \
+            NeonLoadInitiateAcc64.subst(loadIop) + \
+            NeonLoadCompleteAcc64.subst(loadIop) + \
+            NeonStoreExecute64.subst(storeIop) + \
+            NeonStoreInitiateAcc64.subst(storeIop) + \
+            NeonStoreCompleteAcc64.subst(storeIop)
+        header_output += MicroNeonMemDeclare64.subst(loadIop) + \
+            MicroNeonMemDeclare64.subst(storeIop)
+
+    def mkMarshalMicroOp(name, Name):
+        global header_output, decoder_output, exec_output
+
+        getInputCodeOp1L = ''
+        for v in range(4):
+            for p in range(4):
+                getInputCodeOp1L += '''
+            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
+                         %(p)d, 0x2);
+            ''' % { 'v' : v, 'p' : p }
+
+        getInputCodeOp1S = ''
+        for v in range(4):
+            for p in range(4):
+                getInputCodeOp1S += '''
+            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
+                         %(p)d, 0x2);
+            ''' % { 'v' : v, 'p' : p }
+
+        if name == 'deint_neon_uop':
+
+            eCode = '''
+                VReg input[4];  // input data from scratch area
+                VReg output[2];  // output data to arch. SIMD regs
+                VReg temp;
+                temp.lo = 0;
+                temp.hi = 0;
+            '''
+            for p in range(4):
+                eCode += '''
+                writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
+                ''' % { 'p' : p }
+            eCode += getInputCodeOp1L
+
+            # Note that numRegs is not always the same as numStructElems; in
+            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
+            # 1, 2, 3 or 4
+
+            eCode += '''
+                output[0].lo = 0;
+                output[0].hi = 0;
+                output[1].lo = 0;
+                output[1].hi = 0;
+
+                int eCount = dataSize / (8 << eSize);
+                int eSizeBytes = 1 << eSize;  // element size in bytes
+                int numBytes = step * dataSize / 4;
+                int totNumBytes = numRegs * dataSize / 8;
+
+                int structElemNo, pos, a, b;
+                XReg data;
+
+                for (int r = 0; r < 2; ++r) {
+                    for (int i = 0; i < eCount; ++i) {
+                        if (numBytes < totNumBytes) {
+                            structElemNo = r + (step * 2);
+                            if (numStructElems == 1) {
+                                pos = (eSizeBytes * i) +
+                                    (eCount * structElemNo * eSizeBytes);
+                            } else {
+                                pos = (numStructElems * eSizeBytes * i) +
+                                    (structElemNo * eSizeBytes);
+                            }
+                            a = pos / 16;
+                            b = (pos % 16) / eSizeBytes;
+                            data = (XReg) readVecElem(input[a], (XReg) b,
+                                                      eSize);
+                            writeVecElem(&output[r], data, i, eSize);
+                            numBytes += eSizeBytes;
+                        }
+                    }
+                }
+            '''
+            for p in range(4):
+                eCode += '''
+                AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
+                    %(p)d, 0x2);
+                ''' % { 'p' : p }
+            eCode += '''
+                if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
+            '''
+            for p in range(4):
+                eCode += '''
+                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
+                        output[1], %(p)d, 0x2);
+                ''' % { 'p' : p }
+            eCode += '''
+                } else {
+            '''
+            for p in range(4):
+                eCode += '''
+                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
+                        %(p)d, 0x2);
+                ''' % { 'p' : p }
+            eCode += '''
+                }
+            '''
+
+            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
+                                { 'code' : eCode }, ['IsMicroop'])
+            header_output += MicroNeonMixDeclare64.subst(iop)
+            exec_output += MicroNeonMixExecute64.subst(iop)
+
+        elif name == 'int_neon_uop':
+
+            eCode = '''
+                VReg input[4];  // input data from arch. SIMD regs
+                VReg output[2];  // output data to scratch area
+            '''
+
+            eCode += getInputCodeOp1S
+
+            # Note that numRegs is not always the same as numStructElems; in
+            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
+            # 1, 2, 3 or 4
+
+            eCode += '''
+                int eCount = dataSize / (8 << eSize);
+                int eSizeBytes = 1 << eSize;
+                int totNumBytes = numRegs * dataSize / 8;
+                int numOutputElems = 128 / (8 << eSize);
+                int stepOffset = step * 32;
+
+                for (int i = 0; i < 2; ++i) {
+                    output[i].lo = 0;
+                    output[i].hi = 0;
+                }
+
+                int r = 0, k = 0, i, j;
+                XReg data;
+
+                for (int pos = stepOffset; pos < 32 + stepOffset;
+                        pos += eSizeBytes) {
+                    if (pos < totNumBytes) {
+                        if (numStructElems == 1) {
+                            i = (pos / eSizeBytes) % eCount;
+                            j = pos / (eCount * eSizeBytes);
+                        } else {
+                            i = pos / (numStructElems * eSizeBytes);
+                            j = (pos % (numStructElems * eSizeBytes)) /
+                                eSizeBytes;
+                        }
+                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
+                        writeVecElem(&output[r], data, k, eSize);
+                        k++;
+                        if (k == numOutputElems){
+                            k = 0;
+                            ++r;
+                        }
+                    }
+                }
+                '''
+            for v in range(2):
+                for p in range(4):
+                    eCode += '''
+                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
+                    output[%(v)d], %(p)d, 0x2);
+                ''' % { 'v': v, 'p': p}
+
+            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
+                                { 'code' : eCode }, ['IsMicroop'])
+            header_output += MicroNeonMixDeclare64.subst(iop)
+            exec_output += MicroNeonMixExecute64.subst(iop)
+
+        elif name == 'unpack_neon_uop':
+
+            eCode = '''
+                VReg input[4];  //input data from scratch area
+                VReg output[2];  //output data to arch. SIMD regs
+            '''
+
+            eCode += getInputCodeOp1L
+
+            # Fill output regs with register data initially.  Note that
+            # elements in output register outside indexed lanes are left
+            # untouched
+            for v in range(2):
+                for p in range(4):
+                    eCode += '''
+                writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
+                             %(p)d, 0x2);
+                ''' % { 'v': v, 'p': p}
+            eCode += '''
+                int eCount = dataSize / (8 << eSize);
+                int eCount128 = 128 / (8 << eSize);
+                int eSizeBytes = 1 << eSize;
+                int totNumBytes = numStructElems * eSizeBytes;
+                int numInputElems = eCount128;
+                int stepOffset = step * 2 * eSizeBytes;
+                int stepLimit = 2 * eSizeBytes;
+
+                int r = 0, i, j;
+                XReg data;
+
+                for (int pos = stepOffset; pos < stepLimit + stepOffset;
+                        pos += eSizeBytes) {
+                    if (pos < totNumBytes) {
+                        r = pos / eSizeBytes;
+                        j = r / numInputElems;
+                        i = r % numInputElems;
+                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
+
+                        if (replicate) {
+                            for (int i = 0; i < eCount128; ++i) {
+                                if (i < eCount) {
+                                    writeVecElem(&output[r % 2], data, i,
+                                                 eSize);
+                                } else {  // zero extend if necessary
+                                    writeVecElem(&output[r % 2], (XReg) 0, i,
+                                                 eSize);
+                                }
+                            }
+                        } else {
+                            writeVecElem(&output[r % 2], data, lane, eSize);
+                        }
+                    }
+                }
+            '''
+            for v in range(2):
+                for p in range(4):
+                    eCode += '''
+                AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
+                    output[%(v)d], %(p)d, 0x2);
+                ''' % { 'v' : v, 'p' : p }
+
+            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
+                                { 'code' : eCode }, ['IsMicroop'])
+            header_output += MicroNeonMixLaneDeclare64.subst(iop)
+            exec_output += MicroNeonMixExecute64.subst(iop)
+
+        elif name == 'pack_neon_uop':
+
+            eCode = '''
+                VReg input[4];  // input data from arch. SIMD regs
+                VReg output[2];  // output data to scratch area
+            '''
+
+            eCode += getInputCodeOp1S
+
+            eCode += '''
+                int eSizeBytes = 1 << eSize;
+                int numOutputElems = 128 / (8 << eSize);
+                int totNumBytes = numStructElems * eSizeBytes;
+                int stepOffset = step * 32;
+                int stepLimit = 32;
+
+                int r = 0, i, j;
+                XReg data;
+
+                for (int i = 0; i < 2; ++i) {
+                    output[i].lo = 0;
+                    output[i].hi = 0;
+                }
+
+                for (int pos = stepOffset; pos < stepLimit + stepOffset;
+                        pos += eSizeBytes) {
+                    if (pos < totNumBytes) {
+                        r = pos / 16;
+                        j = pos / eSizeBytes;
+                        i = (pos / eSizeBytes) %  numOutputElems;
+                        data = (XReg) readVecElem(input[j], lane, eSize);
+                        writeVecElem(&output[r % 2], data, i, eSize);
+                    }
+                }
+            '''
+
+            for v in range(2):
+                for p in range(4):
+                    eCode += '''
+                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
+                    output[%(v)d], %(p)d, 0x2);
+                ''' % { 'v' : v, 'p' : p }
+
+            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
+                                { 'code' : eCode }, ['IsMicroop'])
+            header_output += MicroNeonMixLaneDeclare64.subst(iop)
+            exec_output += MicroNeonMixExecute64.subst(iop)
+
+    # Generate instructions
+    mkMemAccMicroOp('mem_neon_uop')
+    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64')
+    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64')
+    mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
+    mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
+
+}};
+
+let {{
+
+    iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
+    header_output += VMemMultDeclare64.subst(iop)
+    decoder_output += VMemMultConstructor64.subst(iop)
+
+    iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
+    header_output += VMemMultDeclare64.subst(iop)
+    decoder_output += VMemMultConstructor64.subst(iop)
+
+    iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
+    header_output += VMemSingleDeclare64.subst(iop)
+    decoder_output += VMemSingleConstructor64.subst(iop)
+
+    iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
+    header_output += VMemSingleDeclare64.subst(iop)
+    decoder_output += VMemSingleConstructor64.subst(iop)
+
+}};
diff --git a/src/arch/arm/isa/insts/str.isa b/src/arch/arm/isa/insts/str.isa
index 8084605..3f59569 100644
--- a/src/arch/arm/isa/insts/str.isa
+++ b/src/arch/arm/isa/insts/str.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2011 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -38,6 +38,7 @@
 // Authors: Gabe Black
 
 let {{
+    import math
 
     header_output = ""
     decoder_output = ""
@@ -77,7 +78,9 @@
             (newHeader,
              newDecoder,
              newExec) = self.fillTemplates(self.name, self.Name, codeBlobs,
-                                           self.memFlags, self.instFlags, base, wbDecl)
+                                           self.memFlags, self.instFlags,
+                                           base, wbDecl, None, False,
+                                           self.size, self.sign)
 
             header_output += newHeader
             decoder_output += newDecoder
@@ -171,7 +174,7 @@
                                       self.size, self.sign, self.user)
 
             # Add memory request flags where necessary
-            self.memFlags.append("%d" % (self.size - 1))
+            self.memFlags.append("%d" % int(math.log(self.size, 2)))
             if self.user:
                 self.memFlags.append("ArmISA::TLB::UserMode")
 
diff --git a/src/arch/arm/isa/insts/str64.isa b/src/arch/arm/isa/insts/str64.isa
new file mode 100644
index 0000000..c15dca1
--- /dev/null
+++ b/src/arch/arm/isa/insts/str64.isa
@@ -0,0 +1,372 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+let {{
+
+    header_output = ""
+    decoder_output = ""
+    exec_output = ""
+
+    class StoreInst64(LoadStoreInst):
+        execBase = 'Store64'
+        micro = False
+
+        def __init__(self, mnem, Name, size=4, user=False, flavor="normal",
+                     top = False):
+            super(StoreInst64, self).__init__()
+
+            self.name = mnem
+            self.Name = Name
+            self.size = size
+            self.user = user
+            self.flavor = flavor
+            self.top = top
+
+            self.memFlags = ["ArmISA::TLB::MustBeOne"]
+            self.instFlags = []
+            self.codeBlobs = { "postacc_code" : "" }
+
+            # Add memory request flags where necessary
+            if self.user:
+                self.memFlags.append("ArmISA::TLB::UserMode")
+
+            if self.flavor in ("relexp", "exp"):
+                # For exclusive pair ops alignment check is based on total size
+                self.memFlags.append("%d" % int(math.log(self.size, 2) + 1))
+            elif not (self.size == 16 and self.top):
+                # Only the first microop should perform alignment checking.
+                self.memFlags.append("%d" % int(math.log(self.size, 2)))
+
+            if self.flavor not in ("release", "relex", "exclusive",
+                                   "relexp", "exp"):
+                self.memFlags.append("ArmISA::TLB::AllowUnaligned")
+
+            if self.micro:
+                self.instFlags.append("IsMicroop")
+
+            if self.flavor in ("release", "relex", "relexp"):
+                self.instFlags.extend(["IsMemBarrier",
+                                       "IsWriteBarrier",
+                                       "IsReadBarrier"])
+            if self.flavor in ("relex", "exclusive", "exp", "relexp"):
+                self.instFlags.append("IsStoreConditional")
+                self.memFlags.append("Request::LLSC")
+
+        def emitHelper(self, base = 'Memory64', wbDecl = None):
+            global header_output, decoder_output, exec_output
+
+            # If this is a microop itself, don't allow anything that would
+            # require further microcoding.
+            if self.micro:
+                assert not wbDecl
+
+            fa_code = None
+            if not self.micro and self.flavor in ("normal", "release"):
+                fa_code = '''
+                    fault->annotate(ArmFault::SAS, %s);
+                    fault->annotate(ArmFault::SSE, false);
+                    fault->annotate(ArmFault::SRT, dest);
+                    fault->annotate(ArmFault::SF, %s);
+                    fault->annotate(ArmFault::AR, %s);
+                ''' % ("0" if self.size == 1 else
+                       "1" if self.size == 2 else
+                       "2" if self.size == 4 else "3",
+                       "true" if self.size == 8 else "false",
+                       "true" if self.flavor == "release" else "false")
+
+            (newHeader, newDecoder, newExec) = \
+                self.fillTemplates(self.name, self.Name, self.codeBlobs,
+                                   self.memFlags, self.instFlags,
+                                   base, wbDecl, faCode=fa_code)
+
+            header_output += newHeader
+            decoder_output += newDecoder
+            exec_output += newExec
+
+        def buildEACode(self):
+            # Address computation
+            eaCode = ""
+            if self.flavor == "fp":
+                eaCode += vfp64EnabledCheckCode
+
+            eaCode += SPAlignmentCheckCode + "EA = XBase"
+            if self.size == 16:
+                if self.top:
+                    eaCode += " + (isBigEndian64(xc->tcBase()) ? 0 : 8)"
+                else:
+                    eaCode += " + (isBigEndian64(xc->tcBase()) ? 8 : 0)"
+            if not self.post:
+                eaCode += self.offset
+            eaCode += ";"
+
+            self.codeBlobs["ea_code"] = eaCode
+
+
+    class StoreImmInst64(StoreInst64):
+        def __init__(self, *args, **kargs):
+            super(StoreImmInst64, self).__init__(*args, **kargs)
+            self.offset = "+ imm"
+
+            self.wbDecl = "MicroAddXiUop(machInst, base, base, imm);"
+
+    class StoreRegInst64(StoreInst64):
+        def __init__(self, *args, **kargs):
+            super(StoreRegInst64, self).__init__(*args, **kargs)
+            self.offset = "+ extendReg64(XOffset, type, shiftAmt, 64)"
+
+            self.wbDecl = \
+                "MicroAddXERegUop(machInst, base, base, " + \
+                "                 offset, type, shiftAmt);"
+
+    class StoreRawRegInst64(StoreInst64):
+        def __init__(self, *args, **kargs):
+            super(StoreRawRegInst64, self).__init__(*args, **kargs)
+            self.offset = ""
+
+    class StoreSingle64(StoreInst64):
+        def emit(self):
+            self.buildEACode()
+
+            # Code that actually handles the access
+            if self.flavor == "fp":
+                if self.size in (1, 2, 4):
+                    accCode = '''
+                        Mem%(suffix)s =
+                            cSwap(AA64FpDestP0%(suffix)s, isBigEndian64(xc->tcBase()));
+                    '''
+                elif self.size == 8 or (self.size == 16 and not self.top):
+                    accCode = '''
+                        uint64_t data = AA64FpDestP1_uw;
+                        data = (data << 32) | AA64FpDestP0_uw;
+                        Mem%(suffix)s = cSwap(data, isBigEndian64(xc->tcBase()));
+                    '''
+                elif self.size == 16 and self.top:
+                    accCode = '''
+                        uint64_t data = AA64FpDestP3_uw;
+                        data = (data << 32) | AA64FpDestP2_uw;
+                        Mem%(suffix)s = cSwap(data, isBigEndian64(xc->tcBase()));
+                    '''
+            else:
+                accCode = \
+                    'Mem%(suffix)s = cSwap(XDest%(suffix)s, isBigEndian64(xc->tcBase()));'
+            if self.size == 16:
+                accCode = accCode % \
+                    { "suffix" : buildMemSuffix(False, 8) }
+            else:
+                accCode = accCode % \
+                    { "suffix" : buildMemSuffix(False, self.size) }
+
+            self.codeBlobs["memacc_code"] = accCode
+
+            if self.flavor in ("relex", "exclusive"):
+                self.instFlags.append("IsStoreConditional")
+                self.memFlags.append("Request::LLSC")
+
+            # Push it out to the output files
+            wbDecl = None
+            if self.writeback and not self.micro:
+                wbDecl = self.wbDecl
+            self.emitHelper(self.base, wbDecl)
+
+    class StoreDouble64(StoreInst64):
+        def emit(self):
+            self.buildEACode()
+
+            # Code that actually handles the access
+            if self.flavor == "fp":
+                accCode = '''
+                    uint64_t data = AA64FpDest2P0_uw;
+                    data = (data << 32) | AA64FpDestP0_uw;
+                    Mem_ud = cSwap(data, isBigEndian64(xc->tcBase()));
+                '''
+            else:
+                if self.size == 4:
+                    accCode = '''
+                        uint64_t data = XDest2_uw;
+                        data = (data << 32) | XDest_uw;
+                        Mem_ud = cSwap(data, isBigEndian64(xc->tcBase()));
+                    '''
+                elif self.size == 8:
+                    accCode = '''
+                        // This temporary needs to be here so that the parser
+                        // will correctly identify this instruction as a store.
+                        Twin64_t temp;
+                        temp.a = XDest_ud;
+                        temp.b = XDest2_ud;
+                        Mem_tud = temp;
+                    '''
+            self.codeBlobs["memacc_code"] = accCode
+
+            # Push it out to the output files
+            wbDecl = None
+            if self.writeback and not self.micro:
+                wbDecl = self.wbDecl
+            self.emitHelper(self.base, wbDecl)
+
+    class StoreImm64(StoreImmInst64, StoreSingle64):
+        decConstBase = 'LoadStoreImm64'
+        base = 'ArmISA::MemoryImm64'
+        writeback = False
+        post = False
+
+    class StorePre64(StoreImmInst64, StoreSingle64):
+        decConstBase = 'LoadStoreImm64'
+        base = 'ArmISA::MemoryPreIndex64'
+        writeback = True
+        post = False
+
+    class StorePost64(StoreImmInst64, StoreSingle64):
+        decConstBase = 'LoadStoreImm64'
+        base = 'ArmISA::MemoryPostIndex64'
+        writeback = True
+        post = True
+
+    class StoreReg64(StoreRegInst64, StoreSingle64):
+        decConstBase = 'LoadStoreReg64'
+        base = 'ArmISA::MemoryReg64'
+        writeback = False
+        post = False
+
+    class StoreRaw64(StoreRawRegInst64, StoreSingle64):
+        decConstBase = 'LoadStoreRaw64'
+        base = 'ArmISA::MemoryRaw64'
+        writeback = False
+        post = False
+
+    class StoreEx64(StoreRawRegInst64, StoreSingle64):
+        decConstBase = 'LoadStoreEx64'
+        base = 'ArmISA::MemoryEx64'
+        writeback = False
+        post = False
+        execBase = 'StoreEx64'
+        def __init__(self, *args, **kargs):
+            super(StoreEx64, self).__init__(*args, **kargs)
+            self.codeBlobs["postacc_code"] = "XResult = !writeResult;"
+
+    def buildStores64(mnem, NameBase, size, flavor="normal"):
+        StoreImm64(mnem, NameBase + "_IMM", size, flavor=flavor).emit()
+        StorePre64(mnem, NameBase + "_PRE", size, flavor=flavor).emit()
+        StorePost64(mnem, NameBase + "_POST", size, flavor=flavor).emit()
+        StoreReg64(mnem, NameBase + "_REG", size, flavor=flavor).emit()
+
+    buildStores64("strb", "STRB64", 1)
+    buildStores64("strh", "STRH64", 2)
+    buildStores64("str", "STRW64", 4)
+    buildStores64("str", "STRX64", 8)
+    buildStores64("str", "STRBFP64", 1, flavor="fp")
+    buildStores64("str", "STRHFP64", 2, flavor="fp")
+    buildStores64("str", "STRSFP64", 4, flavor="fp")
+    buildStores64("str", "STRDFP64", 8, flavor="fp")
+
+    StoreImm64("sturb", "STURB64_IMM", 1).emit()
+    StoreImm64("sturh", "STURH64_IMM", 2).emit()
+    StoreImm64("stur", "STURW64_IMM", 4).emit()
+    StoreImm64("stur", "STURX64_IMM", 8).emit()
+    StoreImm64("stur", "STURBFP64_IMM", 1, flavor="fp").emit()
+    StoreImm64("stur", "STURHFP64_IMM", 2, flavor="fp").emit()
+    StoreImm64("stur", "STURSFP64_IMM", 4, flavor="fp").emit()
+    StoreImm64("stur", "STURDFP64_IMM", 8, flavor="fp").emit()
+
+    StoreImm64("sttrb", "STTRB64_IMM", 1, user=True).emit()
+    StoreImm64("sttrh", "STTRH64_IMM", 2, user=True).emit()
+    StoreImm64("sttr", "STTRW64_IMM", 4, user=True).emit()
+    StoreImm64("sttr", "STTRX64_IMM", 8, user=True).emit()
+
+    StoreRaw64("stlr", "STLRX64", 8, flavor="release").emit()
+    StoreRaw64("stlr", "STLRW64", 4, flavor="release").emit()
+    StoreRaw64("stlrh", "STLRH64", 2, flavor="release").emit()
+    StoreRaw64("stlrb", "STLRB64", 1, flavor="release").emit()
+
+    StoreEx64("stlxr", "STLXRX64", 8, flavor="relex").emit()
+    StoreEx64("stlxr", "STLXRW64", 4, flavor="relex").emit()
+    StoreEx64("stlxrh", "STLXRH64", 2, flavor="relex").emit()
+    StoreEx64("stlxrb", "STLXRB64", 1, flavor="relex").emit()
+
+    StoreEx64("stxr", "STXRX64", 8, flavor="exclusive").emit()
+    StoreEx64("stxr", "STXRW64", 4, flavor="exclusive").emit()
+    StoreEx64("stxrh", "STXRH64", 2, flavor="exclusive").emit()
+    StoreEx64("stxrb", "STXRB64", 1, flavor="exclusive").emit()
+
+    class StoreImmU64(StoreImm64):
+        decConstBase = 'LoadStoreImmU64'
+        micro = True
+
+    class StoreImmDU64(StoreImmInst64, StoreDouble64):
+        decConstBase = 'LoadStoreImmDU64'
+        base = 'ArmISA::MemoryDImm64'
+        micro = True
+        post = False
+        writeback = False
+
+    class StoreImmDEx64(StoreImmInst64, StoreDouble64):
+        execBase = 'StoreEx64'
+        decConstBase = 'StoreImmDEx64'
+        base = 'ArmISA::MemoryDImmEx64'
+        micro = False
+        post = False
+        writeback = False
+        def __init__(self, *args, **kargs):
+            super(StoreImmDEx64, self).__init__(*args, **kargs)
+            self.codeBlobs["postacc_code"] = "XResult = !writeResult;"
+
+    class StoreRegU64(StoreReg64):
+        decConstBase = 'LoadStoreRegU64'
+        micro = True
+
+    StoreImmDEx64("stlxp", "STLXPW64", 4, flavor="relexp").emit()
+    StoreImmDEx64("stlxp", "STLXPX64", 8, flavor="relexp").emit()
+    StoreImmDEx64("stxp", "STXPW64", 4, flavor="exp").emit()
+    StoreImmDEx64("stxp", "STXPX64", 8, flavor="exp").emit()
+
+    StoreImmU64("strxi_uop", "MicroStrXImmUop", 8).emit()
+    StoreRegU64("strxr_uop", "MicroStrXRegUop", 8).emit()
+    StoreImmU64("strfpxi_uop", "MicroStrFpXImmUop", 8, flavor="fp").emit()
+    StoreRegU64("strfpxr_uop", "MicroStrFpXRegUop", 8, flavor="fp").emit()
+    StoreImmU64("strqbfpxi_uop", "MicroStrQBFpXImmUop",
+                16, flavor="fp", top=False).emit()
+    StoreRegU64("strqbfpxr_uop", "MicroStrQBFpXRegUop",
+                16, flavor="fp", top=False).emit()
+    StoreImmU64("strqtfpxi_uop", "MicroStrQTFpXImmUop",
+                16, flavor="fp", top=True).emit()
+    StoreRegU64("strqtfpxr_uop", "MicroStrQTFpXRegUop",
+                16, flavor="fp", top=True).emit()
+    StoreImmDU64("strdxi_uop", "MicroStrDXImmUop", 4).emit()
+    StoreImmDU64("strdfpxi_uop", "MicroStrDFpXImmUop", 4, flavor="fp").emit()
+
+}};
diff --git a/src/arch/arm/isa/insts/swap.isa b/src/arch/arm/isa/insts/swap.isa
index b42a1c4..f2ceed2 100644
--- a/src/arch/arm/isa/insts/swap.isa
+++ b/src/arch/arm/isa/insts/swap.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2011 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -73,10 +73,7 @@
 
     swpPreAccCode = '''
         if (!((SCTLR)Sctlr).sw) {
-            if (FullSystem)
-                return new UndefinedInstruction;
-            else
-                return new UndefinedInstruction(false, mnemonic);
+            return new UndefinedInstruction(machInst, false, mnemonic);
         }
     '''
 
diff --git a/src/arch/arm/isa/operands.isa b/src/arch/arm/isa/operands.isa
index 64deef0..7a12133 100644
--- a/src/arch/arm/isa/operands.isa
+++ b/src/arch/arm/isa/operands.isa
@@ -1,5 +1,5 @@
 // -*- mode:c++ -*-
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -80,6 +80,31 @@
             xc->%(func)s(this, %(op_idx)s, %(final_val)s);
         }
     '''
+    aarch64Read = '''
+        ((xc->%(func)s(this, %(op_idx)s)) & mask(intWidth))
+    '''
+    aarch64Write = '''
+        xc->%(func)s(this, %(op_idx)s, (%(final_val)s) & mask(intWidth))
+    '''
+    aarchX64Read = '''
+        ((xc->%(func)s(this, %(op_idx)s)) & mask(aarch64 ? 64 : 32))
+    '''
+    aarchX64Write = '''
+        xc->%(func)s(this, %(op_idx)s, (%(final_val)s) & mask(aarch64 ? 64 : 32))
+    '''
+    aarchW64Read = '''
+        ((xc->%(func)s(this, %(op_idx)s)) & mask(32))
+    '''
+    aarchW64Write = '''
+        xc->%(func)s(this, %(op_idx)s, (%(final_val)s) & mask(32))
+    '''
+    cntrlNsBankedWrite = '''
+        xc->setMiscReg(flattenMiscRegNsBanked(dest, xc->tcBase()), %(final_val)s)
+    '''
+
+    cntrlNsBankedRead = '''
+        xc->readMiscReg(flattenMiscRegNsBanked(op1, xc->tcBase()))
+    '''
 
     #PCState operands need to have a sorting index (the number at the end)
     #less than all the integer registers which might update the PC. That way
@@ -99,6 +124,18 @@
         return ('IntReg', 'uw', idx, 'IsInteger', srtNormal,
                 maybePCRead, maybePCWrite)
 
+    def intReg64(idx):
+        return ('IntReg', 'ud', idx, 'IsInteger', srtNormal,
+                aarch64Read, aarch64Write)
+
+    def intRegX64(idx, id = srtNormal):
+        return ('IntReg', 'ud', idx, 'IsInteger', id,
+                aarchX64Read, aarchX64Write)
+
+    def intRegW64(idx, id = srtNormal):
+        return ('IntReg', 'ud', idx, 'IsInteger', id,
+                aarchW64Read, aarchW64Write)
+
     def intRegNPC(idx):
         return ('IntReg', 'uw', idx, 'IsInteger', srtNormal)
 
@@ -120,26 +157,49 @@
     def cntrlReg(idx, id = srtNormal, type = 'uw'):
         return ('ControlReg', type, idx, None, id)
 
+    def cntrlNsBankedReg(idx, id = srtNormal, type = 'uw'):
+        return ('ControlReg', type, idx, (None, None, 'IsControl'), id, cntrlNsBankedRead, cntrlNsBankedWrite)
+
+    def cntrlNsBankedReg64(idx, id = srtNormal, type = 'ud'):
+        return ('ControlReg', type, idx, (None, None, 'IsControl'), id, cntrlNsBankedRead, cntrlNsBankedWrite)
+
     def cntrlRegNC(idx, id = srtNormal, type = 'uw'):
         return ('ControlReg', type, idx, None, id)
 
     def pcStateReg(idx, id):
-        return ('PCState', 'uw', idx, (None, None, 'IsControl'), id)
+        return ('PCState', 'ud', idx, (None, None, 'IsControl'), id)
 }};
 
 def operands {{
     #Abstracted integer reg operands
     'Dest': intReg('dest'),
+    'Dest64': intReg64('dest'),
+    'XDest': intRegX64('dest'),
+    'WDest': intRegW64('dest'),
     'IWDest': intRegIWPC('dest'),
     'AIWDest': intRegAIWPC('dest'),
     'Dest2': intReg('dest2'),
+    'XDest2': intRegX64('dest2'),
+    'FDest2': floatReg('dest2'),
     'Result': intReg('result'),
+    'XResult': intRegX64('result'),
+    'XBase': intRegX64('base', id = srtBase),
     'Base': intRegAPC('base', id = srtBase),
+    'XOffset': intRegX64('offset'),
     'Index': intReg('index'),
     'Shift': intReg('shift'),
     'Op1': intReg('op1'),
     'Op2': intReg('op2'),
     'Op3': intReg('op3'),
+    'Op164': intReg64('op1'),
+    'Op264': intReg64('op2'),
+    'Op364': intReg64('op3'),
+    'XOp1': intRegX64('op1'),
+    'XOp2': intRegX64('op2'),
+    'XOp3': intRegX64('op3'),
+    'WOp1': intRegW64('op1'),
+    'WOp2': intRegW64('op2'),
+    'WOp3': intRegW64('op3'),
     'Reg0': intReg('reg0'),
     'Reg1': intReg('reg1'),
     'Reg2': intReg('reg2'),
@@ -147,13 +207,19 @@
 
     #Fixed index integer reg operands
     'SpMode': intRegNPC('intRegInMode((OperatingMode)regMode, INTREG_SP)'),
+    'DecodedBankedIntReg': intRegNPC('decodeMrsMsrBankedIntRegIndex(byteMask, r)'),
     'LR': intRegNPC('INTREG_LR'),
+    'XLR': intRegX64('INTREG_X30'),
     'R7': intRegNPC('7'),
     # First four arguments are passed in registers
     'R0': intRegNPC('0'),
     'R1': intRegNPC('1'),
     'R2': intRegNPC('2'),
     'R3': intRegNPC('3'),
+    'X0': intRegX64('0'),
+    'X1': intRegX64('1'),
+    'X2': intRegX64('2'),
+    'X3': intRegX64('3'),
 
     #Pseudo integer condition code registers
     'CondCodesNZ': intRegCC('INTREG_CONDCODES_NZ'),
@@ -230,9 +296,95 @@
     'FpOp2P2': floatReg('(op2 + 2)'),
     'FpOp2P3': floatReg('(op2 + 3)'),
 
+    # Create AArch64 unpacked view of the FP registers
+    'AA64FpOp1P0':   floatReg('((op1 * 4) + 0)'),
+    'AA64FpOp1P1':   floatReg('((op1 * 4) + 1)'),
+    'AA64FpOp1P2':   floatReg('((op1 * 4) + 2)'),
+    'AA64FpOp1P3':   floatReg('((op1 * 4) + 3)'),
+    'AA64FpOp2P0':   floatReg('((op2 * 4) + 0)'),
+    'AA64FpOp2P1':   floatReg('((op2 * 4) + 1)'),
+    'AA64FpOp2P2':   floatReg('((op2 * 4) + 2)'),
+    'AA64FpOp2P3':   floatReg('((op2 * 4) + 3)'),
+    'AA64FpOp3P0':   floatReg('((op3 * 4) + 0)'),
+    'AA64FpOp3P1':   floatReg('((op3 * 4) + 1)'),
+    'AA64FpOp3P2':   floatReg('((op3 * 4) + 2)'),
+    'AA64FpOp3P3':   floatReg('((op3 * 4) + 3)'),
+    'AA64FpDestP0':  floatReg('((dest * 4) + 0)'),
+    'AA64FpDestP1':  floatReg('((dest * 4) + 1)'),
+    'AA64FpDestP2':  floatReg('((dest * 4) + 2)'),
+    'AA64FpDestP3':  floatReg('((dest * 4) + 3)'),
+    'AA64FpDest2P0': floatReg('((dest2 * 4) + 0)'),
+    'AA64FpDest2P1': floatReg('((dest2 * 4) + 1)'),
+    'AA64FpDest2P2': floatReg('((dest2 * 4) + 2)'),
+    'AA64FpDest2P3': floatReg('((dest2 * 4) + 3)'),
+
+    'AA64FpOp1P0V0':   floatReg('((((op1+0)) * 4) + 0)'),
+    'AA64FpOp1P1V0':   floatReg('((((op1+0)) * 4) + 1)'),
+    'AA64FpOp1P2V0':   floatReg('((((op1+0)) * 4) + 2)'),
+    'AA64FpOp1P3V0':   floatReg('((((op1+0)) * 4) + 3)'),
+
+    'AA64FpOp1P0V1':   floatReg('((((op1+1)) * 4) + 0)'),
+    'AA64FpOp1P1V1':   floatReg('((((op1+1)) * 4) + 1)'),
+    'AA64FpOp1P2V1':   floatReg('((((op1+1)) * 4) + 2)'),
+    'AA64FpOp1P3V1':   floatReg('((((op1+1)) * 4) + 3)'),
+
+    'AA64FpOp1P0V2':   floatReg('((((op1+2)) * 4) + 0)'),
+    'AA64FpOp1P1V2':   floatReg('((((op1+2)) * 4) + 1)'),
+    'AA64FpOp1P2V2':   floatReg('((((op1+2)) * 4) + 2)'),
+    'AA64FpOp1P3V2':   floatReg('((((op1+2)) * 4) + 3)'),
+
+    'AA64FpOp1P0V3':   floatReg('((((op1+3)) * 4) + 0)'),
+    'AA64FpOp1P1V3':   floatReg('((((op1+3)) * 4) + 1)'),
+    'AA64FpOp1P2V3':   floatReg('((((op1+3)) * 4) + 2)'),
+    'AA64FpOp1P3V3':   floatReg('((((op1+3)) * 4) + 3)'),
+
+    'AA64FpOp1P0V0S':   floatReg('((((op1+0)%32) * 4) + 0)'),
+    'AA64FpOp1P1V0S':   floatReg('((((op1+0)%32) * 4) + 1)'),
+    'AA64FpOp1P2V0S':   floatReg('((((op1+0)%32) * 4) + 2)'),
+    'AA64FpOp1P3V0S':   floatReg('((((op1+0)%32) * 4) + 3)'),
+
+    'AA64FpOp1P0V1S':   floatReg('((((op1+1)%32) * 4) + 0)'),
+    'AA64FpOp1P1V1S':   floatReg('((((op1+1)%32) * 4) + 1)'),
+    'AA64FpOp1P2V1S':   floatReg('((((op1+1)%32) * 4) + 2)'),
+    'AA64FpOp1P3V1S':   floatReg('((((op1+1)%32) * 4) + 3)'),
+
+    'AA64FpOp1P0V2S':   floatReg('((((op1+2)%32) * 4) + 0)'),
+    'AA64FpOp1P1V2S':   floatReg('((((op1+2)%32) * 4) + 1)'),
+    'AA64FpOp1P2V2S':   floatReg('((((op1+2)%32) * 4) + 2)'),
+    'AA64FpOp1P3V2S':   floatReg('((((op1+2)%32) * 4) + 3)'),
+
+    'AA64FpOp1P0V3S':   floatReg('((((op1+3)%32) * 4) + 0)'),
+    'AA64FpOp1P1V3S':   floatReg('((((op1+3)%32) * 4) + 1)'),
+    'AA64FpOp1P2V3S':   floatReg('((((op1+3)%32) * 4) + 2)'),
+    'AA64FpOp1P3V3S':   floatReg('((((op1+3)%32) * 4) + 3)'),
+
+    'AA64FpDestP0V0':   floatReg('((((dest+0)) * 4) + 0)'),
+    'AA64FpDestP1V0':   floatReg('((((dest+0)) * 4) + 1)'),
+    'AA64FpDestP2V0':   floatReg('((((dest+0)) * 4) + 2)'),
+    'AA64FpDestP3V0':   floatReg('((((dest+0)) * 4) + 3)'),
+
+    'AA64FpDestP0V1':   floatReg('((((dest+1)) * 4) + 0)'),
+    'AA64FpDestP1V1':   floatReg('((((dest+1)) * 4) + 1)'),
+    'AA64FpDestP2V1':   floatReg('((((dest+1)) * 4) + 2)'),
+    'AA64FpDestP3V1':   floatReg('((((dest+1)) * 4) + 3)'),
+
+    'AA64FpDestP0V0L':   floatReg('((((dest+0)%32) * 4) + 0)'),
+    'AA64FpDestP1V0L':   floatReg('((((dest+0)%32) * 4) + 1)'),
+    'AA64FpDestP2V0L':   floatReg('((((dest+0)%32) * 4) + 2)'),
+    'AA64FpDestP3V0L':   floatReg('((((dest+0)%32) * 4) + 3)'),
+
+    'AA64FpDestP0V1L':   floatReg('((((dest+1)%32) * 4) + 0)'),
+    'AA64FpDestP1V1L':   floatReg('((((dest+1)%32) * 4) + 1)'),
+    'AA64FpDestP2V1L':   floatReg('((((dest+1)%32) * 4) + 2)'),
+    'AA64FpDestP3V1L':   floatReg('((((dest+1)%32) * 4) + 3)'),
+
     #Abstracted control reg operands
     'MiscDest': cntrlReg('dest'),
     'MiscOp1': cntrlReg('op1'),
+    'MiscNsBankedDest': cntrlNsBankedReg('dest'),
+    'MiscNsBankedOp1': cntrlNsBankedReg('op1'),
+    'MiscNsBankedDest64': cntrlNsBankedReg64('dest'),
+    'MiscNsBankedOp164': cntrlNsBankedReg64('op1'),
 
     #Fixed index control regs
     'Cpsr': cntrlReg('MISCREG_CPSR', srtCpsr),
@@ -244,22 +396,41 @@
     'FpscrQc': cntrlRegNC('MISCREG_FPSCR_QC'),
     'FpscrExc': cntrlRegNC('MISCREG_FPSCR_EXC'),
     'Cpacr': cntrlReg('MISCREG_CPACR'),
+    'Cpacr64': cntrlReg('MISCREG_CPACR_EL1'),
     'Fpexc': cntrlRegNC('MISCREG_FPEXC'),
+    'Nsacr': cntrlReg('MISCREG_NSACR'),
+    'ElrHyp': cntrlRegNC('MISCREG_ELR_HYP'),
+    'Hcr': cntrlReg('MISCREG_HCR'),
+    'Hcr64': cntrlReg('MISCREG_HCR_EL2'),
+    'Hdcr': cntrlReg('MISCREG_HDCR'),
+    'Hcptr': cntrlReg('MISCREG_HCPTR'),
+    'CptrEl264': cntrlReg('MISCREG_CPTR_EL2'),
+    'CptrEl364': cntrlReg('MISCREG_CPTR_EL3'),
+    'Hstr': cntrlReg('MISCREG_HSTR'),
+    'Scr': cntrlReg('MISCREG_SCR'),
+    'Scr64': cntrlReg('MISCREG_SCR_EL3'),
     'Sctlr': cntrlRegNC('MISCREG_SCTLR'),
     'SevMailbox': cntrlRegNC('MISCREG_SEV_MAILBOX'),
     'LLSCLock': cntrlRegNC('MISCREG_LOCKFLAG'),
+    'Dczid' : cntrlRegNC('MISCREG_DCZID_EL0'),
 
     #Register fields for microops
     'URa' : intReg('ura'),
+    'XURa' : intRegX64('ura'),
+    'WURa' : intRegW64('ura'),
     'IWRa' : intRegIWPC('ura'),
     'Fa' : floatReg('ura'),
+    'FaP1' : floatReg('ura + 1'),
     'URb' : intReg('urb'),
+    'XURb' : intRegX64('urb'),
     'URc' : intReg('urc'),
+    'XURc' : intRegX64('urc'),
 
     #Memory Operand
     'Mem': ('Mem', 'uw', None, ('IsMemRef', 'IsLoad', 'IsStore'), srtNormal),
 
     #PCState fields
+    'RawPC': pcStateReg('pc', srtPC),
     'PC': pcStateReg('instPC', srtPC),
     'NPC': pcStateReg('instNPC', srtPC),
     'pNPC': pcStateReg('instNPC', srtEPC),
diff --git a/src/arch/arm/isa/templates/basic.isa b/src/arch/arm/isa/templates/basic.isa
index b3878b8..de4506e 100644
--- a/src/arch/arm/isa/templates/basic.isa
+++ b/src/arch/arm/isa/templates/basic.isa
@@ -1,5 +1,17 @@
 // -*- mode:c++ -*-
 
+// Copyright (c) 2011 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
 // Copyright (c) 2007-2008 The Florida State University
 // All rights reserved.
 //
@@ -60,6 +72,13 @@
         }
 }};
 
+def template BasicConstructor64 {{
+        inline %(class_name)s::%(class_name)s(ExtMachInst machInst)  : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+        {
+            %(constructor)s;
+        }
+}};
+
 
 // Basic instruction class execute method template.
 def template BasicExecute {{
diff --git a/src/arch/arm/isa/templates/branch64.isa b/src/arch/arm/isa/templates/branch64.isa
new file mode 100644
index 0000000..84b3e6a
--- /dev/null
+++ b/src/arch/arm/isa/templates/branch64.isa
@@ -0,0 +1,141 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+def template BranchImm64Declare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, int64_t _imm);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template BranchImm64Constructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          int64_t _imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _imm)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template BranchImmCond64Declare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, int64_t _imm,
+                       ConditionCode _condCode);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template BranchImmCond64Constructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          int64_t _imm,
+                                          ConditionCode _condCode)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _imm, _condCode)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template BranchReg64Declare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _op1);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template BranchReg64Constructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _op1)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _op1)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template BranchImmReg64Declare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst,
+                       int64_t imm, IntRegIndex _op1);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template BranchImmReg64Constructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          int64_t _imm,
+                                          IntRegIndex _op1)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _imm, _op1)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template BranchImmImmReg64Declare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, int64_t _imm1, int64_t _imm2,
+                       IntRegIndex _op1);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template BranchImmImmReg64Constructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          int64_t _imm1, int64_t _imm2,
+                                          IntRegIndex _op1)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _imm1, _imm2, _op1)
+    {
+        %(constructor)s;
+    }
+}};
diff --git a/src/arch/arm/isa/templates/data64.isa b/src/arch/arm/isa/templates/data64.isa
new file mode 100644
index 0000000..b6f7ce8
--- /dev/null
+++ b/src/arch/arm/isa/templates/data64.isa
@@ -0,0 +1,279 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+def template DataXImmDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                IntRegIndex _op1, uint64_t _imm);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataXImmConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          uint64_t _imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _imm)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template DataXSRegDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                IntRegIndex _op1, IntRegIndex _op2,
+                int32_t _shiftAmt, ArmShiftType _shiftType);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataXSRegConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2,
+                                          int32_t _shiftAmt,
+                                          ArmShiftType _shiftType)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2, _shiftAmt, _shiftType)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template DataXERegDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                IntRegIndex _op1, IntRegIndex _op2,
+                ArmExtendType _extendType, int32_t _shiftAmt);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataXERegConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2,
+                                          ArmExtendType _extendType,
+                                          int32_t _shiftAmt)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2, _extendType, _shiftAmt)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template DataX1RegDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _op1);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataX1RegConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _dest, _op1)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template DataX2RegDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _op1, IntRegIndex _op2);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataX2RegConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template DataX2RegImmDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _op1, IntRegIndex _op2, uint64_t _imm);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataX2RegImmConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2,
+                                          uint64_t _imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2, _imm)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template DataX3RegDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _op1, IntRegIndex _op2, IntRegIndex _op3);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataX3RegConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2,
+                                          IntRegIndex _op3)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2, _op3)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template DataXCondCompImmDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _op1,
+                       uint64_t _imm, ConditionCode _condCode, uint8_t _defCc);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataXCondCompImmConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _op1,
+                                          uint64_t _imm,
+                                          ConditionCode _condCode,
+                                          uint8_t _defCc)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _op1, _imm, _condCode, _defCc)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template DataXCondCompRegDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _op1,
+                       IntRegIndex _op2, ConditionCode _condCode,
+                       uint8_t _defCc);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataXCondCompRegConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2,
+                                          ConditionCode _condCode,
+                                          uint8_t _defCc)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _op1, _op2, _condCode, _defCc)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template DataXCondSelDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _op1, IntRegIndex _op2,
+                       ConditionCode _condCode);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template DataXCondSelConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2,
+                                          ConditionCode _condCode)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2, _condCode)
+    {
+        %(constructor)s;
+    }
+}};
diff --git a/src/arch/arm/isa/templates/macromem.isa b/src/arch/arm/isa/templates/macromem.isa
index 195204a..4650906 100644
--- a/src/arch/arm/isa/templates/macromem.isa
+++ b/src/arch/arm/isa/templates/macromem.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -338,6 +338,18 @@
     }
 }};
 
+def template MicroIntImmXConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                   RegIndex _ura,
+                                   RegIndex _urb,
+                                   int32_t _imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _ura, _urb, _imm)
+    {
+        %(constructor)s;
+    }
+}};
+
 def template MicroIntRegDeclare {{
     class %(class_name)s : public %(base_class)s
     {
@@ -349,6 +361,28 @@
     };
 }};
 
+def template MicroIntXERegConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                   RegIndex _ura, RegIndex _urb, RegIndex _urc,
+                                   ArmExtendType _type, uint32_t _shiftAmt)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _ura, _urb, _urc, _type, _shiftAmt)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template MicroIntXERegDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst,
+                       RegIndex _ura, RegIndex _urb, RegIndex _urc,
+                       ArmExtendType _type, uint32_t _shiftAmt);
+        %(BasicExecDeclare)s
+    };
+}};
+
 def template MicroIntRegConstructor {{
     %(class_name)s::%(class_name)s(ExtMachInst machInst,
                                    RegIndex _ura, RegIndex _urb, RegIndex _urc,
@@ -402,6 +436,96 @@
 
 }};
 
+def template BigFpMemImmDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+  public:
+    // Constructor
+    %(class_name)s(const char *mnemonic, ExtMachInst machInst,
+                   bool load, IntRegIndex dest, IntRegIndex base, int64_t imm);
+    %(BasicExecPanic)s
+};
+}};
+
+def template BigFpMemImmConstructor {{
+%(class_name)s::%(class_name)s(const char *mnemonic, ExtMachInst machInst,
+        bool load, IntRegIndex dest, IntRegIndex base, int64_t imm)
+    : %(base_class)s(mnemonic, machInst, %(op_class)s, load, dest, base, imm)
+{
+    %(constructor)s;
+}
+}};
+
+def template BigFpMemRegDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+  public:
+    // Constructor
+    %(class_name)s(const char *mnemonic, ExtMachInst machInst,
+                   bool load, IntRegIndex dest, IntRegIndex base,
+                   IntRegIndex offset, ArmExtendType type, int64_t imm);
+    %(BasicExecPanic)s
+};
+}};
+
+def template BigFpMemRegConstructor {{
+%(class_name)s::%(class_name)s(const char *mnemonic, ExtMachInst machInst,
+        bool load, IntRegIndex dest, IntRegIndex base,
+        IntRegIndex offset, ArmExtendType type, int64_t imm)
+    : %(base_class)s(mnemonic, machInst, %(op_class)s, load, dest, base,
+                     offset, type, imm)
+{
+    %(constructor)s;
+}
+}};
+
+def template BigFpMemLitDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+  public:
+    // Constructor
+    %(class_name)s(const char *mnemonic, ExtMachInst machInst,
+                   IntRegIndex dest, int64_t imm);
+    %(BasicExecPanic)s
+};
+}};
+
+def template BigFpMemLitConstructor {{
+%(class_name)s::%(class_name)s(const char *mnemonic, ExtMachInst machInst,
+        IntRegIndex dest, int64_t imm)
+    : %(base_class)s(mnemonic, machInst, %(op_class)s, dest, imm)
+{
+    %(constructor)s;
+}
+}};
+
+def template PairMemDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+    public:
+        // Constructor
+        %(class_name)s(const char *mnemonic, ExtMachInst machInst,
+                uint32_t size, bool fp, bool load, bool noAlloc, bool signExt,
+                bool exclusive, bool acrel, uint32_t imm,
+                AddrMode mode, IntRegIndex rn, IntRegIndex rt,
+                IntRegIndex rt2);
+        %(BasicExecPanic)s
+};
+}};
+
+def template PairMemConstructor {{
+%(class_name)s::%(class_name)s(const char *mnemonic, ExtMachInst machInst,
+        uint32_t size, bool fp, bool load, bool noAlloc, bool signExt,
+        bool exclusive, bool acrel, uint32_t imm, AddrMode mode,
+        IntRegIndex rn, IntRegIndex rt, IntRegIndex rt2)
+    : %(base_class)s(mnemonic, machInst, %(op_class)s, size,
+                     fp, load, noAlloc, signExt, exclusive, acrel,
+                     imm, mode, rn, rt, rt2)
+{
+    %(constructor)s;
+}
+}};
+
 def template VMemMultDeclare {{
 class %(class_name)s : public %(base_class)s
 {
diff --git a/src/arch/arm/isa/templates/mem.isa b/src/arch/arm/isa/templates/mem.isa
index 871378f..7682c27 100644
--- a/src/arch/arm/isa/templates/mem.isa
+++ b/src/arch/arm/isa/templates/mem.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010, 2012 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -697,6 +697,11 @@
         %(InitiateAccDeclare)s
 
         %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
     };
 }};
 
@@ -763,6 +768,11 @@
         %(InitiateAccDeclare)s
 
         %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
     };
 }};
 
@@ -808,6 +818,11 @@
         %(InitiateAccDeclare)s
 
         %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
     };
 }};
 
@@ -828,6 +843,11 @@
         %(InitiateAccDeclare)s
 
         %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
     };
 }};
 
diff --git a/src/arch/arm/isa/templates/mem64.isa b/src/arch/arm/isa/templates/mem64.isa
new file mode 100644
index 0000000..87dcba9
--- /dev/null
+++ b/src/arch/arm/isa/templates/mem64.isa
@@ -0,0 +1,686 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+let {{
+    SPAlignmentCheckCode = '''
+        if (baseIsSP && bits(XBase, 3, 0) &&
+            SPAlignmentCheckEnabled(xc->tcBase())) {
+            return new SPAlignmentFault();
+        }
+   '''
+}};
+
+def template Load64Execute {{
+    Fault %(class_name)s::execute(%(CPU_exec_context)s *xc,
+                                  Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        if (fault == NoFault) {
+            fault = readMemAtomic(xc, traceData, EA, Mem, memAccessFlags);
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template Store64Execute {{
+    Fault %(class_name)s::execute(%(CPU_exec_context)s *xc,
+                                  Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            fault = writeMemAtomic(xc, traceData, Mem, EA,
+                                   memAccessFlags, NULL);
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template Store64InitiateAcc {{
+    Fault %(class_name)s::initiateAcc(%(CPU_exec_context)s *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            fault = writeMemTiming(xc, traceData, Mem, EA, memAccessFlags,
+                                   NULL);
+        }
+
+        return fault;
+    }
+}};
+
+def template StoreEx64Execute {{
+    Fault %(class_name)s::execute(%(CPU_exec_context)s *xc,
+                                  Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        uint64_t writeResult = 0;
+        if (fault == NoFault) {
+            fault = writeMemAtomic(xc, traceData, Mem, EA, memAccessFlags,
+                                   &writeResult);
+        }
+
+        if (fault == NoFault) {
+            %(postacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template StoreEx64InitiateAcc {{
+    Fault %(class_name)s::initiateAcc(%(CPU_exec_context)s *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            fault = writeMemTiming(xc, traceData, Mem, EA, memAccessFlags,
+                                   NULL);
+        }
+
+        return fault;
+    }
+}};
+
+def template Load64InitiateAcc {{
+    Fault %(class_name)s::initiateAcc(%(CPU_exec_context)s *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_src_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        if (fault == NoFault) {
+            fault = readMemTiming(xc, traceData, EA, Mem, memAccessFlags);
+        }
+
+        return fault;
+    }
+}};
+
+def template Load64CompleteAcc {{
+    Fault %(class_name)s::completeAcc(PacketPtr pkt,
+                                      %(CPU_exec_context)s *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+
+        // ARM instructions will not have a pkt if the predicate is false
+        getMem(pkt, Mem, traceData);
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template Store64CompleteAcc {{
+    Fault %(class_name)s::completeAcc(PacketPtr pkt,
+                                      %(CPU_exec_context)s *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        return NoFault;
+    }
+}};
+
+def template StoreEx64CompleteAcc {{
+    Fault %(class_name)s::completeAcc(PacketPtr pkt,
+                                      %(CPU_exec_context)s *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+
+        uint64_t writeResult = pkt->req->getExtraData();
+        %(postacc_code)s;
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template DCStore64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _base, IntRegIndex _dest, uint64_t _imm);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template DCStore64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst, IntRegIndex _base, IntRegIndex _dest, uint64_t _imm)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                 (IntRegIndex)_base, _dest, _imm)
+    {
+        %(constructor)s;
+        assert(!%(use_uops)d);
+    }
+}};
+
+def template DCStore64Execute {{
+    Fault %(class_name)s::execute(%(CPU_exec_context)s *xc,
+                                  Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            fault = xc->writeMem(NULL, op_size, EA, memAccessFlags, NULL);
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template DCStore64InitiateAcc {{
+    Fault %(class_name)s::initiateAcc(%(CPU_exec_context)s *xc,
+                                      Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            fault = xc->writeMem(NULL, op_size, EA, memAccessFlags, NULL);
+        }
+
+        return fault;
+    }
+}};
+
+
+def template LoadStoreImm64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst,
+                IntRegIndex _dest, IntRegIndex _base, int64_t _imm);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template LoadStoreImmU64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst,
+                IntRegIndex _dest, IntRegIndex _base, int64_t _imm,
+                bool noAlloc = false, bool exclusive = false,
+                bool acrel = false);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template LoadStoreImmDU64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst,
+                IntRegIndex _dest, IntRegIndex _dest2, IntRegIndex _base,
+                int64_t _imm = 0, bool noAlloc = false, bool exclusive = false,
+                bool acrel = false);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template StoreImmDEx64Declare {{
+    /**
+     * Static instruction class for "%(mnemonic)s".
+     */
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst,
+                IntRegIndex _result, IntRegIndex _dest, IntRegIndex _dest2,
+                IntRegIndex _base, int64_t _imm = 0);
+
+        %(BasicExecDeclare)s
+
+        %(InitiateAccDeclare)s
+
+        %(CompleteAccDeclare)s
+    };
+}};
+
+
+def template LoadStoreReg64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst,
+                IntRegIndex _dest, IntRegIndex _base, IntRegIndex _offset,
+                ArmExtendType _type, uint32_t _shiftAmt);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template LoadStoreRegU64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst,
+                IntRegIndex _dest, IntRegIndex _base, IntRegIndex _offset,
+                ArmExtendType _type, uint32_t _shiftAmt,
+                bool noAlloc = false, bool exclusive = false,
+                bool acrel = false);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template LoadStoreRaw64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _base);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template LoadStoreEx64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       IntRegIndex _base, IntRegIndex _result);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template LoadStoreLit64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest, int64_t _imm);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template LoadStoreLitU64Declare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest, int64_t _imm,
+                bool noAlloc = false, bool exclusive = false,
+                bool acrel = false);
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+
+        virtual void
+        annotateFault(ArmFault *fault) {
+            %(fa_code)s
+        }
+    };
+}};
+
+def template LoadStoreImm64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _base, int64_t _imm)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                 (IntRegIndex)_dest, (IntRegIndex)_base, _imm)
+    {
+        %(constructor)s;
+#if %(use_uops)d
+        assert(numMicroops >= 2);
+        uops = new StaticInstPtr[numMicroops];
+        uops[0] = new %(acc_name)s(machInst, _dest, _base, _imm);
+        uops[0]->setDelayedCommit();
+        uops[1] = new %(wb_decl)s;
+        uops[1]->setLastMicroop();
+#endif
+    }
+}};
+
+def template LoadStoreImmU64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _base, int64_t _imm,
+            bool noAlloc, bool exclusive, bool acrel)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                 _dest, _base, _imm)
+    {
+        %(constructor)s;
+        assert(!%(use_uops)d);
+        setExcAcRel(exclusive, acrel);
+    }
+}};
+
+def template LoadStoreImmDU64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _dest2, IntRegIndex _base,
+            int64_t _imm, bool noAlloc, bool exclusive, bool acrel)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                 _dest, _dest2, _base, _imm)
+    {
+        %(constructor)s;
+        assert(!%(use_uops)d);
+        setExcAcRel(exclusive, acrel);
+    }
+}};
+
+def template StoreImmDEx64Constructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _result, IntRegIndex _dest, IntRegIndex _dest2,
+            IntRegIndex _base, int64_t _imm)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                 _result, _dest, _dest2, _base, _imm)
+    {
+        %(constructor)s;
+        assert(!%(use_uops)d);
+    }
+}};
+
+
+def template LoadStoreReg64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _base, IntRegIndex _offset,
+            ArmExtendType _type, uint32_t _shiftAmt)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                 _dest, _base, _offset, _type, _shiftAmt)
+    {
+        %(constructor)s;
+#if %(use_uops)d
+        assert(numMicroops >= 2);
+        uops = new StaticInstPtr[numMicroops];
+        uops[0] = new %(acc_name)s(machInst, _dest, _base, _offset,
+                                   _type, _shiftAmt);
+        uops[0]->setDelayedCommit();
+        uops[1] = new %(wb_decl)s;
+        uops[1]->setLastMicroop();
+#endif
+    }
+}};
+
+def template LoadStoreRegU64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _base, IntRegIndex _offset,
+            ArmExtendType _type, uint32_t _shiftAmt,
+            bool noAlloc, bool exclusive, bool acrel)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                 _dest, _base, _offset, _type, _shiftAmt)
+    {
+        %(constructor)s;
+        assert(!%(use_uops)d);
+        setExcAcRel(exclusive, acrel);
+    }
+}};
+
+def template LoadStoreRaw64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _base)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _dest, _base)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template LoadStoreEx64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, IntRegIndex _base, IntRegIndex _result)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                          _dest, _base, _result)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template LoadStoreLit64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, int64_t _imm)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                 (IntRegIndex)_dest, _imm)
+    {
+        %(constructor)s;
+#if %(use_uops)d
+        assert(numMicroops >= 2);
+        uops = new StaticInstPtr[numMicroops];
+        uops[0] = new %(acc_name)s(machInst, _dest, _imm);
+        uops[0]->setDelayedCommit();
+        uops[1] = new %(wb_decl)s;
+        uops[1]->setLastMicroop();
+#endif
+    }
+}};
+
+def template LoadStoreLitU64Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, int64_t _imm,
+            bool noAlloc, bool exclusive, bool acrel)
+         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                 (IntRegIndex)_dest, _imm)
+    {
+        %(constructor)s;
+        assert(!%(use_uops)d);
+        setExcAcRel(exclusive, acrel);
+    }
+}};
diff --git a/src/arch/arm/isa/templates/misc.isa b/src/arch/arm/isa/templates/misc.isa
index 212897a..36db5b6 100644
--- a/src/arch/arm/isa/templates/misc.isa
+++ b/src/arch/arm/isa/templates/misc.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -62,6 +62,69 @@
     }
 }};
 
+def template MrsBankedRegDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    uint8_t byteMask;
+    bool    r;
+
+  public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _dest,
+                       uint8_t _sysM, bool _r);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template MrsBankedRegConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          uint8_t     _sysM,
+                                          bool        _r)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _dest),
+          byteMask(_sysM), r(_r)
+    {
+        %(constructor)s;
+        if (!(condCode == COND_AL || condCode == COND_UC)) {
+            for (int x = 0; x < _numDestRegs; x++) {
+                _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+            }
+        }
+    }
+}};
+
+def template MsrBankedRegDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    bool r;
+
+  public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _op1,
+                       uint8_t _sysM, bool _r);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template MsrBankedRegConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _op1,
+                                          uint8_t     _sysM,
+                                          bool        _r)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _op1, _sysM),
+          r(_r)
+    {
+        %(constructor)s;
+        if (!(condCode == COND_AL || condCode == COND_UC)) {
+            for (int x = 0; x < _numDestRegs; x++) {
+                _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+            }
+        }
+    }
+}};
+
 def template MsrRegDeclare {{
 class %(class_name)s : public %(base_class)s
 {
@@ -114,6 +177,66 @@
     }
 }};
 
+def template MrrcOpDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _op1,
+                       IntRegIndex _dest, IntRegIndex _dest2, uint32_t imm);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template MrrcOpConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex op1,
+                                          IntRegIndex dest,
+                                          IntRegIndex dest2,
+                                          uint32_t    imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, op1, dest,
+                         dest2, imm)
+    {
+        %(constructor)s;
+        if (!(condCode == COND_AL || condCode == COND_UC)) {
+            for (int x = 0; x < _numDestRegs; x++) {
+                _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+            }
+        }
+    }
+}};
+
+def template McrrOpDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, IntRegIndex _op1, IntRegIndex _op2,
+                       IntRegIndex _dest, uint32_t imm);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template McrrOpConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex op1,
+                                          IntRegIndex op2,
+                                          IntRegIndex dest,
+                                          uint32_t    imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, op1, op2,
+                         dest, imm)
+    {
+        %(constructor)s;
+        if (!(condCode == COND_AL || condCode == COND_UC)) {
+            for (int x = 0; x < _numDestRegs; x++) {
+                _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+            }
+        }
+    }
+}};
+
 def template ImmOpDeclare {{
 class %(class_name)s : public %(base_class)s
 {
@@ -310,6 +433,35 @@
     }
 }};
 
+def template RegImmImmOpDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst,
+                       IntRegIndex _dest, uint64_t _imm1, uint64_t _imm2);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template RegImmImmOpConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          uint64_t _imm1,
+                                          uint64_t _imm2)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _imm1, _imm2)
+    {
+        %(constructor)s;
+        if (!(condCode == COND_AL || condCode == COND_UC)) {
+            for (int x = 0; x < _numDestRegs; x++) {
+                _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+            }
+        }
+    }
+}};
+
 def template RegRegImmImmOpDeclare {{
 class %(class_name)s : public %(base_class)s
 {
diff --git a/src/arch/arm/isa/templates/misc64.isa b/src/arch/arm/isa/templates/misc64.isa
new file mode 100644
index 0000000..09d3d44
--- /dev/null
+++ b/src/arch/arm/isa/templates/misc64.isa
@@ -0,0 +1,91 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2011 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+def template RegRegImmImmOp64Declare {{
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst,
+                       IntRegIndex _dest, IntRegIndex _op1,
+                       uint64_t _imm1, uint64_t _imm2);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template RegRegImmImmOp64Constructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          uint64_t _imm1,
+                                          uint64_t _imm2)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _imm1, _imm2)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template RegRegRegImmOp64Declare {{
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst,
+                       IntRegIndex _dest, IntRegIndex _op1,
+                       IntRegIndex _op2, uint64_t _imm);
+        %(BasicExecDeclare)s
+};
+}};
+
+def template RegRegRegImmOp64Constructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2,
+                                          uint64_t _imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2, _imm)
+    {
+        %(constructor)s;
+    }
+}};
+
diff --git a/src/arch/arm/isa/templates/neon.isa b/src/arch/arm/isa/templates/neon.isa
index 573d245..ffa6b53 100644
--- a/src/arch/arm/isa/templates/neon.isa
+++ b/src/arch/arm/isa/templates/neon.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2012 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -39,8 +39,26 @@
 
 let {{
     simdEnabledCheckCode = '''
-        if (!neonEnabled(Cpacr, Cpsr, Fpexc))
-            return disabledFault();
+    {
+        uint32_t issEnCheck;
+        bool trapEnCheck;
+        uint32_t seq;
+        if (!vfpNeonEnabled(seq, Hcptr, Nsacr, Cpacr, Cpsr, issEnCheck,
+                            trapEnCheck, xc->tcBase(), Fpexc, true))
+            {return disabledFault();}
+        if (trapEnCheck) {
+            CPSR cpsrEnCheck = Cpsr;
+            if (cpsrEnCheck.mode == MODE_HYP) {
+                return new UndefinedInstruction(machInst, issEnCheck,
+                                                EC_TRAPPED_HCPTR);
+            } else {
+                if (!inSecureState(Scr, Cpsr)) {
+                    return new HypervisorTrap(machInst, issEnCheck,
+                                              EC_TRAPPED_HCPTR);
+                }
+            }
+        }
+    }
     '''
 }};
 
diff --git a/src/arch/arm/isa/templates/neon64.isa b/src/arch/arm/isa/templates/neon64.isa
new file mode 100644
index 0000000..d20e4e6
--- /dev/null
+++ b/src/arch/arm/isa/templates/neon64.isa
@@ -0,0 +1,527 @@
+// -*- mode: c++ -*-
+
+// Copyright (c) 2012-2013 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Mbou Eyole
+//          Giacomo Gabrielli
+
+let {{
+    simd64EnabledCheckCode = vfp64EnabledCheckCode
+}};
+
+def template NeonX2RegOpDeclare {{
+template <class _Element>
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    typedef _Element Element;
+  public:
+    // Constructor
+    %(class_name)s(ExtMachInst machInst,
+                   IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2)
+    {
+        %(constructor)s;
+    }
+
+    %(BasicExecDeclare)s
+};
+}};
+
+def template NeonX2RegImmOpDeclare {{
+template <class _Element>
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    typedef _Element Element;
+  public:
+    // Constructor
+    %(class_name)s(ExtMachInst machInst,
+                   IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                   uint64_t _imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2, _imm)
+    {
+        %(constructor)s;
+    }
+
+    %(BasicExecDeclare)s
+};
+}};
+
+def template NeonX1RegOpDeclare {{
+template <class _Element>
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    typedef _Element Element;
+  public:
+    // Constructor
+    %(class_name)s(ExtMachInst machInst,
+                   IntRegIndex _dest, IntRegIndex _op1)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1)
+    {
+        %(constructor)s;
+    }
+
+    %(BasicExecDeclare)s
+};
+}};
+
+def template NeonX1RegImmOpDeclare {{
+template <class _Element>
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    typedef _Element Element;
+  public:
+    // Constructor
+    %(class_name)s(ExtMachInst machInst,
+                   IntRegIndex _dest, IntRegIndex _op1, uint64_t _imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _imm)
+    {
+        %(constructor)s;
+    }
+
+    %(BasicExecDeclare)s
+};
+}};
+
+def template NeonX1Reg2ImmOpDeclare {{
+template <class _Element>
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    typedef _Element Element;
+  public:
+    // Constructor
+    %(class_name)s(ExtMachInst machInst,
+                   IntRegIndex _dest, IntRegIndex _op1, uint64_t _imm1,
+                   uint64_t _imm2)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _imm1, _imm2)
+    {
+        %(constructor)s;
+    }
+
+    %(BasicExecDeclare)s
+};
+}};
+
+def template NeonX1RegImmOnlyOpDeclare {{
+template <class _Element>
+class %(class_name)s : public %(base_class)s
+{
+  protected:
+    typedef _Element Element;
+  public:
+    // Constructor
+    %(class_name)s(ExtMachInst machInst,
+                   IntRegIndex _dest, uint64_t _imm)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _imm)
+    {
+        %(constructor)s;
+    }
+
+    %(BasicExecDeclare)s
+};
+}};
+
+def template NeonXExecDeclare {{
+    template
+    Fault %(class_name)s<%(targs)s>::execute(
+            %(CPU_exec_context)s *, Trace::InstRecord *) const;
+}};
+
+def template NeonXEqualRegOpExecute {{
+    template <class Element>
+    Fault %(class_name)s<Element>::execute(%(CPU_exec_context)s *xc,
+            Trace::InstRecord *traceData) const
+    {
+        Fault fault = NoFault;
+        %(op_decl)s;
+        %(op_rd)s;
+
+        const unsigned rCount = %(r_count)d;
+        const unsigned eCount = rCount * sizeof(FloatRegBits) / sizeof(Element);
+        const unsigned eCountFull = 4 * sizeof(FloatRegBits) / sizeof(Element);
+
+        union RegVect {
+            FloatRegBits regs[rCount];
+            Element elements[eCount];
+        };
+
+        union FullRegVect {
+            FloatRegBits regs[4];
+            Element elements[eCountFull];
+        };
+
+        %(code)s;
+        if (fault == NoFault)
+        {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template NeonXUnequalRegOpExecute {{
+    template <class Element>
+    Fault %(class_name)s<Element>::execute(%(CPU_exec_context)s *xc,
+            Trace::InstRecord *traceData) const
+    {
+        typedef typename bigger_type_t<Element>::type BigElement;
+        Fault fault = NoFault;
+        %(op_decl)s;
+        %(op_rd)s;
+
+        const unsigned rCount = %(r_count)d;
+        const unsigned eCount = rCount * sizeof(FloatRegBits) / sizeof(Element);
+        const unsigned eCountFull = 4 * sizeof(FloatRegBits) / sizeof(Element);
+
+        union RegVect {
+            FloatRegBits regs[rCount];
+            Element elements[eCount];
+            BigElement bigElements[eCount / 2];
+        };
+
+        union BigRegVect {
+            FloatRegBits regs[2 * rCount];
+            BigElement elements[eCount];
+        };
+
+        union FullRegVect {
+            FloatRegBits regs[4];
+            Element elements[eCountFull];
+        };
+
+        %(code)s;
+        if (fault == NoFault)
+        {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template MicroNeonMemDeclare64 {{
+    class %(class_name)s : public %(base_class)s
+    {
+      protected:
+        // True if the base register is SP (used for SP alignment checking)
+        bool baseIsSP;
+        // Access size in bytes
+        uint8_t accSize;
+        // Vector element size (0 -> 8-bit, 1 -> 16-bit, 2 -> 32-bit,
+        // 3 -> 64-bit)
+        uint8_t eSize;
+
+      public:
+        %(class_name)s(ExtMachInst machInst, RegIndex _dest, RegIndex _ura,
+                       uint32_t _imm, unsigned extraMemFlags, bool _baseIsSP,
+                       uint8_t _accSize, uint8_t _eSize)
+            : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _dest,
+                             _ura, _imm),
+            baseIsSP(_baseIsSP), accSize(_accSize), eSize(_eSize)
+        {
+            memAccessFlags |= extraMemFlags;
+            %(constructor)s;
+        }
+
+        %(BasicExecDeclare)s
+        %(InitiateAccDeclare)s
+        %(CompleteAccDeclare)s
+    };
+}};
+
+def template NeonLoadExecute64 {{
+    Fault %(class_name)s::execute(
+        %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(mem_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        MemUnion memUnion;
+        uint8_t *dataPtr = memUnion.bytes;
+
+        if (fault == NoFault) {
+            fault = xc->readMem(EA, dataPtr, accSize, memAccessFlags);
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template NeonLoadInitiateAcc64 {{
+    Fault %(class_name)s::initiateAcc(
+        %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(mem_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        MemUnion memUnion;
+        uint8_t *dataPtr = memUnion.bytes;
+
+        if (fault == NoFault) {
+            fault = xc->readMem(EA, dataPtr, accSize, memAccessFlags);
+        }
+
+        return fault;
+    }
+}};
+
+def template NeonLoadCompleteAcc64 {{
+    Fault %(class_name)s::completeAcc(
+        PacketPtr pkt, %(CPU_exec_context)s *xc,
+        Trace::InstRecord *traceData) const
+    {
+        Fault fault = NoFault;
+
+        %(mem_decl)s;
+        %(op_decl)s;
+        %(op_rd)s;
+
+        MemUnion &memUnion = *(MemUnion *)pkt->getPtr<uint8_t>();
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template NeonStoreExecute64 {{
+    Fault %(class_name)s::execute(
+        %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(mem_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        MemUnion memUnion;
+        uint8_t *dataPtr = memUnion.bytes;
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            fault = xc->writeMem(dataPtr, accSize, EA, memAccessFlags,
+                                 NULL);
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template NeonStoreInitiateAcc64 {{
+    Fault %(class_name)s::initiateAcc(
+        %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+
+        %(op_decl)s;
+        %(mem_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        MemUnion memUnion;
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            fault = xc->writeMem(memUnion.bytes, accSize, EA, memAccessFlags,
+                                 NULL);
+        }
+
+        return fault;
+    }
+}};
+
+def template NeonStoreCompleteAcc64 {{
+    Fault %(class_name)s::completeAcc(
+        PacketPtr pkt, %(CPU_exec_context)s *xc,
+        Trace::InstRecord *traceData) const
+    {
+        return NoFault;
+    }
+}};
+
+def template VMemMultDeclare64 {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, RegIndex rn, RegIndex vd,
+                       RegIndex rm, uint8_t eSize, uint8_t dataSize,
+                       uint8_t numStructElems, uint8_t numRegs, bool wb);
+        %(BasicExecPanic)s
+    };
+}};
+
+def template VMemSingleDeclare64 {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        // Constructor
+        %(class_name)s(ExtMachInst machInst, RegIndex rn, RegIndex vd,
+                       RegIndex rm, uint8_t eSize, uint8_t dataSize,
+                       uint8_t numStructElems, uint8_t index, bool wb,
+                       bool replicate = false);
+        %(BasicExecPanic)s
+    };
+}};
+
+def template VMemMultConstructor64 {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegIndex rn, RegIndex vd, RegIndex rm,
+        uint8_t _eSize, uint8_t _dataSize, uint8_t _numStructElems,
+        uint8_t _numRegs, bool _wb) :
+            %(base_class)s(
+                "%(mnemonic)s", machInst, %(op_class)s, rn, vd, rm,
+                _eSize, _dataSize, _numStructElems, _numRegs, _wb)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template VMemSingleConstructor64 {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegIndex rn, RegIndex vd, RegIndex rm,
+        uint8_t _eSize, uint8_t _dataSize, uint8_t _numStructElems,
+        uint8_t _index, bool _wb, bool _replicate) :
+            %(base_class)s(
+                "%(mnemonic)s", machInst, %(op_class)s, rn, vd, rm,
+                _eSize, _dataSize, _numStructElems, _index, _wb,
+                _replicate)
+    {
+        %(constructor)s;
+    }
+}};
+
+def template MicroNeonMixDeclare64 {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegIndex _dest, RegIndex _op1,
+                       uint8_t _eSize, uint8_t _dataSize,
+                       uint8_t _numStructElems, uint8_t _numRegs,
+                       uint8_t _step) :
+            %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                           _dest, _op1, _eSize, _dataSize, _numStructElems,
+                           _numRegs, _step)
+        {
+            %(constructor)s;
+        }
+
+        %(BasicExecDeclare)s
+    };
+}};
+
+def template MicroNeonMixLaneDeclare64 {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegIndex _dest, RegIndex _op1,
+                       uint8_t _eSize, uint8_t _dataSize,
+                       uint8_t _numStructElems, uint8_t _lane, uint8_t _step,
+                       bool _replicate = false) :
+            %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                           _dest, _op1, _eSize, _dataSize, _numStructElems,
+                           _lane, _step, _replicate)
+        {
+            %(constructor)s;
+        }
+
+        %(BasicExecDeclare)s
+    };
+}};
+
+def template MicroNeonMixExecute64 {{
+    Fault %(class_name)s::execute(%(CPU_exec_context)s *xc,
+            Trace::InstRecord *traceData) const
+    {
+        Fault fault = NoFault;
+        uint64_t resTemp = 0;
+        resTemp = resTemp;
+        %(op_decl)s;
+        %(op_rd)s;
+
+        %(code)s;
+        if (fault == NoFault)
+        {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
diff --git a/src/arch/arm/isa/templates/templates.isa b/src/arch/arm/isa/templates/templates.isa
index 1481392..2263cdf 100644
--- a/src/arch/arm/isa/templates/templates.isa
+++ b/src/arch/arm/isa/templates/templates.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2011 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -40,26 +40,37 @@
 //Basic instruction templates
 ##include "basic.isa"
 
+//Templates for AArch64 bit data instructions.
+##include "data64.isa"
+
 //Templates for predicated instructions
 ##include "pred.isa"
 
 //Templates for memory instructions
 ##include "mem.isa"
 
+//Templates for AArch64 memory instructions
+##include "mem64.isa"
+
 //Miscellaneous instructions that don't fit elsewhere
 ##include "misc.isa"
+##include "misc64.isa"
 
 //Templates for microcoded memory instructions
 ##include "macromem.isa"
 
 //Templates for branches
 ##include "branch.isa"
+##include "branch64.isa"
 
 //Templates for multiplies
 ##include "mult.isa"
 
 //Templates for VFP instructions
 ##include "vfp.isa"
+##include "vfp64.isa"
 
 //Templates for Neon instructions
 ##include "neon.isa"
+
+##include "neon64.isa"
diff --git a/src/arch/arm/isa/templates/vfp.isa b/src/arch/arm/isa/templates/vfp.isa
index 90dd751..176b660 100644
--- a/src/arch/arm/isa/templates/vfp.isa
+++ b/src/arch/arm/isa/templates/vfp.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010 ARM Limited
+// Copyright (c) 2010-2013 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -39,32 +39,117 @@
 
 let {{
     vfpEnabledCheckCode = '''
-        if (!vfpEnabled(Cpacr, Cpsr, Fpexc))
-            return disabledFault();
+        uint32_t issEnCheck;
+        bool trapEnCheck;
+        uint32_t seq;
+        if (!vfpNeonEnabled(seq,Hcptr, Nsacr, Cpacr, Cpsr, issEnCheck,
+                            trapEnCheck, xc->tcBase(), Fpexc))
+            {return disabledFault();}
+        if (trapEnCheck) {
+            CPSR cpsrEnCheck = Cpsr;
+            if (cpsrEnCheck.mode == MODE_HYP) {
+                return new UndefinedInstruction(machInst, issEnCheck,
+                                                EC_TRAPPED_HCPTR);
+            } else {
+                if (!inSecureState(Scr, Cpsr)) {
+                    return new HypervisorTrap(machInst, issEnCheck,
+                                              EC_TRAPPED_HCPTR);
+                }
+            }
+        }
+    '''
+
+    vfp64EnabledCheckCode = '''
+        CPSR cpsrEnCheck = Cpsr;
+        ExceptionLevel el = (ExceptionLevel) (uint8_t) cpsrEnCheck.el;
+        if (!vfpNeon64Enabled(Cpacr64, el))
+             return new SupervisorTrap(machInst, 0x1E00000,
+                                       EC_TRAPPED_SIMD_FP);
+
+        if (ArmSystem::haveVirtualization(xc->tcBase()) && el <= EL2) {
+            HCPTR cptrEnCheck = xc->tcBase()->readMiscReg(MISCREG_CPTR_EL2);
+            if (cptrEnCheck.tfp)
+                return new HypervisorTrap(machInst, 0x1E00000,
+                                          EC_TRAPPED_SIMD_FP);
+        }
+
+        if (ArmSystem::haveSecurity(xc->tcBase())) {
+            HCPTR cptrEnCheck = xc->tcBase()->readMiscReg(MISCREG_CPTR_EL3);
+            if (cptrEnCheck.tfp)
+                return new SecureMonitorTrap(machInst, 0x1E00000,
+                                             EC_TRAPPED_SIMD_FP);
+        }
     '''
 
     vmsrEnabledCheckCode = '''
-        if (!vfpEnabled(Cpacr, Cpsr))
+        uint32_t issEnCheck;
+        bool trapEnCheck;
+        uint32_t seq;
+        if (!vfpNeonEnabled(seq,Hcptr, Nsacr, Cpacr, Cpsr, issEnCheck,
+                            trapEnCheck, xc->tcBase()))
             if (dest != (int)MISCREG_FPEXC && dest != (int)MISCREG_FPSID)
-                return disabledFault();
+                {return disabledFault();}
         if (!inPrivilegedMode(Cpsr))
             if (dest != (int)MISCREG_FPSCR)
                 return disabledFault();
-
+        if (trapEnCheck) {
+            CPSR cpsrEnCheck = Cpsr;
+            if (cpsrEnCheck.mode == MODE_HYP) {
+                return new UndefinedInstruction(machInst, issEnCheck,
+                                                EC_TRAPPED_HCPTR);
+            } else {
+                if (!inSecureState(Scr, Cpsr)) {
+                    return new HypervisorTrap(machInst, issEnCheck,
+                                              EC_TRAPPED_HCPTR);
+                }
+            }
+        }
     '''
 
     vmrsEnabledCheckCode = '''
-        if (!vfpEnabled(Cpacr, Cpsr))
+        uint32_t issEnCheck;
+        bool trapEnCheck;
+        uint32_t seq;
+        if (!vfpNeonEnabled(seq,Hcptr, Nsacr, Cpacr, Cpsr, issEnCheck,
+                            trapEnCheck, xc->tcBase()))
             if (op1 != (int)MISCREG_FPEXC && op1 != (int)MISCREG_FPSID &&
                 op1 != (int)MISCREG_MVFR0 && op1 != (int)MISCREG_MVFR1)
-                return disabledFault();
+                {return disabledFault();}
         if (!inPrivilegedMode(Cpsr))
             if (op1 != (int)MISCREG_FPSCR)
                 return disabledFault();
+        if (trapEnCheck) {
+            CPSR cpsrEnCheck = Cpsr;
+            if (cpsrEnCheck.mode == MODE_HYP) {
+                return new UndefinedInstruction(machInst, issEnCheck,
+                                                EC_TRAPPED_HCPTR);
+            } else {
+                if (!inSecureState(Scr, Cpsr)) {
+                    return new HypervisorTrap(machInst, issEnCheck,
+                                              EC_TRAPPED_HCPTR);
+                }
+            }
+        }
     '''
     vmrsApsrEnabledCheckCode = '''
-        if (!vfpEnabled(Cpacr, Cpsr))
-                return disabledFault();
+        uint32_t issEnCheck;
+        bool trapEnCheck;
+        uint32_t seq;
+        if (!vfpNeonEnabled(seq,Hcptr, Nsacr, Cpacr, Cpsr, issEnCheck,
+                            trapEnCheck, xc->tcBase()))
+            {return disabledFault();}
+        if (trapEnCheck) {
+            CPSR cpsrEnCheck = Cpsr;
+            if (cpsrEnCheck.mode == MODE_HYP) {
+                return new UndefinedInstruction(machInst, issEnCheck,
+                                                EC_TRAPPED_HCPTR);
+            } else {
+                if (!inSecureState(Scr, Cpsr)) {
+                    return new HypervisorTrap(machInst, issEnCheck,
+                                              EC_TRAPPED_HCPTR);
+                }
+            }
+        }
     '''
 }};
 
diff --git a/src/arch/arm/isa/templates/vfp64.isa b/src/arch/arm/isa/templates/vfp64.isa
new file mode 100644
index 0000000..518ceda
--- /dev/null
+++ b/src/arch/arm/isa/templates/vfp64.isa
@@ -0,0 +1,140 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2012 ARM Limited
+// All rights reserved
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Thomas Grocutt
+
+def template AA64FpRegRegOpConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest, IntRegIndex _op1,
+                                          VfpMicroMode mode)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                _dest, _op1, mode)
+    {
+        %(constructor)s;
+        for (int x = 0; x < _numDestRegs; x++) {
+            _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+        }
+    }
+}};
+
+def template AA64FpRegRegOpConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest, IntRegIndex _op1,
+                                          VfpMicroMode mode)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                _dest, _op1, mode)
+    {
+        %(constructor)s;
+        for (int x = 0; x < _numDestRegs; x++) {
+            _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+        }
+    }
+}};
+
+def template AA64FpRegImmOpConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+            IntRegIndex _dest, uint64_t _imm, VfpMicroMode mode)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                _dest, _imm, mode)
+    {
+        %(constructor)s;
+        for (int x = 0; x < _numDestRegs; x++) {
+            _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+        }
+    }
+}};
+
+def template AA64FpRegRegImmOpConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          uint64_t _imm,
+                                          VfpMicroMode mode)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _imm, mode)
+    {
+        %(constructor)s;
+        for (int x = 0; x < _numDestRegs; x++) {
+            _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+        }
+    }
+}};
+
+def template AA64FpRegRegRegOpConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2,
+                                          VfpMicroMode mode)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2, mode)
+    {
+        %(constructor)s;
+        for (int x = 0; x < _numDestRegs; x++) {
+            _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+        }
+    }
+}};
+
+def template AA64FpRegRegRegRegOpDeclare {{
+class %(class_name)s : public %(base_class)s
+{
+  public:
+    // Constructor
+    %(class_name)s(ExtMachInst machInst,
+                   IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                   IntRegIndex _op3, VfpMicroMode mode = VfpNotAMicroop);
+    %(BasicExecDeclare)s
+};
+}};
+
+def template AA64FpRegRegRegRegOpConstructor {{
+    inline %(class_name)s::%(class_name)s(ExtMachInst machInst,
+                                          IntRegIndex _dest,
+                                          IntRegIndex _op1,
+                                          IntRegIndex _op2,
+                                          IntRegIndex _op3,
+                                          VfpMicroMode mode)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s,
+                         _dest, _op1, _op2, _op3, mode)
+    {
+        %(constructor)s;
+        for (int x = 0; x < _numDestRegs; x++) {
+                _srcRegIdx[_numSrcRegs++] = _destRegIdx[x];
+        }
+    }
+}};