arch-arm: Add initial support for SVE gather/scatter loads/stores

Change-Id: I891623015b47a39f61ed616f8896f32a7134c8e2
Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/13521
Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com>
Maintainer: Andreas Sandberg <andreas.sandberg@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/src/arch/arm/insts/sve_macromem.hh b/src/arch/arm/insts/sve_macromem.hh
new file mode 100644
index 0000000..a31af9b
--- /dev/null
+++ b/src/arch/arm/insts/sve_macromem.hh
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2018 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Giacomo Gabrielli
+ */
+
+#ifndef __ARCH_ARM_SVE_MACROMEM_HH__
+#define __ARCH_ARM_SVE_MACROMEM_HH__
+
+#include "arch/arm/generated/decoder.hh"
+#include "arch/arm/insts/pred_inst.hh"
+
+namespace ArmISA {
+
+template <typename RegElemType, typename MemElemType,
+          template <typename, typename> class MicroopType>
+class SveIndexedMemVI : public PredMacroOp
+{
+  protected:
+    IntRegIndex dest;
+    IntRegIndex gp;
+    IntRegIndex base;
+    uint64_t imm;
+
+  public:
+    SveIndexedMemVI(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                    IntRegIndex _dest, IntRegIndex _gp, IntRegIndex _base,
+                    uint64_t _imm)
+        : PredMacroOp(mnem, machInst, __opClass),
+          dest(_dest), gp(_gp), base(_base), imm(_imm)
+    {
+        bool isLoad = (__opClass == MemReadOp);
+
+        int num_elems = ((machInst.sveLen + 1) * 16) / sizeof(RegElemType);
+
+        numMicroops = num_elems;
+        if (isLoad) {
+            numMicroops++;
+        }
+
+        microOps = new StaticInstPtr[numMicroops];
+
+        StaticInstPtr *uop = microOps;
+
+        if (isLoad) {
+            // The first microop of a gather load copies the source vector
+            // register used for address calculation to an auxiliary register,
+            // with all subsequent microops reading from the latter.  This is
+            // needed to properly handle cases where the source vector
+            // register is the same as the destination register
+            *uop = new ArmISAInst::SveGatherLoadCpySrcVecMicroop(
+                mnem, machInst, _base, this);
+            uop++;
+        }
+
+        for (int i = 0; i < num_elems; i++, uop++) {
+            *uop = new MicroopType<RegElemType, MemElemType>(
+                mnem, machInst, __opClass, _dest, _gp,
+                isLoad ? (IntRegIndex) VECREG_UREG0 : _base, _imm, i,
+                num_elems);
+        }
+
+        --uop;
+        (*uop)->setLastMicroop();
+        microOps[0]->setFirstMicroop();
+
+        for (StaticInstPtr *uop = microOps; !(*uop)->isLastMicroop(); uop++) {
+            (*uop)->setDelayedCommit();
+        }
+    }
+
+    Fault
+    execute(ExecContext *, Trace::InstRecord *) const
+    {
+        panic("Execute method called when it shouldn't!");
+        return NoFault;
+    }
+
+    std::string
+    generateDisassembly(Addr pc, const SymbolTable *symtab) const
+    {
+        // TODO: add suffix to transfer and base registers
+        std::stringstream ss;
+        printMnemonic(ss, "", false);
+        ccprintf(ss, "{");
+        printVecReg(ss, dest, true);
+        ccprintf(ss, "}, ");
+        printVecPredReg(ss, gp);
+        ccprintf(ss, "/z, [");
+        printVecReg(ss, base, true);
+        if (imm != 0) {
+            ccprintf(ss, ", #%d", imm * sizeof(MemElemType));
+        }
+        ccprintf(ss, "]");
+        return ss.str();
+    }
+};
+
+template <typename RegElemType, typename MemElemType,
+          template <typename, typename> class MicroopType>
+class SveIndexedMemSV : public PredMacroOp
+{
+  protected:
+    IntRegIndex dest;
+    IntRegIndex gp;
+    IntRegIndex base;
+    IntRegIndex offset;
+
+    bool offsetIs32;
+    bool offsetIsSigned;
+    bool offsetIsScaled;
+
+  public:
+    SveIndexedMemSV(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                    IntRegIndex _dest, IntRegIndex _gp, IntRegIndex _base,
+                    IntRegIndex _offset, bool _offsetIs32,
+                    bool _offsetIsSigned, bool _offsetIsScaled)
+        : PredMacroOp(mnem, machInst, __opClass),
+          dest(_dest), gp(_gp), base(_base), offset(_offset),
+          offsetIs32(_offsetIs32), offsetIsSigned(_offsetIsSigned),
+          offsetIsScaled(_offsetIsScaled)
+    {
+        bool isLoad = (__opClass == MemReadOp);
+
+        int num_elems = ((machInst.sveLen + 1) * 16) / sizeof(RegElemType);
+
+        numMicroops = num_elems;
+        if (isLoad) {
+            numMicroops++;
+        }
+
+        microOps = new StaticInstPtr[numMicroops];
+
+        StaticInstPtr *uop = microOps;
+
+        if (isLoad) {
+            // The first microop of a gather load copies the source vector
+            // register used for address calculation to an auxiliary register,
+            // with all subsequent microops reading from the latter.  This is
+            // needed to properly handle cases where the source vector
+            // register is the same as the destination register
+            *uop = new ArmISAInst::SveGatherLoadCpySrcVecMicroop(
+                mnem, machInst, _offset, this);
+            uop++;
+        }
+
+        for (int i = 0; i < num_elems; i++, uop++) {
+            *uop = new MicroopType<RegElemType, MemElemType>(
+                mnem, machInst, __opClass, _dest, _gp, _base,
+                isLoad ? (IntRegIndex) VECREG_UREG0 : _offset, _offsetIs32,
+                _offsetIsSigned, _offsetIsScaled, i, num_elems);
+        }
+
+        --uop;
+        (*uop)->setLastMicroop();
+        microOps[0]->setFirstMicroop();
+
+        for (StaticInstPtr *uop = microOps; !(*uop)->isLastMicroop(); uop++) {
+            (*uop)->setDelayedCommit();
+        }
+    }
+
+    Fault
+    execute(ExecContext *, Trace::InstRecord *) const
+    {
+        panic("Execute method called when it shouldn't!");
+        return NoFault;
+    }
+
+    std::string
+    generateDisassembly(Addr pc, const SymbolTable *symtab) const
+    {
+        // TODO: add suffix to transfer and base registers
+        std::stringstream ss;
+        printMnemonic(ss, "", false);
+        ccprintf(ss, "{");
+        printVecReg(ss, dest, true);
+        ccprintf(ss, "}, ");
+        printVecPredReg(ss, gp);
+        ccprintf(ss, "/z, [");
+        printIntReg(ss, base);
+        ccprintf(ss, ", ");
+        printVecReg(ss, offset, true);
+        ccprintf(ss, "]");
+        return ss.str();
+    }
+};
+
+}  // namespace ArmISA
+
+#endif  // __ARCH_ARM_SVE_MACROMEM_HH__
diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa
index e81ab3e..7b2d3af 100644
--- a/src/arch/arm/isa/formats/sve_2nd_level.isa
+++ b/src/arch/arm/isa/formats/sve_2nd_level.isa
@@ -2896,34 +2896,153 @@
     StaticInstPtr
     decodeSveMemGather32(ExtMachInst machInst)
     {
-        // TODO: for now only LDR and LD1R are implemented
-        if (bits(machInst, 22) && bits(machInst, 15)) {
-            IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
-            IntRegIndex rn = makeSP(
-                (IntRegIndex) (uint8_t) bits(machInst, 9, 5));
-            uint64_t imm = bits(machInst, 21, 16);
-            IntRegIndex pg = (IntRegIndex) (uint8_t) bits(machInst, 12, 10);
-            uint8_t dtype = (bits(machInst, 24, 23) << 2) |
-                            bits(machInst, 14, 13);
-            return decodeSveContigLoadSIInsts<SveLoadAndRepl>(
-                    dtype, machInst, zt, pg, rn, imm, false, true);
-        } else if (bits(machInst, 24, 22) == 0x6 &&
-                   bits(machInst, 15, 13) == 0x0 &&
-                   bits(machInst, 4) == 0x0) {
-            IntRegIndex pt = (IntRegIndex) (uint8_t) bits(machInst, 3, 0);
-            IntRegIndex rn = makeSP(
-                (IntRegIndex) (uint8_t) bits(machInst, 9, 5));
-            uint64_t imm = sext<9>((bits(machInst, 21, 16) << 3) |
-                                   bits(machInst, 12, 10));
-            return new SveLdrPred(machInst, pt, rn, imm);
-        } else if (bits(machInst, 24, 22) == 0x6 &&
-                   bits(machInst, 15, 13) == 0x2) {
-            IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
-            IntRegIndex rn = makeSP(
-                (IntRegIndex) (uint8_t) bits(machInst, 9, 5));
-            uint64_t imm = sext<9>((bits(machInst, 21, 16) << 3) |
-                                   bits(machInst, 12, 10));
-            return new SveLdrVec(machInst, zt, rn, imm);
+        if (bits(machInst, 15)) {
+            if (bits(machInst, 22)) {
+                // SVE load and broadcast element
+                IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+                uint64_t imm = bits(machInst, 21, 16);
+                IntRegIndex pg = (IntRegIndex) (uint8_t)
+                                 bits(machInst, 12, 10);
+                uint8_t dtype = (bits(machInst, 24, 23) << 2) |
+                                bits(machInst, 14, 13);
+                return decodeSveContigLoadSIInsts<SveLoadAndRepl>(
+                        dtype, machInst, zt, pg, rn, imm, false, true);
+            } else {
+                if (bits(machInst, 21)) {
+                    // SVE 32-bit gather load (vector plus immediate)
+                    IntRegIndex zt = (IntRegIndex) (uint8_t)
+                                     bits(machInst, 4, 0);
+                    IntRegIndex zn = (IntRegIndex) (uint8_t)
+                                     bits(machInst, 9, 5);
+                    uint64_t imm = bits(machInst, 20, 16);
+                    IntRegIndex pg = (IntRegIndex) (uint8_t)
+                                     bits(machInst, 12, 10);
+                    uint8_t dtype = (bits(machInst, 24, 23) << 1) |
+                                    bits(machInst, 14);
+                    uint8_t ff = bits(machInst, 13);
+                    if (ff) {
+                        return new Unknown64(machInst);
+                    }
+                    return decodeSveGatherLoadVIInsts(
+                        dtype, machInst, zt, pg, zn, imm, true, ff);
+                } else {
+                    uint8_t b14_13 = bits(machInst, 14, 13);
+                    if (b14_13 == 0x2 && bits(machInst, 4) == 0) {
+                        // TODO: SVE contiguous prefetch (scalar plus scalar)
+                        return new Unknown64(machInst);
+                    } else if (b14_13 == 0x3 && bits(machInst, 4) == 0) {
+                        // TODO: SVE 32-bit gather prefetch (vector plus
+                        // immediate)
+                        return new Unknown64(machInst);
+                    }
+                }
+            }
+        } else {
+            uint8_t b24_23 = bits(machInst, 24, 23);
+            if (b24_23 != 0x3 && bits(machInst, 21) == 0) {
+                // SVE 32-bit gather load (scalar plus 32-bit unscaled offsets)
+                IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+                IntRegIndex zm = (IntRegIndex) (uint8_t)
+                         bits(machInst, 20, 16);
+                IntRegIndex pg = (IntRegIndex) (uint8_t)
+                         bits(machInst, 12, 10);
+                uint8_t dtype = (bits(machInst, 24, 23) << 1) |
+                                bits(machInst, 14);
+                uint8_t xs = bits(machInst, 22);
+                uint8_t ff = bits(machInst, 13);
+                if (ff) {
+                    return new Unknown64(machInst);
+                }
+                return decodeSveGatherLoadSVInsts(
+                        dtype, machInst, zt, pg, rn, zm,
+                        true, true, xs, false, ff);
+            }
+            switch (b24_23) {
+              case 0x0:
+                if (bits(machInst, 21) && bits(machInst, 4) == 0) {
+                    // TODO: SVE 32-bit gather prefetch (vector plus immediate)
+                    break;
+                }
+                break;
+              case 0x1:
+                if (bits(machInst, 21)) {
+                    // SVE 32-bit gather load halfwords (scalar plus 32-bit
+                    // scaled offsets)
+                    IntRegIndex zt = (IntRegIndex) (uint8_t)
+                             bits(machInst, 4, 0);
+                    IntRegIndex rn = (IntRegIndex) (uint8_t)
+                             bits(machInst, 9, 5);
+                    IntRegIndex zm = (IntRegIndex) (uint8_t)
+                             bits(machInst, 20, 16);
+                    IntRegIndex pg = (IntRegIndex) (uint8_t)
+                             bits(machInst, 12, 10);
+                    uint8_t xs = bits(machInst, 22);
+                    uint8_t ff = bits(machInst, 13);
+                    if (ff) {
+                        return new Unknown64(machInst);
+                    }
+                    if (bits(machInst, 14)) {
+                        return new SveIndexedMemSV<uint32_t, uint16_t,
+                                                   SveGatherLoadSVMicroop>(
+                            "ld1", machInst, MemReadOp, zt, pg, rn, zm,
+                            true, xs, true);
+                    } else {
+                        return new SveIndexedMemSV<int32_t, int16_t,
+                                                   SveGatherLoadSVMicroop>(
+                            "ld1", machInst, MemReadOp, zt, pg, rn, zm,
+                            true, xs, true);
+                    }
+                }
+                break;
+              case 0x2:
+                if (bits(machInst, 21)) {
+                    // SVE 32-bit gather load words (scalar plus 32-bit scaled
+                    // offsets)
+                    IntRegIndex zt = (IntRegIndex) (uint8_t)
+                             bits(machInst, 4, 0);
+                    IntRegIndex rn = (IntRegIndex) (uint8_t)
+                             bits(machInst, 9, 5);
+                    IntRegIndex zm = (IntRegIndex) (uint8_t)
+                             bits(machInst, 20, 16);
+                    IntRegIndex pg = (IntRegIndex) (uint8_t)
+                             bits(machInst, 12, 10);
+                    uint8_t xs = bits(machInst, 22);
+                    uint8_t ff = bits(machInst, 13);
+                    if (ff) {
+                        return new Unknown64(machInst);
+                    }
+                    return new SveIndexedMemSV<uint32_t, uint32_t,
+                                               SveGatherLoadSVMicroop>(
+                        "ld1", machInst, MemReadOp, zt, pg, rn, zm,
+                        true, xs, true);
+                }
+                break;
+              case 0x3:
+                if (bits(machInst, 22) == 0 && bits(machInst, 14, 13) == 0x0 &&
+                        bits(machInst, 4) == 0) {
+                    // SVE load predicate register
+                    IntRegIndex pt = (IntRegIndex) (uint8_t)
+                        bits(machInst, 3, 0);
+                    IntRegIndex rn = (IntRegIndex) (uint8_t)
+                        bits(machInst, 9, 5);
+                    uint64_t imm = sext<9>((bits(machInst, 21, 16) << 3) |
+                                           bits(machInst, 12, 10));
+                    return new SveLdrPred(machInst, pt, rn, imm);
+                } else if (bits(machInst, 22) == 0 &&
+                           bits(machInst, 14, 13) == 0x2) {
+                    // SVE load vector register
+                    IntRegIndex zt = (IntRegIndex) (uint8_t)
+                        bits(machInst, 4, 0);
+                    IntRegIndex rn = (IntRegIndex) (uint8_t)
+                        bits(machInst, 9, 5);
+                    uint64_t imm = sext<9>((bits(machInst, 21, 16) << 3) |
+                                           bits(machInst, 12, 10));
+                    return new SveLdrVec(machInst, zt, rn, imm);
+                }
+                break;
+            }
         }
         return new Unknown64(machInst);
     }  // decodeSveMemGather32
@@ -3048,6 +3167,124 @@
     StaticInstPtr
     decodeSveMemGather64(ExtMachInst machInst)
     {
+        switch ((bits(machInst, 21) << 1) | bits(machInst, 15)) {
+          case 0x0:
+            {
+                // SVE 64-bit gather load (scalar plus unpacked 32-bit unscaled
+                // offsets)
+                IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+                IntRegIndex zm = (IntRegIndex) (uint8_t)
+                         bits(machInst, 20, 16);
+                IntRegIndex pg = (IntRegIndex) (uint8_t)
+                         bits(machInst, 12, 10);
+                uint8_t dtype = (bits(machInst, 24, 23) << 1) |
+                                bits(machInst, 14);
+                uint8_t xs = bits(machInst, 22);
+                uint8_t ff = bits(machInst, 13);
+                if (ff) {
+                    return new Unknown64(machInst);
+                }
+                return decodeSveGatherLoadSVInsts(
+                        dtype, machInst, zt, pg, rn, zm,
+                        false, true, xs, false, ff);
+            }
+          case 0x1:
+            if (bits(machInst, 22)) {
+                // SVE 64-bit gather load (scalar plus 64-bit unscaled offsets)
+                IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+                IntRegIndex zm = (IntRegIndex) (uint8_t)
+                         bits(machInst, 20, 16);
+                IntRegIndex pg = (IntRegIndex) (uint8_t)
+                         bits(machInst, 12, 10);
+                uint8_t dtype = (bits(machInst, 24, 23) << 1) |
+                                bits(machInst, 14);
+                uint8_t ff = bits(machInst, 13);
+                if (ff) {
+                    return new Unknown64(machInst);
+                }
+                return decodeSveGatherLoadSVInsts(
+                        dtype, machInst, zt, pg, rn, zm,
+                        false, false, false, false, ff);
+            } else {
+                if (bits(machInst, 14, 13) == 0x3 && bits(machInst, 4) == 0) {
+                    // TODO: SVE 64-bit gather prefetch (vector plus immediate)
+                    break;
+                }
+            }
+            break;
+          case 0x2:
+            if (bits(machInst, 24, 23) != 0x0) {
+                //  SVE 64-bit gather load (scalar plus unpacked 32-bit scaled
+                //  offsets)
+                IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+                IntRegIndex zm = (IntRegIndex) (uint8_t)
+                         bits(machInst, 20, 16);
+                IntRegIndex pg = (IntRegIndex) (uint8_t)
+                         bits(machInst, 12, 10);
+                uint8_t dtype = (bits(machInst, 24, 23) << 1) |
+                                bits(machInst, 14);
+                uint8_t xs = bits(machInst, 22);
+                uint8_t ff = bits(machInst, 13);
+                if (ff) {
+                    return new Unknown64(machInst);
+                }
+                return decodeSveGatherLoadSVInsts(
+                        dtype, machInst, zt, pg, rn, zm,
+                        false, true, xs, true, ff);
+            } else if (bits(machInst, 4) == 0) {
+                // TODO: SVE 64-bit gather prefetch (scalar plus unpacked
+                // 32-bit scaled offsets)
+                return new Unknown64(machInst);
+            }
+            break;
+          case 0x3:
+            if (bits(machInst, 22) == 0) {
+                // SVE 64-bit gather load (vector plus immediate)
+                IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+                IntRegIndex zn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+                uint64_t imm = bits(machInst, 20, 16);
+                IntRegIndex pg = (IntRegIndex) (uint8_t)
+                                 bits(machInst, 12, 10);
+                uint8_t dtype = (bits(machInst, 24, 23) << 1) |
+                                bits(machInst, 14);
+                uint8_t ff = bits(machInst, 13);
+                if (ff) {
+                    return new Unknown64(machInst);
+                }
+                return decodeSveGatherLoadVIInsts(
+                    dtype, machInst, zt, pg, zn, imm, false, ff);
+            } else {
+                if (bits(machInst, 24, 23) != 0x0) {
+                    // SVE 64-bit gather load (scalar plus 64-bit scaled
+                    // offsets)
+                    IntRegIndex zt = (IntRegIndex) (uint8_t)
+                             bits(machInst, 4, 0);
+                    IntRegIndex rn = (IntRegIndex) (uint8_t)
+                             bits(machInst, 9, 5);
+                    IntRegIndex zm = (IntRegIndex) (uint8_t)
+                             bits(machInst, 20, 16);
+                    IntRegIndex pg = (IntRegIndex) (uint8_t)
+                             bits(machInst, 12, 10);
+                    uint8_t dtype = (bits(machInst, 24, 23) << 1) |
+                                    bits(machInst, 14);
+                    uint8_t ff = bits(machInst, 13);
+                    if (ff) {
+                        return new Unknown64(machInst);
+                    }
+                    return decodeSveGatherLoadSVInsts(
+                            dtype, machInst, zt, pg, rn, zm,
+                            false, false, false, true, ff);
+                } else if (bits(machInst, 4) == 0) {
+                    // TODO: SVE 64-bit gather prefetch (scalar plus 64-bit
+                    // scaled offsets)
+                    break;
+                }
+            }
+            break;
+        }
         return new Unknown64(machInst);
     }  // decodeSveMemGather64
 
@@ -3087,36 +3324,12 @@
     }  // decodeSveContigNTStoreSS
 
     StaticInstPtr
-    decodeSveScatterStore64SV32U(ExtMachInst machInst)
-    {
-        return new Unknown64(machInst);
-    }  // decodeSveScatterStore64SV32U
-
-    StaticInstPtr
-    decodeSveScatterStore64SV64U(ExtMachInst machInst)
-    {
-        return new Unknown64(machInst);
-    }  // decodeSveScatterStore64SV64U
-
-    StaticInstPtr
     decodeSveContigNTStoreSI(ExtMachInst machInst)
     {
         return new Unknown64(machInst);
     }  // decodeSveContigNTStoreSI
 
     StaticInstPtr
-    decodeSveScatterStore64VI(ExtMachInst machInst)
-    {
-        return new Unknown64(machInst);
-    }  // decodeSveScatterStore64VI
-
-    StaticInstPtr
-    decodeSveScatterStore32SV32S(ExtMachInst machInst)
-    {
-        return new Unknown64(machInst);
-    }  // decodeSveScatterStore32SV32S
-
-    StaticInstPtr
     decodeSveStoreStructsSS(ExtMachInst machInst)
     {
         return new Unknown64(machInst);
@@ -3129,30 +3342,6 @@
     }  // decodeSveStoreStructsSI
 
     StaticInstPtr
-    decodeSveScatterStore32SV32U(ExtMachInst machInst)
-    {
-        return new Unknown64(machInst);
-    }  // decodeSveScatterStore32SV32U
-
-    StaticInstPtr
-    decodeSveScatterStore32VI(ExtMachInst machInst)
-    {
-        return new Unknown64(machInst);
-    }  // decodeSveScatterStore32VI
-
-    StaticInstPtr
-    decodeSveScatterStore64SV32S(ExtMachInst machInst)
-    {
-        return new Unknown64(machInst);
-    }  // decodeSveScatterStore64SV32S
-
-    StaticInstPtr
-    decodeSveScatterStore64SV64S(ExtMachInst machInst)
-    {
-        return new Unknown64(machInst);
-    }  // decodeSveScatterStore64SV64S
-
-    StaticInstPtr
     decodeSveMemStore(ExtMachInst machInst)
     {
         switch (bits(machInst, 15, 13)) {
@@ -3186,37 +3375,118 @@
             }
           case 0x4:
           case 0x6:
-            switch (bits(machInst, 22, 21)) {
-              case 0x0:
-                return decodeSveScatterStore64SV32U(machInst);
-              case 0x1:
-                if (bits(machInst, 24, 23) != 0x0) {
-                    return decodeSveScatterStore64SV32S(machInst);
+            {
+                IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+                IntRegIndex rn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+                IntRegIndex zm = (IntRegIndex) (uint8_t)
+                         bits(machInst, 20, 16);
+                IntRegIndex pg = (IntRegIndex) (uint8_t)
+                         bits(machInst, 12, 10);
+                uint8_t msz = bits(machInst, 24, 23);
+                uint8_t xs = bits(machInst, 22);
+
+                switch (bits(machInst, 22, 21)) {
+                  case 0x0:
+                    // SVE 64-bit scatter store (scalar plus unpacked 32-bit
+                    // unscaled offsets)
+                    return decodeSveScatterStoreSVInsts(
+                            msz, machInst, zt, pg, rn, zm,
+                            false, true, xs, false);
+                  case 0x1:
+                    if (bits(machInst, 24, 23) != 0x0) {
+                        // SVE 64-bit scatter store (scalar plus unpacked
+                        // 32-bit scaled offsets)
+                        return decodeSveScatterStoreSVInsts(
+                                msz, machInst, zt, pg, rn, zm,
+                                false, true, xs, true);
+                    }
+                    break;
+                  case 0x2:
+                    if (bits(machInst, 24, 23) != 0x3) {
+                        // SVE 32-bit scatter store (scalar plus 32-bit
+                        // unscaled offsets)
+                        return decodeSveScatterStoreSVInsts(
+                                msz, machInst, zt, pg, rn, zm,
+                                true, true, xs, false);
+                    }
+                    break;
+                  case 0x3:
+                    // SVE 32-bit scatter store (scalar plus 32-bit scaled
+                    // offsets)
+                    return decodeSveScatterStoreSVInsts(
+                            msz, machInst, zt, pg, rn, zm,
+                            true, true, xs, true);
                 }
-                break;
-              case 0x2:
-                if (bits(machInst, 24, 23) != 0x3) {
-                    return decodeSveScatterStore32SV32U(machInst);
-                }
-                break;
-              case 0x3:
-                return decodeSveScatterStore32SV32S(machInst);
             }
             break;
           case 0x5:
             switch (bits(machInst, 22, 21)) {
               case 0x0:
-                return decodeSveScatterStore64SV64U(machInst);
+                {
+                    // SVE 64-bit scatter store (scalar plus 64-bit unscaled
+                    // offsets)
+                    IntRegIndex zt = (IntRegIndex) (uint8_t)
+                            bits(machInst, 4, 0);
+                    IntRegIndex rn = (IntRegIndex) (uint8_t)
+                            bits(machInst, 9, 5);
+                    IntRegIndex zm = (IntRegIndex) (uint8_t)
+                            bits(machInst, 20, 16);
+                    IntRegIndex pg = (IntRegIndex) (uint8_t)
+                            bits(machInst, 12, 10);
+                    uint8_t msz = bits(machInst, 24, 23);
+
+                    return decodeSveScatterStoreSVInsts(
+                            msz, machInst, zt, pg, rn, zm,
+                            false, false, false, false);
+                }
               case 0x1:
                 if (bits(machInst, 24, 23) != 0x0) {
-                    return decodeSveScatterStore64SV64S(machInst);
+                    // SVE 64-bit scatter store (scalar plus 64-bit scaled
+                    // offsets)
+                    IntRegIndex zt = (IntRegIndex) (uint8_t)
+                            bits(machInst, 4, 0);
+                    IntRegIndex rn = (IntRegIndex) (uint8_t)
+                            bits(machInst, 9, 5);
+                    IntRegIndex zm = (IntRegIndex) (uint8_t)
+                            bits(machInst, 20, 16);
+                    IntRegIndex pg = (IntRegIndex) (uint8_t)
+                            bits(machInst, 12, 10);
+                    uint8_t msz = bits(machInst, 24, 23);
+
+                    return decodeSveScatterStoreSVInsts(
+                            msz, machInst, zt, pg, rn, zm,
+                            false, false, false, true);
                 }
                 break;
               case 0x2:
-                return decodeSveScatterStore64VI(machInst);
+                {
+                    // SVE 64-bit scatter store (vector plus immediate)
+                    IntRegIndex zt = (IntRegIndex) (uint8_t)
+                            bits(machInst, 4, 0);
+                    IntRegIndex zn = (IntRegIndex) (uint8_t)
+                            bits(machInst, 9, 5);
+                    uint64_t imm = bits(machInst, 20, 16);
+                    IntRegIndex pg = (IntRegIndex) (uint8_t)
+                            bits(machInst, 12, 10);
+                    uint8_t msz = bits(machInst, 24, 23);
+
+                    return decodeSveScatterStoreVIInsts(
+                        msz, machInst, zt, pg, zn, imm, false);
+                }
               case 0x3:
                 if (bits(machInst, 24, 23) != 0x3) {
-                    return decodeSveScatterStore64VI(machInst);
+                    // SVE 32-bit scatter store (vector plus immediate)
+                    IntRegIndex zt = (IntRegIndex) (uint8_t)
+                            bits(machInst, 4, 0);
+                    IntRegIndex zn = (IntRegIndex) (uint8_t)
+                            bits(machInst, 9, 5);
+                    uint64_t imm = bits(machInst, 20, 16);
+                    IntRegIndex pg = (IntRegIndex) (uint8_t)
+                            bits(machInst, 12, 10);
+                    uint8_t msz = bits(machInst, 24, 23);
+
+                    return decodeSveScatterStoreVIInsts(
+                        msz, machInst, zt, pg, zn, imm, true);
                 }
                 break;
             }
diff --git a/src/arch/arm/isa/includes.isa b/src/arch/arm/isa/includes.isa
index 9aef8c6..f054bc8 100644
--- a/src/arch/arm/isa/includes.isa
+++ b/src/arch/arm/isa/includes.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010, 2012, 2017 ARM Limited
+// Copyright (c) 2010, 2012, 2017-2018 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -68,9 +68,10 @@
 #include "arch/arm/insts/sve_mem.hh"
 #include "arch/arm/insts/vfp.hh"
 #include "arch/arm/isa_traits.hh"
+#include "enums/DecoderFlavour.hh"
 #include "mem/packet.hh"
 #include "sim/faults.hh"
-#include "enums/DecoderFlavour.hh"
+
 }};
 
 output decoder {{
@@ -80,11 +81,12 @@
 
 #include "arch/arm/decoder.hh"
 #include "arch/arm/faults.hh"
+#include "arch/arm/insts/sve_macromem.hh"
 #include "arch/arm/intregs.hh"
 #include "arch/arm/isa_traits.hh"
 #include "arch/arm/utility.hh"
-#include "base/loader/symtab.hh"
 #include "base/cprintf.hh"
+#include "base/loader/symtab.hh"
 #include "cpu/thread_context.hh"
 
 using namespace ArmISA;
@@ -102,8 +104,10 @@
 #include "base/crc.hh"
 #include "cpu/base.hh"
 #include "sim/pseudo_inst.hh"
+
 #if defined(linux)
 #include <fenv.h>
+
 #endif
 
 #include "base/cp_annotate.hh"
diff --git a/src/arch/arm/isa/insts/sve_mem.isa b/src/arch/arm/isa/insts/sve_mem.isa
index f4ca4c3..3102e80 100644
--- a/src/arch/arm/isa/insts/sve_mem.isa
+++ b/src/arch/arm/isa/insts/sve_mem.isa
@@ -1,4 +1,4 @@
-// Copyright (c) 2017 ARM Limited
+// Copyright (c) 2017-2018 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -204,6 +204,288 @@
 
 }};
 
+output decoder {{
+
+    StaticInstPtr
+    decodeSveGatherLoadVIInsts(uint8_t dtype, ExtMachInst machInst,
+                               IntRegIndex zt, IntRegIndex pg, IntRegIndex zn,
+                               uint64_t imm, bool esizeIs32,
+                               bool firstFaulting)
+    {
+        const char* mn = firstFaulting ? "ldff1" : "ld1";
+        switch (dtype) {
+          case 0x0:
+            if (esizeIs32) {
+                return new SveIndexedMemVI<int32_t, int8_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            } else {
+                return new SveIndexedMemVI<int64_t, int8_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            }
+          case 0x1:
+            if (esizeIs32) {
+                return new SveIndexedMemVI<uint32_t, uint8_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            } else {
+                return new SveIndexedMemVI<uint64_t, uint8_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            }
+          case 0x2:
+            if (esizeIs32) {
+                return new SveIndexedMemVI<int32_t, int16_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            } else {
+                return new SveIndexedMemVI<int64_t, int16_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            }
+          case 0x3:
+            if (esizeIs32) {
+                return new SveIndexedMemVI<uint32_t, uint16_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            } else {
+                return new SveIndexedMemVI<uint64_t, uint16_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            }
+          case 0x4:
+            if (esizeIs32) {
+                break;
+            } else {
+                return new SveIndexedMemVI<int64_t, int32_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            }
+          case 0x5:
+            if (esizeIs32) {
+                return new SveIndexedMemVI<uint32_t, uint32_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            } else {
+                return new SveIndexedMemVI<uint64_t, uint32_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            }
+          case 0x7:
+            if (esizeIs32) {
+                break;
+            } else {
+                return new SveIndexedMemVI<uint64_t, uint64_t,
+                                           SveGatherLoadVIMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+            }
+        }
+        return new Unknown64(machInst);
+    }
+
+    StaticInstPtr
+    decodeSveGatherLoadSVInsts(uint8_t dtype, ExtMachInst machInst,
+                               IntRegIndex zt, IntRegIndex pg, IntRegIndex rn,
+                               IntRegIndex zm, bool esizeIs32, bool offsetIs32,
+                               bool offsetIsSigned, bool offsetIsScaled,
+                               bool firstFaulting)
+    {
+        const char* mn = firstFaulting ? "ldff1" : "ld1";
+        switch (dtype) {
+          case 0x0:
+            if (esizeIs32) {
+                return new SveIndexedMemSV<int32_t, int8_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            } else {
+                return new SveIndexedMemSV<int64_t, int8_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+          case 0x1:
+            if (esizeIs32) {
+                return new SveIndexedMemSV<uint32_t, uint8_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            } else {
+                return new SveIndexedMemSV<uint64_t, uint8_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+          case 0x2:
+            if (esizeIs32) {
+                return new SveIndexedMemSV<int32_t, int16_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            } else {
+                return new SveIndexedMemSV<int64_t, int16_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+          case 0x3:
+            if (esizeIs32) {
+                return new SveIndexedMemSV<uint32_t, uint16_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            } else {
+                return new SveIndexedMemSV<uint64_t, uint16_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+          case 0x4:
+            if (esizeIs32) {
+                break;
+            } else {
+                return new SveIndexedMemSV<int64_t, int32_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+          case 0x5:
+            if (esizeIs32) {
+                return new SveIndexedMemSV<uint32_t, uint32_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            } else {
+                return new SveIndexedMemSV<uint64_t, uint32_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+          case 0x7:
+            if (esizeIs32) {
+                break;
+            } else {
+                return new SveIndexedMemSV<uint64_t, uint64_t,
+                                           SveGatherLoadSVMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+        }
+        return new Unknown64(machInst);
+    }
+
+    StaticInstPtr
+    decodeSveScatterStoreVIInsts(uint8_t msz, ExtMachInst machInst,
+                                 IntRegIndex zt, IntRegIndex pg,
+                                 IntRegIndex zn, uint64_t imm,
+                                 bool esizeIs32)
+    {
+        const char* mn = "st1";
+        switch (msz) {
+          case 0x0:
+            if (esizeIs32) {
+                return new SveIndexedMemVI<uint32_t, uint8_t,
+                                           SveScatterStoreVIMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+            } else {
+                return new SveIndexedMemVI<uint64_t, uint8_t,
+                                           SveScatterStoreVIMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+            }
+          case 0x1:
+            if (esizeIs32) {
+                return new SveIndexedMemVI<uint32_t, uint16_t,
+                                           SveScatterStoreVIMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+            } else {
+                return new SveIndexedMemVI<uint64_t, uint16_t,
+                                           SveScatterStoreVIMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+            }
+          case 0x2:
+            if (esizeIs32) {
+                return new SveIndexedMemVI<uint32_t, uint32_t,
+                                           SveScatterStoreVIMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+            } else {
+                return new SveIndexedMemVI<uint64_t, uint32_t,
+                                           SveScatterStoreVIMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+            }
+          case 0x3:
+            if (esizeIs32) {
+                break;
+            } else {
+                return new SveIndexedMemVI<uint64_t, uint64_t,
+                                           SveScatterStoreVIMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+            }
+        }
+        return new Unknown64(machInst);
+    }
+
+    StaticInstPtr
+    decodeSveScatterStoreSVInsts(uint8_t msz, ExtMachInst machInst,
+                                 IntRegIndex zt, IntRegIndex pg,
+                                 IntRegIndex rn, IntRegIndex zm,
+                                 bool esizeIs32, bool offsetIs32,
+                                 bool offsetIsSigned, bool offsetIsScaled)
+    {
+        const char* mn = "st1";
+        switch (msz) {
+          case 0x0:
+            if (esizeIs32) {
+                return new SveIndexedMemSV<uint32_t, uint8_t,
+                                           SveScatterStoreSVMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            } else {
+                return new SveIndexedMemSV<uint64_t, uint8_t,
+                                           SveScatterStoreSVMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+          case 0x1:
+            if (esizeIs32) {
+                return new SveIndexedMemSV<uint32_t, uint16_t,
+                                           SveScatterStoreSVMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            } else {
+                return new SveIndexedMemSV<uint64_t, uint16_t,
+                                           SveScatterStoreSVMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+          case 0x2:
+            if (esizeIs32) {
+                return new SveIndexedMemSV<uint32_t, uint32_t,
+                                           SveScatterStoreSVMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            } else {
+                return new SveIndexedMemSV<uint64_t, uint32_t,
+                                           SveScatterStoreSVMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+          case 0x3:
+            if (esizeIs32) {
+                break;
+            } else {
+                return new SveIndexedMemSV<uint64_t, uint64_t,
+                                           SveScatterStoreSVMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, rn, zm,
+                    offsetIs32, offsetIsSigned, offsetIsScaled);
+            }
+        }
+        return new Unknown64(machInst);
+    }
+
+}};
+
+
 let {{
 
     header_output = ''
@@ -323,6 +605,31 @@
         ('uint64_t', 'uint64_t'),
     )
 
+    gatherLoadTplArgs = (
+        ('int32_t', 'int8_t'),
+        ('int64_t', 'int8_t'),
+        ('uint32_t', 'uint8_t'),
+        ('uint64_t', 'uint8_t'),
+        ('int32_t', 'int16_t'),
+        ('int64_t', 'int16_t'),
+        ('uint32_t', 'uint16_t'),
+        ('uint64_t', 'uint16_t'),
+        ('int64_t', 'int32_t'),
+        ('uint32_t', 'uint32_t'),
+        ('uint64_t', 'uint32_t'),
+        ('uint64_t', 'uint64_t'),
+    )
+
+    scatterStoreTplArgs = (
+        ('uint32_t', 'uint8_t'),
+        ('uint64_t', 'uint8_t'),
+        ('uint32_t', 'uint16_t'),
+        ('uint64_t', 'uint16_t'),
+        ('uint32_t', 'uint32_t'),
+        ('uint64_t', 'uint32_t'),
+        ('uint64_t', 'uint64_t'),
+    )
+
     # Generates definitions for SVE contiguous loads
     def emitSveContigMemInsts(offsetIsImm):
         global header_output, exec_output, decoders
@@ -437,9 +744,124 @@
                          'class_name': 'SveLoadAndRepl'}
             exec_output += SveContigMemExecDeclare.subst(substDict)
 
+    class IndexedAddrForm:
+        VEC_PLUS_IMM = 0
+        SCA_PLUS_VEC = 1
+
+    # Generates definitions for the transfer microops of SVE indexed memory
+    # operations (gather loads, scatter stores)
+    def emitSveIndexedMemMicroops(indexed_addr_form):
+        assert indexed_addr_form in (IndexedAddrForm.VEC_PLUS_IMM,
+                                     IndexedAddrForm.SCA_PLUS_VEC)
+        global header_output, exec_output, decoders
+        tplHeader = 'template <class RegElemType, class MemElemType>'
+        tplArgs = '<RegElemType, MemElemType>'
+        if indexed_addr_form == IndexedAddrForm.VEC_PLUS_IMM:
+            eaCode = '''
+        EA = AA64FpBase_x[elemIndex] + imm * sizeof(MemElemType)'''
+        else:
+            eaCode = '''
+        uint64_t offset = AA64FpOffset_x[elemIndex];
+        if (offsetIs32) {
+            offset &= (1ULL << 32) - 1;
+        }
+        if (offsetIsSigned) {
+            offset = sext<32>(offset);
+        }
+        if (offsetIsScaled) {
+            offset *= sizeof(MemElemType);
+        }
+        EA = XBase + offset'''
+        loadMemAccCode = '''
+            if (GpOp_x[elemIndex]) {
+                AA64FpDest_x[elemIndex] = memData;
+            } else {
+                AA64FpDest_x[elemIndex] = 0;
+            }
+        '''
+        storeMemAccCode = '''
+            memData = AA64FpDest_x[elemIndex];
+        '''
+        predCheckCode = 'GpOp_x[elemIndex]'
+        loadIop = InstObjParams('ld1',
+            ('SveGatherLoadVIMicroop'
+             if indexed_addr_form == IndexedAddrForm.VEC_PLUS_IMM
+             else 'SveGatherLoadSVMicroop'),
+            'MicroOp',
+            {'tpl_header': tplHeader,
+             'tpl_args': tplArgs,
+             'memacc_code': loadMemAccCode,
+             'ea_code' : sveEnabledCheckCode + eaCode,
+             'pred_check_code' : predCheckCode,
+             'fa_code' : ''},
+            ['IsMicroop', 'IsMemRef', 'IsLoad'])
+        storeIop = InstObjParams('st1',
+            ('SveScatterStoreVIMicroop'
+             if indexed_addr_form == IndexedAddrForm.VEC_PLUS_IMM
+             else 'SveScatterStoreSVMicroop'),
+            'MicroOp',
+            {'tpl_header': tplHeader,
+             'tpl_args': tplArgs,
+             'memacc_code': storeMemAccCode,
+             'ea_code' : sveEnabledCheckCode + eaCode,
+             'pred_check_code' : predCheckCode,
+             'fa_code' : ''},
+            ['IsMicroop', 'IsMemRef', 'IsStore'])
+        if indexed_addr_form == IndexedAddrForm.VEC_PLUS_IMM:
+            header_output += SveIndexedMemVIMicroopDeclare.subst(loadIop)
+            header_output += SveIndexedMemVIMicroopDeclare.subst(storeIop)
+        else:
+            header_output += SveIndexedMemSVMicroopDeclare.subst(loadIop)
+            header_output += SveIndexedMemSVMicroopDeclare.subst(storeIop)
+        exec_output += (
+            SveGatherLoadMicroopExecute.subst(loadIop) +
+            SveGatherLoadMicroopInitiateAcc.subst(loadIop) +
+            SveGatherLoadMicroopCompleteAcc.subst(loadIop) +
+            SveScatterStoreMicroopExecute.subst(storeIop) +
+            SveScatterStoreMicroopInitiateAcc.subst(storeIop) +
+            SveScatterStoreMicroopCompleteAcc.subst(storeIop))
+        for args in gatherLoadTplArgs:
+            substDict = {'tpl_args': '<%s>' % ', '.join(args),
+                         'class_name': (
+                             'SveGatherLoadVIMicroop'
+                             if indexed_addr_form == \
+                                 IndexedAddrForm.VEC_PLUS_IMM
+                             else 'SveGatherLoadSVMicroop')}
+            # TODO: this should become SveMemExecDeclare
+            exec_output += SveContigMemExecDeclare.subst(substDict)
+        for args in scatterStoreTplArgs:
+            substDict = {'tpl_args': '<%s>' % ', '.join(args),
+                         'class_name': (
+                             'SveScatterStoreVIMicroop'
+                             if indexed_addr_form == \
+                                 IndexedAddrForm.VEC_PLUS_IMM
+                             else 'SveScatterStoreSVMicroop')}
+            # TODO: this should become SveMemExecDeclare
+            exec_output += SveContigMemExecDeclare.subst(substDict)
+
+    # Generates definitions for the first microop of SVE gather loads, required
+    # to propagate the source vector register to the transfer microops
+    def emitSveGatherLoadCpySrcVecMicroop():
+        global header_output, exec_output, decoders
+        code = sveEnabledCheckCode + '''
+        unsigned eCount = ArmStaticInst::getCurSveVecLen<uint8_t>(
+                xc->tcBase());
+        for (unsigned i = 0; i < eCount; i++) {
+            AA64FpUreg0_ub[i] = AA64FpOp1_ub[i];
+        }'''
+        iop = InstObjParams('ld1',
+            'SveGatherLoadCpySrcVecMicroop',
+            'MicroOp',
+            {'code': code},
+            ['IsMicroop'])
+        header_output += SveGatherLoadCpySrcVecMicroopDeclare.subst(iop)
+        exec_output += SveGatherLoadCpySrcVecMicroopExecute.subst(iop)
+
     # LD1[S]{B,H,W,D} (scalar plus immediate)
+    # ST1[S]{B,H,W,D} (scalar plus immediate)
     emitSveContigMemInsts(True)
     # LD1[S]{B,H,W,D} (scalar plus scalar)
+    # ST1[S]{B,H,W,D} (scalar plus scalar)
     emitSveContigMemInsts(False)
 
     # LD1R[S]{B,H,W,D}
@@ -450,4 +872,14 @@
     # LDR (vector), STR (vector)
     emitSveMemFillSpill(False)
 
+    # LD1[S]{B,H,W,D} (vector plus immediate)
+    # ST1[S]{B,H,W,D} (vector plus immediate)
+    emitSveIndexedMemMicroops(IndexedAddrForm.VEC_PLUS_IMM)
+    # LD1[S]{B,H,W,D} (scalar plus vector)
+    # ST1[S]{B,H,W,D} (scalar plus vector)
+    emitSveIndexedMemMicroops(IndexedAddrForm.SCA_PLUS_VEC)
+
+    # Source vector copy microop for gather loads
+    emitSveGatherLoadCpySrcVecMicroop()
+
 }};
diff --git a/src/arch/arm/isa/operands.isa b/src/arch/arm/isa/operands.isa
index 0a0469a..a3b3857 100644
--- a/src/arch/arm/isa/operands.isa
+++ b/src/arch/arm/isa/operands.isa
@@ -1,5 +1,5 @@
 // -*- mode:c++ -*-
-// Copyright (c) 2010-2014, 2016 ARM Limited
+// Copyright (c) 2010-2014, 2016-2018 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -541,6 +541,39 @@
         'AA64FpDestMergeQ':  vectorRegElem('0', 'tud', zeroing = True)
     }),
 
+    'AA64FpBase': vectorReg('base',
+    {
+        'AA64FpBaseP0': vectorRegElem('0'),
+        'AA64FpBaseP1': vectorRegElem('1'),
+        'AA64FpBaseP2': vectorRegElem('2'),
+        'AA64FpBaseP3': vectorRegElem('3'),
+        'AA64FpBaseS':  vectorRegElem('0', 'sf', zeroing = True),
+        'AA64FpBaseD':  vectorRegElem('0', 'df', zeroing = True),
+        'AA64FpBaseQ':  vectorRegElem('0', 'tud', zeroing = True)
+    }),
+
+    'AA64FpOffset': vectorReg('offset',
+    {
+        'AA64FpOffsetP0': vectorRegElem('0'),
+        'AA64FpOffsetP1': vectorRegElem('1'),
+        'AA64FpOffsetP2': vectorRegElem('2'),
+        'AA64FpOffsetP3': vectorRegElem('3'),
+        'AA64FpOffsetS':  vectorRegElem('0', 'sf', zeroing = True),
+        'AA64FpOffsetD':  vectorRegElem('0', 'df', zeroing = True),
+        'AA64FpOffsetQ':  vectorRegElem('0', 'tud', zeroing = True)
+    }),
+
+    'AA64FpUreg0': vectorReg('VECREG_UREG0',
+    {
+        'AA64FpUreg0P0': vectorRegElem('0'),
+        'AA64FpUreg0P1': vectorRegElem('1'),
+        'AA64FpUreg0P2': vectorRegElem('2'),
+        'AA64FpUreg0P3': vectorRegElem('3'),
+        'AA64FpUreg0S':  vectorRegElem('0', 'sf', zeroing = True),
+        'AA64FpUreg0D':  vectorRegElem('0', 'df', zeroing = True),
+        'AA64FpUreg0Q':  vectorRegElem('0', 'tud', zeroing = True)
+    }),
+
     # Predicate register operands
     'GpOp': vecPredReg('gp'),
     'POp1': vecPredReg('op1'),
diff --git a/src/arch/arm/isa/templates/sve_mem.isa b/src/arch/arm/isa/templates/sve_mem.isa
index 8471e44..2cdf2ff 100644
--- a/src/arch/arm/isa/templates/sve_mem.isa
+++ b/src/arch/arm/isa/templates/sve_mem.isa
@@ -1,4 +1,4 @@
-// Copyright (c) 2017 ARM Limited
+// Copyright (c) 2017-2018 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -384,3 +384,342 @@
     }
 }};
 
+def template SveIndexedMemVIMicroopDeclare {{
+    %(tpl_header)s
+    class %(class_name)s : public %(base_class)s
+    {
+      protected:
+        typedef RegElemType TPElem;
+
+        IntRegIndex dest;
+        IntRegIndex gp;
+        IntRegIndex base;
+        uint64_t imm;
+
+        int elemIndex;
+        int numElems;
+
+        unsigned memAccessFlags;
+
+      public:
+        %(class_name)s(const char* mnem, ExtMachInst machInst,
+            OpClass __opClass, IntRegIndex _dest, IntRegIndex _gp,
+            IntRegIndex _base, uint64_t _imm, int _elemIndex, int _numElems)
+            : %(base_class)s(mnem, machInst, %(op_class)s),
+              dest(_dest), gp(_gp), base(_base), imm(_imm),
+              elemIndex(_elemIndex), numElems(_numElems),
+              memAccessFlags(ArmISA::TLB::AllowUnaligned |
+                             ArmISA::TLB::MustBeOne)
+        {
+            %(constructor)s;
+            if (_opClass == MemReadOp && elemIndex == 0) {
+                // The first micro-op is responsible for pinning the
+                // destination register
+                _destRegIdx[0].setNumPinnedWrites(numElems - 1);
+            }
+        }
+
+        Fault execute(ExecContext *, Trace::InstRecord *) const;
+        Fault initiateAcc(ExecContext *, Trace::InstRecord *) const;
+        Fault completeAcc(PacketPtr, ExecContext *, Trace::InstRecord *) const;
+
+        virtual void
+        annotateFault(ArmFault *fault)
+        {
+            %(fa_code)s
+        }
+
+        std::string
+        generateDisassembly(Addr pc, const SymbolTable *symtab) const
+        {
+            // TODO: add suffix to transfer register
+            std::stringstream ss;
+            printMnemonic(ss, "", false);
+            ccprintf(ss, "{");
+            printVecReg(ss, dest, true);
+            ccprintf(ss, "}, ");
+            printVecPredReg(ss, gp);
+            if (_opClass == MemReadOp) {
+                ccprintf(ss, "/z");
+            }
+            ccprintf(ss, ", [");
+            printVecReg(ss, base, true);
+            if (imm != 0) {
+                ccprintf(ss, ", #%d", imm * sizeof(MemElemType));
+            }
+            ccprintf(ss, "] (uop elem %d tfer)", elemIndex);
+            return ss.str();
+        }
+    };
+}};
+
+def template SveIndexedMemSVMicroopDeclare {{
+    %(tpl_header)s
+    class %(class_name)s : public %(base_class)s
+    {
+      protected:
+        typedef RegElemType TPElem;
+
+        IntRegIndex dest;
+        IntRegIndex gp;
+        IntRegIndex base;
+        IntRegIndex offset;
+
+        bool offsetIs32;
+        bool offsetIsSigned;
+        bool offsetIsScaled;
+
+        int elemIndex;
+        int numElems;
+
+        unsigned memAccessFlags;
+
+      public:
+        %(class_name)s(const char* mnem, ExtMachInst machInst,
+            OpClass __opClass, IntRegIndex _dest, IntRegIndex _gp,
+            IntRegIndex _base, IntRegIndex _offset, bool _offsetIs32,
+            bool _offsetIsSigned, bool _offsetIsScaled, int _elemIndex,
+            int _numElems)
+            : %(base_class)s(mnem, machInst, %(op_class)s),
+              dest(_dest), gp(_gp), base(_base), offset(_offset),
+              offsetIs32(_offsetIs32), offsetIsSigned(_offsetIsSigned),
+              offsetIsScaled(_offsetIsScaled), elemIndex(_elemIndex),
+              numElems(_numElems),
+              memAccessFlags(ArmISA::TLB::AllowUnaligned |
+                             ArmISA::TLB::MustBeOne)
+        {
+            %(constructor)s;
+            if (_opClass == MemReadOp && elemIndex == 0) {
+                // The first micro-op is responsible for pinning the
+                // destination register
+                _destRegIdx[0].setNumPinnedWrites(numElems - 1);
+            }
+        }
+
+        Fault execute(ExecContext *, Trace::InstRecord *) const;
+        Fault initiateAcc(ExecContext *, Trace::InstRecord *) const;
+        Fault completeAcc(PacketPtr, ExecContext *, Trace::InstRecord *) const;
+
+        virtual void
+        annotateFault(ArmFault *fault)
+        {
+            %(fa_code)s
+        }
+
+        std::string
+        generateDisassembly(Addr pc, const SymbolTable *symtab) const
+        {
+            // TODO: add suffix to transfer and base registers
+            std::stringstream ss;
+            printMnemonic(ss, "", false);
+            ccprintf(ss, "{");
+            printVecReg(ss, dest, true);
+            ccprintf(ss, "}, ");
+            printVecPredReg(ss, gp);
+            if (_opClass == MemReadOp) {
+                ccprintf(ss, "/z");
+            }
+            ccprintf(ss, ", [");
+            printIntReg(ss, base);
+            ccprintf(ss, ", ");
+            printVecReg(ss, offset, true);
+            ccprintf(ss, "] (uop elem %d tfer)", elemIndex);
+            return ss.str();
+        }
+    };
+}};
+
+def template SveGatherLoadMicroopExecute {{
+    %(tpl_header)s
+    Fault %(class_name)s%(tpl_args)s::execute(ExecContext *xc,
+        Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+        bool aarch64 M5_VAR_USED = true;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        MemElemType memData;
+
+        if (%(pred_check_code)s) {
+            fault = readMemAtomic(xc, traceData, EA, memData,
+                this->memAccessFlags);
+        }
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template SveGatherLoadMicroopInitiateAcc {{
+    %(tpl_header)s
+    Fault %(class_name)s%(tpl_args)s::initiateAcc(ExecContext *xc,
+        Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+        bool aarch64 M5_VAR_USED = true;
+
+        %(op_src_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        MemElemType memData;
+
+        if (%(pred_check_code)s) {
+            fault = initiateMemRead(xc, traceData, EA, memData,
+                this->memAccessFlags);
+        } else {
+            xc->setMemAccPredicate(false);
+        }
+
+        return fault;
+    }
+}};
+
+def template SveGatherLoadMicroopCompleteAcc {{
+    %(tpl_header)s
+    Fault %(class_name)s%(tpl_args)s::completeAcc(PacketPtr pkt,
+        ExecContext *xc, Trace::InstRecord *traceData) const
+    {
+        Fault fault = NoFault;
+        bool aarch64 M5_VAR_USED = true;
+
+        %(op_decl)s;
+        %(op_rd)s;
+
+        MemElemType memData = 0;
+        if (%(pred_check_code)s) {
+            getMem(pkt, memData, traceData);
+        }
+
+        if (fault == NoFault) {
+            %(memacc_code)s;
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template SveScatterStoreMicroopExecute {{
+    %(tpl_header)s
+    Fault %(class_name)s%(tpl_args)s::execute(ExecContext *xc,
+        Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+        bool aarch64 M5_VAR_USED = true;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        MemElemType memData;
+        %(memacc_code)s;
+
+        if (%(pred_check_code)s) {
+            fault = writeMemAtomic(xc, traceData, memData, EA,
+                                   this->memAccessFlags, NULL);
+        }
+
+        if (fault == NoFault) {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
+def template SveScatterStoreMicroopInitiateAcc {{
+    %(tpl_header)s
+    Fault %(class_name)s%(tpl_args)s::initiateAcc(ExecContext *xc,
+        Trace::InstRecord *traceData) const
+    {
+        Addr EA;
+        Fault fault = NoFault;
+        bool aarch64 M5_VAR_USED = true;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        MemElemType memData;
+        %(memacc_code)s;
+
+        if (%(pred_check_code)s) {
+            fault = writeMemTiming(xc, traceData, memData, EA,
+                                   this->memAccessFlags, NULL);
+        } else {
+            xc->setPredicate(false);
+        }
+
+        return fault;
+    }
+}};
+
+def template SveScatterStoreMicroopCompleteAcc {{
+    %(tpl_header)s
+    Fault %(class_name)s%(tpl_args)s::completeAcc(PacketPtr pkt,
+        ExecContext *xc, Trace::InstRecord *traceData) const
+    {
+        return NoFault;
+    }
+}};
+
+def template SveGatherLoadCpySrcVecMicroopDeclare {{
+    class SveGatherLoadCpySrcVecMicroop : public MicroOp
+    {
+      protected:
+        IntRegIndex op1;
+
+        StaticInst *macroOp;
+
+      public:
+        SveGatherLoadCpySrcVecMicroop(const char* mnem, ExtMachInst machInst,
+            IntRegIndex _op1, StaticInst *_macroOp)
+            : MicroOp(mnem, machInst, SimdAluOp), op1(_op1), macroOp(_macroOp)
+        {
+            %(constructor)s;
+        }
+
+        Fault execute(ExecContext *, Trace::InstRecord *) const;
+
+        std::string
+        generateDisassembly(Addr pc, const SymbolTable *symtab) const
+        {
+            std::stringstream ss;
+            ccprintf(ss, "%s", macroOp->disassemble(pc, symtab));
+            ccprintf(ss, " (uop src vec cpy)");
+            return ss.str();
+        }
+    };
+}};
+
+def template SveGatherLoadCpySrcVecMicroopExecute {{
+    Fault SveGatherLoadCpySrcVecMicroop::execute(ExecContext *xc,
+            Trace::InstRecord *traceData) const
+    {
+        Fault fault = NoFault;
+        %(op_decl)s;
+        %(op_rd)s;
+
+        %(code)s;
+        if (fault == NoFault)
+        {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
diff --git a/src/arch/arm/registers.hh b/src/arch/arm/registers.hh
index 8ee48ed..8e6ce79 100644
--- a/src/arch/arm/registers.hh
+++ b/src/arch/arm/registers.hh
@@ -88,6 +88,7 @@
 const int NumIntRegs = NUM_INTREGS;
 const int NumFloatRegs = NumFloatV8ArchRegs + NumFloatSpecialRegs;
 const int NumVecRegs = NumVecV8ArchRegs + NumVecSpecialRegs;
+const int VECREG_UREG0 = 32;
 const int NumVecPredRegs = 17;  // P0-P15, FFR
 const int PREDREG_FFR = 16;
 const int NumCCRegs = NUM_CCREGS;