arch-arm: Add first-/non-faulting load instructions

First-/non-faulting loads are part of Arm SVE.

Change-Id: I93dfd6d1d74791653927e99098ddb651150a8ef7
Signed-off-by: Gabor Dozsa <gabor.dozsa@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/19177
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Maintainer: Giacomo Travaglini <giacomo.travaglini@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/src/arch/arm/faults.cc b/src/arch/arm/faults.cc
index 9437471..a03c917 100644
--- a/src/arch/arm/faults.cc
+++ b/src/arch/arm/faults.cc
@@ -1216,6 +1216,14 @@
          (source <  ArmFault::PermissionLL + 4));
 }
 
+template<class T>
+bool
+AbortFault<T>::getFaultVAddr(Addr &va) const
+{
+    va = (stage2 ?  OVAddr : faultAddr);
+    return true;
+}
+
 ExceptionClass
 PrefetchAbort::ec(ThreadContext *tc) const
 {
@@ -1618,5 +1626,29 @@
 IllegalInstSetStateFault::IllegalInstSetStateFault()
 {}
 
+bool
+getFaultVAddr(Fault fault, Addr &va)
+{
+    auto arm_fault = dynamic_cast<ArmFault *>(fault.get());
+
+    if (arm_fault) {
+        return arm_fault->getFaultVAddr(va);
+    } else {
+        auto pgt_fault = dynamic_cast<GenericPageTableFault *>(fault.get());
+        if (pgt_fault) {
+            va = pgt_fault->getFaultVAddr();
+            return true;
+        }
+
+        auto align_fault = dynamic_cast<GenericAlignmentFault *>(fault.get());
+        if (align_fault) {
+            va = align_fault->getFaultVAddr();
+            return true;
+        }
+
+        // Return false since it's not an address triggered exception
+        return false;
+    }
+}
 
 } // namespace ArmISA
diff --git a/src/arch/arm/faults.hh b/src/arch/arm/faults.hh
index e04a0dc..d14983d 100644
--- a/src/arch/arm/faults.hh
+++ b/src/arch/arm/faults.hh
@@ -234,6 +234,8 @@
     virtual bool isStage2() const { return false; }
     virtual FSR getFsr(ThreadContext *tc) const { return 0; }
     virtual void setSyndrome(ThreadContext *tc, MiscRegIndex syndrome_reg);
+    virtual bool getFaultVAddr(Addr &va) const { return false; }
+
 };
 
 template<typename T>
@@ -435,6 +437,8 @@
         stage2(_stage2), s1ptw(false), tranMethod(_tranMethod)
     {}
 
+    bool getFaultVAddr(Addr &va) const override;
+
     void invoke(ThreadContext *tc, const StaticInstPtr &inst =
                 StaticInst::nullStaticInstPtr) override;
 
@@ -625,6 +629,18 @@
 template<> ArmFault::FaultVals ArmFaultVals<SoftwareBreakpoint>::vals;
 template<> ArmFault::FaultVals ArmFaultVals<ArmSev>::vals;
 
+/**
+ * Returns true if the fault passed as a first argument was triggered
+ * by a memory access, false otherwise.
+ * If true it is storing the faulting address in the va argument
+ *
+ * @param fault generated fault
+ * @param va function will modify this passed-by-reference parameter
+ *           with the correct faulting virtual address
+ * @return true if va contains a valid value, false otherwise
+ */
+bool getFaultVAddr(Fault fault, Addr &va);
+
 
 } // namespace ArmISA
 
diff --git a/src/arch/arm/insts/sve_macromem.hh b/src/arch/arm/insts/sve_macromem.hh
index a31af9b..b365dcb 100644
--- a/src/arch/arm/insts/sve_macromem.hh
+++ b/src/arch/arm/insts/sve_macromem.hh
@@ -46,7 +46,8 @@
 namespace ArmISA {
 
 template <typename RegElemType, typename MemElemType,
-          template <typename, typename> class MicroopType>
+          template <typename, typename> class MicroopType,
+          template <typename> class FirstFaultWritebackMicroopType>
 class SveIndexedMemVI : public PredMacroOp
 {
   protected:
@@ -58,17 +59,22 @@
   public:
     SveIndexedMemVI(const char *mnem, ExtMachInst machInst, OpClass __opClass,
                     IntRegIndex _dest, IntRegIndex _gp, IntRegIndex _base,
-                    uint64_t _imm)
+                    uint64_t _imm, bool firstFault)
         : PredMacroOp(mnem, machInst, __opClass),
           dest(_dest), gp(_gp), base(_base), imm(_imm)
     {
         bool isLoad = (__opClass == MemReadOp);
+        assert(!firstFault || isLoad);
 
         int num_elems = ((machInst.sveLen + 1) * 16) / sizeof(RegElemType);
 
         numMicroops = num_elems;
         if (isLoad) {
-            numMicroops++;
+            if (firstFault) {
+                numMicroops += 2;
+            } else {
+                numMicroops++;
+            }
         }
 
         microOps = new StaticInstPtr[numMicroops];
@@ -90,10 +96,16 @@
             *uop = new MicroopType<RegElemType, MemElemType>(
                 mnem, machInst, __opClass, _dest, _gp,
                 isLoad ? (IntRegIndex) VECREG_UREG0 : _base, _imm, i,
-                num_elems);
+                num_elems, firstFault);
         }
 
-        --uop;
+        if (firstFault) {
+            *uop = new FirstFaultWritebackMicroopType<RegElemType>(
+                mnem, machInst, __opClass, num_elems, this);
+        } else {
+            --uop;
+        }
+
         (*uop)->setLastMicroop();
         microOps[0]->setFirstMicroop();
 
@@ -130,7 +142,8 @@
 };
 
 template <typename RegElemType, typename MemElemType,
-          template <typename, typename> class MicroopType>
+          template <typename, typename> class MicroopType,
+          template <typename> class FirstFaultWritebackMicroopType>
 class SveIndexedMemSV : public PredMacroOp
 {
   protected:
@@ -147,19 +160,25 @@
     SveIndexedMemSV(const char *mnem, ExtMachInst machInst, OpClass __opClass,
                     IntRegIndex _dest, IntRegIndex _gp, IntRegIndex _base,
                     IntRegIndex _offset, bool _offsetIs32,
-                    bool _offsetIsSigned, bool _offsetIsScaled)
+                    bool _offsetIsSigned, bool _offsetIsScaled,
+                    bool firstFault)
         : PredMacroOp(mnem, machInst, __opClass),
           dest(_dest), gp(_gp), base(_base), offset(_offset),
           offsetIs32(_offsetIs32), offsetIsSigned(_offsetIsSigned),
           offsetIsScaled(_offsetIsScaled)
     {
         bool isLoad = (__opClass == MemReadOp);
+        assert(!firstFault || isLoad);
 
         int num_elems = ((machInst.sveLen + 1) * 16) / sizeof(RegElemType);
 
         numMicroops = num_elems;
         if (isLoad) {
-            numMicroops++;
+            if (firstFault) {
+                numMicroops += 2;
+            } else {
+                numMicroops++;
+            }
         }
 
         microOps = new StaticInstPtr[numMicroops];
@@ -181,10 +200,16 @@
             *uop = new MicroopType<RegElemType, MemElemType>(
                 mnem, machInst, __opClass, _dest, _gp, _base,
                 isLoad ? (IntRegIndex) VECREG_UREG0 : _offset, _offsetIs32,
-                _offsetIsSigned, _offsetIsScaled, i, num_elems);
+                _offsetIsSigned, _offsetIsScaled, i, num_elems, firstFault);
         }
 
-        --uop;
+        if (firstFault) {
+            *uop = new FirstFaultWritebackMicroopType<RegElemType>(
+                mnem, machInst, __opClass, num_elems, this);
+        } else {
+            --uop;
+        }
+
         (*uop)->setLastMicroop();
         microOps[0]->setFirstMicroop();
 
diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa
index d4e7528..69d80e2 100644
--- a/src/arch/arm/isa/formats/sve_2nd_level.isa
+++ b/src/arch/arm/isa/formats/sve_2nd_level.isa
@@ -2921,9 +2921,6 @@
                     uint8_t dtype = (bits(machInst, 24, 23) << 1) |
                                     bits(machInst, 14);
                     uint8_t ff = bits(machInst, 13);
-                    if (ff) {
-                        return new Unknown64(machInst);
-                    }
                     return decodeSveGatherLoadVIInsts(
                         dtype, machInst, zt, pg, zn, imm, true, ff);
                 } else {
@@ -2952,9 +2949,6 @@
                                 bits(machInst, 14);
                 uint8_t xs = bits(machInst, 22);
                 uint8_t ff = bits(machInst, 13);
-                if (ff) {
-                    return new Unknown64(machInst);
-                }
                 return decodeSveGatherLoadSVInsts(
                         dtype, machInst, zt, pg, rn, zm,
                         true, true, xs, false, ff);
@@ -2980,19 +2974,20 @@
                              bits(machInst, 12, 10);
                     uint8_t xs = bits(machInst, 22);
                     uint8_t ff = bits(machInst, 13);
-                    if (ff) {
-                        return new Unknown64(machInst);
-                    }
                     if (bits(machInst, 14)) {
-                        return new SveIndexedMemSV<uint32_t, uint16_t,
-                                                   SveGatherLoadSVMicroop>(
-                            "ld1", machInst, MemReadOp, zt, pg, rn, zm,
-                            true, xs, true);
+                        return
+                           new SveIndexedMemSV<uint32_t, uint16_t,
+                                               SveGatherLoadSVMicroop,
+                                               SveFirstFaultWritebackMicroop>(
+                            ff ? "ldff1" : "ld1", machInst, MemReadOp, zt, pg,
+                            rn, zm, true, xs, true, ff);
                     } else {
-                        return new SveIndexedMemSV<int32_t, int16_t,
-                                                   SveGatherLoadSVMicroop>(
-                            "ld1", machInst, MemReadOp, zt, pg, rn, zm,
-                            true, xs, true);
+                        return
+                           new SveIndexedMemSV<int32_t, int16_t,
+                                               SveGatherLoadSVMicroop,
+                                               SveFirstFaultWritebackMicroop>(
+                            ff ? "ldff1" : "ld1", machInst, MemReadOp, zt, pg,
+                            rn, zm, true, xs, true, ff);
                     }
                 }
                 break;
@@ -3010,13 +3005,11 @@
                              bits(machInst, 12, 10);
                     uint8_t xs = bits(machInst, 22);
                     uint8_t ff = bits(machInst, 13);
-                    if (ff) {
-                        return new Unknown64(machInst);
-                    }
                     return new SveIndexedMemSV<uint32_t, uint32_t,
-                                               SveGatherLoadSVMicroop>(
-                        "ld1", machInst, MemReadOp, zt, pg, rn, zm,
-                        true, xs, true);
+                                               SveGatherLoadSVMicroop,
+                                               SveFirstFaultWritebackMicroop>(
+                        ff ? "ldff1" : "ld1", machInst, MemReadOp, zt, pg, rn,
+                        zm, true, xs, true, ff);
                 }
                 break;
               case 0x3:
@@ -3083,7 +3076,18 @@
     StaticInstPtr
     decodeSveContigFFLoadSS(ExtMachInst machInst)
     {
-        return new Unknown64(machInst);
+        IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex rn = makeSP((IntRegIndex) (uint8_t) bits(machInst, 9, 5));
+        IntRegIndex rm = makeSP(
+            (IntRegIndex) (uint8_t) bits(machInst, 20, 16));
+        IntRegIndex pg = (IntRegIndex) (uint8_t) bits(machInst, 12, 10);
+
+        if (rm == 0x1f) {
+            return new Unknown64(machInst);
+        }
+
+        return decodeSveContigLoadSSInsts<SveContigFFLoadSS>(
+            bits(machInst, 24, 21), machInst, zt, pg, rn, rm, true);
     }  // decodeSveContigFFLoadSS
 
     StaticInstPtr
@@ -3101,7 +3105,13 @@
     StaticInstPtr
     decodeSveContigNFLoadSI(ExtMachInst machInst)
     {
-        return new Unknown64(machInst);
+        IntRegIndex zt = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex rn = makeSP((IntRegIndex) (uint8_t) bits(machInst, 9, 5));
+        uint64_t imm = sext<4>(bits(machInst, 19, 16));
+        IntRegIndex pg = (IntRegIndex) (uint8_t) bits(machInst, 12, 10);
+
+        return decodeSveContigLoadSIInsts<SveContigNFLoadSI>(
+            bits(machInst, 24, 21), machInst, zt, pg, rn, imm, true);
     }  // decodeSveContigNFLoadSI
 
     StaticInstPtr
@@ -3186,9 +3196,6 @@
                                 bits(machInst, 14);
                 uint8_t xs = bits(machInst, 22);
                 uint8_t ff = bits(machInst, 13);
-                if (ff) {
-                    return new Unknown64(machInst);
-                }
                 return decodeSveGatherLoadSVInsts(
                         dtype, machInst, zt, pg, rn, zm,
                         false, true, xs, false, ff);
@@ -3205,9 +3212,6 @@
                 uint8_t dtype = (bits(machInst, 24, 23) << 1) |
                                 bits(machInst, 14);
                 uint8_t ff = bits(machInst, 13);
-                if (ff) {
-                    return new Unknown64(machInst);
-                }
                 return decodeSveGatherLoadSVInsts(
                         dtype, machInst, zt, pg, rn, zm,
                         false, false, false, false, ff);
@@ -3232,9 +3236,6 @@
                                 bits(machInst, 14);
                 uint8_t xs = bits(machInst, 22);
                 uint8_t ff = bits(machInst, 13);
-                if (ff) {
-                    return new Unknown64(machInst);
-                }
                 return decodeSveGatherLoadSVInsts(
                         dtype, machInst, zt, pg, rn, zm,
                         false, true, xs, true, ff);
@@ -3255,9 +3256,6 @@
                 uint8_t dtype = (bits(machInst, 24, 23) << 1) |
                                 bits(machInst, 14);
                 uint8_t ff = bits(machInst, 13);
-                if (ff) {
-                    return new Unknown64(machInst);
-                }
                 return decodeSveGatherLoadVIInsts(
                     dtype, machInst, zt, pg, zn, imm, false, ff);
             } else {
@@ -3275,9 +3273,6 @@
                     uint8_t dtype = (bits(machInst, 24, 23) << 1) |
                                     bits(machInst, 14);
                     uint8_t ff = bits(machInst, 13);
-                    if (ff) {
-                        return new Unknown64(machInst);
-                    }
                     return decodeSveGatherLoadSVInsts(
                             dtype, machInst, zt, pg, rn, zm,
                             false, false, false, true, ff);
diff --git a/src/arch/arm/isa/insts/sve_mem.isa b/src/arch/arm/isa/insts/sve_mem.isa
index 3102e80..e776deb 100644
--- a/src/arch/arm/isa/insts/sve_mem.isa
+++ b/src/arch/arm/isa/insts/sve_mem.isa
@@ -89,13 +89,11 @@
     StaticInstPtr
     decodeSveContigLoadSIInsts(uint8_t dtype, ExtMachInst machInst,
                                IntRegIndex zt, IntRegIndex pg, IntRegIndex rn,
-                               uint64_t imm, bool firstFaulting,
+                               uint64_t imm, bool nonFaulting,
                                bool replicate = false)
     {
-        assert(!(replicate && firstFaulting));
-
-        const char* mn = replicate ? "ld1r" :
-                                     (firstFaulting ? "ldff1" : "ld1");
+        assert(!(nonFaulting && replicate));
+        const char* mn = replicate ? "ld1r" : (nonFaulting ? "ldnf1" : "ld1");
         switch (dtype) {
           case 0x0:
             return new Base<uint8_t, uint8_t>(mn, machInst, zt, pg, rn, imm);
@@ -210,75 +208,87 @@
     decodeSveGatherLoadVIInsts(uint8_t dtype, ExtMachInst machInst,
                                IntRegIndex zt, IntRegIndex pg, IntRegIndex zn,
                                uint64_t imm, bool esizeIs32,
-                               bool firstFaulting)
+                               bool firstFault)
     {
-        const char* mn = firstFaulting ? "ldff1" : "ld1";
+        const char* mn = firstFault ? "ldff1" : "ld1";
         switch (dtype) {
           case 0x0:
             if (esizeIs32) {
                 return new SveIndexedMemVI<int32_t, int8_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             } else {
                 return new SveIndexedMemVI<int64_t, int8_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             }
           case 0x1:
             if (esizeIs32) {
                 return new SveIndexedMemVI<uint32_t, uint8_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             } else {
                 return new SveIndexedMemVI<uint64_t, uint8_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             }
           case 0x2:
             if (esizeIs32) {
                 return new SveIndexedMemVI<int32_t, int16_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             } else {
                 return new SveIndexedMemVI<int64_t, int16_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             }
           case 0x3:
             if (esizeIs32) {
                 return new SveIndexedMemVI<uint32_t, uint16_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             } else {
                 return new SveIndexedMemVI<uint64_t, uint16_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             }
           case 0x4:
             if (esizeIs32) {
                 break;
             } else {
                 return new SveIndexedMemVI<int64_t, int32_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             }
           case 0x5:
             if (esizeIs32) {
                 return new SveIndexedMemVI<uint32_t, uint32_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             } else {
                 return new SveIndexedMemVI<uint64_t, uint32_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             }
           case 0x7:
             if (esizeIs32) {
                 break;
             } else {
                 return new SveIndexedMemVI<uint64_t, uint64_t,
-                                           SveGatherLoadVIMicroop>(
-                    mn, machInst, MemReadOp, zt, pg, zn, imm);
+                                           SveGatherLoadVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemReadOp, zt, pg, zn, imm, firstFault);
             }
         }
         return new Unknown64(machInst);
@@ -289,87 +299,99 @@
                                IntRegIndex zt, IntRegIndex pg, IntRegIndex rn,
                                IntRegIndex zm, bool esizeIs32, bool offsetIs32,
                                bool offsetIsSigned, bool offsetIsScaled,
-                               bool firstFaulting)
+                               bool firstFault)
     {
-        const char* mn = firstFaulting ? "ldff1" : "ld1";
+        const char* mn = firstFault ? "ldff1" : "ld1";
         switch (dtype) {
           case 0x0:
             if (esizeIs32) {
                 return new SveIndexedMemSV<int32_t, int8_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             } else {
                 return new SveIndexedMemSV<int64_t, int8_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             }
           case 0x1:
             if (esizeIs32) {
                 return new SveIndexedMemSV<uint32_t, uint8_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             } else {
                 return new SveIndexedMemSV<uint64_t, uint8_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             }
           case 0x2:
             if (esizeIs32) {
                 return new SveIndexedMemSV<int32_t, int16_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             } else {
                 return new SveIndexedMemSV<int64_t, int16_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             }
           case 0x3:
             if (esizeIs32) {
                 return new SveIndexedMemSV<uint32_t, uint16_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             } else {
                 return new SveIndexedMemSV<uint64_t, uint16_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             }
           case 0x4:
             if (esizeIs32) {
                 break;
             } else {
                 return new SveIndexedMemSV<int64_t, int32_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             }
           case 0x5:
             if (esizeIs32) {
                 return new SveIndexedMemSV<uint32_t, uint32_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             } else {
                 return new SveIndexedMemSV<uint64_t, uint32_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             }
           case 0x7:
             if (esizeIs32) {
                 break;
             } else {
                 return new SveIndexedMemSV<uint64_t, uint64_t,
-                                           SveGatherLoadSVMicroop>(
+                                           SveGatherLoadSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemReadOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, firstFault);
             }
         }
         return new Unknown64(machInst);
@@ -386,40 +408,47 @@
           case 0x0:
             if (esizeIs32) {
                 return new SveIndexedMemVI<uint32_t, uint8_t,
-                                           SveScatterStoreVIMicroop>(
-                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+                                           SveScatterStoreVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm, false);
             } else {
                 return new SveIndexedMemVI<uint64_t, uint8_t,
-                                           SveScatterStoreVIMicroop>(
-                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+                                           SveScatterStoreVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm, false);
             }
           case 0x1:
             if (esizeIs32) {
                 return new SveIndexedMemVI<uint32_t, uint16_t,
-                                           SveScatterStoreVIMicroop>(
-                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+                                           SveScatterStoreVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm, false);
             } else {
                 return new SveIndexedMemVI<uint64_t, uint16_t,
-                                           SveScatterStoreVIMicroop>(
-                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+                                           SveScatterStoreVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm, false);
             }
           case 0x2:
             if (esizeIs32) {
                 return new SveIndexedMemVI<uint32_t, uint32_t,
-                                           SveScatterStoreVIMicroop>(
-                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+                                           SveScatterStoreVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm, false);
             } else {
                 return new SveIndexedMemVI<uint64_t, uint32_t,
-                                           SveScatterStoreVIMicroop>(
-                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+                                           SveScatterStoreVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm, false);
             }
           case 0x3:
             if (esizeIs32) {
                 break;
             } else {
                 return new SveIndexedMemVI<uint64_t, uint64_t,
-                                           SveScatterStoreVIMicroop>(
-                    mn, machInst, MemWriteOp, zt, pg, zn, imm);
+                                           SveScatterStoreVIMicroop,
+                                           SveFirstFaultWritebackMicroop>(
+                    mn, machInst, MemWriteOp, zt, pg, zn, imm, false);
             }
         }
         return new Unknown64(machInst);
@@ -437,47 +466,54 @@
           case 0x0:
             if (esizeIs32) {
                 return new SveIndexedMemSV<uint32_t, uint8_t,
-                                           SveScatterStoreSVMicroop>(
+                                           SveScatterStoreSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemWriteOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, false);
             } else {
                 return new SveIndexedMemSV<uint64_t, uint8_t,
-                                           SveScatterStoreSVMicroop>(
+                                           SveScatterStoreSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemWriteOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, false);
             }
           case 0x1:
             if (esizeIs32) {
                 return new SveIndexedMemSV<uint32_t, uint16_t,
-                                           SveScatterStoreSVMicroop>(
+                                           SveScatterStoreSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemWriteOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, false);
             } else {
                 return new SveIndexedMemSV<uint64_t, uint16_t,
-                                           SveScatterStoreSVMicroop>(
+                                           SveScatterStoreSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemWriteOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, false);
             }
           case 0x2:
             if (esizeIs32) {
                 return new SveIndexedMemSV<uint32_t, uint32_t,
-                                           SveScatterStoreSVMicroop>(
+                                           SveScatterStoreSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemWriteOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, false);
             } else {
                 return new SveIndexedMemSV<uint64_t, uint32_t,
-                                           SveScatterStoreSVMicroop>(
+                                           SveScatterStoreSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemWriteOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, false);
             }
           case 0x3:
             if (esizeIs32) {
                 break;
             } else {
                 return new SveIndexedMemSV<uint64_t, uint64_t,
-                                           SveScatterStoreSVMicroop>(
+                                           SveScatterStoreSVMicroop,
+                                           SveFirstFaultWritebackMicroop>(
                     mn, machInst, MemWriteOp, zt, pg, rn, zm,
-                    offsetIs32, offsetIsSigned, offsetIsScaled);
+                    offsetIs32, offsetIsSigned, offsetIsScaled, false);
             }
         }
         return new Unknown64(machInst);
@@ -505,6 +541,9 @@
         int memAccessSize = %(memacc_size)s;
         EA = XBase + ((int64_t) imm * %(memacc_size)s)''' % {
             'memacc_size': 'eCount / 8' if isPred else 'eCount'}
+        loadRdEnableCode = '''
+        auto rdEn = std::vector<bool>();
+        '''
         if isPred:
             loadMemAccCode = '''
             int index = 0;
@@ -551,6 +590,8 @@
              'tpl_args': '',
              'memacc_code': loadMemAccCode,
              'ea_code' : sveEnabledCheckCode + eaCode,
+             'rden_code' : loadRdEnableCode,
+             'fault_code' : '',
              'fa_code' : ''},
             ['IsMemRef', 'IsLoad'])
         storeIop = InstObjParams('str',
@@ -633,6 +674,11 @@
     # Generates definitions for SVE contiguous loads
     def emitSveContigMemInsts(offsetIsImm):
         global header_output, exec_output, decoders
+        # First-faulting instructions only have a scalar plus scalar form,
+        # while non-faulting instructions only a scalar plus immediate form, so
+        # `offsetIsImm` is used to determine which class of instructions is
+        # generated
+        firstFaulting = not offsetIsImm
         tplHeader = 'template <class RegElemType, class MemElemType>'
         tplArgs = '<RegElemType, MemElemType>'
         eaCode = SPAlignmentCheckCode + '''
@@ -642,6 +688,16 @@
             eaCode += '((int64_t) this->imm * eCount * sizeof(MemElemType))'
         else:
             eaCode += '(XOffset * sizeof(MemElemType));'
+        loadRdEnableCode = '''
+        auto rdEn = std::vector<bool>(sizeof(MemElemType) * eCount, true);
+        for (int i = 0; i < eCount; i++) {
+            if (!GpOp_x[i]) {
+                for (int j = 0; j < sizeof(MemElemType); j++) {
+                    rdEn[sizeof(MemElemType) * i + j] = false;
+                }
+            }
+        }
+        '''
         loadMemAccCode = '''
         for (int i = 0; i < eCount; i++) {
             if (GpOp_x[i]) {
@@ -666,13 +722,60 @@
         storeWrEnableCode = '''
         auto wrEn = std::vector<bool>(sizeof(MemElemType) * eCount, true);
         '''
+        ffrReadBackCode = '''
+        auto& firstFaultReg = Ffr;'''
+        fautlingLoadmemAccCode = '''
+        for (int i = 0; i < eCount; i++) {
+            if (GpOp_x[i] && firstFaultReg[i * sizeof(RegElemType)]) {
+                AA64FpDest_x[i] = memDataView[i];
+            } else {
+                AA64FpDest_x[i] = 0;
+            }
+        }
+        '''
+        nonFaultingCode = 'true ||'
+        faultCode = '''
+        Addr fault_addr;
+        if (fault == NoFault || getFaultVAddr(fault, fault_addr)) {
+            unsigned fault_elem_index;
+            if (fault != NoFault) {
+                assert(fault_addr >= EA);
+                fault_elem_index = (fault_addr - EA) / sizeof(MemElemType);
+            } else {
+                fault_elem_index = eCount + 1;
+            }
+            int first_active_index;
+            for (first_active_index = 0;
+                 first_active_index < eCount && !(GpOp_x[first_active_index]);
+                 first_active_index++);
+            if (%s first_active_index < fault_elem_index) {
+                for (int i = 0; i < eCount; i++) {
+                    for (int j = 0; j < sizeof(RegElemType); j++) {
+                        if (i < fault_elem_index) {
+                            Ffr_ub[i * sizeof(RegElemType) + j] = FfrAux_x[i];
+                        } else {
+                            Ffr_ub[i * sizeof(RegElemType) + j] = 0;
+                        }
+                    }
+                }
+                fault = NoFault;
+                if (first_active_index >= fault_elem_index) {
+                    // non-faulting load needs this
+                    xc->setMemAccPredicate(false);
+                }
+            }
+        }
+        ''' % ('' if firstFaulting else nonFaultingCode)
+
         loadIop = InstObjParams('ld1',
             'SveContigLoadSI' if offsetIsImm else 'SveContigLoadSS',
             'SveContigMemSI' if offsetIsImm else 'SveContigMemSS',
             {'tpl_header': tplHeader,
              'tpl_args': tplArgs,
+             'rden_code' : loadRdEnableCode,
              'memacc_code': loadMemAccCode,
              'ea_code' : sveEnabledCheckCode + eaCode,
+             'fault_code' : '',
              'fa_code' : ''},
             ['IsMemRef', 'IsLoad'])
         storeIop = InstObjParams('st1',
@@ -685,19 +788,38 @@
              'ea_code' : sveEnabledCheckCode + eaCode,
              'fa_code' : ''},
             ['IsMemRef', 'IsStore'])
+        faultIop = InstObjParams('ldff1' if firstFaulting else 'ldnf1',
+            'SveContigFFLoadSS' if firstFaulting else 'SveContigNFLoadSI',
+            'SveContigMemSS' if firstFaulting else 'SveContigMemSI',
+            {'tpl_header': tplHeader,
+             'tpl_args': tplArgs,
+             'rden_code' : loadRdEnableCode,
+             'memacc_code': fautlingLoadmemAccCode,
+             'ea_code' : sveEnabledCheckCode + eaCode,
+             'fault_code' : faultCode,
+             'fa_code' : ''},
+            ['IsMemRef', 'IsLoad'])
+        faultIop.snippets['memacc_code'] = (ffrReadBackCode +
+                                           faultIop.snippets['memacc_code'])
         if offsetIsImm:
             header_output += SveContigMemSIOpDeclare.subst(loadIop)
             header_output += SveContigMemSIOpDeclare.subst(storeIop)
+            header_output += SveContigMemSIOpDeclare.subst(faultIop)
         else:
             header_output += SveContigMemSSOpDeclare.subst(loadIop)
             header_output += SveContigMemSSOpDeclare.subst(storeIop)
+            header_output += SveContigMemSSOpDeclare.subst(faultIop)
         exec_output += (
             SveContigLoadExecute.subst(loadIop) +
             SveContigLoadInitiateAcc.subst(loadIop) +
             SveContigLoadCompleteAcc.subst(loadIop) +
             SveContigStoreExecute.subst(storeIop) +
             SveContigStoreInitiateAcc.subst(storeIop) +
-            SveContigStoreCompleteAcc.subst(storeIop))
+            SveContigStoreCompleteAcc.subst(storeIop) +
+            SveContigLoadExecute.subst(faultIop) +
+            SveContigLoadInitiateAcc.subst(faultIop) +
+            SveContigLoadCompleteAcc.subst(faultIop))
+
         for args in loadTplArgs:
             substDict = {'tpl_args': '<%s>' % ', '.join(args),
                          'class_name': 'SveContigLoadSI' if offsetIsImm
@@ -708,6 +830,12 @@
                          'class_name': 'SveContigStoreSI' if offsetIsImm
                                        else 'SveContigStoreSS'}
             exec_output += SveContigMemExecDeclare.subst(substDict)
+        for args in loadTplArgs:
+            substDict = {'tpl_args': '<%s>' % ', '.join(args),
+                         'class_name': 'SveContigFFLoadSS' if firstFaulting
+                                       else 'SveContigNFLoadSI'}
+            exec_output += SveContigMemExecDeclare.subst(substDict)
+
 
     # Generates definitions for SVE load-and-replicate instructions
     def emitSveLoadAndRepl():
@@ -773,16 +901,14 @@
         }
         EA = XBase + offset'''
         loadMemAccCode = '''
-            if (GpOp_x[elemIndex]) {
-                AA64FpDest_x[elemIndex] = memData;
-            } else {
-                AA64FpDest_x[elemIndex] = 0;
-            }
+            AA64FpDest_x[elemIndex] = memData;
         '''
         storeMemAccCode = '''
             memData = AA64FpDest_x[elemIndex];
         '''
-        predCheckCode = 'GpOp_x[elemIndex]'
+        predCheckCode = 'GpOp_x[index]'
+        faultStatusSetCode = 'PUreg0_x[elemIndex] = 1;'
+        faultStatusResetCode = 'PUreg0_x[elemIndex] = 0;'
         loadIop = InstObjParams('ld1',
             ('SveGatherLoadVIMicroop'
              if indexed_addr_form == IndexedAddrForm.VEC_PLUS_IMM
@@ -792,6 +918,8 @@
              'tpl_args': tplArgs,
              'memacc_code': loadMemAccCode,
              'ea_code' : sveEnabledCheckCode + eaCode,
+             'fault_status_set_code' : faultStatusSetCode,
+             'fault_status_reset_code' : faultStatusResetCode,
              'pred_check_code' : predCheckCode,
              'fa_code' : ''},
             ['IsMicroop', 'IsMemRef', 'IsLoad'])
@@ -839,6 +967,39 @@
             # TODO: this should become SveMemExecDeclare
             exec_output += SveContigMemExecDeclare.subst(substDict)
 
+    firstFaultTplArgs = ('int32_t', 'int64_t', 'uint32_t', 'uint64_t')
+
+    def emitSveFirstFaultWritebackMicroop():
+        global header_output, exec_output, decoders
+        tplHeader = 'template <class RegElemType>'
+        tplArgs = '<RegElemType>'
+        faultStatusCheckCode = 'PUreg0_x[index]'
+        firstFaultResetCode = '''
+        for(int j = 0; j < sizeof(RegElemType); j++) {
+            Ffr_ub[index * sizeof(RegElemType) + j] = 0;
+        }
+        '''
+        firstFaultForwardCode = '''
+        for(int j = 0; j < sizeof(RegElemType); j++) {
+            Ffr_ub[index * sizeof(RegElemType) + j] = FfrAux_x[index];
+        }
+        '''
+        iop = InstObjParams('ldff1',
+            'SveFirstFaultWritebackMicroop',
+            'MicroOp',
+            {'tpl_header': tplHeader,
+             'tpl_args': tplArgs,
+             'fault_status_check_code' : faultStatusCheckCode,
+             'first_fault_reset_code' : firstFaultResetCode,
+             'first_fault_forward_code' : firstFaultForwardCode},
+             ['IsMicroop'])
+        header_output += SveFirstFaultWritebackMicroopDeclare.subst(iop)
+        exec_output += SveFirstFaultWritebackMicroopExecute.subst(iop)
+        for args in firstFaultTplArgs:
+            substDict = {'targs': args,
+                         'class_name' : 'SveFirstFaultWritebackMicroop' }
+            exec_output += SveOpExecDeclare.subst(substDict)
+
     # Generates definitions for the first microop of SVE gather loads, required
     # to propagate the source vector register to the transfer microops
     def emitSveGatherLoadCpySrcVecMicroop():
@@ -859,9 +1020,11 @@
 
     # LD1[S]{B,H,W,D} (scalar plus immediate)
     # ST1[S]{B,H,W,D} (scalar plus immediate)
+    # LDNF1[S]{B,H,W,D} (scalar plus immediate)
     emitSveContigMemInsts(True)
     # LD1[S]{B,H,W,D} (scalar plus scalar)
     # ST1[S]{B,H,W,D} (scalar plus scalar)
+    # LDFF1[S]{B,H,W,D} (scalar plus vector)
     emitSveContigMemInsts(False)
 
     # LD1R[S]{B,H,W,D}
@@ -874,12 +1037,16 @@
 
     # LD1[S]{B,H,W,D} (vector plus immediate)
     # ST1[S]{B,H,W,D} (vector plus immediate)
+    # LDFF1[S]{B,H,W,D} (scalar plus immediate)
     emitSveIndexedMemMicroops(IndexedAddrForm.VEC_PLUS_IMM)
     # LD1[S]{B,H,W,D} (scalar plus vector)
     # ST1[S]{B,H,W,D} (scalar plus vector)
+    # LDFF1[S]{B,H,W,D} (scalar plus vector)
     emitSveIndexedMemMicroops(IndexedAddrForm.SCA_PLUS_VEC)
 
+    # FFR writeback microop for gather loads
+    emitSveFirstFaultWritebackMicroop()
+
     # Source vector copy microop for gather loads
     emitSveGatherLoadCpySrcVecMicroop()
-
 }};
diff --git a/src/arch/arm/isa/operands.isa b/src/arch/arm/isa/operands.isa
index a3b3857..aaa64e7 100644
--- a/src/arch/arm/isa/operands.isa
+++ b/src/arch/arm/isa/operands.isa
@@ -581,6 +581,8 @@
     'PDest': vecPredReg('dest'),
     'PDestMerge': vecPredReg('dest'),
     'Ffr': vecPredReg('PREDREG_FFR'),
+    'FfrAux': vecPredReg('PREDREG_FFR'),
+    'PUreg0': vecPredReg('PREDREG_UREG0'),
 
     #Abstracted control reg operands
     'MiscDest': cntrlReg('dest'),
diff --git a/src/arch/arm/isa/templates/sve_mem.isa b/src/arch/arm/isa/templates/sve_mem.isa
index 2cdf2ff..5e2e553 100644
--- a/src/arch/arm/isa/templates/sve_mem.isa
+++ b/src/arch/arm/isa/templates/sve_mem.isa
@@ -151,13 +151,15 @@
         TheISA::VecRegContainer memData;
         auto memDataView = memData.as<MemElemType>();
 
-        if (fault == NoFault) {
-            fault = xc->readMem(EA, memData.raw_ptr<uint8_t>(), memAccessSize,
-                this->memAccessFlags);
-            %(memacc_code)s;
-        }
+        %(rden_code)s;
+
+        fault = xc->readMem(EA, memData.raw_ptr<uint8_t>(), memAccessSize,
+            this->memAccessFlags, rdEn);
+
+        %(fault_code)s;
 
         if (fault == NoFault) {
+            %(memacc_code)s;
             %(op_wb)s;
         }
 
@@ -178,13 +180,14 @@
 
         %(op_src_decl)s;
         %(op_rd)s;
-
         %(ea_code)s;
 
-        if (fault == NoFault) {
-            fault = xc->initiateMemRead(EA, memAccessSize,
-                this->memAccessFlags);
-        }
+        %(rden_code)s;
+
+        fault = xc->initiateMemRead(EA, memAccessSize, this->memAccessFlags,
+            rdEn);
+
+        %(fault_code)s;
 
         return fault;
     }
@@ -195,7 +198,6 @@
     Fault %(class_name)s%(tpl_args)s::completeAcc(PacketPtr pkt,
         ExecContext *xc, Trace::InstRecord *traceData) const
     {
-        Fault fault = NoFault;
         bool aarch64 M5_VAR_USED = true;
         unsigned eCount = ArmStaticInst::getCurSveVecLen<RegElemType>(
             xc->tcBase());
@@ -206,18 +208,15 @@
         TheISA::VecRegContainer memData;
         auto memDataView = memData.as<MemElemType>();
 
-        memcpy(memData.raw_ptr<uint8_t>(), pkt->getPtr<uint8_t>(),
-            pkt->getSize());
-
-        if (fault == NoFault) {
-            %(memacc_code)s;
+        if (xc->readMemAccPredicate()) {
+            memcpy(memData.raw_ptr<uint8_t>(), pkt->getPtr<uint8_t>(),
+                   pkt->getSize());
         }
 
-        if (fault == NoFault) {
-            %(op_wb)s;
-        }
+        %(memacc_code)s;
+        %(op_wb)s;
 
-        return fault;
+        return NoFault;
     }
 }};
 
@@ -398,24 +397,29 @@
 
         int elemIndex;
         int numElems;
+        bool firstFault;
 
         unsigned memAccessFlags;
 
       public:
         %(class_name)s(const char* mnem, ExtMachInst machInst,
             OpClass __opClass, IntRegIndex _dest, IntRegIndex _gp,
-            IntRegIndex _base, uint64_t _imm, int _elemIndex, int _numElems)
+            IntRegIndex _base, uint64_t _imm, int _elemIndex, int _numElems,
+            bool _firstFault)
             : %(base_class)s(mnem, machInst, %(op_class)s),
               dest(_dest), gp(_gp), base(_base), imm(_imm),
               elemIndex(_elemIndex), numElems(_numElems),
+              firstFault(_firstFault),
               memAccessFlags(ArmISA::TLB::AllowUnaligned |
                              ArmISA::TLB::MustBeOne)
         {
             %(constructor)s;
             if (_opClass == MemReadOp && elemIndex == 0) {
                 // The first micro-op is responsible for pinning the
-                // destination register
-                _destRegIdx[0].setNumPinnedWrites(numElems - 1);
+                // destination and the fault status registers
+                assert(_numDestRegs == 2);
+               _destRegIdx[0].setNumPinnedWrites(numElems - 1);
+               _destRegIdx[1].setNumPinnedWrites(numElems - 1);
             }
         }
 
@@ -471,6 +475,7 @@
 
         int elemIndex;
         int numElems;
+        bool firstFault;
 
         unsigned memAccessFlags;
 
@@ -479,20 +484,22 @@
             OpClass __opClass, IntRegIndex _dest, IntRegIndex _gp,
             IntRegIndex _base, IntRegIndex _offset, bool _offsetIs32,
             bool _offsetIsSigned, bool _offsetIsScaled, int _elemIndex,
-            int _numElems)
+            int _numElems, bool _firstFault)
             : %(base_class)s(mnem, machInst, %(op_class)s),
               dest(_dest), gp(_gp), base(_base), offset(_offset),
               offsetIs32(_offsetIs32), offsetIsSigned(_offsetIsSigned),
               offsetIsScaled(_offsetIsScaled), elemIndex(_elemIndex),
-              numElems(_numElems),
+              numElems(_numElems), firstFault(_firstFault),
               memAccessFlags(ArmISA::TLB::AllowUnaligned |
                              ArmISA::TLB::MustBeOne)
         {
             %(constructor)s;
             if (_opClass == MemReadOp && elemIndex == 0) {
                 // The first micro-op is responsible for pinning the
-                // destination register
-                _destRegIdx[0].setNumPinnedWrites(numElems - 1);
+                // destination and the fault status registers
+                assert(_numDestRegs == 2);
+               _destRegIdx[0].setNumPinnedWrites(numElems - 1);
+               _destRegIdx[1].setNumPinnedWrites(numElems - 1);
             }
         }
 
@@ -542,18 +549,33 @@
         %(op_rd)s;
         %(ea_code)s;
 
-        MemElemType memData;
+        MemElemType memData = 0;
 
+        int index = elemIndex;
         if (%(pred_check_code)s) {
             fault = readMemAtomic(xc, traceData, EA, memData,
                 this->memAccessFlags);
         }
 
         if (fault == NoFault) {
+            %(fault_status_reset_code)s;
             %(memacc_code)s;
             %(op_wb)s;
-        }
+        } else {
+            %(fault_status_set_code)s;
+            if (firstFault) {
+               for (index = 0;
+                    index < numElems && !(%(pred_check_code)s);
+                    index++);
 
+               if (index < elemIndex) {
+                  fault = NoFault;
+                  memData = 0;
+                  %(memacc_code)s;
+                  %(op_wb)s;
+               }
+            }
+        }
         return fault;
     }
 }};
@@ -573,11 +595,27 @@
 
         MemElemType memData;
 
+        int index = elemIndex;
         if (%(pred_check_code)s) {
             fault = initiateMemRead(xc, traceData, EA, memData,
                 this->memAccessFlags);
+            if (fault != NoFault) {
+                %(fault_status_set_code)s;
+                if (firstFault) {
+                    for (index = 0;
+                         index < numElems && !(%(pred_check_code)s);
+                         index++);
+                    if (index < elemIndex) {
+                        fault = NoFault;
+                        xc->setMemAccPredicate(false);
+                    }
+                }
+            } else {
+                %(fault_status_reset_code)s;
+            }
         } else {
             xc->setMemAccPredicate(false);
+            %(fault_status_reset_code)s;
         }
 
         return fault;
@@ -589,26 +627,20 @@
     Fault %(class_name)s%(tpl_args)s::completeAcc(PacketPtr pkt,
         ExecContext *xc, Trace::InstRecord *traceData) const
     {
-        Fault fault = NoFault;
         bool aarch64 M5_VAR_USED = true;
 
         %(op_decl)s;
         %(op_rd)s;
 
         MemElemType memData = 0;
-        if (%(pred_check_code)s) {
+        if (xc->readMemAccPredicate()) {
             getMem(pkt, memData, traceData);
         }
 
-        if (fault == NoFault) {
-            %(memacc_code)s;
-        }
+        %(memacc_code)s;
+        %(op_wb)s;
 
-        if (fault == NoFault) {
-            %(op_wb)s;
-        }
-
-        return fault;
+        return NoFault;
     }
 }};
 
@@ -628,6 +660,7 @@
         MemElemType memData;
         %(memacc_code)s;
 
+        int index = elemIndex;
         if (%(pred_check_code)s) {
             fault = writeMemAtomic(xc, traceData, memData, EA,
                                    this->memAccessFlags, NULL);
@@ -657,6 +690,7 @@
         MemElemType memData;
         %(memacc_code)s;
 
+        int index = elemIndex;
         if (%(pred_check_code)s) {
             fault = writeMemTiming(xc, traceData, memData, EA,
                                    this->memAccessFlags, NULL);
@@ -677,6 +711,64 @@
     }
 }};
 
+def template SveFirstFaultWritebackMicroopDeclare {{
+    %(tpl_header)s
+    class SveFirstFaultWritebackMicroop : public MicroOp
+    {
+      protected:
+        typedef RegElemType TPElem;
+
+        int numElems;
+        StaticInst *macroOp;
+
+      public:
+        SveFirstFaultWritebackMicroop(const char* mnem, ExtMachInst machInst,
+            OpClass __opClass, int _numElems, StaticInst *_macroOp)
+            : MicroOp(mnem, machInst, __opClass),
+              numElems(_numElems), macroOp(_macroOp)
+        {
+            %(constructor)s;
+        }
+
+        Fault execute(ExecContext *, Trace::InstRecord *) const;
+
+        std::string
+        generateDisassembly(Addr pc, const SymbolTable *symtab) const
+        {
+            std::stringstream ss;
+            ccprintf(ss, "%s", macroOp->disassemble(pc, symtab));
+            ccprintf(ss, " (uop%d)", numElems);
+            return ss.str();
+        }
+    };
+}};
+
+def template SveFirstFaultWritebackMicroopExecute {{
+    %(tpl_header)s
+    Fault %(class_name)s%(tpl_args)s::execute(ExecContext *xc,
+        Trace::InstRecord *traceData) const
+    {
+        bool aarch64 M5_VAR_USED = true;
+
+        %(op_decl)s;
+        %(op_rd)s;
+
+        int  index, firstFaultIndex;
+        for (index = 0;
+             index < numElems && !%(fault_status_check_code)s;
+             index++);
+        firstFaultIndex = index;
+        for (index = 0; index < numElems; index++) {
+            if (index < firstFaultIndex) {
+                %(first_fault_forward_code)s;
+            } else {
+                %(first_fault_reset_code)s;
+            }
+        }
+        return NoFault;
+    }
+}};
+
 def template SveGatherLoadCpySrcVecMicroopDeclare {{
     class SveGatherLoadCpySrcVecMicroop : public MicroOp
     {
diff --git a/src/arch/arm/registers.hh b/src/arch/arm/registers.hh
index 8e6ce79..3790d9d 100644
--- a/src/arch/arm/registers.hh
+++ b/src/arch/arm/registers.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2011, 2014, 2016-2017 ARM Limited
+ * Copyright (c) 2010-2011, 2014, 2016-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -89,8 +89,9 @@
 const int NumFloatRegs = NumFloatV8ArchRegs + NumFloatSpecialRegs;
 const int NumVecRegs = NumVecV8ArchRegs + NumVecSpecialRegs;
 const int VECREG_UREG0 = 32;
-const int NumVecPredRegs = 17;  // P0-P15, FFR
+const int NumVecPredRegs = 18;  // P0-P15, FFR, UREG0
 const int PREDREG_FFR = 16;
+const int PREDREG_UREG0 = 17;
 const int NumCCRegs = NUM_CCREGS;
 const int NumMiscRegs = NUM_MISCREGS;