arch-arm: Add SVE LD1RQ[BHWD]

Add both scalar+scalar and scalar+immediate versions.

Change-Id: If5fa1a71ab0dab93f9d35b544ea0899ece858bea
Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/19170
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com>
Maintainer: Andreas Sandberg <andreas.sandberg@arm.com>
diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa
index 3bfae1d..c06d7f6 100644
--- a/src/arch/arm/isa/formats/sve_2nd_level.isa
+++ b/src/arch/arm/isa/formats/sve_2nd_level.isa
@@ -3049,12 +3049,62 @@
     StaticInstPtr
     decodeSveLoadBcastQuadSS(ExtMachInst machInst)
     {
+        uint8_t num = bits(machInst, 22, 21);
+        if (num != 0x00) {
+            return new Unknown64(machInst);
+        }
+
+        IntRegIndex zt = (IntRegIndex)(uint8_t) bits(machInst, 4, 0);
+        IntRegIndex rn = makeSP((IntRegIndex)(uint8_t) bits(machInst, 9, 5));
+        IntRegIndex pg = (IntRegIndex)(uint8_t) bits(machInst, 12, 10);
+        IntRegIndex rm = (IntRegIndex)(uint8_t) bits(machInst, 20, 16);
+        uint8_t msz = bits(machInst, 24, 23);
+        switch (msz) {
+            case 0:
+                return new SveLd1RqSS<uint8_t, uint8_t>("ld1rqb",
+                        machInst, zt, pg, rn, rm);
+            case 1:
+                return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
+                        machInst, zt, pg, rn, rm);
+            case 2:
+                return new SveLd1RqSS<uint32_t, uint32_t>("ld1rqw",
+                        machInst, zt, pg, rn, rm);
+            case 3:
+                return new SveLd1RqSS<uint64_t, uint64_t>("ld1rqd",
+                        machInst, zt, pg, rn, rm);
+        }
+
         return new Unknown64(machInst);
     }  // decodeSveLoadBcastQuadSS
 
     StaticInstPtr
     decodeSveLoadBcastQuadSI(ExtMachInst machInst)
     {
+        uint8_t num = bits(machInst, 22, 21);
+        if (num != 0x00) {
+            return new Unknown64(machInst);
+        }
+
+        IntRegIndex zt = (IntRegIndex)(uint8_t) bits(machInst, 4, 0);
+        IntRegIndex rn = makeSP((IntRegIndex)(uint8_t) bits(machInst, 9, 5));
+        IntRegIndex pg = (IntRegIndex)(uint8_t) bits(machInst, 12, 10);
+        uint64_t imm = sext<4>(bits(machInst, 19, 16));
+        uint8_t msz = bits(machInst, 24, 23);
+        switch (msz) {
+            case 0:
+                return new SveLd1RqSI<uint8_t, uint8_t>("ld1rqb",
+                        machInst, zt, pg, rn, imm);
+            case 1:
+                return new SveLd1RqSI<uint16_t, uint16_t>("ld1rqh",
+                        machInst, zt, pg, rn, imm);
+            case 2:
+                return new SveLd1RqSI<uint32_t, uint32_t>("ld1rqw",
+                        machInst, zt, pg, rn, imm);
+            case 3:
+                return new SveLd1RqSI<uint64_t, uint64_t>("ld1rqd",
+                        machInst, zt, pg, rn, imm);
+        }
+
         return new Unknown64(machInst);
     }  // decodeSveLoadBcastQuadSI
 
diff --git a/src/arch/arm/isa/insts/sve_mem.isa b/src/arch/arm/isa/insts/sve_mem.isa
index 32a078d..d993122 100644
--- a/src/arch/arm/isa/insts/sve_mem.isa
+++ b/src/arch/arm/isa/insts/sve_mem.isa
@@ -1475,6 +1475,70 @@
                                        else 'SveStoreRegRegMicroop')
             exec_output += SveStructMemExecDeclare.subst(substDict)
 
+    # Generates definitions for SVE load-and-replicate quadword instructions
+    def emitSveLoadAndReplQuad(offsetIsImm):
+        global header_output, exec_output, decoders
+        tplHeader = 'template <class RegElemType, class MemElemType>'
+        tplArgs = '<RegElemType, MemElemType>'
+        eaCode = SPAlignmentCheckCode + '''
+        int memAccessSize = 16;
+        EA = XBase + '''
+        if offsetIsImm:
+            eaCode += '(((int64_t) this->imm) * 16);'
+        else:
+            eaCode += '(XOffset * sizeof(MemElemType));'
+        loadRdEnableCode = '''
+        eCount = 16/sizeof(RegElemType);
+        auto rdEn = std::vector<bool>(16, true);
+        for (int i = 0; i < eCount; ++i) {
+            if (!GpOp_x[i]) {
+                for (int j = 0; j < sizeof(RegElemType); ++j) {
+                    rdEn[sizeof(RegElemType) * i + j] = false;
+                }
+            }
+        }
+        '''
+        memAccCode = '''
+        __uint128_t qword;
+        RegElemType* qp = reinterpret_cast<RegElemType*>(&qword);
+        for (int i = 0; i < 16/sizeof(RegElemType); ++i) {
+            if (GpOp_x[i]) {
+                qp[i] = memDataView[i];
+            } else {
+                qp[i] = 0;
+            }
+        }
+        eCount = ArmStaticInst::getCurSveVecLen<__uint128_t>(
+                xc->tcBase());
+        for (int i = 0; i < eCount; ++i) {
+            AA64FpDest_uq[i] = qword;
+        }
+        '''
+        iop = InstObjParams('ld1rq',
+                'SveLd1RqSI' if offsetIsImm else 'SveLd1RqSS',
+                'SveContigMemSI' if offsetIsImm else 'SveContigMemSS',
+                {'tpl_header': tplHeader,
+                 'tpl_args': tplArgs,
+                 'rden_code': loadRdEnableCode,
+                 'memacc_code': memAccCode,
+                 'ea_code': sveEnabledCheckCode + eaCode,
+                 'fault_code': '',
+                 'fa_code': ''},
+                ['IsMemRef', 'IsLoad'])
+        if offsetIsImm:
+            header_output += SveContigMemSIOpDeclare.subst(iop)
+        else:
+            header_output += SveContigMemSSOpDeclare.subst(iop)
+        exec_output += (
+                SveContigLoadExecute.subst(iop) +
+                SveContigLoadInitiateAcc.subst(iop) +
+                SveContigLoadCompleteAcc.subst(iop))
+        for ttype in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'):
+            substDict = {'tpl_args': '<%s, %s>' % (ttype, ttype),
+                    'class_name': 'SveLd1RqSI' if offsetIsImm
+                                  else 'SveLd1RqSS'}
+            exec_output += SveContigMemExecDeclare.subst(substDict)
+
     # LD1[S]{B,H,W,D} (scalar plus immediate)
     # ST1[S]{B,H,W,D} (scalar plus immediate)
     # LDNF1[S]{B,H,W,D} (scalar plus immediate)
@@ -1487,6 +1551,11 @@
     # LD1R[S]{B,H,W,D}
     emitSveLoadAndRepl()
 
+    # LD1RQ{B,H,W,D} (scalar plus immediate)
+    emitSveLoadAndReplQuad(offsetIsImm = True)
+    # LD1RQ{B,H,W,D} (scalar plus scalar)
+    emitSveLoadAndReplQuad(offsetIsImm = False)
+
     # LD{2,3,4}{B,H,W,D} (scalar plus immediate)
     # ST{2,3,4}{B,H,W,D} (scalar plus immediate)
     emitSveStructMemInsts(offsetIsImm = True)
diff --git a/src/arch/arm/isa/operands.isa b/src/arch/arm/isa/operands.isa
index 5eae9b4..b0e5b3b 100644
--- a/src/arch/arm/isa/operands.isa
+++ b/src/arch/arm/isa/operands.isa
@@ -48,6 +48,8 @@
     'uw' : 'uint32_t',
     'sd' : 'int64_t',
     'ud' : 'uint64_t',
+    'sq' : '__int128_t',
+    'uq' : '__uint128_t',
     'tud' : 'std::array<uint64_t, 2>',
     'sf' : 'float',
     'df' : 'double',