arch-arm: implement VMINNM and VMAXNM scalar version

ARMv8.2 16-bit versions have not yet been implemented, but a placeholders
were created for them.

Refactor the nearby decoding tree to closely match the ARM spec A32 decode
table.

That piece of the tree can also be called from thumb which decodes it in
the same way, although the thumb decode table has a different terminology

The old code didn't match neither A32 or T32 terminologies, so it is
better to at least match one of them to help verify correctness.

Change-Id: Iabbbca2932557cf6c98ce36690c385c3ddf39ed8
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/18690
Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com>
Maintainer: Andreas Sandberg <andreas.sandberg@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa
index e730833..da439ac 100644
--- a/src/arch/arm/isa/formats/fp.isa
+++ b/src/arch/arm/isa/formats/fp.isa
@@ -2034,6 +2034,7 @@
                               (bits(machInst, 15, 12) << 1));
         }
     }
+
     IntRegIndex decodeFpVm(ExtMachInst machInst, uint32_t size, bool isInt)
     {
         if (!isInt and size == 3) {
@@ -2044,31 +2045,64 @@
                               (bits(machInst, 3, 0) << 1));
         }
     }
-    StaticInstPtr
-    decodeShortFpTransfer(ExtMachInst machInst)
+
+    IntRegIndex decodeFpVn(ExtMachInst machInst, uint32_t size)
     {
-        const uint32_t l = bits(machInst, 20);
-        const uint32_t c = bits(machInst, 8);
-        const uint32_t a = bits(machInst, 23, 21);
-        const uint32_t q = bits(machInst, 6, 5);
-        const uint32_t o1 = bits(machInst, 18);
-        if ((machInst.thumb == 1 && bits(machInst, 28) == 1) ||
-            (machInst.thumb == 0 && machInst.condCode == 0xf)) {
-            // Determine if this is backported aarch64 FP instruction
-            const bool b31_b24 = bits(machInst, 31, 24) == 0xFE;
-            const bool b23 = bits(machInst, 23);
-            const bool b21_b19 = bits(machInst, 21, 19) == 0x7;
-            const bool b11_b9  = bits(machInst, 11, 9) == 0x5;
-            const uint32_t size = bits(machInst, 9, 8);
-            const bool op3 = bits(machInst, 6);
-            const bool b4 = bits(machInst, 4) == 0x0;
-            const uint32_t rm = bits(machInst, 17, 16);
-            IntRegIndex vd = decodeFpVd(machInst, size, false);
-            IntRegIndex vm = decodeFpVm(machInst, size, false);
-            IntRegIndex vdInt = decodeFpVd(machInst, size, true);
-            if (b31_b24 && b23 && b21_b19 && b11_b9 && op3 && b4) {
+        if (size == 3) {
+            return (IntRegIndex)((bits(machInst, 7) << 5) |
+                            (bits(machInst, 19, 16) << 1));
+        } else {
+            return (IntRegIndex)(bits(machInst, 7) |
+                            (bits(machInst, 19, 16) << 1));
+        }
+    }
+
+    StaticInstPtr
+    decodeFloatingPointDataProcessing(ExtMachInst machInst) {
+        const uint32_t op0 = bits(machInst, 23, 20);
+        const uint32_t op1 = bits(machInst, 19, 16);
+        const uint32_t op2 = bits(machInst, 9, 8);
+        const uint32_t op3 = bits(machInst, 6);
+        const uint32_t rm = bits(machInst, 17, 16);
+        const uint32_t size = bits(machInst, 9, 8);
+        IntRegIndex vd = decodeFpVd(machInst, size, false);
+        IntRegIndex vm = decodeFpVm(machInst, size, false);
+        IntRegIndex vdInt = decodeFpVd(machInst, size, true);
+        IntRegIndex vn = decodeFpVn(machInst, size);
+        if (bits(machInst, 31, 24) == 0xFE && !bits(machInst, 4)) {
+            if (bits(op0, 3) == 0 && op2 != 0 && !op3){
+                ConditionCode cond;
+                switch(bits(machInst, 21, 20)) {
+                case 0x0: cond = COND_EQ; break;
+                case 0x1: cond = COND_VS; break;
+                case 0x2: cond = COND_GE; break;
+                case 0x3: cond = COND_GT; break;
+                }
+                if (size == 3) {
+                    return new VselD(machInst, vd, vn, vm, cond);
+                } else {
+                    return new VselS(machInst, vd, vn, vm, cond);
+                }
+            } else if (bits(op0, 3) == 1 && bits(op0, 1, 0) == 0 && op2 != 0) {
+                const bool op = bits(machInst, 6);
+                if (op) {
+                    if (size == 1) {
+                        return new FailUnimplemented("vminnm.f16", machInst);
+                    }
+                    return decodeNeonSizeSingleDouble<VminnmS, VminnmD>(
+                        size, machInst, vd, vn, vm);
+                } else {
+                    if (size == 1) {
+                        return new FailUnimplemented("vmaxnm.f16", machInst);
+                    }
+                    return decodeNeonSizeSingleDouble<VmaxnmS, VmaxnmD>(
+                        size, machInst, vd, vn, vm);
+                }
+            } else if (bits(op0, 3) && bits(op0, 1, 0) == 3 &&
+                        bits(op1, 3) && op2 != 0 && op3)
+                    {
+                const uint32_t o1 = bits(machInst, 18);
                 if (o1 == 0) {
-                    // VINT* Integer Rounding Instruction
                     if (size == 3) {
                         switch(rm) {
                             case 0x0:
@@ -2105,119 +2139,112 @@
                 } else {
                     const bool op = bits(machInst, 7);
                     switch(rm) {
-                      case 0x0:
+                    case 0x0:
                         switch(size) {
-                          case 0x0:
+                        case 0x0:
                             return new Unknown(machInst);
-                          case 0x1:
+                        case 0x1:
                             return new FailUnimplemented(
                                 "vcvta.u32.f16", machInst);
-                          case 0x2:
+                        case 0x2:
                             if (op) {
                                 return new VcvtaFpSIntS(machInst, vdInt, vm);
                             } else {
                                 return new VcvtaFpUIntS(machInst, vdInt, vm);
                             }
-                          case 0x3:
+                        case 0x3:
                             if (op) {
                                 return new VcvtaFpSIntD(machInst, vdInt, vm);
                             } else {
                                 return new VcvtaFpUIntD(machInst, vdInt, vm);
                             }
-                          default: return new Unknown(machInst);
+                        default: return new Unknown(machInst);
                         }
-                      case 0x1:
+                    case 0x1:
                         switch(size) {
-                          case 0x0:
+                        case 0x0:
                             return new Unknown(machInst);
-                          case 0x1:
+                        case 0x1:
                             return new FailUnimplemented(
                                 "vcvtn.u32.f16", machInst);
-                          case 0x2:
+                        case 0x2:
                             if (op) {
                                 return new VcvtnFpSIntS(machInst, vdInt, vm);
                             } else {
                                 return new VcvtnFpUIntS(machInst, vdInt, vm);
                             }
-                          case 0x3:
+                        case 0x3:
                             if (op) {
                                 return new VcvtnFpSIntD(machInst, vdInt, vm);
                             } else {
                                 return new VcvtnFpUIntD(machInst, vdInt, vm);
                             }
-                          default: return new Unknown(machInst);
+                        default: return new Unknown(machInst);
                         }
-                      case 0x2:
+                    case 0x2:
                         switch(size) {
-                          case 0x0:
+                        case 0x0:
                             return new Unknown(machInst);
-                          case 0x1:
+                        case 0x1:
                             return new FailUnimplemented(
                                 "vcvtp.u32.f16", machInst);
-                          case 0x2:
+                        case 0x2:
                             if (op) {
                                 return new VcvtpFpSIntS(machInst, vdInt, vm);
                             } else {
                                 return new VcvtpFpUIntS(machInst, vdInt, vm);
                             }
-                          case 0x3:
+                        case 0x3:
                             if (op) {
                                 return new VcvtpFpSIntD(machInst, vdInt, vm);
                             } else {
                                 return new VcvtpFpUIntD(machInst, vdInt, vm);
                             }
-                          default: return new Unknown(machInst);
+                        default: return new Unknown(machInst);
                         }
-                      case 0x3:
+                    case 0x3:
                         switch(size) {
-                          case 0x0:
+                        case 0x0:
                             return new Unknown(machInst);
-                          case 0x1:
+                        case 0x1:
                             return new FailUnimplemented(
                                 "vcvtm.u32.f16", machInst);
-                          case 0x2:
+                        case 0x2:
                             if (op) {
                                 return new VcvtmFpSIntS(machInst, vdInt, vm);
                             } else {
                                 return new VcvtmFpUIntS(machInst, vdInt, vm);
                             }
-                          case 0x3:
+                        case 0x3:
                             if (op) {
                                 return new VcvtmFpSIntD(machInst, vdInt, vm);
                             } else {
                                 return new VcvtmFpUIntD(machInst, vdInt, vm);
                             }
-                          default: return new Unknown(machInst);
+                        default: return new Unknown(machInst);
                         }
-                      default: return new Unknown(machInst);
+                    default: return new Unknown(machInst);
                     }
                 }
-            } else if (b31_b24 && !b23 && b11_b9 && !op3 && b4){
-                // VSEL* floating point conditional select
-
-                ConditionCode cond;
-                switch(bits(machInst, 21, 20)) {
-                  case 0x0: cond = COND_EQ; break;
-                  case 0x1: cond = COND_VS; break;
-                  case 0x2: cond = COND_GE; break;
-                  case 0x3: cond = COND_GT; break;
-                }
-
-                if (size == 3) {
-                      const IntRegIndex vn =
-                          (IntRegIndex)((bits(machInst, 7) << 5) |
-                                       (bits(machInst, 19, 16) << 1));
-                    return new VselD(machInst, vd, vn, vm, cond);
-                } else {
-                      const IntRegIndex vn =
-                          (IntRegIndex)((bits(machInst, 19, 16) << 1) |
-                                        bits(machInst, 7));
-                      return new VselS(machInst, vd, vn, vm, cond);
-                }
             } else {
                 return new Unknown(machInst);
             }
+        } else {
+            return new Unknown(machInst);
         }
+    }
+
+    StaticInstPtr
+    decodeShortFpTransfer(ExtMachInst machInst)
+    {
+        if ((machInst.thumb == 1 && bits(machInst, 28) == 1) ||
+            (machInst.thumb == 0 && machInst.condCode == 0xf)) {
+                return decodeFloatingPointDataProcessing(machInst);
+        }
+        const uint32_t l = bits(machInst, 20);
+        const uint32_t c = bits(machInst, 8);
+        const uint32_t a = bits(machInst, 23, 21);
+        const uint32_t q = bits(machInst, 6, 5);
         if (l == 0 && c == 0) {
             if (a == 0) {
                 const uint32_t vn = (bits(machInst, 19, 16) << 1) |
diff --git a/src/arch/arm/isa/insts/fp.isa b/src/arch/arm/isa/insts/fp.isa
index d8323c4..df4d583 100644
--- a/src/arch/arm/isa/insts/fp.isa
+++ b/src/arch/arm/isa/insts/fp.isa
@@ -578,6 +578,66 @@
     buildBinFpOp("vmul", "Vmul", "FpRegRegRegOp", "SimdFloatMultOp", "fpMulS",
                  "fpMulD")
 
+    def buildBinOp(name, base, opClass, op):
+        '''
+        Create backported aarch64 instructions that use fplib.
+
+        Because they are backported, these instructions are unconditional.
+        '''
+        global header_output, decoder_output, exec_output
+        inst_datas = [
+            (
+                "s",
+                '''
+                FpDest_uw = fplib%(op)s<>(FpOp1_uw, FpOp2_uw, fpscr);
+                '''
+            ),
+            (
+                "d",
+                '''
+                uint64_t op1 = ((uint64_t)FpOp1P0_uw |
+                               ((uint64_t)FpOp1P1_uw << 32));
+                uint64_t op2 = ((uint64_t)FpOp2P0_uw |
+                               ((uint64_t)FpOp2P1_uw << 32));
+                uint64_t dest = fplib%(op)s<>(op1, op2, fpscr);
+                FpDestP0_uw = dest;
+                FpDestP1_uw = dest >> 32;
+                '''
+            )
+        ]
+        Name = name[0].upper() + name[1:]
+        declareTempl = eval(base + "Declare");
+        constructorTempl = eval(base + "Constructor");
+        for size_suffix, code in inst_datas:
+            code = (
+                '''
+                FPSCR fpscr = (FPSCR)FpscrExc;
+                ''' +
+                code +
+                '''
+                FpscrExc = fpscr;
+                '''
+            )
+            iop = InstObjParams(
+                name + size_suffix,
+                Name + size_suffix.upper(),
+                base,
+                {
+                    "code": code % {"op": op},
+                    "op_class": opClass
+                },
+                []
+            )
+            header_output += declareTempl.subst(iop)
+            decoder_output += constructorTempl.subst(iop)
+            exec_output += BasicExecute.subst(iop)
+    ops = [
+        ("vminnm", "FpRegRegRegOp", "SimdFloatCmpOp", "MinNum"),
+        ("vmaxnm", "FpRegRegRegOp", "SimdFloatCmpOp", "MaxNum"),
+    ]
+    for op in ops:
+        buildBinOp(*op)
+
     def buildUnaryFpOp(name, Name, base, opClass, singleOp, doubleOp = None):
         if doubleOp is None:
             doubleOp = singleOp
diff --git a/src/arch/arm/isa/insts/neon.isa b/src/arch/arm/isa/insts/neon.isa
index bfebd10..f242451 100644
--- a/src/arch/arm/isa/insts/neon.isa
+++ b/src/arch/arm/isa/insts/neon.isa
@@ -58,6 +58,22 @@
         }
     }
 
+    template <class BaseS, class BaseD>
+    StaticInstPtr
+    decodeNeonSizeSingleDouble(unsigned size,
+                         ExtMachInst machInst, IntRegIndex dest,
+                         IntRegIndex op1, IntRegIndex op2)
+    {
+        switch (size) {
+          case 2:
+            return new BaseS(machInst, dest, op1, op2);
+          case 3:
+            return new BaseD(machInst, dest, op1, op2);
+          default:
+            return new Unknown(machInst);
+        }
+    }
+
     template <template <typename T> class Base>
     StaticInstPtr
     decodeNeonSThreeUReg(unsigned size,