arch-arm: implement floating point aarch32 VCVTA family

These instructions round floating point to integer, and were added to
aarch32 as an extension to ARMv7.

Change-Id: I62d1705badc95a4e8954a5ad62b2b6bc9e4ffe00
Reviewed-on: https://gem5-review.googlesource.com/c/16788
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Maintainer: Andreas Sandberg <andreas.sandberg@arm.com>
diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa
index 77a33e6..c159dc6 100644
--- a/src/arch/arm/isa/formats/fp.isa
+++ b/src/arch/arm/isa/formats/fp.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2011, 2016-2018 ARM Limited
+// Copyright (c) 2010-2011, 2016-2019 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -2001,6 +2001,26 @@
     decodeShortFpTransfer(ExtMachInst machInst);
     '''
     decoder_output = '''
+    IntRegIndex decodeFpVd(ExtMachInst machInst, uint32_t size, bool isInt)
+    {
+        if (!isInt and size == 3) {
+            return (IntRegIndex)((bits(machInst, 22) << 5) |
+                               (bits(machInst, 15, 12) << 1));
+        } else {
+            return (IntRegIndex)(bits(machInst, 22) |
+                              (bits(machInst, 15, 12) << 1));
+        }
+    }
+    IntRegIndex decodeFpVm(ExtMachInst machInst, uint32_t size, bool isInt)
+    {
+        if (!isInt and size == 3) {
+            return (IntRegIndex)((bits(machInst, 5) << 5) |
+                               (bits(machInst, 3, 0) << 1));
+        } else {
+            return (IntRegIndex)(bits(machInst, 5) |
+                              (bits(machInst, 3, 0) << 1));
+        }
+    }
     StaticInstPtr
     decodeShortFpTransfer(ExtMachInst machInst)
     {
@@ -2008,67 +2028,143 @@
         const uint32_t c = bits(machInst, 8);
         const uint32_t a = bits(machInst, 23, 21);
         const uint32_t b = bits(machInst, 6, 5);
+        const uint32_t o1 = bits(machInst, 18);
         if ((machInst.thumb == 1 && bits(machInst, 28) == 1) ||
             (machInst.thumb == 0 && machInst.condCode == 0xf)) {
             // Determine if this is backported aarch64 FP instruction
             const bool b31_b24 = bits(machInst, 31, 24) == 0xFE;
             const bool b23 = bits(machInst, 23);
-            const bool b21_b18 = bits(machInst, 21, 18) == 0xE;
+            const bool b21_b19 = bits(machInst, 21, 19) == 0x7;
             const bool b11_b9  = bits(machInst, 11, 9) == 0x5;
-            const bool sz = bits(machInst, 8);
-            const bool b7_b6   = bits(machInst, 7, 6) == 0x1;
-            const bool b6 = bits(machInst, 6) == 0x0;
+            const uint32_t size = bits(machInst, 9, 8);
+            const bool op3 = bits(machInst, 6);
             const bool b4 = bits(machInst, 4) == 0x0;
-            if (b31_b24 && b23 && b21_b18 && b11_b9 && b7_b6 && b4) {
-                  // VINT* Integer Rounding Instructon
-                  const uint32_t rm = bits(machInst, 17, 16);
-
-                  if (sz) {
-                      const IntRegIndex vd =
-                          (IntRegIndex)((bits(machInst, 22) << 5) |
-                                        (bits(machInst, 15, 12) << 1));
-                      const IntRegIndex vm =
-                          (IntRegIndex)((bits(machInst, 5) << 5) |
-                                        (bits(machInst, 3, 0) << 1));
-                      switch(rm) {
-                        case 0x0:
-                          return decodeVfpRegRegOp<VRIntAD>(machInst, vd, vm,
-                                                            true);
-                        case 0x1:
-                          return decodeVfpRegRegOp<VRIntND>(machInst, vd, vm,
-                                                            true);
-                        case 0x2:
-                          return decodeVfpRegRegOp<VRIntPD>(machInst, vd, vm,
-                                                            true);
-                        case 0x3:
-                          return decodeVfpRegRegOp<VRIntMD>(machInst, vd, vm,
-                                                            true);
-                        default: return new Unknown(machInst);
-                      }
-                  } else {
-                      const IntRegIndex vd =
-                          (IntRegIndex)(bits(machInst, 22) |
-                                       (bits(machInst, 15, 12) << 1));
-                      const IntRegIndex vm =
-                          (IntRegIndex)(bits(machInst, 5) |
-                                        (bits(machInst, 3, 0) << 1));
-                      switch(rm) {
-                        case 0x0:
-                          return decodeVfpRegRegOp<VRIntAS>(machInst, vd, vm,
-                                                            false);
-                        case 0x1:
-                          return decodeVfpRegRegOp<VRIntNS>(machInst, vd, vm,
-                                                            false);
-                        case 0x2:
-                          return decodeVfpRegRegOp<VRIntPS>(machInst, vd, vm,
-                                                            false);
-                        case 0x3:
-                          return decodeVfpRegRegOp<VRIntMS>(machInst, vd, vm,
-                                                            false);
-                        default: return new Unknown(machInst);
-                      }
-                  }
-            } else if (b31_b24 && !b23 && b11_b9 && b6 && b4){
+            const uint32_t rm = bits(machInst, 17, 16);
+            IntRegIndex vd = decodeFpVd(machInst, size, false);
+            IntRegIndex vm = decodeFpVm(machInst, size, false);
+            IntRegIndex vdInt = decodeFpVd(machInst, size, true);
+            if (b31_b24 && b23 && b21_b19 && b11_b9 && op3 && b4) {
+                if (o1 == 0) {
+                    // VINT* Integer Rounding Instruction
+                    if (size == 3) {
+                        switch(rm) {
+                            case 0x0:
+                            return decodeVfpRegRegOp<VRIntAD>(machInst, vd, vm,
+                                                                true);
+                            case 0x1:
+                            return decodeVfpRegRegOp<VRIntND>(machInst, vd, vm,
+                                                                true);
+                            case 0x2:
+                            return decodeVfpRegRegOp<VRIntPD>(machInst, vd, vm,
+                                                                true);
+                            case 0x3:
+                            return decodeVfpRegRegOp<VRIntMD>(machInst, vd, vm,
+                                                                true);
+                            default: return new Unknown(machInst);
+                        }
+                    } else {
+                        switch(rm) {
+                            case 0x0:
+                            return decodeVfpRegRegOp<VRIntAS>(machInst, vd, vm,
+                                                                false);
+                            case 0x1:
+                            return decodeVfpRegRegOp<VRIntNS>(machInst, vd, vm,
+                                                                false);
+                            case 0x2:
+                            return decodeVfpRegRegOp<VRIntPS>(machInst, vd, vm,
+                                                                false);
+                            case 0x3:
+                            return decodeVfpRegRegOp<VRIntMS>(machInst, vd, vm,
+                                                                false);
+                            default: return new Unknown(machInst);
+                        }
+                    }
+                } else {
+                    const bool op = bits(machInst, 7);
+                    switch(rm) {
+                      case 0x0:
+                        switch(size) {
+                          case 0x0:
+                            return new Unknown(machInst);
+                          case 0x1:
+                            return new FailUnimplemented(
+                                "vcvta.u32.f16", machInst);
+                          case 0x2:
+                            if (op) {
+                                return new VcvtaFpSIntS(machInst, vdInt, vm);
+                            } else {
+                                return new VcvtaFpUIntS(machInst, vdInt, vm);
+                            }
+                          case 0x3:
+                            if (op) {
+                                return new VcvtaFpSIntD(machInst, vdInt, vm);
+                            } else {
+                                return new VcvtaFpUIntD(machInst, vdInt, vm);
+                            }
+                        }
+                      case 0x1:
+                        switch(size) {
+                          case 0x0:
+                            return new Unknown(machInst);
+                          case 0x1:
+                            return new FailUnimplemented(
+                                "vcvtn.u32.f16", machInst);
+                          case 0x2:
+                            if (op) {
+                                return new VcvtnFpSIntS(machInst, vdInt, vm);
+                            } else {
+                                return new VcvtnFpUIntS(machInst, vdInt, vm);
+                            }
+                          case 0x3:
+                            if (op) {
+                                return new VcvtnFpSIntD(machInst, vdInt, vm);
+                            } else {
+                                return new VcvtnFpUIntD(machInst, vdInt, vm);
+                            }
+                        }
+                      case 0x2:
+                        switch(size) {
+                          case 0x0:
+                            return new Unknown(machInst);
+                          case 0x1:
+                            return new FailUnimplemented(
+                                "vcvtp.u32.f16", machInst);
+                          case 0x2:
+                            if (op) {
+                                return new VcvtpFpSIntS(machInst, vdInt, vm);
+                            } else {
+                                return new VcvtpFpUIntS(machInst, vdInt, vm);
+                            }
+                          case 0x3:
+                            if (op) {
+                                return new VcvtpFpSIntD(machInst, vdInt, vm);
+                            } else {
+                                return new VcvtpFpUIntD(machInst, vdInt, vm);
+                            }
+                        }
+                      case 0x3:
+                        switch(size) {
+                          case 0x0:
+                            return new Unknown(machInst);
+                          case 0x1:
+                            return new FailUnimplemented(
+                                "vcvtm.u32.f16", machInst);
+                          case 0x2:
+                            if (op) {
+                                return new VcvtmFpSIntS(machInst, vdInt, vm);
+                            } else {
+                                return new VcvtmFpUIntS(machInst, vdInt, vm);
+                            }
+                          case 0x3:
+                            if (op) {
+                                return new VcvtmFpSIntD(machInst, vdInt, vm);
+                            } else {
+                                return new VcvtmFpUIntD(machInst, vdInt, vm);
+                            }
+                        }
+                    }
+                }
+            } else if (b31_b24 && !b23 && b11_b9 && !op3 && b4){
                 // VSEL* floating point conditional select
 
                 ConditionCode cond;
@@ -2079,24 +2175,12 @@
                   case 0x3: cond = COND_GT; break;
                 }
 
-                if (sz) {
-                      const IntRegIndex vd =
-                          (IntRegIndex)((bits(machInst, 22) << 5) |
-                                        (bits(machInst, 15, 12) << 1));
-                      const IntRegIndex vm =
-                          (IntRegIndex)((bits(machInst, 5) << 5) |
-                                        (bits(machInst, 3, 0) << 1));
+                if (size == 3) {
                       const IntRegIndex vn =
                           (IntRegIndex)((bits(machInst, 7) << 5) |
                                        (bits(machInst, 19, 16) << 1));
                     return new VselD(machInst, vd, vn, vm, cond);
                 } else {
-                      const IntRegIndex vd =
-                          (IntRegIndex)(bits(machInst, 22) |
-                                       (bits(machInst, 15, 12) << 1));
-                      const IntRegIndex vm =
-                          (IntRegIndex)(bits(machInst, 5) |
-                                        (bits(machInst, 3, 0) << 1));
                       const IntRegIndex vn =
                           (IntRegIndex)((bits(machInst, 19, 16) << 1) |
                                         bits(machInst, 7));
diff --git a/src/arch/arm/isa/insts/fp.isa b/src/arch/arm/isa/insts/fp.isa
index dcf5889..d8323c4 100644
--- a/src/arch/arm/isa/insts/fp.isa
+++ b/src/arch/arm/isa/insts/fp.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2013,2016 ARM Limited
+// Copyright (c) 2010-2013,2016,2018-2019 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -993,85 +993,96 @@
     decoder_output += FpRegRegOpConstructor.subst(vcvtFpSIntDRIop);
     exec_output += PredOpExecute.subst(vcvtFpSIntDRIop);
 
-    vcvtFpUIntSCode = vfpEnabledCheckCode + '''
+    round_mode_suffix_to_mode = {
+        '': 'VfpRoundZero',
+        'a': 'VfpRoundAway',
+        'm': 'VfpRoundDown',
+        'n': 'VfpRoundNearest',
+        'p': 'VfpRoundUpward',
+    }
+
+    def buildVcvt(code, className, roundModeSuffix):
+        global header_output, decoder_output, exec_output, \
+            vfpEnabledCheckCode, round_mode_suffix_to_mode
+        full_code = vfpEnabledCheckCode + code.format(
+            round_mode=round_mode_suffix_to_mode[roundModeSuffix],
+        )
+        iop = InstObjParams(
+            "vcvt{}".format(roundModeSuffix),
+            className.format(roundModeSuffix),
+            "FpRegRegOp",
+            { "code": full_code,
+              "predicate_test": predicateTest,
+              "op_class": "SimdFloatCvtOp" },
+            []
+        )
+        header_output += FpRegRegOpDeclare.subst(iop);
+        decoder_output += FpRegRegOpConstructor.subst(iop);
+        exec_output += PredOpExecute.subst(iop);
+
+    code = '''
         FPSCR fpscr = (FPSCR) FpscrExc;
         vfpFlushToZero(fpscr, FpOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         fesetround(FeRoundZero);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_uw = vfpFpToFixed<float>(FpOp1, false, 32, 0);
+        FpDest_uw = vfpFpToFixed<float>(
+            FpOp1, false, 32, 0, true, {round_mode});
         __asm__ __volatile__("" :: "m" (FpDest_uw));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
     '''
-    vcvtFpUIntSIop = InstObjParams("vcvt", "VcvtFpUIntS", "FpRegRegOp",
-                                     { "code": vcvtFpUIntSCode,
-                                       "predicate_test": predicateTest,
-                                       "op_class": "SimdFloatCvtOp" }, [])
-    header_output += FpRegRegOpDeclare.subst(vcvtFpUIntSIop);
-    decoder_output += FpRegRegOpConstructor.subst(vcvtFpUIntSIop);
-    exec_output += PredOpExecute.subst(vcvtFpUIntSIop);
+    for round_mode_suffix in round_mode_suffix_to_mode:
+        buildVcvt(code, "Vcvt{}FpUIntS", round_mode_suffix)
 
-    vcvtFpUIntDCode = vfpEnabledCheckCode + '''
+    code = '''
         FPSCR fpscr = (FPSCR) FpscrExc;
         double cOp1 = dbl(FpOp1P0_uw, FpOp1P1_uw);
         vfpFlushToZero(fpscr, cOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         fesetround(FeRoundZero);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        uint64_t result = vfpFpToFixed<double>(cOp1, false, 32, 0);
+        uint64_t result = vfpFpToFixed<double>(
+            cOp1, false, 32, 0, true, {round_mode});
         __asm__ __volatile__("" :: "m" (result));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = result;
         FpscrExc = fpscr;
     '''
-    vcvtFpUIntDIop = InstObjParams("vcvt", "VcvtFpUIntD", "FpRegRegOp",
-                                     { "code": vcvtFpUIntDCode,
-                                       "predicate_test": predicateTest,
-                                       "op_class": "SimdFloatCvtOp" }, [])
-    header_output += FpRegRegOpDeclare.subst(vcvtFpUIntDIop);
-    decoder_output += FpRegRegOpConstructor.subst(vcvtFpUIntDIop);
-    exec_output += PredOpExecute.subst(vcvtFpUIntDIop);
+    for round_mode_suffix in round_mode_suffix_to_mode:
+        buildVcvt(code, "Vcvt{}FpUIntD", round_mode_suffix)
 
-    vcvtFpSIntSCode = vfpEnabledCheckCode + '''
+    code = '''
         FPSCR fpscr = (FPSCR) FpscrExc;
         vfpFlushToZero(fpscr, FpOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         fesetround(FeRoundZero);
         __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
-        FpDest_sw = vfpFpToFixed<float>(FpOp1, true, 32, 0);
+        FpDest_sw = vfpFpToFixed<float>(
+            FpOp1, true, 32, 0, true, {round_mode});
         __asm__ __volatile__("" :: "m" (FpDest_sw));
         finishVfp(fpscr, state, fpscr.fz);
         FpscrExc = fpscr;
     '''
-    vcvtFpSIntSIop = InstObjParams("vcvt", "VcvtFpSIntS", "FpRegRegOp",
-                                     { "code": vcvtFpSIntSCode,
-                                       "predicate_test": predicateTest,
-                                       "op_class": "SimdFloatCvtOp" }, [])
-    header_output += FpRegRegOpDeclare.subst(vcvtFpSIntSIop);
-    decoder_output += FpRegRegOpConstructor.subst(vcvtFpSIntSIop);
-    exec_output += PredOpExecute.subst(vcvtFpSIntSIop);
+    for round_mode_suffix in round_mode_suffix_to_mode:
+        buildVcvt(code, "Vcvt{}FpSIntS", round_mode_suffix)
 
-    vcvtFpSIntDCode = vfpEnabledCheckCode + '''
+    code = '''
         FPSCR fpscr = (FPSCR) FpscrExc;
         double cOp1 = dbl(FpOp1P0_uw, FpOp1P1_uw);
         vfpFlushToZero(fpscr, cOp1);
         VfpSavedState state = prepFpState(fpscr.rMode);
         fesetround(FeRoundZero);
         __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
-        int64_t result = vfpFpToFixed<double>(cOp1, true, 32, 0);
+        int64_t result = vfpFpToFixed<double>(
+            cOp1, true, 32, 0, true, {round_mode});
         __asm__ __volatile__("" :: "m" (result));
         finishVfp(fpscr, state, fpscr.fz);
         FpDestP0_uw = result;
         FpscrExc = fpscr;
     '''
-    vcvtFpSIntDIop = InstObjParams("vcvt", "VcvtFpSIntD", "FpRegRegOp",
-                                     { "code": vcvtFpSIntDCode,
-                                       "predicate_test": predicateTest,
-                                       "op_class": "SimdFloatCvtOp" }, [])
-    header_output += FpRegRegOpDeclare.subst(vcvtFpSIntDIop);
-    decoder_output += FpRegRegOpConstructor.subst(vcvtFpSIntDIop);
-    exec_output += PredOpExecute.subst(vcvtFpSIntDIop);
+    for round_mode_suffix in round_mode_suffix_to_mode:
+        buildVcvt(code, "Vcvt{}FpSIntD", round_mode_suffix)
 
     vcvtFpSFpDCode = vfpEnabledCheckCode + '''
         FPSCR fpscr = (FPSCR) FpscrExc;