src/arch/arm/insts/vfp.cc - public/gem5 - Git at Google

 /*
  * Copyright (c) 2010-2013, 2019 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
  * not be construed as granting a license to any other intellectual
  * property including but not limited to intellectual property relating
  * to a hardware implementation of the functionality of the software
  * licensed hereunder.  You may use the software subject to the license
  * terms below provided that you ensure that this notice is replicated
  * unmodified and in its entirety in all distributions of the software,
  * modified or unmodified, in source code or in binary form.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met: redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer;
  * redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution;
  * neither the name of the copyright holders nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "arch/arm/insts/vfp.hh"

 namespace gem5
 {

 using namespace ArmISA;

 /*
  * The asm statements below are to keep gcc from reordering code. Otherwise
  * the rounding mode might be set after the operation it was intended for, the
  * exception bits read before it, etc.
  */

 std::string
 FpCondCompRegOp::generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab) const
 {
     std::stringstream ss;
     printMnemonic(ss, "", false);
     printIntReg(ss, op1);
     ccprintf(ss, ", ");
     printIntReg(ss, op2);
     ccprintf(ss, ", #%d", defCc);
     ccprintf(ss, ", ");
     printCondition(ss, condCode, true);
     return ss.str();
 }

 std::string
 FpCondSelOp::generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab) const
 {
     std::stringstream ss;
     printMnemonic(ss, "", false);
     printIntReg(ss, dest);
     ccprintf(ss, ", ");
     printIntReg(ss, op1);
     ccprintf(ss, ", ");
     printIntReg(ss, op2);
     ccprintf(ss, ", ");
     printCondition(ss, condCode, true);
     return ss.str();
 }

 std::string
 FpRegRegOp::generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab) const
 {
     std::stringstream ss;
     printMnemonic(ss);
     printFloatReg(ss, dest);
     ss << ", ";
     printFloatReg(ss, op1);
     return ss.str();
 }

 std::string
 FpRegImmOp::generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab) const
 {
     std::stringstream ss;
     printMnemonic(ss);
     printFloatReg(ss, dest);
     ccprintf(ss, ", #%d", imm);
     return ss.str();
 }

 std::string
 FpRegRegImmOp::generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab) const
 {
     std::stringstream ss;
     printMnemonic(ss);
     printFloatReg(ss, dest);
     ss << ", ";
     printFloatReg(ss, op1);
     ccprintf(ss, ", #%d", imm);
     return ss.str();
 }

 std::string
 FpRegRegRegOp::generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab) const
 {
     std::stringstream ss;
     printMnemonic(ss);
     printFloatReg(ss, dest);
     ss << ", ";
     printFloatReg(ss, op1);
     ss << ", ";
     printFloatReg(ss, op2);
     return ss.str();
 }

 std::string
 FpRegRegRegCondOp::generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab)
     const
 {
     std::stringstream ss;
     printMnemonic(ss);
     printCondition(ss, cond);
     printFloatReg(ss, dest);
     ss << ", ";
     printFloatReg(ss, op1);
     ss << ", ";
     printFloatReg(ss, op2);
     return ss.str();
 }

 std::string
 FpRegRegRegRegOp::generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab) const
 {
     std::stringstream ss;
     printMnemonic(ss);
     printFloatReg(ss, dest);
     ss << ", ";
     printFloatReg(ss, op1);
     ss << ", ";
     printFloatReg(ss, op2);
     ss << ", ";
     printFloatReg(ss, op3);
     return ss.str();
 }

 std::string
 FpRegRegRegImmOp::generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab) const
 {
     std::stringstream ss;
     printMnemonic(ss);
     printFloatReg(ss, dest);
     ss << ", ";
     printFloatReg(ss, op1);
     ss << ", ";
     printFloatReg(ss, op2);
     ccprintf(ss, ", #%d", imm);
     return ss.str();
 }

 namespace ArmISA
 {

 VfpSavedState
 prepFpState(uint32_t rMode)
 {
     int roundingMode = fegetround();
     feclearexcept(FeAllExceptions);
     switch (rMode) {
       case VfpRoundNearest:
         fesetround(FeRoundNearest);
         break;
       case VfpRoundUpward:
         fesetround(FeRoundUpward);
         break;
       case VfpRoundDown:
         fesetround(FeRoundDown);
         break;
       case VfpRoundZero:
         fesetround(FeRoundZero);
         break;
     }
     return roundingMode;
 }

 void
 finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask)
 {
     int exceptions = fetestexcept(FeAllExceptions);
     bool underflow = false;
     if ((exceptions & FeInvalid) && mask.ioc) {
         fpscr.ioc = 1;
     }
     if ((exceptions & FeDivByZero) && mask.dzc) {
         fpscr.dzc = 1;
     }
     if ((exceptions & FeOverflow) && mask.ofc) {
         fpscr.ofc = 1;
     }
     if (exceptions & FeUnderflow) {
         underflow = true;
         if (mask.ufc)
             fpscr.ufc = 1;
     }
     if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) {
         fpscr.ixc = 1;
     }
     fesetround(state);
 }

 template <class fpType>
 fpType
 fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
 {
     int fpClass = std::fpclassify(val);
     fpType junk = 0.0;
     if (fpClass == FP_NAN) {
         const bool single = (sizeof(val) == sizeof(float));
         const uint64_t qnan = single ? 0x7fc00000 : 0x7ff8000000000000ULL;
         const bool nan = std::isnan(op1);
         if (!nan || defaultNan) {
             val = bitsToFp(qnan, junk);
         } else if (nan) {
             val = bitsToFp(fpToBits(op1) | qnan, junk);
         }
     } else if (fpClass == FP_SUBNORMAL && flush == 1) {
         // Turn val into a zero with the correct sign;
         uint64_t bitMask = 0x1ULL << (sizeof(fpType) * 8 - 1);
         val = bitsToFp(fpToBits(val) & bitMask, junk);
         feclearexcept(FeInexact);
         feraiseexcept(FeUnderflow);
     }
     return val;
 }

 template
 float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
 template
 double fixDest<double>(bool flush, bool defaultNan, double val, double op1);

 template <class fpType>
 fpType
 fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
 {
     int fpClass = std::fpclassify(val);
     fpType junk = 0.0;
     if (fpClass == FP_NAN) {
         const bool single = (sizeof(val) == sizeof(float));
         const uint64_t qnan = single ? 0x7fc00000 : 0x7ff8000000000000ULL;
         const bool nan1 = std::isnan(op1);
         const bool nan2 = std::isnan(op2);
         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
         if ((!nan1 && !nan2) || defaultNan) {
             val = bitsToFp(qnan, junk);
         } else if (signal1) {
             val = bitsToFp(fpToBits(op1) | qnan, junk);
         } else if (signal2) {
             val = bitsToFp(fpToBits(op2) | qnan, junk);
         } else if (nan1) {
             val = op1;
         } else if (nan2) {
             val = op2;
         }
     } else if (fpClass == FP_SUBNORMAL && flush) {
         // Turn val into a zero with the correct sign;
         uint64_t bitMask = 0x1ULL << (sizeof(fpType) * 8 - 1);
         val = bitsToFp(fpToBits(val) & bitMask, junk);
         feclearexcept(FeInexact);
         feraiseexcept(FeUnderflow);
     }
     return val;
 }

 template
 float fixDest<float>(bool flush, bool defaultNan,
                      float val, float op1, float op2);
 template
 double fixDest<double>(bool flush, bool defaultNan,
                        double val, double op1, double op2);

 template <class fpType>
 fpType
 fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
 {
     fpType mid = fixDest(flush, defaultNan, val, op1, op2);
     const bool single = (sizeof(fpType) == sizeof(float));
     const fpType junk = 0.0;
     if ((single && (val == bitsToFp(0x00800000, junk) ||
                     val == bitsToFp(0x80800000, junk))) ||
         (!single && (val == bitsToFp(0x0010000000000000ULL, junk) ||
                      val == bitsToFp(0x8010000000000000ULL, junk)))
         ) {
         __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
         fesetround(FeRoundZero);
         fpType temp = 0.0;
         __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
         temp = op1 / op2;
         if (flushToZero(temp)) {
             feraiseexcept(FeUnderflow);
             if (flush) {
                 feclearexcept(FeInexact);
                 mid = temp;
             }
         }
         __asm__ __volatile__("" :: "m" (temp));
     }
     return mid;
 }

 template
 float fixDivDest<float>(bool flush, bool defaultNan,
                         float val, float op1, float op2);
 template
 double fixDivDest<double>(bool flush, bool defaultNan,
                           double val, double op1, double op2);

 float
 fixFpDFpSDest(FPSCR fpscr, double val)
 {
     const float junk = 0.0;
     float op1 = 0.0;
     if (std::isnan(val)) {
         uint64_t valBits = fpToBits(val);
         uint32_t op1Bits = bits(valBits, 50, 29) |
                            (mask(9) << 22) |
                            (bits(valBits, 63) << 31);
         op1 = bitsToFp(op1Bits, junk);
     }
     float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
     if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
                     (FeUnderflow | FeInexact)) {
         feclearexcept(FeInexact);
     }
     if (mid == bitsToFp(0x00800000, junk) ||
         mid == bitsToFp(0x80800000, junk)) {
         __asm__ __volatile__("" : "=m" (val) : "m" (val));
         fesetround(FeRoundZero);
         float temp = 0.0;
         __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
         temp = val;
         if (flushToZero(temp)) {
             feraiseexcept(FeUnderflow);
             if (fpscr.fz) {
                 feclearexcept(FeInexact);
                 mid = temp;
             }
         }
         __asm__ __volatile__("" :: "m" (temp));
     }
     return mid;
 }

 double
 fixFpSFpDDest(FPSCR fpscr, float val)
 {
     const double junk = 0.0;
     double op1 = 0.0;
     if (std::isnan(val)) {
         uint32_t valBits = fpToBits(val);
         uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
                            (mask(12) << 51) |
                            ((uint64_t)bits(valBits, 31) << 63);
         op1 = bitsToFp(op1Bits, junk);
     }
     double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
     if (mid == bitsToFp(0x0010000000000000ULL, junk) ||
         mid == bitsToFp(0x8010000000000000ULL, junk)) {
         __asm__ __volatile__("" : "=m" (val) : "m" (val));
         fesetround(FeRoundZero);
         double temp = 0.0;
         __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
         temp = val;
         if (flushToZero(temp)) {
             feraiseexcept(FeUnderflow);
             if (fpscr.fz) {
                 feclearexcept(FeInexact);
                 mid = temp;
             }
         }
         __asm__ __volatile__("" :: "m" (temp));
     }
     return mid;
 }

 static inline uint16_t
 vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan,
           uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble)
 {
     uint32_t mWidth;
     uint32_t eWidth;
     uint32_t eHalfRange;
     uint32_t sBitPos;

     if (isDouble) {
         mWidth = 52;
         eWidth = 11;
     } else {
         mWidth = 23;
         eWidth = 8;
     }
     sBitPos    = eWidth + mWidth;
     eHalfRange = (1 << (eWidth-1)) - 1;

     // Extract the operand.
     bool neg = bits(opBits, sBitPos);
     uint32_t exponent = bits(opBits, sBitPos-1, mWidth);
     uint64_t oldMantissa = bits(opBits, mWidth-1, 0);
     uint32_t mantissa = oldMantissa >> (mWidth - 10);
     // Do the conversion.
     uint64_t extra = oldMantissa & mask(mWidth - 10);
     if (exponent == mask(eWidth)) {
         if (oldMantissa != 0) {
             // Nans.
             if (bits(mantissa, 9) == 0) {
                 // Signalling nan.
                 fpscr.ioc = 1;
             }
             if (ahp) {
                 mantissa = 0;
                 exponent = 0;
                 fpscr.ioc = 1;
             } else if (defaultNan) {
                 mantissa = (1 << 9);
                 exponent = 0x1f;
                 neg = false;
             } else {
                 exponent = 0x1f;
                 mantissa |= (1 << 9);
             }
         } else {
             // Infinities.
             exponent = 0x1F;
             if (ahp) {
                 fpscr.ioc = 1;
                 mantissa = 0x3ff;
             } else {
                 mantissa = 0;
             }
         }
     } else if (exponent == 0 && oldMantissa == 0) {
         // Zero, don't need to do anything.
     } else {
         // Normalized or denormalized numbers.

         bool inexact = (extra != 0);

         if (exponent == 0) {
             // Denormalized.
             // If flush to zero is on, this shouldn't happen.
             assert(!flush);

             // Check for underflow
             if (inexact || fpscr.ufe)
                 fpscr.ufc = 1;

             // Handle rounding.
             unsigned mode = rMode;
             if ((mode == VfpRoundUpward && !neg && extra) ||
                 (mode == VfpRoundDown && neg && extra) ||
                 (mode == VfpRoundNearest &&
                  (extra > (1 << 9) ||
                   (extra == (1 << 9) && bits(mantissa, 0))))) {
                 mantissa++;
             }

             // See if the number became normalized after rounding.
             if (mantissa == (1 << 10)) {
                 mantissa = 0;
                 exponent = 1;
             }
         } else {
             // Normalized.

             // We need to track the dropped bits differently since
             // more can be dropped by denormalizing.
             bool topOne = bits(extra, mWidth - 10 - 1);
             bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0;

             if (exponent <= (eHalfRange - 15)) {
                 // The result is too small. Denormalize.
                 mantissa |= (1 << 10);
                 while (mantissa && exponent <= (eHalfRange - 15)) {
                     restZeros = restZeros && !topOne;
                     topOne = bits(mantissa, 0);
                     mantissa = mantissa >> 1;
                     exponent++;
                 }
                 if (topOne || !restZeros)
                     inexact = true;
                 exponent = 0;
             } else {
                 // Change bias.
                 exponent -= (eHalfRange - 15);
             }

             if (exponent == 0 && (inexact || fpscr.ufe)) {
                 // Underflow
                 fpscr.ufc = 1;
             }

             // Handle rounding.
             unsigned mode = rMode;
             bool nonZero = topOne || !restZeros;
             if ((mode == VfpRoundUpward && !neg && nonZero) ||
                 (mode == VfpRoundDown && neg && nonZero) ||
                 (mode == VfpRoundNearest && topOne &&
                  (!restZeros || bits(mantissa, 0)))) {
                 mantissa++;
             }

             // See if we rounded up and need to bump the exponent.
             if (mantissa == (1 << 10)) {
                 mantissa = 0;
                 exponent++;
             }

             // Deal with overflow
             if (ahp) {
                 if (exponent >= 0x20) {
                     exponent = 0x1f;
                     mantissa = 0x3ff;
                     fpscr.ioc = 1;
                     // Supress inexact exception.
                     inexact = false;
                 }
             } else {
                 if (exponent >= 0x1f) {
                     if ((mode == VfpRoundNearest) ||
                         (mode == VfpRoundUpward && !neg) ||
                         (mode == VfpRoundDown && neg)) {
                         // Overflow to infinity.
                         exponent = 0x1f;
                         mantissa = 0;
                     } else {
                         // Overflow to max normal.
                         exponent = 0x1e;
                         mantissa = 0x3ff;
                     }
                     fpscr.ofc = 1;
                     inexact = true;
                 }
             }
         }

         if (inexact) {
             fpscr.ixc = 1;
         }
     }
     // Reassemble and install the result.
     uint32_t result = bits(mantissa, 9, 0);
     replaceBits(result, 14, 10, exponent);
     if (neg)
         result |= (1 << 15);
     return result;
 }

 uint16_t
 vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
            uint32_t rMode, bool ahp, float op)
 {
     uint64_t opBits = fpToBits(op);
     return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false);
 }

 uint16_t
 vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
            uint32_t rMode, bool ahp, double op)
 {
     uint64_t opBits = fpToBits(op);
     return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true);
 }

 static inline uint64_t
 vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble)
 {
     uint32_t mWidth;
     uint32_t eWidth;
     uint32_t eHalfRange;
     uint32_t sBitPos;

     if (isDouble) {
         mWidth = 52;
         eWidth = 11;
     } else {
         mWidth = 23;
         eWidth = 8;
     }
     sBitPos    = eWidth + mWidth;
     eHalfRange = (1 << (eWidth-1)) - 1;

     // Extract the bitfields.
     bool neg = bits(op, 15);
     uint32_t exponent = bits(op, 14, 10);
     uint64_t mantissa = bits(op, 9, 0);
     // Do the conversion.
     if (exponent == 0) {
         if (mantissa != 0) {
             // Normalize the value.
             exponent = exponent + (eHalfRange - 15) + 1;
             while (mantissa < (1 << 10)) {
                 mantissa = mantissa << 1;
                 exponent--;
             }
         }
         mantissa = mantissa << (mWidth - 10);
     } else if (exponent == 0x1f && !ahp) {
         // Infinities and nans.
         exponent = mask(eWidth);
         if (mantissa != 0) {
             // Nans.
             mantissa = mantissa << (mWidth - 10);
             if (bits(mantissa, mWidth-1) == 0) {
                 // Signalling nan.
                 fpscr.ioc = 1;
                 mantissa |= (((uint64_t) 1) << (mWidth-1));
             }
             if (defaultNan) {
                 mantissa &= ~mask(mWidth-1);
                 neg = false;
             }
         }
     } else {
         exponent = exponent + (eHalfRange - 15);
         mantissa = mantissa << (mWidth - 10);
     }
     // Reassemble the result.
     uint64_t result = bits(mantissa, mWidth-1, 0);
     replaceBits(result, sBitPos-1, mWidth, exponent);
     if (neg) {
         result |= (((uint64_t) 1) << sBitPos);
     }
     return result;
 }

 double
 vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
 {
     double junk = 0.0;
     uint64_t result;

     result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true);
     return bitsToFp(result, junk);
 }

 float
 vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
 {
     float junk = 0.0;
     uint64_t result;

     result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false);
     return bitsToFp(result, junk);
 }

 float
 vfpUFixedToFpS(bool flush, bool defaultNan,
         uint64_t val, uint8_t width, uint8_t imm)
 {
     fesetround(FeRoundNearest);
     if (width == 16)
         val = (uint16_t)val;
     else if (width == 32)
         val = (uint32_t)val;
     else if (width != 64)
         panic("Unsupported width %d", width);
     float scale = powf(2.0, imm);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
 }

 float
 vfpSFixedToFpS(bool flush, bool defaultNan,
         int64_t val, uint8_t width, uint8_t imm)
 {
     fesetround(FeRoundNearest);
     if (width == 16)
         val = szext<16>(val);
     else if (width == 32)
         val = szext<32>(val);
     else if (width != 64)
         panic("Unsupported width %d", width);

     float scale = powf(2.0, imm);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
 }


 double
 vfpUFixedToFpD(bool flush, bool defaultNan,
         uint64_t val, uint8_t width, uint8_t imm)
 {
     fesetround(FeRoundNearest);
     if (width == 16)
         val = (uint16_t)val;
     else if (width == 32)
         val = (uint32_t)val;
     else if (width != 64)
         panic("Unsupported width %d", width);

     double scale = pow(2.0, imm);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
 }

 double
 vfpSFixedToFpD(bool flush, bool defaultNan,
         int64_t val, uint8_t width, uint8_t imm)
 {
     fesetround(FeRoundNearest);
     if (width == 16)
         val = szext<16>(val);
     else if (width == 32)
         val = szext<32>(val);
     else if (width != 64)
         panic("Unsupported width %d", width);

     double scale = pow(2.0, imm);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
 }

 // This function implements a magic formula taken from the architecture
 // reference manual. It was originally called recip_sqrt_estimate.
 static double
 recipSqrtEstimate(double a)
 {
     int64_t q0, q1, s;
     double r;
     if (a < 0.5) {
         q0 = (int64_t)(a * 512.0);
         r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
     } else {
         q1 = (int64_t)(a * 256.0);
         r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
     }
     s = (int64_t)(256.0 * r + 0.5);
     return (double)s / 256.0;
 }

 // This function is only intended for use in Neon instructions because
 // it ignores certain bits in the FPSCR.
 float
 fprSqrtEstimate(FPSCR &fpscr, float op)
 {
     const uint32_t qnan = 0x7fc00000;
     float junk = 0.0;
     int fpClass = std::fpclassify(op);
     if (fpClass == FP_NAN) {
         if ((fpToBits(op) & qnan) != qnan)
             fpscr.ioc = 1;
         return bitsToFp(qnan, junk);
     } else if (fpClass == FP_ZERO) {
         fpscr.dzc = 1;
         // Return infinity with the same sign as the operand.
         return bitsToFp((std::signbit(op) << 31) |
                        (0xFF << 23) | (0 << 0), junk);
     } else if (std::signbit(op)) {
         // Set invalid op bit.
         fpscr.ioc = 1;
         return bitsToFp(qnan, junk);
     } else if (fpClass == FP_INFINITE) {
         return 0.0;
     } else {
         uint64_t opBits = fpToBits(op);
         double scaled;
         if (bits(opBits, 23)) {
             scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
                               (0x3fdULL << 52) | (bits(opBits, 31) << 63),
                               (double)0.0);
         } else {
             scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
                               (0x3feULL << 52) | (bits(opBits, 31) << 63),
                               (double)0.0);
         }
         uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;

         uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));

         return bitsToFp((bits(estimate, 63) << 31) |
                         (bits(resultExp, 7, 0) << 23) |
                         (bits(estimate, 51, 29) << 0), junk);
     }
 }

 uint32_t
 unsignedRSqrtEstimate(uint32_t op)
 {
     if (bits(op, 31, 30) == 0) {
         return -1;
     } else {
         double dpOp;
         if (bits(op, 31)) {
             dpOp = bitsToFp((0ULL << 63) |
                             (0x3feULL << 52) |
                             (bits((uint64_t)op, 30, 0) << 21) |
                             (0 << 0), (double)0.0);
         } else {
             dpOp = bitsToFp((0ULL << 63) |
                             (0x3fdULL << 52) |
                             (bits((uint64_t)op, 29, 0) << 22) |
                             (0 << 0), (double)0.0);
         }
         uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
         return (1 << 31) | bits(estimate, 51, 21);
     }
 }

 // This function implements a magic formula taken from the architecture
 // reference manual. It was originally called recip_estimate.

 static double
 recipEstimate(double a)
 {
     int64_t q, s;
     double r;
     q = (int64_t)(a * 512.0);
     r = 1.0 / (((double)q + 0.5) / 512.0);
     s = (int64_t)(256.0 * r + 0.5);
     return (double)s / 256.0;
 }

 // This function is only intended for use in Neon instructions because
 // it ignores certain bits in the FPSCR.
 float
 fpRecipEstimate(FPSCR &fpscr, float op)
 {
     const uint32_t qnan = 0x7fc00000;
     float junk = 0.0;
     int fpClass = std::fpclassify(op);
     if (fpClass == FP_NAN) {
         if ((fpToBits(op) & qnan) != qnan)
             fpscr.ioc = 1;
         return bitsToFp(qnan, junk);
     } else if (fpClass == FP_INFINITE) {
         return bitsToFp(std::signbit(op) << 31, junk);
     } else if (fpClass == FP_ZERO) {
         fpscr.dzc = 1;
         // Return infinity with the same sign as the operand.
         return bitsToFp((std::signbit(op) << 31) |
                        (0xFF << 23) | (0 << 0), junk);
     } else if (fabs(op) >= pow(2.0, 126)) {
         fpscr.ufc = 1;
         return bitsToFp(std::signbit(op) << 31, junk);
     } else {
         uint64_t opBits = fpToBits(op);
         double scaled;
         scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
                           (0x3feULL << 52) | (0ULL << 63),
                           (double)0.0);
         uint64_t resultExp = 253 - bits(opBits, 30, 23);

         uint64_t estimate = fpToBits(recipEstimate(scaled));

         return bitsToFp((bits(opBits, 31) << 31) |
                         (bits(resultExp, 7, 0) << 23) |
                         (bits(estimate, 51, 29) << 0), junk);
     }
 }

 uint32_t
 unsignedRecipEstimate(uint32_t op)
 {
     if (bits(op, 31) == 0) {
         return -1;
     } else {
         double dpOp;
         dpOp = bitsToFp((0ULL << 63) |
                         (0x3feULL << 52) |
                         (bits((uint64_t)op, 30, 0) << 21) |
                         (0 << 0), (double)0.0);
         uint64_t estimate = fpToBits(recipEstimate(dpOp));
         return (1 << 31) | bits(estimate, 51, 21);
     }
 }

 FPSCR
 fpStandardFPSCRValue(const FPSCR &fpscr)
 {
     FPSCR new_fpscr(0);
     new_fpscr.ahp = fpscr.ahp;
     new_fpscr.dn = 1;
     new_fpscr.fz = 1;
     new_fpscr.fz16 = fpscr.fz16;
     return new_fpscr;
 };

 template <class fpType>
 fpType
 FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
                   fpType op1, fpType op2) const
 {
     done = true;
     fpType junk = 0.0;
     fpType dest = 0.0;
     const bool single = (sizeof(fpType) == sizeof(float));
     const uint64_t qnan =
         single ? 0x7fc00000 : 0x7ff8000000000000ULL;
     const bool nan1 = std::isnan(op1);
     const bool nan2 = std::isnan(op2);
     const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
     const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
     if (nan1 || nan2) {
         if (defaultNan) {
             dest = bitsToFp(qnan, junk);
         }  else if (signal1) {
             dest = bitsToFp(fpToBits(op1) | qnan, junk);
         } else if (signal2) {
             dest = bitsToFp(fpToBits(op2) | qnan, junk);
         } else if (nan1) {
             dest = op1;
         } else if (nan2) {
             dest = op2;
         }
         if (signal1 || signal2) {
             fpscr.ioc = 1;
         }
     } else {
         done = false;
     }
     return dest;
 }

 template
 float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
                         float op1, float op2) const;
 template
 double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
                          double op1, double op2) const;

 // @TODO remove this function when we've finished switching all FMA code to use the new FPLIB
 template <class fpType>
 fpType
 FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
                 fpType (*func)(fpType, fpType, fpType),
                 bool flush, bool defaultNan, uint32_t rMode) const
 {
     const bool single = (sizeof(fpType) == sizeof(float));
     fpType junk = 0.0;

     if (flush && (flushToZero(op1, op2) || flushToZero(op3)))
         fpscr.idc = 1;
     VfpSavedState state = prepFpState(rMode);
     __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state)
                              :  "m" (op1),  "m" (op2),  "m" (op3),  "m" (state));
     fpType dest = func(op1, op2, op3);
     __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));

     int fpClass = std::fpclassify(dest);
     // Get NAN behavior right. This varies between x86 and ARM.
     if (fpClass == FP_NAN) {
         const uint64_t qnan =
             single ? 0x7fc00000 : 0x7ff8000000000000ULL;
         const bool nan1 = std::isnan(op1);
         const bool nan2 = std::isnan(op2);
         const bool nan3 = std::isnan(op3);
         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
         const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan);
         if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) {
             dest = bitsToFp(qnan, junk);
         } else if (signal1) {
             dest = bitsToFp(fpToBits(op1) | qnan, junk);
         } else if (signal2) {
             dest = bitsToFp(fpToBits(op2) | qnan, junk);
         } else if (signal3) {
             dest = bitsToFp(fpToBits(op3) | qnan, junk);
         } else if (nan1) {
             dest = op1;
         } else if (nan2) {
             dest = op2;
         } else if (nan3) {
             dest = op3;
         }
     } else if (flush && flushToZero(dest)) {
         feraiseexcept(FeUnderflow);
     } else if ((
                 (single && (dest == bitsToFp(0x00800000, junk) ||
                      dest == bitsToFp(0x80800000, junk))) ||
                 (!single &&
                     (dest == bitsToFp(0x0010000000000000ULL, junk) ||
                      dest == bitsToFp(0x8010000000000000ULL, junk)))
                ) && rMode != VfpRoundZero) {
         /*
          * Correct for the fact that underflow is detected -before- rounding
          * in ARM and -after- rounding in x86.
          */
         fesetround(FeRoundZero);
         __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3)
                                  :  "m" (op1),  "m" (op2),  "m" (op3));
         fpType temp = func(op1, op2, op2);
         __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
         if (flush && flushToZero(temp)) {
             dest = temp;
         }
     }
     finishVfp(fpscr, state, flush);
     return dest;
 }

 template
 float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3,
                       float (*func)(float, float, float),
                       bool flush, bool defaultNan, uint32_t rMode) const;
 template
 double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3,
                        double (*func)(double, double, double),
                        bool flush, bool defaultNan, uint32_t rMode) const;

 template <class fpType>
 fpType
 FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
                fpType (*func)(fpType, fpType),
                bool flush, bool defaultNan, uint32_t rMode) const
 {
     const bool single = (sizeof(fpType) == sizeof(float));
     fpType junk = 0.0;

     if (flush && flushToZero(op1, op2))
         fpscr.idc = 1;
     VfpSavedState state = prepFpState(rMode);
     __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
                              : "m" (op1), "m" (op2), "m" (state));
     fpType dest = func(op1, op2);
     __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));

     // Get NAN behavior right. This varies between x86 and ARM.
     if (std::isnan(dest)) {
         const uint64_t qnan =
             single ? 0x7fc00000 : 0x7ff8000000000000ULL;
         const bool nan1 = std::isnan(op1);
         const bool nan2 = std::isnan(op2);
         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
         if ((!nan1 && !nan2) || (defaultNan == 1)) {
             dest = bitsToFp(qnan, junk);
         } else if (signal1) {
             dest = bitsToFp(fpToBits(op1) | qnan, junk);
         } else if (signal2) {
             dest = bitsToFp(fpToBits(op2) | qnan, junk);
         } else if (nan1) {
             dest = op1;
         } else if (nan2) {
             dest = op2;
         }
     } else if (flush && flushToZero(dest)) {
         feraiseexcept(FeUnderflow);
     } else if ((
                 (single && (dest == bitsToFp(0x00800000, junk) ||
                      dest == bitsToFp(0x80800000, junk))) ||
                 (!single &&
                     (dest == bitsToFp(0x0010000000000000ULL, junk) ||
                      dest == bitsToFp(0x8010000000000000ULL, junk)))
                ) && rMode != VfpRoundZero) {
         /*
          * Correct for the fact that underflow is detected -before- rounding
          * in ARM and -after- rounding in x86.
          */
         fesetround(FeRoundZero);
         __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
                                  : "m" (op1), "m" (op2));
         fpType temp = func(op1, op2);
         __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
         if (flush && flushToZero(temp)) {
             dest = temp;
         }
     }
     finishVfp(fpscr, state, flush);
     return dest;
 }

 template
 float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
                      float (*func)(float, float),
                      bool flush, bool defaultNan, uint32_t rMode) const;
 template
 double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
                       double (*func)(double, double),
                       bool flush, bool defaultNan, uint32_t rMode) const;

 template <class fpType>
 fpType
 FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
               bool flush, uint32_t rMode) const
 {
     const bool single = (sizeof(fpType) == sizeof(float));
     fpType junk = 0.0;

     if (flush && flushToZero(op1))
         fpscr.idc = 1;
     VfpSavedState state = prepFpState(rMode);
     __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
                              : "m" (op1), "m" (state));
     fpType dest = func(op1);
     __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));

     // Get NAN behavior right. This varies between x86 and ARM.
     if (std::isnan(dest)) {
         const uint64_t qnan =
             single ? 0x7fc00000 : 0x7ff8000000000000ULL;
         const bool nan = std::isnan(op1);
         if (!nan || fpscr.dn == 1) {
             dest = bitsToFp(qnan, junk);
         } else if (nan) {
             dest = bitsToFp(fpToBits(op1) | qnan, junk);
         }
     } else if (flush && flushToZero(dest)) {
         feraiseexcept(FeUnderflow);
     } else if ((
                 (single && (dest == bitsToFp(0x00800000, junk) ||
                      dest == bitsToFp(0x80800000, junk))) ||
                 (!single &&
                     (dest == bitsToFp(0x0010000000000000ULL, junk) ||
                      dest == bitsToFp(0x8010000000000000ULL, junk)))
                ) && rMode != VfpRoundZero) {
         /*
          * Correct for the fact that underflow is detected -before- rounding
          * in ARM and -after- rounding in x86.
          */
         fesetround(FeRoundZero);
         __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
         fpType temp = func(op1);
         __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
         if (flush && flushToZero(temp)) {
             dest = temp;
         }
     }
     finishVfp(fpscr, state, flush);
     return dest;
 }

 template
 float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
                     bool flush, uint32_t rMode) const;
 template
 double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
                      bool flush, uint32_t rMode) const;

 IntRegIndex
 VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
 {
     if (wide) {
         stride *= 2;
     }
     unsigned offset = idx % 8;
     idx = (IntRegIndex)(idx - offset);
     offset += stride;
     idx = (IntRegIndex)(idx + (offset % 8));
     return idx;
 }

 void
 VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
 {
     unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
     assert(!inScalarBank(dest));
     dest = addStride(dest, stride);
     op1 = addStride(op1, stride);
     if (!inScalarBank(op2)) {
         op2 = addStride(op2, stride);
     }
 }

 void
 VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
 {
     unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
     assert(!inScalarBank(dest));
     dest = addStride(dest, stride);
     if (!inScalarBank(op1)) {
         op1 = addStride(op1, stride);
     }
 }

 void
 VfpMacroOp::nextIdxs(IntRegIndex &dest)
 {
     unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
     assert(!inScalarBank(dest));
     dest = addStride(dest, stride);
 }

 } // namespace ArmISA
 } // namespace gem5