| /* |
| * Copyright (c) 2010-2013, 2019 ARM Limited |
| * All rights reserved |
| * |
| * The license below extends only to copyright in the software and shall |
| * not be construed as granting a license to any other intellectual |
| * property including but not limited to intellectual property relating |
| * to a hardware implementation of the functionality of the software |
| * licensed hereunder. You may use the software subject to the license |
| * terms below provided that you ensure that this notice is replicated |
| * unmodified and in its entirety in all distributions of the software, |
| * modified or unmodified, in source code or in binary form. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer; |
| * redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution; |
| * neither the name of the copyright holders nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "arch/arm/insts/vfp.hh" |
| |
| namespace gem5 |
| { |
| |
| using namespace ArmISA; |
| |
| /* |
| * The asm statements below are to keep gcc from reordering code. Otherwise |
| * the rounding mode might be set after the operation it was intended for, the |
| * exception bits read before it, etc. |
| */ |
| |
| std::string |
| FpCondCompRegOp::generateDisassembly( |
| Addr pc, const loader::SymbolTable *symtab) const |
| { |
| std::stringstream ss; |
| printMnemonic(ss, "", false); |
| printIntReg(ss, op1); |
| ccprintf(ss, ", "); |
| printIntReg(ss, op2); |
| ccprintf(ss, ", #%d", defCc); |
| ccprintf(ss, ", "); |
| printCondition(ss, condCode, true); |
| return ss.str(); |
| } |
| |
| std::string |
| FpCondSelOp::generateDisassembly( |
| Addr pc, const loader::SymbolTable *symtab) const |
| { |
| std::stringstream ss; |
| printMnemonic(ss, "", false); |
| printIntReg(ss, dest); |
| ccprintf(ss, ", "); |
| printIntReg(ss, op1); |
| ccprintf(ss, ", "); |
| printIntReg(ss, op2); |
| ccprintf(ss, ", "); |
| printCondition(ss, condCode, true); |
| return ss.str(); |
| } |
| |
| std::string |
| FpRegRegOp::generateDisassembly( |
| Addr pc, const loader::SymbolTable *symtab) const |
| { |
| std::stringstream ss; |
| printMnemonic(ss); |
| printFloatReg(ss, dest); |
| ss << ", "; |
| printFloatReg(ss, op1); |
| return ss.str(); |
| } |
| |
| std::string |
| FpRegImmOp::generateDisassembly( |
| Addr pc, const loader::SymbolTable *symtab) const |
| { |
| std::stringstream ss; |
| printMnemonic(ss); |
| printFloatReg(ss, dest); |
| ccprintf(ss, ", #%d", imm); |
| return ss.str(); |
| } |
| |
| std::string |
| FpRegRegImmOp::generateDisassembly( |
| Addr pc, const loader::SymbolTable *symtab) const |
| { |
| std::stringstream ss; |
| printMnemonic(ss); |
| printFloatReg(ss, dest); |
| ss << ", "; |
| printFloatReg(ss, op1); |
| ccprintf(ss, ", #%d", imm); |
| return ss.str(); |
| } |
| |
| std::string |
| FpRegRegRegOp::generateDisassembly( |
| Addr pc, const loader::SymbolTable *symtab) const |
| { |
| std::stringstream ss; |
| printMnemonic(ss); |
| printFloatReg(ss, dest); |
| ss << ", "; |
| printFloatReg(ss, op1); |
| ss << ", "; |
| printFloatReg(ss, op2); |
| return ss.str(); |
| } |
| |
| std::string |
| FpRegRegRegCondOp::generateDisassembly( |
| Addr pc, const loader::SymbolTable *symtab) |
| const |
| { |
| std::stringstream ss; |
| printMnemonic(ss); |
| printCondition(ss, cond); |
| printFloatReg(ss, dest); |
| ss << ", "; |
| printFloatReg(ss, op1); |
| ss << ", "; |
| printFloatReg(ss, op2); |
| return ss.str(); |
| } |
| |
| std::string |
| FpRegRegRegRegOp::generateDisassembly( |
| Addr pc, const loader::SymbolTable *symtab) const |
| { |
| std::stringstream ss; |
| printMnemonic(ss); |
| printFloatReg(ss, dest); |
| ss << ", "; |
| printFloatReg(ss, op1); |
| ss << ", "; |
| printFloatReg(ss, op2); |
| ss << ", "; |
| printFloatReg(ss, op3); |
| return ss.str(); |
| } |
| |
| std::string |
| FpRegRegRegImmOp::generateDisassembly( |
| Addr pc, const loader::SymbolTable *symtab) const |
| { |
| std::stringstream ss; |
| printMnemonic(ss); |
| printFloatReg(ss, dest); |
| ss << ", "; |
| printFloatReg(ss, op1); |
| ss << ", "; |
| printFloatReg(ss, op2); |
| ccprintf(ss, ", #%d", imm); |
| return ss.str(); |
| } |
| |
| namespace ArmISA |
| { |
| |
| VfpSavedState |
| prepFpState(uint32_t rMode) |
| { |
| int roundingMode = fegetround(); |
| feclearexcept(FeAllExceptions); |
| switch (rMode) { |
| case VfpRoundNearest: |
| fesetround(FeRoundNearest); |
| break; |
| case VfpRoundUpward: |
| fesetround(FeRoundUpward); |
| break; |
| case VfpRoundDown: |
| fesetround(FeRoundDown); |
| break; |
| case VfpRoundZero: |
| fesetround(FeRoundZero); |
| break; |
| } |
| return roundingMode; |
| } |
| |
| void |
| finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask) |
| { |
| int exceptions = fetestexcept(FeAllExceptions); |
| bool underflow = false; |
| if ((exceptions & FeInvalid) && mask.ioc) { |
| fpscr.ioc = 1; |
| } |
| if ((exceptions & FeDivByZero) && mask.dzc) { |
| fpscr.dzc = 1; |
| } |
| if ((exceptions & FeOverflow) && mask.ofc) { |
| fpscr.ofc = 1; |
| } |
| if (exceptions & FeUnderflow) { |
| underflow = true; |
| if (mask.ufc) |
| fpscr.ufc = 1; |
| } |
| if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) { |
| fpscr.ixc = 1; |
| } |
| fesetround(state); |
| } |
| |
| template <class fpType> |
| fpType |
| fixDest(bool flush, bool defaultNan, fpType val, fpType op1) |
| { |
| int fpClass = std::fpclassify(val); |
| fpType junk = 0.0; |
| if (fpClass == FP_NAN) { |
| const bool single = (sizeof(val) == sizeof(float)); |
| const uint64_t qnan = single ? 0x7fc00000 : 0x7ff8000000000000ULL; |
| const bool nan = std::isnan(op1); |
| if (!nan || defaultNan) { |
| val = bitsToFp(qnan, junk); |
| } else if (nan) { |
| val = bitsToFp(fpToBits(op1) | qnan, junk); |
| } |
| } else if (fpClass == FP_SUBNORMAL && flush == 1) { |
| // Turn val into a zero with the correct sign; |
| uint64_t bitMask = 0x1ULL << (sizeof(fpType) * 8 - 1); |
| val = bitsToFp(fpToBits(val) & bitMask, junk); |
| feclearexcept(FeInexact); |
| feraiseexcept(FeUnderflow); |
| } |
| return val; |
| } |
| |
| template |
| float fixDest<float>(bool flush, bool defaultNan, float val, float op1); |
| template |
| double fixDest<double>(bool flush, bool defaultNan, double val, double op1); |
| |
| template <class fpType> |
| fpType |
| fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) |
| { |
| int fpClass = std::fpclassify(val); |
| fpType junk = 0.0; |
| if (fpClass == FP_NAN) { |
| const bool single = (sizeof(val) == sizeof(float)); |
| const uint64_t qnan = single ? 0x7fc00000 : 0x7ff8000000000000ULL; |
| const bool nan1 = std::isnan(op1); |
| const bool nan2 = std::isnan(op2); |
| const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); |
| const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); |
| if ((!nan1 && !nan2) || defaultNan) { |
| val = bitsToFp(qnan, junk); |
| } else if (signal1) { |
| val = bitsToFp(fpToBits(op1) | qnan, junk); |
| } else if (signal2) { |
| val = bitsToFp(fpToBits(op2) | qnan, junk); |
| } else if (nan1) { |
| val = op1; |
| } else if (nan2) { |
| val = op2; |
| } |
| } else if (fpClass == FP_SUBNORMAL && flush) { |
| // Turn val into a zero with the correct sign; |
| uint64_t bitMask = 0x1ULL << (sizeof(fpType) * 8 - 1); |
| val = bitsToFp(fpToBits(val) & bitMask, junk); |
| feclearexcept(FeInexact); |
| feraiseexcept(FeUnderflow); |
| } |
| return val; |
| } |
| |
| template |
| float fixDest<float>(bool flush, bool defaultNan, |
| float val, float op1, float op2); |
| template |
| double fixDest<double>(bool flush, bool defaultNan, |
| double val, double op1, double op2); |
| |
| template <class fpType> |
| fpType |
| fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) |
| { |
| fpType mid = fixDest(flush, defaultNan, val, op1, op2); |
| const bool single = (sizeof(fpType) == sizeof(float)); |
| const fpType junk = 0.0; |
| if ((single && (val == bitsToFp(0x00800000, junk) || |
| val == bitsToFp(0x80800000, junk))) || |
| (!single && (val == bitsToFp(0x0010000000000000ULL, junk) || |
| val == bitsToFp(0x8010000000000000ULL, junk))) |
| ) { |
| __asm__ __volatile__("" : "=m" (op1) : "m" (op1)); |
| fesetround(FeRoundZero); |
| fpType temp = 0.0; |
| __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); |
| temp = op1 / op2; |
| if (flushToZero(temp)) { |
| feraiseexcept(FeUnderflow); |
| if (flush) { |
| feclearexcept(FeInexact); |
| mid = temp; |
| } |
| } |
| __asm__ __volatile__("" :: "m" (temp)); |
| } |
| return mid; |
| } |
| |
| template |
| float fixDivDest<float>(bool flush, bool defaultNan, |
| float val, float op1, float op2); |
| template |
| double fixDivDest<double>(bool flush, bool defaultNan, |
| double val, double op1, double op2); |
| |
| float |
| fixFpDFpSDest(FPSCR fpscr, double val) |
| { |
| const float junk = 0.0; |
| float op1 = 0.0; |
| if (std::isnan(val)) { |
| uint64_t valBits = fpToBits(val); |
| uint32_t op1Bits = bits(valBits, 50, 29) | |
| (mask(9) << 22) | |
| (bits(valBits, 63) << 31); |
| op1 = bitsToFp(op1Bits, junk); |
| } |
| float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1); |
| if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) == |
| (FeUnderflow | FeInexact)) { |
| feclearexcept(FeInexact); |
| } |
| if (mid == bitsToFp(0x00800000, junk) || |
| mid == bitsToFp(0x80800000, junk)) { |
| __asm__ __volatile__("" : "=m" (val) : "m" (val)); |
| fesetround(FeRoundZero); |
| float temp = 0.0; |
| __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); |
| temp = val; |
| if (flushToZero(temp)) { |
| feraiseexcept(FeUnderflow); |
| if (fpscr.fz) { |
| feclearexcept(FeInexact); |
| mid = temp; |
| } |
| } |
| __asm__ __volatile__("" :: "m" (temp)); |
| } |
| return mid; |
| } |
| |
| double |
| fixFpSFpDDest(FPSCR fpscr, float val) |
| { |
| const double junk = 0.0; |
| double op1 = 0.0; |
| if (std::isnan(val)) { |
| uint32_t valBits = fpToBits(val); |
| uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) | |
| (mask(12) << 51) | |
| ((uint64_t)bits(valBits, 31) << 63); |
| op1 = bitsToFp(op1Bits, junk); |
| } |
| double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1); |
| if (mid == bitsToFp(0x0010000000000000ULL, junk) || |
| mid == bitsToFp(0x8010000000000000ULL, junk)) { |
| __asm__ __volatile__("" : "=m" (val) : "m" (val)); |
| fesetround(FeRoundZero); |
| double temp = 0.0; |
| __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); |
| temp = val; |
| if (flushToZero(temp)) { |
| feraiseexcept(FeUnderflow); |
| if (fpscr.fz) { |
| feclearexcept(FeInexact); |
| mid = temp; |
| } |
| } |
| __asm__ __volatile__("" :: "m" (temp)); |
| } |
| return mid; |
| } |
| |
| static inline uint16_t |
| vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan, |
| uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble) |
| { |
| uint32_t mWidth; |
| uint32_t eWidth; |
| uint32_t eHalfRange; |
| uint32_t sBitPos; |
| |
| if (isDouble) { |
| mWidth = 52; |
| eWidth = 11; |
| } else { |
| mWidth = 23; |
| eWidth = 8; |
| } |
| sBitPos = eWidth + mWidth; |
| eHalfRange = (1 << (eWidth-1)) - 1; |
| |
| // Extract the operand. |
| bool neg = bits(opBits, sBitPos); |
| uint32_t exponent = bits(opBits, sBitPos-1, mWidth); |
| uint64_t oldMantissa = bits(opBits, mWidth-1, 0); |
| uint32_t mantissa = oldMantissa >> (mWidth - 10); |
| // Do the conversion. |
| uint64_t extra = oldMantissa & mask(mWidth - 10); |
| if (exponent == mask(eWidth)) { |
| if (oldMantissa != 0) { |
| // Nans. |
| if (bits(mantissa, 9) == 0) { |
| // Signalling nan. |
| fpscr.ioc = 1; |
| } |
| if (ahp) { |
| mantissa = 0; |
| exponent = 0; |
| fpscr.ioc = 1; |
| } else if (defaultNan) { |
| mantissa = (1 << 9); |
| exponent = 0x1f; |
| neg = false; |
| } else { |
| exponent = 0x1f; |
| mantissa |= (1 << 9); |
| } |
| } else { |
| // Infinities. |
| exponent = 0x1F; |
| if (ahp) { |
| fpscr.ioc = 1; |
| mantissa = 0x3ff; |
| } else { |
| mantissa = 0; |
| } |
| } |
| } else if (exponent == 0 && oldMantissa == 0) { |
| // Zero, don't need to do anything. |
| } else { |
| // Normalized or denormalized numbers. |
| |
| bool inexact = (extra != 0); |
| |
| if (exponent == 0) { |
| // Denormalized. |
| // If flush to zero is on, this shouldn't happen. |
| assert(!flush); |
| |
| // Check for underflow |
| if (inexact || fpscr.ufe) |
| fpscr.ufc = 1; |
| |
| // Handle rounding. |
| unsigned mode = rMode; |
| if ((mode == VfpRoundUpward && !neg && extra) || |
| (mode == VfpRoundDown && neg && extra) || |
| (mode == VfpRoundNearest && |
| (extra > (1 << 9) || |
| (extra == (1 << 9) && bits(mantissa, 0))))) { |
| mantissa++; |
| } |
| |
| // See if the number became normalized after rounding. |
| if (mantissa == (1 << 10)) { |
| mantissa = 0; |
| exponent = 1; |
| } |
| } else { |
| // Normalized. |
| |
| // We need to track the dropped bits differently since |
| // more can be dropped by denormalizing. |
| bool topOne = bits(extra, mWidth - 10 - 1); |
| bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0; |
| |
| if (exponent <= (eHalfRange - 15)) { |
| // The result is too small. Denormalize. |
| mantissa |= (1 << 10); |
| while (mantissa && exponent <= (eHalfRange - 15)) { |
| restZeros = restZeros && !topOne; |
| topOne = bits(mantissa, 0); |
| mantissa = mantissa >> 1; |
| exponent++; |
| } |
| if (topOne || !restZeros) |
| inexact = true; |
| exponent = 0; |
| } else { |
| // Change bias. |
| exponent -= (eHalfRange - 15); |
| } |
| |
| if (exponent == 0 && (inexact || fpscr.ufe)) { |
| // Underflow |
| fpscr.ufc = 1; |
| } |
| |
| // Handle rounding. |
| unsigned mode = rMode; |
| bool nonZero = topOne || !restZeros; |
| if ((mode == VfpRoundUpward && !neg && nonZero) || |
| (mode == VfpRoundDown && neg && nonZero) || |
| (mode == VfpRoundNearest && topOne && |
| (!restZeros || bits(mantissa, 0)))) { |
| mantissa++; |
| } |
| |
| // See if we rounded up and need to bump the exponent. |
| if (mantissa == (1 << 10)) { |
| mantissa = 0; |
| exponent++; |
| } |
| |
| // Deal with overflow |
| if (ahp) { |
| if (exponent >= 0x20) { |
| exponent = 0x1f; |
| mantissa = 0x3ff; |
| fpscr.ioc = 1; |
| // Supress inexact exception. |
| inexact = false; |
| } |
| } else { |
| if (exponent >= 0x1f) { |
| if ((mode == VfpRoundNearest) || |
| (mode == VfpRoundUpward && !neg) || |
| (mode == VfpRoundDown && neg)) { |
| // Overflow to infinity. |
| exponent = 0x1f; |
| mantissa = 0; |
| } else { |
| // Overflow to max normal. |
| exponent = 0x1e; |
| mantissa = 0x3ff; |
| } |
| fpscr.ofc = 1; |
| inexact = true; |
| } |
| } |
| } |
| |
| if (inexact) { |
| fpscr.ixc = 1; |
| } |
| } |
| // Reassemble and install the result. |
| uint32_t result = bits(mantissa, 9, 0); |
| replaceBits(result, 14, 10, exponent); |
| if (neg) |
| result |= (1 << 15); |
| return result; |
| } |
| |
| uint16_t |
| vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, |
| uint32_t rMode, bool ahp, float op) |
| { |
| uint64_t opBits = fpToBits(op); |
| return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false); |
| } |
| |
| uint16_t |
| vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan, |
| uint32_t rMode, bool ahp, double op) |
| { |
| uint64_t opBits = fpToBits(op); |
| return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true); |
| } |
| |
| static inline uint64_t |
| vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble) |
| { |
| uint32_t mWidth; |
| uint32_t eWidth; |
| uint32_t eHalfRange; |
| uint32_t sBitPos; |
| |
| if (isDouble) { |
| mWidth = 52; |
| eWidth = 11; |
| } else { |
| mWidth = 23; |
| eWidth = 8; |
| } |
| sBitPos = eWidth + mWidth; |
| eHalfRange = (1 << (eWidth-1)) - 1; |
| |
| // Extract the bitfields. |
| bool neg = bits(op, 15); |
| uint32_t exponent = bits(op, 14, 10); |
| uint64_t mantissa = bits(op, 9, 0); |
| // Do the conversion. |
| if (exponent == 0) { |
| if (mantissa != 0) { |
| // Normalize the value. |
| exponent = exponent + (eHalfRange - 15) + 1; |
| while (mantissa < (1 << 10)) { |
| mantissa = mantissa << 1; |
| exponent--; |
| } |
| } |
| mantissa = mantissa << (mWidth - 10); |
| } else if (exponent == 0x1f && !ahp) { |
| // Infinities and nans. |
| exponent = mask(eWidth); |
| if (mantissa != 0) { |
| // Nans. |
| mantissa = mantissa << (mWidth - 10); |
| if (bits(mantissa, mWidth-1) == 0) { |
| // Signalling nan. |
| fpscr.ioc = 1; |
| mantissa |= (((uint64_t) 1) << (mWidth-1)); |
| } |
| if (defaultNan) { |
| mantissa &= ~mask(mWidth-1); |
| neg = false; |
| } |
| } |
| } else { |
| exponent = exponent + (eHalfRange - 15); |
| mantissa = mantissa << (mWidth - 10); |
| } |
| // Reassemble the result. |
| uint64_t result = bits(mantissa, mWidth-1, 0); |
| replaceBits(result, sBitPos-1, mWidth, exponent); |
| if (neg) { |
| result |= (((uint64_t) 1) << sBitPos); |
| } |
| return result; |
| } |
| |
| double |
| vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) |
| { |
| double junk = 0.0; |
| uint64_t result; |
| |
| result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true); |
| return bitsToFp(result, junk); |
| } |
| |
| float |
| vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) |
| { |
| float junk = 0.0; |
| uint64_t result; |
| |
| result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false); |
| return bitsToFp(result, junk); |
| } |
| |
| float |
| vfpUFixedToFpS(bool flush, bool defaultNan, |
| uint64_t val, uint8_t width, uint8_t imm) |
| { |
| fesetround(FeRoundNearest); |
| if (width == 16) |
| val = (uint16_t)val; |
| else if (width == 32) |
| val = (uint32_t)val; |
| else if (width != 64) |
| panic("Unsupported width %d", width); |
| float scale = powf(2.0, imm); |
| __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); |
| feclearexcept(FeAllExceptions); |
| __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); |
| return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); |
| } |
| |
| float |
| vfpSFixedToFpS(bool flush, bool defaultNan, |
| int64_t val, uint8_t width, uint8_t imm) |
| { |
| fesetround(FeRoundNearest); |
| if (width == 16) |
| val = szext<16>(val); |
| else if (width == 32) |
| val = szext<32>(val); |
| else if (width != 64) |
| panic("Unsupported width %d", width); |
| |
| float scale = powf(2.0, imm); |
| __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); |
| feclearexcept(FeAllExceptions); |
| __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); |
| return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); |
| } |
| |
| |
| double |
| vfpUFixedToFpD(bool flush, bool defaultNan, |
| uint64_t val, uint8_t width, uint8_t imm) |
| { |
| fesetround(FeRoundNearest); |
| if (width == 16) |
| val = (uint16_t)val; |
| else if (width == 32) |
| val = (uint32_t)val; |
| else if (width != 64) |
| panic("Unsupported width %d", width); |
| |
| double scale = pow(2.0, imm); |
| __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); |
| feclearexcept(FeAllExceptions); |
| __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); |
| return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); |
| } |
| |
| double |
| vfpSFixedToFpD(bool flush, bool defaultNan, |
| int64_t val, uint8_t width, uint8_t imm) |
| { |
| fesetround(FeRoundNearest); |
| if (width == 16) |
| val = szext<16>(val); |
| else if (width == 32) |
| val = szext<32>(val); |
| else if (width != 64) |
| panic("Unsupported width %d", width); |
| |
| double scale = pow(2.0, imm); |
| __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); |
| feclearexcept(FeAllExceptions); |
| __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); |
| return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); |
| } |
| |
| // This function implements a magic formula taken from the architecture |
| // reference manual. It was originally called recip_sqrt_estimate. |
| static double |
| recipSqrtEstimate(double a) |
| { |
| int64_t q0, q1, s; |
| double r; |
| if (a < 0.5) { |
| q0 = (int64_t)(a * 512.0); |
| r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); |
| } else { |
| q1 = (int64_t)(a * 256.0); |
| r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); |
| } |
| s = (int64_t)(256.0 * r + 0.5); |
| return (double)s / 256.0; |
| } |
| |
| // This function is only intended for use in Neon instructions because |
| // it ignores certain bits in the FPSCR. |
| float |
| fprSqrtEstimate(FPSCR &fpscr, float op) |
| { |
| const uint32_t qnan = 0x7fc00000; |
| float junk = 0.0; |
| int fpClass = std::fpclassify(op); |
| if (fpClass == FP_NAN) { |
| if ((fpToBits(op) & qnan) != qnan) |
| fpscr.ioc = 1; |
| return bitsToFp(qnan, junk); |
| } else if (fpClass == FP_ZERO) { |
| fpscr.dzc = 1; |
| // Return infinity with the same sign as the operand. |
| return bitsToFp((std::signbit(op) << 31) | |
| (0xFF << 23) | (0 << 0), junk); |
| } else if (std::signbit(op)) { |
| // Set invalid op bit. |
| fpscr.ioc = 1; |
| return bitsToFp(qnan, junk); |
| } else if (fpClass == FP_INFINITE) { |
| return 0.0; |
| } else { |
| uint64_t opBits = fpToBits(op); |
| double scaled; |
| if (bits(opBits, 23)) { |
| scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | |
| (0x3fdULL << 52) | (bits(opBits, 31) << 63), |
| (double)0.0); |
| } else { |
| scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | |
| (0x3feULL << 52) | (bits(opBits, 31) << 63), |
| (double)0.0); |
| } |
| uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2; |
| |
| uint64_t estimate = fpToBits(recipSqrtEstimate(scaled)); |
| |
| return bitsToFp((bits(estimate, 63) << 31) | |
| (bits(resultExp, 7, 0) << 23) | |
| (bits(estimate, 51, 29) << 0), junk); |
| } |
| } |
| |
| uint32_t |
| unsignedRSqrtEstimate(uint32_t op) |
| { |
| if (bits(op, 31, 30) == 0) { |
| return -1; |
| } else { |
| double dpOp; |
| if (bits(op, 31)) { |
| dpOp = bitsToFp((0ULL << 63) | |
| (0x3feULL << 52) | |
| (bits((uint64_t)op, 30, 0) << 21) | |
| (0 << 0), (double)0.0); |
| } else { |
| dpOp = bitsToFp((0ULL << 63) | |
| (0x3fdULL << 52) | |
| (bits((uint64_t)op, 29, 0) << 22) | |
| (0 << 0), (double)0.0); |
| } |
| uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp)); |
| return (1 << 31) | bits(estimate, 51, 21); |
| } |
| } |
| |
| // This function implements a magic formula taken from the architecture |
| // reference manual. It was originally called recip_estimate. |
| |
| static double |
| recipEstimate(double a) |
| { |
| int64_t q, s; |
| double r; |
| q = (int64_t)(a * 512.0); |
| r = 1.0 / (((double)q + 0.5) / 512.0); |
| s = (int64_t)(256.0 * r + 0.5); |
| return (double)s / 256.0; |
| } |
| |
| // This function is only intended for use in Neon instructions because |
| // it ignores certain bits in the FPSCR. |
| float |
| fpRecipEstimate(FPSCR &fpscr, float op) |
| { |
| const uint32_t qnan = 0x7fc00000; |
| float junk = 0.0; |
| int fpClass = std::fpclassify(op); |
| if (fpClass == FP_NAN) { |
| if ((fpToBits(op) & qnan) != qnan) |
| fpscr.ioc = 1; |
| return bitsToFp(qnan, junk); |
| } else if (fpClass == FP_INFINITE) { |
| return bitsToFp(std::signbit(op) << 31, junk); |
| } else if (fpClass == FP_ZERO) { |
| fpscr.dzc = 1; |
| // Return infinity with the same sign as the operand. |
| return bitsToFp((std::signbit(op) << 31) | |
| (0xFF << 23) | (0 << 0), junk); |
| } else if (fabs(op) >= pow(2.0, 126)) { |
| fpscr.ufc = 1; |
| return bitsToFp(std::signbit(op) << 31, junk); |
| } else { |
| uint64_t opBits = fpToBits(op); |
| double scaled; |
| scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | |
| (0x3feULL << 52) | (0ULL << 63), |
| (double)0.0); |
| uint64_t resultExp = 253 - bits(opBits, 30, 23); |
| |
| uint64_t estimate = fpToBits(recipEstimate(scaled)); |
| |
| return bitsToFp((bits(opBits, 31) << 31) | |
| (bits(resultExp, 7, 0) << 23) | |
| (bits(estimate, 51, 29) << 0), junk); |
| } |
| } |
| |
| uint32_t |
| unsignedRecipEstimate(uint32_t op) |
| { |
| if (bits(op, 31) == 0) { |
| return -1; |
| } else { |
| double dpOp; |
| dpOp = bitsToFp((0ULL << 63) | |
| (0x3feULL << 52) | |
| (bits((uint64_t)op, 30, 0) << 21) | |
| (0 << 0), (double)0.0); |
| uint64_t estimate = fpToBits(recipEstimate(dpOp)); |
| return (1 << 31) | bits(estimate, 51, 21); |
| } |
| } |
| |
| FPSCR |
| fpStandardFPSCRValue(const FPSCR &fpscr) |
| { |
| FPSCR new_fpscr(0); |
| new_fpscr.ahp = fpscr.ahp; |
| new_fpscr.dn = 1; |
| new_fpscr.fz = 1; |
| new_fpscr.fz16 = fpscr.fz16; |
| return new_fpscr; |
| }; |
| |
| template <class fpType> |
| fpType |
| FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, |
| fpType op1, fpType op2) const |
| { |
| done = true; |
| fpType junk = 0.0; |
| fpType dest = 0.0; |
| const bool single = (sizeof(fpType) == sizeof(float)); |
| const uint64_t qnan = |
| single ? 0x7fc00000 : 0x7ff8000000000000ULL; |
| const bool nan1 = std::isnan(op1); |
| const bool nan2 = std::isnan(op2); |
| const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); |
| const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); |
| if (nan1 || nan2) { |
| if (defaultNan) { |
| dest = bitsToFp(qnan, junk); |
| } else if (signal1) { |
| dest = bitsToFp(fpToBits(op1) | qnan, junk); |
| } else if (signal2) { |
| dest = bitsToFp(fpToBits(op2) | qnan, junk); |
| } else if (nan1) { |
| dest = op1; |
| } else if (nan2) { |
| dest = op2; |
| } |
| if (signal1 || signal2) { |
| fpscr.ioc = 1; |
| } |
| } else { |
| done = false; |
| } |
| return dest; |
| } |
| |
| template |
| float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, |
| float op1, float op2) const; |
| template |
| double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, |
| double op1, double op2) const; |
| |
| // @TODO remove this function when we've finished switching all FMA code to use the new FPLIB |
| template <class fpType> |
| fpType |
| FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3, |
| fpType (*func)(fpType, fpType, fpType), |
| bool flush, bool defaultNan, uint32_t rMode) const |
| { |
| const bool single = (sizeof(fpType) == sizeof(float)); |
| fpType junk = 0.0; |
| |
| if (flush && (flushToZero(op1, op2) || flushToZero(op3))) |
| fpscr.idc = 1; |
| VfpSavedState state = prepFpState(rMode); |
| __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state) |
| : "m" (op1), "m" (op2), "m" (op3), "m" (state)); |
| fpType dest = func(op1, op2, op3); |
| __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); |
| |
| int fpClass = std::fpclassify(dest); |
| // Get NAN behavior right. This varies between x86 and ARM. |
| if (fpClass == FP_NAN) { |
| const uint64_t qnan = |
| single ? 0x7fc00000 : 0x7ff8000000000000ULL; |
| const bool nan1 = std::isnan(op1); |
| const bool nan2 = std::isnan(op2); |
| const bool nan3 = std::isnan(op3); |
| const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); |
| const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); |
| const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan); |
| if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) { |
| dest = bitsToFp(qnan, junk); |
| } else if (signal1) { |
| dest = bitsToFp(fpToBits(op1) | qnan, junk); |
| } else if (signal2) { |
| dest = bitsToFp(fpToBits(op2) | qnan, junk); |
| } else if (signal3) { |
| dest = bitsToFp(fpToBits(op3) | qnan, junk); |
| } else if (nan1) { |
| dest = op1; |
| } else if (nan2) { |
| dest = op2; |
| } else if (nan3) { |
| dest = op3; |
| } |
| } else if (flush && flushToZero(dest)) { |
| feraiseexcept(FeUnderflow); |
| } else if (( |
| (single && (dest == bitsToFp(0x00800000, junk) || |
| dest == bitsToFp(0x80800000, junk))) || |
| (!single && |
| (dest == bitsToFp(0x0010000000000000ULL, junk) || |
| dest == bitsToFp(0x8010000000000000ULL, junk))) |
| ) && rMode != VfpRoundZero) { |
| /* |
| * Correct for the fact that underflow is detected -before- rounding |
| * in ARM and -after- rounding in x86. |
| */ |
| fesetround(FeRoundZero); |
| __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3) |
| : "m" (op1), "m" (op2), "m" (op3)); |
| fpType temp = func(op1, op2, op2); |
| __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); |
| if (flush && flushToZero(temp)) { |
| dest = temp; |
| } |
| } |
| finishVfp(fpscr, state, flush); |
| return dest; |
| } |
| |
| template |
| float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3, |
| float (*func)(float, float, float), |
| bool flush, bool defaultNan, uint32_t rMode) const; |
| template |
| double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3, |
| double (*func)(double, double, double), |
| bool flush, bool defaultNan, uint32_t rMode) const; |
| |
| template <class fpType> |
| fpType |
| FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, |
| fpType (*func)(fpType, fpType), |
| bool flush, bool defaultNan, uint32_t rMode) const |
| { |
| const bool single = (sizeof(fpType) == sizeof(float)); |
| fpType junk = 0.0; |
| |
| if (flush && flushToZero(op1, op2)) |
| fpscr.idc = 1; |
| VfpSavedState state = prepFpState(rMode); |
| __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state) |
| : "m" (op1), "m" (op2), "m" (state)); |
| fpType dest = func(op1, op2); |
| __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); |
| |
| // Get NAN behavior right. This varies between x86 and ARM. |
| if (std::isnan(dest)) { |
| const uint64_t qnan = |
| single ? 0x7fc00000 : 0x7ff8000000000000ULL; |
| const bool nan1 = std::isnan(op1); |
| const bool nan2 = std::isnan(op2); |
| const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); |
| const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); |
| if ((!nan1 && !nan2) || (defaultNan == 1)) { |
| dest = bitsToFp(qnan, junk); |
| } else if (signal1) { |
| dest = bitsToFp(fpToBits(op1) | qnan, junk); |
| } else if (signal2) { |
| dest = bitsToFp(fpToBits(op2) | qnan, junk); |
| } else if (nan1) { |
| dest = op1; |
| } else if (nan2) { |
| dest = op2; |
| } |
| } else if (flush && flushToZero(dest)) { |
| feraiseexcept(FeUnderflow); |
| } else if (( |
| (single && (dest == bitsToFp(0x00800000, junk) || |
| dest == bitsToFp(0x80800000, junk))) || |
| (!single && |
| (dest == bitsToFp(0x0010000000000000ULL, junk) || |
| dest == bitsToFp(0x8010000000000000ULL, junk))) |
| ) && rMode != VfpRoundZero) { |
| /* |
| * Correct for the fact that underflow is detected -before- rounding |
| * in ARM and -after- rounding in x86. |
| */ |
| fesetround(FeRoundZero); |
| __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2) |
| : "m" (op1), "m" (op2)); |
| fpType temp = func(op1, op2); |
| __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); |
| if (flush && flushToZero(temp)) { |
| dest = temp; |
| } |
| } |
| finishVfp(fpscr, state, flush); |
| return dest; |
| } |
| |
| template |
| float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, |
| float (*func)(float, float), |
| bool flush, bool defaultNan, uint32_t rMode) const; |
| template |
| double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, |
| double (*func)(double, double), |
| bool flush, bool defaultNan, uint32_t rMode) const; |
| |
| template <class fpType> |
| fpType |
| FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType), |
| bool flush, uint32_t rMode) const |
| { |
| const bool single = (sizeof(fpType) == sizeof(float)); |
| fpType junk = 0.0; |
| |
| if (flush && flushToZero(op1)) |
| fpscr.idc = 1; |
| VfpSavedState state = prepFpState(rMode); |
| __asm__ __volatile__ ("" : "=m" (op1), "=m" (state) |
| : "m" (op1), "m" (state)); |
| fpType dest = func(op1); |
| __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); |
| |
| // Get NAN behavior right. This varies between x86 and ARM. |
| if (std::isnan(dest)) { |
| const uint64_t qnan = |
| single ? 0x7fc00000 : 0x7ff8000000000000ULL; |
| const bool nan = std::isnan(op1); |
| if (!nan || fpscr.dn == 1) { |
| dest = bitsToFp(qnan, junk); |
| } else if (nan) { |
| dest = bitsToFp(fpToBits(op1) | qnan, junk); |
| } |
| } else if (flush && flushToZero(dest)) { |
| feraiseexcept(FeUnderflow); |
| } else if (( |
| (single && (dest == bitsToFp(0x00800000, junk) || |
| dest == bitsToFp(0x80800000, junk))) || |
| (!single && |
| (dest == bitsToFp(0x0010000000000000ULL, junk) || |
| dest == bitsToFp(0x8010000000000000ULL, junk))) |
| ) && rMode != VfpRoundZero) { |
| /* |
| * Correct for the fact that underflow is detected -before- rounding |
| * in ARM and -after- rounding in x86. |
| */ |
| fesetround(FeRoundZero); |
| __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1)); |
| fpType temp = func(op1); |
| __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); |
| if (flush && flushToZero(temp)) { |
| dest = temp; |
| } |
| } |
| finishVfp(fpscr, state, flush); |
| return dest; |
| } |
| |
| template |
| float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float), |
| bool flush, uint32_t rMode) const; |
| template |
| double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double), |
| bool flush, uint32_t rMode) const; |
| |
| RegIndex |
| VfpMacroOp::addStride(RegIndex idx, unsigned stride) |
| { |
| if (wide) { |
| stride *= 2; |
| } |
| unsigned offset = idx % 8; |
| idx = (RegIndex)(idx - offset); |
| offset += stride; |
| idx = (RegIndex)(idx + (offset % 8)); |
| return idx; |
| } |
| |
| void |
| VfpMacroOp::nextIdxs(RegIndex &dest, RegIndex &op1, RegIndex &op2) |
| { |
| unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; |
| assert(!inScalarBank(dest)); |
| dest = addStride(dest, stride); |
| op1 = addStride(op1, stride); |
| if (!inScalarBank(op2)) { |
| op2 = addStride(op2, stride); |
| } |
| } |
| |
| void |
| VfpMacroOp::nextIdxs(RegIndex &dest, RegIndex &op1) |
| { |
| unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; |
| assert(!inScalarBank(dest)); |
| dest = addStride(dest, stride); |
| if (!inScalarBank(op1)) { |
| op1 = addStride(op1, stride); |
| } |
| } |
| |
| void |
| VfpMacroOp::nextIdxs(RegIndex &dest) |
| { |
| unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; |
| assert(!inScalarBank(dest)); |
| dest = addStride(dest, stride); |
| } |
| |
| } // namespace ArmISA |
| } // namespace gem5 |