| // -*- mode: c++ -*- |
| |
| // Copyright (c) 2012-2014 ARM Limited |
| // All rights reserved |
| // |
| // The license below extends only to copyright in the software and shall |
| // not be construed as granting a license to any other intellectual |
| // property including but not limited to intellectual property relating |
| // to a hardware implementation of the functionality of the software |
| // licensed hereunder. You may use the software subject to the license |
| // terms below provided that you ensure that this notice is replicated |
| // unmodified and in its entirety in all distributions of the software, |
| // modified or unmodified, in source code or in binary form. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer; |
| // redistributions in binary form must reproduce the above copyright |
| // notice, this list of conditions and the following disclaimer in the |
| // documentation and/or other materials provided with the distribution; |
| // neither the name of the copyright holders nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| // |
| // Authors: Mbou Eyole |
| // Giacomo Gabrielli |
| |
| let {{ |
| |
| header_output = '' |
| decoder_output = '' |
| exec_output = '' |
| |
| zeroSveVecRegUpperPartCode = ''' |
| ArmISA::ISA::zeroSveVecRegUpperPart(%s, |
| ArmStaticInst::getCurSveVecLen<uint64_t>(xc->tcBase())); |
| ''' |
| |
| def mkMemAccMicroOp(name): |
| global header_output, decoder_output, exec_output |
| SPAlignmentCheckCodeNeon = ''' |
| if (baseIsSP && bits(XURa, 3, 0) && |
| SPAlignmentCheckEnabled(xc->tcBase())) { |
| return std::make_shared<SPAlignmentFault>(); |
| } |
| ''' |
| eaCode = SPAlignmentCheckCodeNeon + ''' |
| EA = XURa + imm; |
| ''' |
| memDecl = ''' |
| const int MaxNumBytes = 16; |
| union MemUnion { |
| uint8_t bytes[MaxNumBytes]; |
| uint32_t floatRegBits[MaxNumBytes / 4]; |
| }; |
| ''' |
| |
| # Do endian conversion for all the elements |
| convCode = ''' |
| VReg x = {0, 0}; |
| |
| x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) | |
| (XReg) memUnion.floatRegBits[0]; |
| x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) | |
| (XReg) memUnion.floatRegBits[2]; |
| |
| const unsigned eCount = 16 / (1 << eSize); |
| |
| if (isBigEndian64(xc->tcBase())) { |
| for (unsigned i = 0; i < eCount; i++) { |
| switch (eSize) { |
| case 0x3: // 64-bit |
| writeVecElem(&x, (XReg) gtobe( |
| (uint64_t) readVecElem(x, i, eSize)), i, eSize); |
| break; |
| case 0x2: // 32-bit |
| writeVecElem(&x, (XReg) gtobe( |
| (uint32_t) readVecElem(x, i, eSize)), i, eSize); |
| break; |
| case 0x1: // 16-bit |
| writeVecElem(&x, (XReg) gtobe( |
| (uint16_t) readVecElem(x, i, eSize)), i, eSize); |
| break; |
| default: // 8-bit |
| break; // Nothing to do here |
| } |
| } |
| } else { |
| for (unsigned i = 0; i < eCount; i++) { |
| switch (eSize) { |
| case 0x3: // 64-bit |
| writeVecElem(&x, (XReg) gtole( |
| (uint64_t) readVecElem(x, i, eSize)), i, eSize); |
| break; |
| case 0x2: // 32-bit |
| writeVecElem(&x, (XReg) gtole( |
| (uint32_t) readVecElem(x, i, eSize)), i, eSize); |
| break; |
| case 0x1: // 16-bit |
| writeVecElem(&x, (XReg) gtole( |
| (uint16_t) readVecElem(x, i, eSize)), i, eSize); |
| break; |
| default: // 8-bit |
| break; // Nothing to do here |
| } |
| } |
| } |
| |
| memUnion.floatRegBits[0] = (uint32_t) x.lo; |
| memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32); |
| memUnion.floatRegBits[2] = (uint32_t) x.hi; |
| memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32); |
| ''' |
| |
| # Offload everything into registers |
| regSetCode = '' |
| for reg in range(4): |
| regSetCode += ''' |
| AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]); |
| ''' % { 'reg' : reg } |
| |
| # Pull everything in from registers |
| regGetCode = '' |
| for reg in range(4): |
| regGetCode += ''' |
| memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); |
| ''' % { 'reg' : reg } |
| |
| loadMemAccCode = convCode + regSetCode |
| storeMemAccCode = regGetCode + convCode |
| |
| loadIop = InstObjParams(name + 'ld', |
| 'MicroNeonLoad64', |
| 'MicroNeonMemOp', |
| { 'mem_decl' : memDecl, |
| 'memacc_code' : loadMemAccCode, |
| 'ea_code' : simd64EnabledCheckCode + eaCode, |
| }, |
| [ 'IsMicroop', 'IsMemRef', 'IsLoad' ]) |
| loadIop.snippets["memacc_code"] += zeroSveVecRegUpperPartCode % \ |
| "AA64FpDest" |
| storeIop = InstObjParams(name + 'st', |
| 'MicroNeonStore64', |
| 'MicroNeonMemOp', |
| { 'mem_decl' : memDecl, |
| 'memacc_code' : storeMemAccCode, |
| 'ea_code' : simd64EnabledCheckCode + eaCode, |
| }, |
| [ 'IsMicroop', 'IsMemRef', 'IsStore' ]) |
| |
| exec_output += NeonLoadExecute64.subst(loadIop) + \ |
| NeonLoadInitiateAcc64.subst(loadIop) + \ |
| NeonLoadCompleteAcc64.subst(loadIop) + \ |
| NeonStoreExecute64.subst(storeIop) + \ |
| NeonStoreInitiateAcc64.subst(storeIop) + \ |
| NeonStoreCompleteAcc64.subst(storeIop) |
| header_output += MicroNeonMemDeclare64.subst(loadIop) + \ |
| MicroNeonMemDeclare64.subst(storeIop) |
| |
| def mkMarshalMicroOp(name, Name, numRegs=4): |
| global header_output, decoder_output, exec_output |
| |
| getInputCodeOp1L = '' |
| for v in range(numRegs): |
| for p in range(4): |
| getInputCodeOp1L += ''' |
| writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw, |
| %(p)d, 0x2); |
| ''' % { 'v' : v, 'p' : p } |
| |
| getInputCodeOp1S = '' |
| for v in range(numRegs): |
| for p in range(4): |
| getInputCodeOp1S += ''' |
| writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw, |
| %(p)d, 0x2); |
| ''' % { 'v' : v, 'p' : p } |
| |
| if name == 'deint_neon_uop': |
| |
| eCode = ''' |
| // input data from scratch area |
| VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; |
| VReg output[2]; // output data to arch. SIMD regs |
| VReg temp; |
| temp.lo = 0; |
| temp.hi = 0; |
| ''' |
| for p in range(4): |
| eCode += ''' |
| writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2); |
| ''' % { 'p' : p } |
| eCode += getInputCodeOp1L |
| |
| # Note that numRegs is not always the same as numStructElems; in |
| # particular, for LD1/ST1, numStructElems is 1 but numRegs can be |
| # 1, 2, 3 or 4 |
| |
| eCode += ''' |
| output[0].lo = 0; |
| output[0].hi = 0; |
| output[1].lo = 0; |
| output[1].hi = 0; |
| |
| int eCount = dataSize / (8 << eSize); |
| int eSizeBytes = 1 << eSize; // element size in bytes |
| int numBytes = step * dataSize / 4; |
| int totNumBytes = numRegs * dataSize / 8; |
| |
| int structElemNo, pos, a, b; |
| XReg data; |
| |
| for (int r = 0; r < 2; ++r) { |
| for (int i = 0; i < eCount; ++i) { |
| if (numBytes < totNumBytes) { |
| structElemNo = r + (step * 2); |
| if (numStructElems == 1) { |
| pos = (eSizeBytes * i) + |
| (eCount * structElemNo * eSizeBytes); |
| } else { |
| pos = (numStructElems * eSizeBytes * i) + |
| (structElemNo * eSizeBytes); |
| } |
| a = pos / 16; |
| b = (pos % 16) / eSizeBytes; |
| data = (XReg) readVecElem(input[a], (XReg) b, |
| eSize); |
| writeVecElem(&output[r], data, i, eSize); |
| numBytes += eSizeBytes; |
| } |
| } |
| } |
| ''' |
| for p in range(4): |
| eCode += ''' |
| AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0], |
| %(p)d, 0x2); |
| ''' % { 'p' : p } |
| eCode += ''' |
| if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) { |
| ''' |
| for p in range(4): |
| eCode += ''' |
| AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem( |
| output[1], %(p)d, 0x2); |
| ''' % { 'p' : p } |
| eCode += ''' |
| } else { |
| ''' |
| for p in range(4): |
| eCode += ''' |
| AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp, |
| %(p)d, 0x2); |
| ''' % { 'p' : p } |
| eCode += ''' |
| } |
| ''' |
| |
| iop = InstObjParams(name, Name, 'MicroNeonMixOp64', |
| { 'code' : eCode, 'op_class' : 'No_OpClass' }, |
| ['IsMicroop']) |
| header_output += MicroNeonMixDeclare64.subst(iop) |
| exec_output += MicroNeonMixExecute64.subst(iop) |
| |
| elif name == 'int_neon_uop': |
| |
| eCode = ''' |
| // input data from arch. SIMD regs |
| VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; |
| VReg output[2]; // output data to scratch area |
| ''' |
| |
| eCode += getInputCodeOp1S |
| |
| # Note that numRegs is not always the same as numStructElems; in |
| # particular, for LD1/ST1, numStructElems is 1 but numRegs can be |
| # 1, 2, 3 or 4 |
| |
| eCode += ''' |
| int eCount = dataSize / (8 << eSize); |
| int eSizeBytes = 1 << eSize; |
| int totNumBytes = numRegs * dataSize / 8; |
| int numOutputElems = 128 / (8 << eSize); |
| int stepOffset = step * 32; |
| |
| for (int i = 0; i < 2; ++i) { |
| output[i].lo = 0; |
| output[i].hi = 0; |
| } |
| |
| int r = 0, k = 0, i, j; |
| XReg data; |
| |
| for (int pos = stepOffset; pos < 32 + stepOffset; |
| pos += eSizeBytes) { |
| if (pos < totNumBytes) { |
| if (numStructElems == 1) { |
| i = (pos / eSizeBytes) % eCount; |
| j = pos / (eCount * eSizeBytes); |
| } else { |
| i = pos / (numStructElems * eSizeBytes); |
| j = (pos % (numStructElems * eSizeBytes)) / |
| eSizeBytes; |
| } |
| data = (XReg) readVecElem(input[j], (XReg) i, eSize); |
| writeVecElem(&output[r], data, k, eSize); |
| k++; |
| if (k == numOutputElems){ |
| k = 0; |
| ++r; |
| } |
| } |
| } |
| ''' |
| for v in range(2): |
| for p in range(4): |
| eCode += ''' |
| AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( |
| output[%(v)d], %(p)d, 0x2); |
| ''' % { 'v': v, 'p': p} |
| |
| iop = InstObjParams(name, Name, 'MicroNeonMixOp64', |
| { 'code' : eCode, 'op_class' : 'No_OpClass' }, |
| ['IsMicroop']) |
| header_output += MicroNeonMixDeclare64.subst(iop) |
| exec_output += MicroNeonMixExecute64.subst(iop) |
| |
| elif name == 'unpack_neon_uop': |
| |
| eCode = ''' |
| //input data from scratch area |
| VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; |
| //output data to arch. SIMD regs |
| VReg output[2] = { {0, 0}, {0, 0} }; |
| ''' |
| |
| eCode += getInputCodeOp1L |
| |
| # Fill output regs with register data initially. Note that |
| # elements in output register outside indexed lanes are left |
| # untouched |
| for v in range(2): |
| for p in range(4): |
| eCode += ''' |
| writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw, |
| %(p)d, 0x2); |
| ''' % { 'v': v, 'p': p} |
| eCode += ''' |
| int eCount = dataSize / (8 << eSize); |
| int eCount128 = 128 / (8 << eSize); |
| int eSizeBytes = 1 << eSize; |
| int totNumBytes = numStructElems * eSizeBytes; |
| int numInputElems = eCount128; |
| int stepOffset = step * 2 * eSizeBytes; |
| int stepLimit = 2 * eSizeBytes; |
| |
| int r = 0, i, j; |
| XReg data; |
| |
| for (int pos = stepOffset; pos < stepLimit + stepOffset; |
| pos += eSizeBytes) { |
| if (pos < totNumBytes) { |
| r = pos / eSizeBytes; |
| j = r / numInputElems; |
| i = r % numInputElems; |
| data = (XReg) readVecElem(input[j], (XReg) i, eSize); |
| |
| if (replicate) { |
| for (int i = 0; i < eCount128; ++i) { |
| if (i < eCount) { |
| writeVecElem(&output[r % 2], data, i, |
| eSize); |
| } else { // zero extend if necessary |
| writeVecElem(&output[r % 2], (XReg) 0, i, |
| eSize); |
| } |
| } |
| } else { |
| writeVecElem(&output[r % 2], data, lane, eSize); |
| } |
| } |
| } |
| ''' |
| for v in range(2): |
| for p in range(4): |
| eCode += ''' |
| AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem( |
| output[%(v)d], %(p)d, 0x2); |
| ''' % { 'v' : v, 'p' : p } |
| |
| iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', |
| { 'code' : eCode }, ['IsMicroop']) |
| header_output += MicroNeonMixLaneDeclare64.subst(iop) |
| exec_output += MicroNeonMixExecute64.subst(iop) |
| |
| elif name == 'pack_neon_uop': |
| |
| eCode = ''' |
| // input data from arch. SIMD regs |
| VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; |
| VReg output[2]; // output data to scratch area |
| ''' |
| |
| eCode += getInputCodeOp1S |
| |
| eCode += ''' |
| int eSizeBytes = 1 << eSize; |
| int numOutputElems = 128 / (8 << eSize); |
| int totNumBytes = numStructElems * eSizeBytes; |
| int stepOffset = step * 32; |
| int stepLimit = 32; |
| |
| int r = 0, i, j; |
| XReg data; |
| |
| for (int i = 0; i < 2; ++i) { |
| output[i].lo = 0; |
| output[i].hi = 0; |
| } |
| |
| for (int pos = stepOffset; pos < stepLimit + stepOffset; |
| pos += eSizeBytes) { |
| if (pos < totNumBytes) { |
| r = pos / 16; |
| j = pos / eSizeBytes; |
| i = (pos / eSizeBytes) % numOutputElems; |
| data = (XReg) readVecElem(input[j], lane, eSize); |
| writeVecElem(&output[r % 2], data, i, eSize); |
| } |
| } |
| ''' |
| |
| for v in range(2): |
| for p in range(4): |
| eCode += ''' |
| AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( |
| output[%(v)d], %(p)d, 0x2); |
| ''' % { 'v' : v, 'p' : p } |
| |
| iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', |
| { 'code' : eCode }, ['IsMicroop']) |
| header_output += MicroNeonMixLaneDeclare64.subst(iop) |
| exec_output += MicroNeonMixExecute64.subst(iop) |
| |
| # Generate instructions |
| mkMemAccMicroOp('mem_neon_uop') |
| mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1) |
| mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2) |
| mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3) |
| mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4) |
| mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1) |
| mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2) |
| mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3) |
| mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4) |
| mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64') |
| mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64') |
| |
| }}; |
| |
| let {{ |
| |
| iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', []) |
| header_output += VMemMultDeclare64.subst(iop) |
| decoder_output += VMemMultConstructor64.subst(iop) |
| |
| iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', []) |
| header_output += VMemMultDeclare64.subst(iop) |
| decoder_output += VMemMultConstructor64.subst(iop) |
| |
| iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', []) |
| header_output += VMemSingleDeclare64.subst(iop) |
| decoder_output += VMemSingleConstructor64.subst(iop) |
| |
| iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', []) |
| header_output += VMemSingleDeclare64.subst(iop) |
| decoder_output += VMemSingleConstructor64.subst(iop) |
| |
| }}; |