| ;***************************************************************************** |
| ;* sad-a.asm: h264 encoder library |
| ;***************************************************************************** |
| ;* Copyright (C) 2003-2008 x264 project |
| ;* |
| ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
| ;* Laurent Aimar <fenrir@via.ecp.fr> |
| ;* Jason Garrett-Glaser <darkshikari@gmail.com> |
| ;* Alex Izvorski <aizvorksi@gmail.com> |
| ;* |
| ;* This program is free software; you can redistribute it and/or modify |
| ;* it under the terms of the GNU General Public License as published by |
| ;* the Free Software Foundation; either version 2 of the License, or |
| ;* (at your option) any later version. |
| ;* |
| ;* This program is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| ;* GNU General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU General Public License |
| ;* along with this program; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
| ;***************************************************************************** |
| |
| %include "x86inc.asm" |
| %include "x86util.asm" |
| |
| SECTION_RODATA |
| pb_3: times 16 db 3 |
| sw_64: dd 64 |
| |
| SECTION .text |
| |
| ;============================================================================= |
| ; SAD MMX |
| ;============================================================================= |
| |
| %macro SAD_INC_2x16P 0 |
| movq mm1, [r0] |
| movq mm2, [r0+8] |
| movq mm3, [r0+r1] |
| movq mm4, [r0+r1+8] |
| psadbw mm1, [r2] |
| psadbw mm2, [r2+8] |
| psadbw mm3, [r2+r3] |
| psadbw mm4, [r2+r3+8] |
| lea r0, [r0+2*r1] |
| paddw mm1, mm2 |
| paddw mm3, mm4 |
| lea r2, [r2+2*r3] |
| paddw mm0, mm1 |
| paddw mm0, mm3 |
| %endmacro |
| |
| %macro SAD_INC_2x8P 0 |
| movq mm1, [r0] |
| movq mm2, [r0+r1] |
| psadbw mm1, [r2] |
| psadbw mm2, [r2+r3] |
| lea r0, [r0+2*r1] |
| paddw mm0, mm1 |
| paddw mm0, mm2 |
| lea r2, [r2+2*r3] |
| %endmacro |
| |
| %macro SAD_INC_2x4P 0 |
| movd mm1, [r0] |
| movd mm2, [r2] |
| punpckldq mm1, [r0+r1] |
| punpckldq mm2, [r2+r3] |
| psadbw mm1, mm2 |
| paddw mm0, mm1 |
| lea r0, [r0+2*r1] |
| lea r2, [r2+2*r3] |
| %endmacro |
| |
| ;----------------------------------------------------------------------------- |
| ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) |
| ;----------------------------------------------------------------------------- |
| %macro SAD 2 |
| cglobal x264_pixel_sad_%1x%2_mmxext, 4,4 |
| pxor mm0, mm0 |
| %rep %2/2 |
| SAD_INC_2x%1P |
| %endrep |
| movd eax, mm0 |
| RET |
| %endmacro |
| |
| SAD 16, 16 |
| SAD 16, 8 |
| SAD 8, 16 |
| SAD 8, 8 |
| SAD 8, 4 |
| SAD 4, 8 |
| SAD 4, 4 |
| |
| |
| |
| ;============================================================================= |
| ; SAD XMM |
| ;============================================================================= |
| |
| %macro SAD_END_SSE2 0 |
| movhlps m1, m0 |
| paddw m0, m1 |
| movd eax, m0 |
| RET |
| %endmacro |
| |
| %macro SAD_W16 1 |
| ;----------------------------------------------------------------------------- |
| ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) |
| ;----------------------------------------------------------------------------- |
| cglobal x264_pixel_sad_16x16_%1, 4,4 |
| movdqu m0, [r2] |
| movdqu m1, [r2+r3] |
| lea r2, [r2+2*r3] |
| movdqu m2, [r2] |
| movdqu m3, [r2+r3] |
| lea r2, [r2+2*r3] |
| psadbw m0, [r0] |
| psadbw m1, [r0+r1] |
| lea r0, [r0+2*r1] |
| movdqu m4, [r2] |
| paddw m0, m1 |
| psadbw m2, [r0] |
| psadbw m3, [r0+r1] |
| lea r0, [r0+2*r1] |
| movdqu m5, [r2+r3] |
| lea r2, [r2+2*r3] |
| paddw m2, m3 |
| movdqu m6, [r2] |
| movdqu m7, [r2+r3] |
| lea r2, [r2+2*r3] |
| paddw m0, m2 |
| psadbw m4, [r0] |
| psadbw m5, [r0+r1] |
| lea r0, [r0+2*r1] |
| movdqu m1, [r2] |
| paddw m4, m5 |
| psadbw m6, [r0] |
| psadbw m7, [r0+r1] |
| lea r0, [r0+2*r1] |
| movdqu m2, [r2+r3] |
| lea r2, [r2+2*r3] |
| paddw m6, m7 |
| movdqu m3, [r2] |
| paddw m0, m4 |
| movdqu m4, [r2+r3] |
| lea r2, [r2+2*r3] |
| paddw m0, m6 |
| psadbw m1, [r0] |
| psadbw m2, [r0+r1] |
| lea r0, [r0+2*r1] |
| movdqu m5, [r2] |
| paddw m1, m2 |
| psadbw m3, [r0] |
| psadbw m4, [r0+r1] |
| lea r0, [r0+2*r1] |
| movdqu m6, [r2+r3] |
| lea r2, [r2+2*r3] |
| paddw m3, m4 |
| movdqu m7, [r2] |
| paddw m0, m1 |
| movdqu m1, [r2+r3] |
| paddw m0, m3 |
| psadbw m5, [r0] |
| psadbw m6, [r0+r1] |
| lea r0, [r0+2*r1] |
| paddw m5, m6 |
| psadbw m7, [r0] |
| psadbw m1, [r0+r1] |
| paddw m7, m1 |
| paddw m0, m5 |
| paddw m0, m7 |
| SAD_END_SSE2 |
| |
| ;----------------------------------------------------------------------------- |
| ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) |
| ;----------------------------------------------------------------------------- |
| cglobal x264_pixel_sad_16x8_%1, 4,4 |
| movdqu m0, [r2] |
| movdqu m2, [r2+r3] |
| lea r2, [r2+2*r3] |
| movdqu m3, [r2] |
| movdqu m4, [r2+r3] |
| psadbw m0, [r0] |
| psadbw m2, [r0+r1] |
| lea r0, [r0+2*r1] |
| psadbw m3, [r0] |
| psadbw m4, [r0+r1] |
| lea r0, [r0+2*r1] |
| lea r2, [r2+2*r3] |
| paddw m0, m2 |
| paddw m3, m4 |
| paddw m0, m3 |
| movdqu m1, [r2] |
| movdqu m2, [r2+r3] |
| lea r2, [r2+2*r3] |
| movdqu m3, [r2] |
| movdqu m4, [r2+r3] |
| psadbw m1, [r0] |
| psadbw m2, [r0+r1] |
| lea r0, [r0+2*r1] |
| psadbw m3, [r0] |
| psadbw m4, [r0+r1] |
| lea r0, [r0+2*r1] |
| lea r2, [r2+2*r3] |
| paddw m1, m2 |
| paddw m3, m4 |
| paddw m0, m1 |
| paddw m0, m3 |
| SAD_END_SSE2 |
| %endmacro |
| |
| INIT_XMM |
| SAD_W16 sse2 |
| %define movdqu lddqu |
| SAD_W16 sse3 |
| %define movdqu movdqa |
| SAD_W16 sse2_aligned |
| %undef movdqu |
| |
| %macro SAD_INC_4x8P_SSE 1 |
| movq m1, [r0] |
| movq m2, [r0+r1] |
| lea r0, [r0+2*r1] |
| movq m3, [r2] |
| movq m4, [r2+r3] |
| lea r2, [r2+2*r3] |
| movhps m1, [r0] |
| movhps m2, [r0+r1] |
| movhps m3, [r2] |
| movhps m4, [r2+r3] |
| lea r0, [r0+2*r1] |
| psadbw m1, m3 |
| psadbw m2, m4 |
| lea r2, [r2+2*r3] |
| %if %1 |
| paddw m0, m1 |
| %else |
| SWAP m0, m1 |
| %endif |
| paddw m0, m2 |
| %endmacro |
| |
| ;Even on Nehalem, no sizes other than 8x16 benefit from this method. |
| cglobal x264_pixel_sad_8x16_sse2, 4,4 |
| SAD_INC_4x8P_SSE 0 |
| SAD_INC_4x8P_SSE 1 |
| SAD_INC_4x8P_SSE 1 |
| SAD_INC_4x8P_SSE 1 |
| SAD_END_SSE2 |
| RET |
| |
| ;----------------------------------------------------------------------------- |
| ; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] ); |
| ;----------------------------------------------------------------------------- |
| |
| ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction |
| ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score |
| %macro INTRA_SAD16 1 |
| cglobal x264_intra_sad_x3_16x16_%1,3,5 |
| pxor mm0, mm0 |
| pxor mm1, mm1 |
| psadbw mm0, [r1-FDEC_STRIDE+0] |
| psadbw mm1, [r1-FDEC_STRIDE+8] |
| paddw mm0, mm1 |
| movd r3d, mm0 |
| %ifidn %1, ssse3 |
| mova m1, [pb_3 GLOBAL] |
| %endif |
| %assign n 0 |
| %rep 16 |
| movzx r4d, byte [r1-1+FDEC_STRIDE*n] |
| add r3d, r4d |
| %assign n n+1 |
| %endrep |
| add r3d, 16 |
| shr r3d, 5 |
| imul r3d, 0x01010101 |
| movd m7, r3d |
| mova m5, [r1-FDEC_STRIDE] |
| %if mmsize==16 |
| pshufd m7, m7, 0 |
| %else |
| mova m1, [r1-FDEC_STRIDE+8] |
| punpckldq m7, m7 |
| %endif |
| pxor m4, m4 |
| pxor m3, m3 |
| pxor m2, m2 |
| mov r3d, 15*FENC_STRIDE |
| .vloop: |
| SPLATB m6, r1+r3*2-1, m1 |
| mova m0, [r0+r3] |
| psadbw m0, m7 |
| paddw m4, m0 |
| mova m0, [r0+r3] |
| psadbw m0, m5 |
| paddw m2, m0 |
| %if mmsize==8 |
| mova m0, [r0+r3] |
| psadbw m0, m6 |
| paddw m3, m0 |
| mova m0, [r0+r3+8] |
| psadbw m0, m7 |
| paddw m4, m0 |
| mova m0, [r0+r3+8] |
| psadbw m0, m1 |
| paddw m2, m0 |
| psadbw m6, [r0+r3+8] |
| paddw m3, m6 |
| %else |
| psadbw m6, [r0+r3] |
| paddw m3, m6 |
| %endif |
| add r3d, -FENC_STRIDE |
| jge .vloop |
| %if mmsize==16 |
| pslldq m3, 4 |
| por m3, m2 |
| movhlps m1, m3 |
| paddw m3, m1 |
| movq [r2+0], m3 |
| movhlps m1, m4 |
| paddw m4, m1 |
| %else |
| movd [r2+0], m2 |
| movd [r2+4], m3 |
| %endif |
| movd [r2+8], m4 |
| RET |
| %endmacro |
| |
| INIT_MMX |
| %define SPLATB SPLATB_MMX |
| INTRA_SAD16 mmxext |
| INIT_XMM |
| INTRA_SAD16 sse2 |
| %define SPLATB SPLATB_SSSE3 |
| INTRA_SAD16 ssse3 |
| |
| |
| |
| ;============================================================================= |
| ; SAD x3/x4 MMX |
| ;============================================================================= |
| |
| %macro SAD_X3_START_1x8P 0 |
| movq mm3, [r0] |
| movq mm0, [r1] |
| movq mm1, [r2] |
| movq mm2, [r3] |
| psadbw mm0, mm3 |
| psadbw mm1, mm3 |
| psadbw mm2, mm3 |
| %endmacro |
| |
| %macro SAD_X3_1x8P 2 |
| movq mm3, [r0+%1] |
| movq mm4, [r1+%2] |
| movq mm5, [r2+%2] |
| movq mm6, [r3+%2] |
| psadbw mm4, mm3 |
| psadbw mm5, mm3 |
| psadbw mm6, mm3 |
| paddw mm0, mm4 |
| paddw mm1, mm5 |
| paddw mm2, mm6 |
| %endmacro |
| |
| %macro SAD_X3_START_2x4P 3 |
| movd mm3, [r0] |
| movd %1, [r1] |
| movd %2, [r2] |
| movd %3, [r3] |
| punpckldq mm3, [r0+FENC_STRIDE] |
| punpckldq %1, [r1+r4] |
| punpckldq %2, [r2+r4] |
| punpckldq %3, [r3+r4] |
| psadbw %1, mm3 |
| psadbw %2, mm3 |
| psadbw %3, mm3 |
| %endmacro |
| |
| %macro SAD_X3_2x16P 1 |
| %if %1 |
| SAD_X3_START_1x8P |
| %else |
| SAD_X3_1x8P 0, 0 |
| %endif |
| SAD_X3_1x8P 8, 8 |
| SAD_X3_1x8P FENC_STRIDE, r4 |
| SAD_X3_1x8P FENC_STRIDE+8, r4+8 |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r4] |
| lea r2, [r2+2*r4] |
| lea r3, [r3+2*r4] |
| %endmacro |
| |
| %macro SAD_X3_2x8P 1 |
| %if %1 |
| SAD_X3_START_1x8P |
| %else |
| SAD_X3_1x8P 0, 0 |
| %endif |
| SAD_X3_1x8P FENC_STRIDE, r4 |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r4] |
| lea r2, [r2+2*r4] |
| lea r3, [r3+2*r4] |
| %endmacro |
| |
| %macro SAD_X3_2x4P 1 |
| %if %1 |
| SAD_X3_START_2x4P mm0, mm1, mm2 |
| %else |
| SAD_X3_START_2x4P mm4, mm5, mm6 |
| paddw mm0, mm4 |
| paddw mm1, mm5 |
| paddw mm2, mm6 |
| %endif |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r4] |
| lea r2, [r2+2*r4] |
| lea r3, [r3+2*r4] |
| %endmacro |
| |
| %macro SAD_X4_START_1x8P 0 |
| movq mm7, [r0] |
| movq mm0, [r1] |
| movq mm1, [r2] |
| movq mm2, [r3] |
| movq mm3, [r4] |
| psadbw mm0, mm7 |
| psadbw mm1, mm7 |
| psadbw mm2, mm7 |
| psadbw mm3, mm7 |
| %endmacro |
| |
| %macro SAD_X4_1x8P 2 |
| movq mm7, [r0+%1] |
| movq mm4, [r1+%2] |
| movq mm5, [r2+%2] |
| movq mm6, [r3+%2] |
| psadbw mm4, mm7 |
| psadbw mm5, mm7 |
| psadbw mm6, mm7 |
| psadbw mm7, [r4+%2] |
| paddw mm0, mm4 |
| paddw mm1, mm5 |
| paddw mm2, mm6 |
| paddw mm3, mm7 |
| %endmacro |
| |
| %macro SAD_X4_START_2x4P 0 |
| movd mm7, [r0] |
| movd mm0, [r1] |
| movd mm1, [r2] |
| movd mm2, [r3] |
| movd mm3, [r4] |
| punpckldq mm7, [r0+FENC_STRIDE] |
| punpckldq mm0, [r1+r5] |
| punpckldq mm1, [r2+r5] |
| punpckldq mm2, [r3+r5] |
| punpckldq mm3, [r4+r5] |
| psadbw mm0, mm7 |
| psadbw mm1, mm7 |
| psadbw mm2, mm7 |
| psadbw mm3, mm7 |
| %endmacro |
| |
| %macro SAD_X4_INC_2x4P 0 |
| movd mm7, [r0] |
| movd mm4, [r1] |
| movd mm5, [r2] |
| punpckldq mm7, [r0+FENC_STRIDE] |
| punpckldq mm4, [r1+r5] |
| punpckldq mm5, [r2+r5] |
| psadbw mm4, mm7 |
| psadbw mm5, mm7 |
| paddw mm0, mm4 |
| paddw mm1, mm5 |
| movd mm4, [r3] |
| movd mm5, [r4] |
| punpckldq mm4, [r3+r5] |
| punpckldq mm5, [r4+r5] |
| psadbw mm4, mm7 |
| psadbw mm5, mm7 |
| paddw mm2, mm4 |
| paddw mm3, mm5 |
| %endmacro |
| |
| %macro SAD_X4_2x16P 1 |
| %if %1 |
| SAD_X4_START_1x8P |
| %else |
| SAD_X4_1x8P 0, 0 |
| %endif |
| SAD_X4_1x8P 8, 8 |
| SAD_X4_1x8P FENC_STRIDE, r5 |
| SAD_X4_1x8P FENC_STRIDE+8, r5+8 |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r5] |
| lea r2, [r2+2*r5] |
| lea r3, [r3+2*r5] |
| lea r4, [r4+2*r5] |
| %endmacro |
| |
| %macro SAD_X4_2x8P 1 |
| %if %1 |
| SAD_X4_START_1x8P |
| %else |
| SAD_X4_1x8P 0, 0 |
| %endif |
| SAD_X4_1x8P FENC_STRIDE, r5 |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r5] |
| lea r2, [r2+2*r5] |
| lea r3, [r3+2*r5] |
| lea r4, [r4+2*r5] |
| %endmacro |
| |
| %macro SAD_X4_2x4P 1 |
| %if %1 |
| SAD_X4_START_2x4P |
| %else |
| SAD_X4_INC_2x4P |
| %endif |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r5] |
| lea r2, [r2+2*r5] |
| lea r3, [r3+2*r5] |
| lea r4, [r4+2*r5] |
| %endmacro |
| |
| %macro SAD_X3_END 0 |
| %ifdef ARCH_X86_64 |
| movd [r5+0], mm0 |
| movd [r5+4], mm1 |
| movd [r5+8], mm2 |
| %else |
| mov r0, r5m |
| movd [r0+0], mm0 |
| movd [r0+4], mm1 |
| movd [r0+8], mm2 |
| %endif |
| RET |
| %endmacro |
| |
| %macro SAD_X4_END 0 |
| mov r0, r6m |
| movd [r0+0], mm0 |
| movd [r0+4], mm1 |
| movd [r0+8], mm2 |
| movd [r0+12], mm3 |
| RET |
| %endmacro |
| |
| ;----------------------------------------------------------------------------- |
| ; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, |
| ; uint8_t *pix2, int i_stride, int scores[3] ) |
| ;----------------------------------------------------------------------------- |
| %macro SAD_X 3 |
| cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2 |
| SAD_X%1_2x%2P 1 |
| %rep %3/2-1 |
| SAD_X%1_2x%2P 0 |
| %endrep |
| SAD_X%1_END |
| %endmacro |
| |
| SAD_X 3, 16, 16 |
| SAD_X 3, 16, 8 |
| SAD_X 3, 8, 16 |
| SAD_X 3, 8, 8 |
| SAD_X 3, 8, 4 |
| SAD_X 3, 4, 8 |
| SAD_X 3, 4, 4 |
| SAD_X 4, 16, 16 |
| SAD_X 4, 16, 8 |
| SAD_X 4, 8, 16 |
| SAD_X 4, 8, 8 |
| SAD_X 4, 8, 4 |
| SAD_X 4, 4, 8 |
| SAD_X 4, 4, 4 |
| |
| |
| |
| ;============================================================================= |
| ; SAD x3/x4 XMM |
| ;============================================================================= |
| |
| %macro SAD_X3_START_1x16P_SSE2 0 |
| movdqa xmm3, [r0] |
| movdqu xmm0, [r1] |
| movdqu xmm1, [r2] |
| movdqu xmm2, [r3] |
| psadbw xmm0, xmm3 |
| psadbw xmm1, xmm3 |
| psadbw xmm2, xmm3 |
| %endmacro |
| |
| %macro SAD_X3_1x16P_SSE2 2 |
| movdqa xmm3, [r0+%1] |
| movdqu xmm4, [r1+%2] |
| movdqu xmm5, [r2+%2] |
| movdqu xmm6, [r3+%2] |
| psadbw xmm4, xmm3 |
| psadbw xmm5, xmm3 |
| psadbw xmm6, xmm3 |
| paddw xmm0, xmm4 |
| paddw xmm1, xmm5 |
| paddw xmm2, xmm6 |
| %endmacro |
| |
| %macro SAD_X3_2x16P_SSE2 1 |
| %if %1 |
| SAD_X3_START_1x16P_SSE2 |
| %else |
| SAD_X3_1x16P_SSE2 0, 0 |
| %endif |
| SAD_X3_1x16P_SSE2 FENC_STRIDE, r4 |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r4] |
| lea r2, [r2+2*r4] |
| lea r3, [r3+2*r4] |
| %endmacro |
| |
| %macro SAD_X3_START_2x8P_SSE2 0 |
| movq xmm7, [r0] |
| movq xmm0, [r1] |
| movq xmm1, [r2] |
| movq xmm2, [r3] |
| movhps xmm7, [r0+FENC_STRIDE] |
| movhps xmm0, [r1+r4] |
| movhps xmm1, [r2+r4] |
| movhps xmm2, [r3+r4] |
| psadbw xmm0, xmm7 |
| psadbw xmm1, xmm7 |
| psadbw xmm2, xmm7 |
| %endmacro |
| |
| %macro SAD_X3_2x8P_SSE2 0 |
| movq xmm7, [r0] |
| movq xmm3, [r1] |
| movq xmm4, [r2] |
| movq xmm5, [r3] |
| movhps xmm7, [r0+FENC_STRIDE] |
| movhps xmm3, [r1+r4] |
| movhps xmm4, [r2+r4] |
| movhps xmm5, [r3+r4] |
| psadbw xmm3, xmm7 |
| psadbw xmm4, xmm7 |
| psadbw xmm5, xmm7 |
| paddw xmm0, xmm3 |
| paddw xmm1, xmm4 |
| paddw xmm2, xmm5 |
| %endmacro |
| |
| %macro SAD_X4_START_2x8P_SSE2 0 |
| movq xmm7, [r0] |
| movq xmm0, [r1] |
| movq xmm1, [r2] |
| movq xmm2, [r3] |
| movq xmm3, [r4] |
| movhps xmm7, [r0+FENC_STRIDE] |
| movhps xmm0, [r1+r5] |
| movhps xmm1, [r2+r5] |
| movhps xmm2, [r3+r5] |
| movhps xmm3, [r4+r5] |
| psadbw xmm0, xmm7 |
| psadbw xmm1, xmm7 |
| psadbw xmm2, xmm7 |
| psadbw xmm3, xmm7 |
| %endmacro |
| |
| %macro SAD_X4_2x8P_SSE2 0 |
| movq xmm7, [r0] |
| movq xmm4, [r1] |
| movq xmm5, [r2] |
| %ifdef ARCH_X86_64 |
| movq xmm6, [r3] |
| movq xmm8, [r4] |
| movhps xmm7, [r0+FENC_STRIDE] |
| movhps xmm4, [r1+r5] |
| movhps xmm5, [r2+r5] |
| movhps xmm6, [r3+r5] |
| movhps xmm8, [r4+r5] |
| psadbw xmm4, xmm7 |
| psadbw xmm5, xmm7 |
| psadbw xmm6, xmm7 |
| psadbw xmm8, xmm7 |
| paddw xmm0, xmm4 |
| paddw xmm1, xmm5 |
| paddw xmm2, xmm6 |
| paddw xmm3, xmm8 |
| %else |
| movhps xmm7, [r0+FENC_STRIDE] |
| movhps xmm4, [r1+r5] |
| movhps xmm5, [r2+r5] |
| psadbw xmm4, xmm7 |
| psadbw xmm5, xmm7 |
| paddw xmm0, xmm4 |
| paddw xmm1, xmm5 |
| movq xmm6, [r3] |
| movq xmm4, [r4] |
| movhps xmm6, [r3+r5] |
| movhps xmm4, [r4+r5] |
| psadbw xmm6, xmm7 |
| psadbw xmm4, xmm7 |
| paddw xmm2, xmm6 |
| paddw xmm3, xmm4 |
| %endif |
| %endmacro |
| |
| %macro SAD_X4_START_1x16P_SSE2 0 |
| movdqa xmm7, [r0] |
| movdqu xmm0, [r1] |
| movdqu xmm1, [r2] |
| movdqu xmm2, [r3] |
| movdqu xmm3, [r4] |
| psadbw xmm0, xmm7 |
| psadbw xmm1, xmm7 |
| psadbw xmm2, xmm7 |
| psadbw xmm3, xmm7 |
| %endmacro |
| |
| %macro SAD_X4_1x16P_SSE2 2 |
| movdqa xmm7, [r0+%1] |
| movdqu xmm4, [r1+%2] |
| movdqu xmm5, [r2+%2] |
| movdqu xmm6, [r3+%2] |
| %ifdef ARCH_X86_64 |
| movdqu xmm8, [r4+%2] |
| psadbw xmm4, xmm7 |
| psadbw xmm5, xmm7 |
| psadbw xmm6, xmm7 |
| psadbw xmm8, xmm7 |
| paddw xmm0, xmm4 |
| paddw xmm1, xmm5 |
| paddw xmm2, xmm6 |
| paddw xmm3, xmm8 |
| %else |
| psadbw xmm4, xmm7 |
| psadbw xmm5, xmm7 |
| paddw xmm0, xmm4 |
| psadbw xmm6, xmm7 |
| movdqu xmm4, [r4+%2] |
| paddw xmm1, xmm5 |
| psadbw xmm4, xmm7 |
| paddw xmm2, xmm6 |
| paddw xmm3, xmm4 |
| %endif |
| %endmacro |
| |
| %macro SAD_X4_2x16P_SSE2 1 |
| %if %1 |
| SAD_X4_START_1x16P_SSE2 |
| %else |
| SAD_X4_1x16P_SSE2 0, 0 |
| %endif |
| SAD_X4_1x16P_SSE2 FENC_STRIDE, r5 |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r5] |
| lea r2, [r2+2*r5] |
| lea r3, [r3+2*r5] |
| lea r4, [r4+2*r5] |
| %endmacro |
| |
| %macro SAD_X3_2x8P_SSE2 1 |
| %if %1 |
| SAD_X3_START_2x8P_SSE2 |
| %else |
| SAD_X3_2x8P_SSE2 |
| %endif |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r4] |
| lea r2, [r2+2*r4] |
| lea r3, [r3+2*r4] |
| %endmacro |
| |
| %macro SAD_X4_2x8P_SSE2 1 |
| %if %1 |
| SAD_X4_START_2x8P_SSE2 |
| %else |
| SAD_X4_2x8P_SSE2 |
| %endif |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r5] |
| lea r2, [r2+2*r5] |
| lea r3, [r3+2*r5] |
| lea r4, [r4+2*r5] |
| %endmacro |
| |
| %macro SAD_X3_END_SSE2 0 |
| movhlps xmm4, xmm0 |
| movhlps xmm5, xmm1 |
| movhlps xmm6, xmm2 |
| paddw xmm0, xmm4 |
| paddw xmm1, xmm5 |
| paddw xmm2, xmm6 |
| %ifdef ARCH_X86_64 |
| movd [r5+0], xmm0 |
| movd [r5+4], xmm1 |
| movd [r5+8], xmm2 |
| %else |
| mov r0, r5m |
| movd [r0+0], xmm0 |
| movd [r0+4], xmm1 |
| movd [r0+8], xmm2 |
| %endif |
| RET |
| %endmacro |
| |
| %macro SAD_X4_END_SSE2 0 |
| mov r0, r6m |
| psllq xmm1, 32 |
| psllq xmm3, 32 |
| paddw xmm0, xmm1 |
| paddw xmm2, xmm3 |
| movhlps xmm1, xmm0 |
| movhlps xmm3, xmm2 |
| paddw xmm0, xmm1 |
| paddw xmm2, xmm3 |
| movq [r0+0], xmm0 |
| movq [r0+8], xmm2 |
| RET |
| %endmacro |
| |
| %macro SAD_X3_START_1x16P_SSE2_MISALIGN 0 |
| movdqa xmm2, [r0] |
| movdqu xmm0, [r1] |
| movdqu xmm1, [r2] |
| psadbw xmm0, xmm2 |
| psadbw xmm1, xmm2 |
| psadbw xmm2, [r3] |
| %endmacro |
| |
| %macro SAD_X3_1x16P_SSE2_MISALIGN 2 |
| movdqa xmm3, [r0+%1] |
| movdqu xmm4, [r1+%2] |
| movdqu xmm5, [r2+%2] |
| psadbw xmm4, xmm3 |
| psadbw xmm5, xmm3 |
| psadbw xmm3, [r3+%2] |
| paddw xmm0, xmm4 |
| paddw xmm1, xmm5 |
| paddw xmm2, xmm3 |
| %endmacro |
| |
| %macro SAD_X4_START_1x16P_SSE2_MISALIGN 0 |
| movdqa xmm3, [r0] |
| movdqu xmm0, [r1] |
| movdqu xmm1, [r2] |
| movdqu xmm2, [r3] |
| psadbw xmm0, xmm3 |
| psadbw xmm1, xmm3 |
| psadbw xmm2, xmm3 |
| psadbw xmm3, [r4] |
| %endmacro |
| |
| %macro SAD_X4_1x16P_SSE2_MISALIGN 2 |
| movdqa xmm7, [r0+%1] |
| movdqu xmm4, [r1+%2] |
| movdqu xmm5, [r2+%2] |
| movdqu xmm6, [r3+%2] |
| psadbw xmm4, xmm7 |
| psadbw xmm5, xmm7 |
| psadbw xmm6, xmm7 |
| psadbw xmm7, [r4+%2] |
| paddw xmm0, xmm4 |
| paddw xmm1, xmm5 |
| paddw xmm2, xmm6 |
| paddw xmm3, xmm7 |
| %endmacro |
| |
| %macro SAD_X3_2x16P_SSE2_MISALIGN 1 |
| %if %1 |
| SAD_X3_START_1x16P_SSE2_MISALIGN |
| %else |
| SAD_X3_1x16P_SSE2_MISALIGN 0, 0 |
| %endif |
| SAD_X3_1x16P_SSE2_MISALIGN FENC_STRIDE, r4 |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r4] |
| lea r2, [r2+2*r4] |
| lea r3, [r3+2*r4] |
| %endmacro |
| |
| %macro SAD_X4_2x16P_SSE2_MISALIGN 1 |
| %if %1 |
| SAD_X4_START_1x16P_SSE2_MISALIGN |
| %else |
| SAD_X4_1x16P_SSE2_MISALIGN 0, 0 |
| %endif |
| SAD_X4_1x16P_SSE2_MISALIGN FENC_STRIDE, r5 |
| add r0, 2*FENC_STRIDE |
| lea r1, [r1+2*r5] |
| lea r2, [r2+2*r5] |
| lea r3, [r3+2*r5] |
| lea r4, [r4+2*r5] |
| %endmacro |
| |
| ;----------------------------------------------------------------------------- |
| ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, |
| ; uint8_t *pix2, int i_stride, int scores[3] ) |
| ;----------------------------------------------------------------------------- |
| %macro SAD_X_SSE2 4 |
| cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1 |
| SAD_X%1_2x%2P_SSE2 1 |
| %rep %3/2-1 |
| SAD_X%1_2x%2P_SSE2 0 |
| %endrep |
| SAD_X%1_END_SSE2 |
| %endmacro |
| |
| %macro SAD_X_SSE2_MISALIGN 4 |
| cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1 |
| SAD_X%1_2x%2P_SSE2_MISALIGN 1 |
| %rep %3/2-1 |
| SAD_X%1_2x%2P_SSE2_MISALIGN 0 |
| %endrep |
| SAD_X%1_END_SSE2 |
| %endmacro |
| |
| SAD_X_SSE2 3, 16, 16, sse2 |
| SAD_X_SSE2 3, 16, 8, sse2 |
| SAD_X_SSE2 3, 8, 16, sse2 |
| SAD_X_SSE2 3, 8, 8, sse2 |
| SAD_X_SSE2 3, 8, 4, sse2 |
| SAD_X_SSE2 4, 16, 16, sse2 |
| SAD_X_SSE2 4, 16, 8, sse2 |
| SAD_X_SSE2 4, 8, 16, sse2 |
| SAD_X_SSE2 4, 8, 8, sse2 |
| SAD_X_SSE2 4, 8, 4, sse2 |
| |
| SAD_X_SSE2_MISALIGN 3, 16, 16, sse2 |
| SAD_X_SSE2_MISALIGN 3, 16, 8, sse2 |
| SAD_X_SSE2_MISALIGN 4, 16, 16, sse2 |
| SAD_X_SSE2_MISALIGN 4, 16, 8, sse2 |
| |
| %define movdqu lddqu |
| SAD_X_SSE2 3, 16, 16, sse3 |
| SAD_X_SSE2 3, 16, 8, sse3 |
| SAD_X_SSE2 4, 16, 16, sse3 |
| SAD_X_SSE2 4, 16, 8, sse3 |
| %undef movdqu |
| |
| |
| |
| ;============================================================================= |
| ; SAD cacheline split |
| ;============================================================================= |
| |
| ; Core2 (Conroe) can load unaligned data just as quickly as aligned data... |
| ; unless the unaligned data spans the border between 2 cachelines, in which |
| ; case it's really slow. The exact numbers may differ, but all Intel cpus prior |
| ; to Nehalem have a large penalty for cacheline splits. |
| ; (8-byte alignment exactly half way between two cachelines is ok though.) |
| ; LDDQU was supposed to fix this, but it only works on Pentium 4. |
| ; So in the split case we load aligned data and explicitly perform the |
| ; alignment between registers. Like on archs that have only aligned loads, |
| ; except complicated by the fact that PALIGNR takes only an immediate, not |
| ; a variable alignment. |
| ; It is also possible to hoist the realignment to the macroblock level (keep |
| ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory |
| ; needed for that method makes it often slower. |
| |
| ; sad 16x16 costs on Core2: |
| ; good offsets: 49 cycles (50/64 of all mvs) |
| ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles) |
| ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles) |
| ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles) |
| |
| ; computed jump assumes this loop is exactly 80 bytes |
| %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment |
| ALIGN 16 |
| sad_w16_align%1_sse2: |
| movdqa xmm1, [r2+16] |
| movdqa xmm2, [r2+r3+16] |
| movdqa xmm3, [r2] |
| movdqa xmm4, [r2+r3] |
| pslldq xmm1, 16-%1 |
| pslldq xmm2, 16-%1 |
| psrldq xmm3, %1 |
| psrldq xmm4, %1 |
| por xmm1, xmm3 |
| por xmm2, xmm4 |
| psadbw xmm1, [r0] |
| psadbw xmm2, [r0+r1] |
| paddw xmm0, xmm1 |
| paddw xmm0, xmm2 |
| lea r0, [r0+2*r1] |
| lea r2, [r2+2*r3] |
| dec r4 |
| jg sad_w16_align%1_sse2 |
| ret |
| %endmacro |
| |
| ; computed jump assumes this loop is exactly 64 bytes |
| %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment |
| ALIGN 16 |
| sad_w16_align%1_ssse3: |
| movdqa xmm1, [r2+16] |
| movdqa xmm2, [r2+r3+16] |
| palignr xmm1, [r2], %1 |
| palignr xmm2, [r2+r3], %1 |
| psadbw xmm1, [r0] |
| psadbw xmm2, [r0+r1] |
| paddw xmm0, xmm1 |
| paddw xmm0, xmm2 |
| lea r0, [r0+2*r1] |
| lea r2, [r2+2*r3] |
| dec r4 |
| jg sad_w16_align%1_ssse3 |
| ret |
| %endmacro |
| |
| %macro SAD16_CACHELINE_FUNC 2 ; cpu, height |
| cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0 |
| mov eax, r2m |
| and eax, 0x37 |
| cmp eax, 0x30 |
| jle x264_pixel_sad_16x%2_sse2 |
| PROLOGUE 4,6 |
| mov r4d, r2d |
| and r4d, 15 |
| %ifidn %1, ssse3 |
| shl r4d, 6 ; code size = 64 |
| %else |
| lea r4, [r4*5] |
| shl r4d, 4 ; code size = 80 |
| %endif |
| %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1)) |
| %ifdef PIC |
| lea r5, [sad_w16_addr GLOBAL] |
| add r5, r4 |
| %else |
| lea r5, [sad_w16_addr + r4 GLOBAL] |
| %endif |
| and r2, ~15 |
| mov r4d, %2/2 |
| pxor xmm0, xmm0 |
| call r5 |
| movhlps xmm1, xmm0 |
| paddw xmm0, xmm1 |
| movd eax, xmm0 |
| RET |
| %endmacro |
| |
| %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline |
| mov eax, r2m |
| and eax, 0x17|%1|(%4>>1) |
| cmp eax, 0x10|%1|(%4>>1) |
| jle x264_pixel_sad_%1x%2_mmxext |
| and eax, 7 |
| shl eax, 3 |
| movd mm6, [sw_64 GLOBAL] |
| movd mm7, eax |
| psubw mm6, mm7 |
| PROLOGUE 4,5 |
| and r2, ~7 |
| mov r4d, %3 |
| pxor mm0, mm0 |
| %endmacro |
| |
| %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline |
| cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0 |
| SAD_CACHELINE_START_MMX2 16, %1, %1, %2 |
| .loop: |
| movq mm1, [r2] |
| movq mm2, [r2+8] |
| movq mm3, [r2+16] |
| movq mm4, mm2 |
| psrlq mm1, mm7 |
| psllq mm2, mm6 |
| psllq mm3, mm6 |
| psrlq mm4, mm7 |
| por mm1, mm2 |
| por mm3, mm4 |
| psadbw mm1, [r0] |
| psadbw mm3, [r0+8] |
| paddw mm0, mm1 |
| paddw mm0, mm3 |
| add r2, r3 |
| add r0, r1 |
| dec r4 |
| jg .loop |
| movd eax, mm0 |
| RET |
| %endmacro |
| |
| %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline |
| cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0 |
| SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 |
| .loop: |
| movq mm1, [r2+8] |
| movq mm2, [r2+r3+8] |
| movq mm3, [r2] |
| movq mm4, [r2+r3] |
| psllq mm1, mm6 |
| psllq mm2, mm6 |
| psrlq mm3, mm7 |
| psrlq mm4, mm7 |
| por mm1, mm3 |
| por mm2, mm4 |
| psadbw mm1, [r0] |
| psadbw mm2, [r0+r1] |
| paddw mm0, mm1 |
| paddw mm0, mm2 |
| lea r2, [r2+2*r3] |
| lea r0, [r0+2*r1] |
| dec r4 |
| jg .loop |
| movd eax, mm0 |
| RET |
| %endmacro |
| |
| ; sad_x3/x4_cache64: check each mv. |
| ; if they're all within a cacheline, use normal sad_x3/x4. |
| ; otherwise, send them individually to sad_cache64. |
| %macro CHECK_SPLIT 3 ; pix, width, cacheline |
| mov eax, %1 |
| and eax, 0x17|%2|(%3>>1) |
| cmp eax, 0x10|%2|(%3>>1) |
| jg .split |
| %endmacro |
| |
| %macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver |
| cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0 |
| CHECK_SPLIT r1m, %1, %3 |
| CHECK_SPLIT r2m, %1, %3 |
| CHECK_SPLIT r3m, %1, %3 |
| jmp x264_pixel_sad_x3_%1x%2_%4 |
| .split: |
| %ifdef ARCH_X86_64 |
| push r3 |
| push r2 |
| mov r2, r1 |
| mov r1, FENC_STRIDE |
| mov r3, r4 |
| mov r10, r0 |
| mov r11, r5 |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov [r11], eax |
| pop r2 |
| mov r0, r10 |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov [r11+4], eax |
| pop r2 |
| mov r0, r10 |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov [r11+8], eax |
| %else |
| push edi |
| mov edi, [esp+28] |
| push dword [esp+24] |
| push dword [esp+16] |
| push dword 16 |
| push dword [esp+20] |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov ecx, [esp+32] |
| mov [edi], eax |
| mov [esp+8], ecx |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov ecx, [esp+36] |
| mov [edi+4], eax |
| mov [esp+8], ecx |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov [edi+8], eax |
| add esp, 16 |
| pop edi |
| %endif |
| ret |
| %endmacro |
| |
| %macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver |
| cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0 |
| CHECK_SPLIT r1m, %1, %3 |
| CHECK_SPLIT r2m, %1, %3 |
| CHECK_SPLIT r3m, %1, %3 |
| CHECK_SPLIT r4m, %1, %3 |
| jmp x264_pixel_sad_x4_%1x%2_%4 |
| .split: |
| %ifdef ARCH_X86_64 |
| mov r11, r6m |
| push r4 |
| push r3 |
| push r2 |
| mov r2, r1 |
| mov r1, FENC_STRIDE |
| mov r3, r5 |
| mov r10, r0 |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov [r11], eax |
| pop r2 |
| mov r0, r10 |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov [r11+4], eax |
| pop r2 |
| mov r0, r10 |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov [r11+8], eax |
| pop r2 |
| mov r0, r10 |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov [r11+12], eax |
| %else |
| push edi |
| mov edi, [esp+32] |
| push dword [esp+28] |
| push dword [esp+16] |
| push dword 16 |
| push dword [esp+20] |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov ecx, [esp+32] |
| mov [edi], eax |
| mov [esp+8], ecx |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov ecx, [esp+36] |
| mov [edi+4], eax |
| mov [esp+8], ecx |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov ecx, [esp+40] |
| mov [edi+8], eax |
| mov [esp+8], ecx |
| call x264_pixel_sad_%1x%2_cache%3_%5 |
| mov [edi+12], eax |
| add esp, 16 |
| pop edi |
| %endif |
| ret |
| %endmacro |
| |
| %macro SADX34_CACHELINE_FUNC 5 |
| SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5 |
| SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5 |
| %endmacro |
| |
| |
| ; instantiate the aligned sads |
| |
| %ifndef ARCH_X86_64 |
| SAD16_CACHELINE_FUNC_MMX2 8, 32 |
| SAD16_CACHELINE_FUNC_MMX2 16, 32 |
| SAD8_CACHELINE_FUNC_MMX2 4, 32 |
| SAD8_CACHELINE_FUNC_MMX2 8, 32 |
| SAD8_CACHELINE_FUNC_MMX2 16, 32 |
| SAD16_CACHELINE_FUNC_MMX2 8, 64 |
| SAD16_CACHELINE_FUNC_MMX2 16, 64 |
| %endif ; !ARCH_X86_64 |
| SAD8_CACHELINE_FUNC_MMX2 4, 64 |
| SAD8_CACHELINE_FUNC_MMX2 8, 64 |
| SAD8_CACHELINE_FUNC_MMX2 16, 64 |
| |
| %ifndef ARCH_X86_64 |
| SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext |
| SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext |
| SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext |
| SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext |
| SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext |
| SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext |
| %endif ; !ARCH_X86_64 |
| SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext |
| SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext |
| |
| %ifndef ARCH_X86_64 |
| SAD16_CACHELINE_FUNC sse2, 8 |
| SAD16_CACHELINE_FUNC sse2, 16 |
| %assign i 1 |
| %rep 15 |
| SAD16_CACHELINE_LOOP_SSE2 i |
| %assign i i+1 |
| %endrep |
| SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2 |
| SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2 |
| %endif ; !ARCH_X86_64 |
| |
| SAD16_CACHELINE_FUNC ssse3, 8 |
| SAD16_CACHELINE_FUNC ssse3, 16 |
| %assign i 1 |
| %rep 15 |
| SAD16_CACHELINE_LOOP_SSSE3 i |
| %assign i i+1 |
| %endrep |
| SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3 |
| SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3 |