| ;***************************************************************************** |
| ;* mc-a.asm: h264 encoder library |
| ;***************************************************************************** |
| ;* Copyright (C) 2003-2008 x264 project |
| ;* |
| ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
| ;* Jason Garrett-Glaser <darkshikari@gmail.com> |
| ;* Laurent Aimar <fenrir@via.ecp.fr> |
| ;* Min Chen <chenm001.163.com> |
| ;* |
| ;* This program is free software; you can redistribute it and/or modify |
| ;* it under the terms of the GNU General Public License as published by |
| ;* the Free Software Foundation; either version 2 of the License, or |
| ;* (at your option) any later version. |
| ;* |
| ;* This program is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| ;* GNU General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU General Public License |
| ;* along with this program; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
| ;***************************************************************************** |
| |
| %include "x86inc.asm" |
| |
| SECTION_RODATA |
| |
| pw_4: times 8 dw 4 |
| pw_8: times 8 dw 8 |
| pw_32: times 8 dw 32 |
| pw_64: times 8 dw 64 |
| sw_64: dd 64 |
| |
| SECTION .text |
| |
| ;============================================================================= |
| ; weighted prediction |
| ;============================================================================= |
| ; implicit bipred only: |
| ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 |
| %ifdef ARCH_X86_64 |
| %define t0 r0 |
| %define t1 r1 |
| %define t2 r2 |
| %define t3 r3 |
| %define t4 r4 |
| %define t5 r5 |
| %define t6d r10d |
| %define t7d r11d |
| %macro AVG_START 0 |
| PROLOGUE 6,7 |
| .height_loop: |
| %endmacro |
| %else |
| %define t0 r1 |
| %define t1 r2 |
| %define t2 r3 |
| %define t3 r4 |
| %define t4 r5 |
| %define t5 r6 |
| %define t6d r1d |
| %define t7d r2d |
| %macro AVG_START 0 |
| PROLOGUE 0,7 |
| mov t0, r0m |
| mov t1, r1m |
| mov t2, r2m |
| mov t3, r3m |
| mov t4, r4m |
| mov t5, r5m |
| .height_loop: |
| %endmacro |
| %endif |
| |
| %macro SPLATW 2 |
| %if mmsize==16 |
| pshuflw %1, %2, 0 |
| punpcklqdq %1, %1 |
| %else |
| pshufw %1, %2, 0 |
| %endif |
| %endmacro |
| |
| %macro BIWEIGHT_MMX 2 |
| movh m0, %1 |
| movh m1, %2 |
| punpcklbw m0, m7 |
| punpcklbw m1, m7 |
| pmullw m0, m4 |
| pmullw m1, m5 |
| paddw m0, m1 |
| paddw m0, m6 |
| psraw m0, 6 |
| %endmacro |
| |
| %macro BIWEIGHT_START_MMX 0 |
| movd m4, r6m |
| SPLATW m4, m4 ; weight_dst |
| mova m5, [pw_64 GLOBAL] |
| psubw m5, m4 ; weight_src |
| mova m6, [pw_32 GLOBAL] ; rounding |
| pxor m7, m7 |
| %endmacro |
| |
| %macro BIWEIGHT_SSSE3 2 |
| movh m0, %1 |
| movh m1, %2 |
| punpcklbw m0, m1 |
| pmaddubsw m0, m5 |
| paddw m0, m6 |
| psraw m0, 6 |
| %endmacro |
| |
| %macro BIWEIGHT_START_SSSE3 0 |
| movzx t6d, byte r6m ; FIXME x86_64 |
| mov t7d, 64 |
| sub t7d, t6d |
| shl t7d, 8 |
| add t6d, t7d |
| movd m5, t6d |
| mova m6, [pw_32 GLOBAL] |
| SPLATW m5, m5 ; weight_dst,src |
| %endmacro |
| |
| %macro BIWEIGHT_ROW 4 |
| BIWEIGHT [%2], [%3] |
| %if %4==mmsize/2 |
| packuswb m0, m0 |
| movh [%1], m0 |
| %else |
| SWAP 0, 2 |
| BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] |
| packuswb m2, m0 |
| mova [%1], m2 |
| %endif |
| %endmacro |
| |
| ;----------------------------------------------------------------------------- |
| ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight ) |
| ;----------------------------------------------------------------------------- |
| %macro AVG_WEIGHT 2 |
| cglobal x264_pixel_avg_weight_w%2_%1, 0,0 |
| BIWEIGHT_START |
| AVG_START |
| %if %2==8 && mmsize==16 |
| BIWEIGHT [t2], [t4] |
| SWAP 0, 2 |
| BIWEIGHT [t2+t3], [t4+t5] |
| packuswb m2, m0 |
| movlps [t0], m2 |
| movhps [t0+t1], m2 |
| %else |
| %assign x 0 |
| %rep 1+%2/(mmsize*2) |
| BIWEIGHT_ROW t0+x, t2+x, t4+x, %2 |
| BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2 |
| %assign x x+mmsize |
| %endrep |
| %endif |
| lea t0, [t0+t1*2] |
| lea t2, [t2+t3*2] |
| lea t4, [t4+t5*2] |
| sub eax, 2 |
| jg .height_loop |
| REP_RET |
| %endmacro |
| |
| %define BIWEIGHT BIWEIGHT_MMX |
| %define BIWEIGHT_START BIWEIGHT_START_MMX |
| INIT_MMX |
| AVG_WEIGHT mmxext, 4 |
| AVG_WEIGHT mmxext, 8 |
| AVG_WEIGHT mmxext, 16 |
| INIT_XMM |
| %define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext |
| AVG_WEIGHT sse2, 8 |
| AVG_WEIGHT sse2, 16 |
| %define BIWEIGHT BIWEIGHT_SSSE3 |
| %define BIWEIGHT_START BIWEIGHT_START_SSSE3 |
| INIT_MMX |
| AVG_WEIGHT ssse3, 4 |
| INIT_XMM |
| AVG_WEIGHT ssse3, 8 |
| AVG_WEIGHT ssse3, 16 |
| |
| |
| |
| ;============================================================================= |
| ; pixel avg |
| ;============================================================================= |
| |
| ;----------------------------------------------------------------------------- |
| ; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride, |
| ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight ); |
| ;----------------------------------------------------------------------------- |
| %macro AVGH 3 |
| cglobal x264_pixel_avg_%1x%2_%3,0,0 |
| mov eax, %2 |
| cmp dword r6m, 32 |
| jne x264_pixel_avg_weight_w%1_%3 |
| %if mmsize == 16 && %1 == 16 |
| test dword r4m, 15 |
| jz x264_pixel_avg_w%1_sse2 |
| %endif |
| jmp x264_pixel_avg_w%1_mmxext |
| %endmacro |
| |
| ;----------------------------------------------------------------------------- |
| ; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride, |
| ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, |
| ; int height, int weight ); |
| ;----------------------------------------------------------------------------- |
| |
| %macro AVG_END 0 |
| sub eax, 2 |
| lea t4, [t4+t5*2] |
| lea t2, [t2+t3*2] |
| lea t0, [t0+t1*2] |
| jg .height_loop |
| REP_RET |
| %endmacro |
| |
| %macro AVG_FUNC 3 |
| cglobal %1 |
| AVG_START |
| %2 m0, [t2] |
| %2 m1, [t2+t3] |
| pavgb m0, [t4] |
| pavgb m1, [t4+t5] |
| %3 [t0], m0 |
| %3 [t0+t1], m1 |
| AVG_END |
| %endmacro |
| |
| INIT_MMX |
| AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd |
| AVGH 4, 8, mmxext |
| AVGH 4, 4, mmxext |
| AVGH 4, 2, mmxext |
| |
| AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq |
| AVGH 8, 16, mmxext |
| AVGH 8, 8, mmxext |
| AVGH 8, 4, mmxext |
| |
| cglobal x264_pixel_avg_w16_mmxext |
| AVG_START |
| movq mm0, [t2 ] |
| movq mm1, [t2+8] |
| movq mm2, [t2+t3 ] |
| movq mm3, [t2+t3+8] |
| pavgb mm0, [t4 ] |
| pavgb mm1, [t4+8] |
| pavgb mm2, [t4+t5 ] |
| pavgb mm3, [t4+t5+8] |
| movq [t0 ], mm0 |
| movq [t0+8], mm1 |
| movq [t0+t1 ], mm2 |
| movq [t0+t1+8], mm3 |
| AVG_END |
| |
| AVGH 16, 16, mmxext |
| AVGH 16, 8, mmxext |
| |
| INIT_XMM |
| AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa |
| AVGH 16, 16, sse2 |
| AVGH 16, 8, sse2 |
| AVGH 8, 16, sse2 |
| AVGH 8, 8, sse2 |
| AVGH 8, 4, sse2 |
| AVGH 16, 16, ssse3 |
| AVGH 16, 8, ssse3 |
| AVGH 8, 16, ssse3 |
| AVGH 8, 8, ssse3 |
| AVGH 8, 4, ssse3 |
| INIT_MMX |
| AVGH 4, 8, ssse3 |
| AVGH 4, 4, ssse3 |
| AVGH 4, 2, ssse3 |
| |
| |
| |
| ;============================================================================= |
| ; pixel avg2 |
| ;============================================================================= |
| |
| ;----------------------------------------------------------------------------- |
| ; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride, |
| ; uint8_t *src1, int src_stride, |
| ; uint8_t *src2, int height ); |
| ;----------------------------------------------------------------------------- |
| %macro AVG2_W8 2 |
| cglobal x264_pixel_avg2_w%1_mmxext, 6,7 |
| sub r4, r2 |
| lea r6, [r4+r3] |
| .height_loop: |
| %2 mm0, [r2] |
| %2 mm1, [r2+r3] |
| pavgb mm0, [r2+r4] |
| pavgb mm1, [r2+r6] |
| %2 [r0], mm0 |
| %2 [r0+r1], mm1 |
| sub r5d, 2 |
| lea r2, [r2+r3*2] |
| lea r0, [r0+r1*2] |
| jg .height_loop |
| REP_RET |
| %endmacro |
| |
| AVG2_W8 4, movd |
| AVG2_W8 8, movq |
| |
| %macro AVG2_W16 2 |
| cglobal x264_pixel_avg2_w%1_mmxext, 6,7 |
| sub r4, r2 |
| lea r6, [r4+r3] |
| .height_loop: |
| movq mm0, [r2] |
| %2 mm1, [r2+8] |
| movq mm2, [r2+r3] |
| %2 mm3, [r2+r3+8] |
| pavgb mm0, [r2+r4] |
| pavgb mm1, [r2+r4+8] |
| pavgb mm2, [r2+r6] |
| pavgb mm3, [r2+r6+8] |
| movq [r0], mm0 |
| %2 [r0+8], mm1 |
| movq [r0+r1], mm2 |
| %2 [r0+r1+8], mm3 |
| lea r2, [r2+r3*2] |
| lea r0, [r0+r1*2] |
| sub r5d, 2 |
| jg .height_loop |
| REP_RET |
| %endmacro |
| |
| AVG2_W16 12, movd |
| AVG2_W16 16, movq |
| |
| cglobal x264_pixel_avg2_w20_mmxext, 6,7 |
| sub r4, r2 |
| lea r6, [r4+r3] |
| .height_loop: |
| movq mm0, [r2] |
| movq mm1, [r2+8] |
| movd mm2, [r2+16] |
| movq mm3, [r2+r3] |
| movq mm4, [r2+r3+8] |
| movd mm5, [r2+r3+16] |
| pavgb mm0, [r2+r4] |
| pavgb mm1, [r2+r4+8] |
| pavgb mm2, [r2+r4+16] |
| pavgb mm3, [r2+r6] |
| pavgb mm4, [r2+r6+8] |
| pavgb mm5, [r2+r6+16] |
| movq [r0], mm0 |
| movq [r0+8], mm1 |
| movd [r0+16], mm2 |
| movq [r0+r1], mm3 |
| movq [r0+r1+8], mm4 |
| movd [r0+r1+16], mm5 |
| lea r2, [r2+r3*2] |
| lea r0, [r0+r1*2] |
| sub r5d, 2 |
| jg .height_loop |
| REP_RET |
| |
| cglobal x264_pixel_avg2_w16_sse2, 6,7 |
| sub r4, r2 |
| lea r6, [r4+r3] |
| .height_loop: |
| movdqu xmm0, [r2] |
| movdqu xmm2, [r2+r3] |
| movdqu xmm1, [r2+r4] |
| movdqu xmm3, [r2+r6] |
| pavgb xmm0, xmm1 |
| pavgb xmm2, xmm3 |
| movdqa [r0], xmm0 |
| movdqa [r0+r1], xmm2 |
| lea r2, [r2+r3*2] |
| lea r0, [r0+r1*2] |
| sub r5d, 2 |
| jg .height_loop |
| REP_RET |
| |
| %macro AVG2_W20 1 |
| cglobal x264_pixel_avg2_w20_%1, 6,7 |
| sub r4, r2 |
| lea r6, [r4+r3] |
| .height_loop: |
| movdqu xmm0, [r2] |
| movdqu xmm2, [r2+r3] |
| movd mm4, [r2+16] |
| movd mm5, [r2+r3+16] |
| %ifidn %1, sse2_misalign |
| pavgb xmm0, [r2+r4] |
| pavgb xmm2, [r2+r6] |
| %else |
| movdqu xmm1, [r2+r4] |
| movdqu xmm3, [r2+r6] |
| pavgb xmm0, xmm1 |
| pavgb xmm2, xmm3 |
| %endif |
| pavgb mm4, [r2+r4+16] |
| pavgb mm5, [r2+r6+16] |
| movdqa [r0], xmm0 |
| movd [r0+16], mm4 |
| movdqa [r0+r1], xmm2 |
| movd [r0+r1+16], mm5 |
| lea r2, [r2+r3*2] |
| lea r0, [r0+r1*2] |
| sub r5d, 2 |
| jg .height_loop |
| REP_RET |
| %endmacro |
| |
| AVG2_W20 sse2 |
| AVG2_W20 sse2_misalign |
| |
| ; Cacheline split code for processors with high latencies for loads |
| ; split over cache lines. See sad-a.asm for a more detailed explanation. |
| ; This particular instance is complicated by the fact that src1 and src2 |
| ; can have different alignments. For simplicity and code size, only the |
| ; MMX cacheline workaround is used. As a result, in the case of SSE2 |
| ; pixel_avg, the cacheline check functions calls the SSE2 version if there |
| ; is no cacheline split, and the MMX workaround if there is. |
| |
| %macro INIT_SHIFT 2 |
| and eax, 7 |
| shl eax, 3 |
| movd %1, [sw_64 GLOBAL] |
| movd %2, eax |
| psubw %1, %2 |
| %endmacro |
| |
| %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set |
| cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0 |
| mov eax, r2m |
| and eax, 0x1f|(%2>>1) |
| cmp eax, (32-%1)|(%2>>1) |
| jle x264_pixel_avg2_w%1_%3 |
| ;w12 isn't needed because w16 is just as fast if there's no cacheline split |
| %if %1 == 12 |
| jmp x264_pixel_avg2_w16_cache_mmxext |
| %else |
| jmp x264_pixel_avg2_w%1_cache_mmxext |
| %endif |
| %endmacro |
| |
| %macro AVG_CACHELINE_START 0 |
| %assign stack_offset 0 |
| INIT_SHIFT mm6, mm7 |
| mov eax, r4m |
| INIT_SHIFT mm4, mm5 |
| PROLOGUE 6,6 |
| and r2, ~7 |
| and r4, ~7 |
| sub r4, r2 |
| .height_loop: |
| %endmacro |
| |
| %macro AVG_CACHELINE_LOOP 2 |
| movq mm0, [r2+8+%1] |
| movq mm1, [r2+%1] |
| movq mm2, [r2+r4+8+%1] |
| movq mm3, [r2+r4+%1] |
| psllq mm0, mm6 |
| psrlq mm1, mm7 |
| psllq mm2, mm4 |
| psrlq mm3, mm5 |
| por mm0, mm1 |
| por mm2, mm3 |
| pavgb mm0, mm2 |
| %2 [r0+%1], mm0 |
| %endmacro |
| |
| x264_pixel_avg2_w8_cache_mmxext: |
| AVG_CACHELINE_START |
| AVG_CACHELINE_LOOP 0, movq |
| add r2, r3 |
| add r0, r1 |
| dec r5d |
| jg .height_loop |
| RET |
| |
| x264_pixel_avg2_w16_cache_mmxext: |
| AVG_CACHELINE_START |
| AVG_CACHELINE_LOOP 0, movq |
| AVG_CACHELINE_LOOP 8, movq |
| add r2, r3 |
| add r0, r1 |
| dec r5d |
| jg .height_loop |
| RET |
| |
| x264_pixel_avg2_w20_cache_mmxext: |
| AVG_CACHELINE_START |
| AVG_CACHELINE_LOOP 0, movq |
| AVG_CACHELINE_LOOP 8, movq |
| AVG_CACHELINE_LOOP 16, movd |
| add r2, r3 |
| add r0, r1 |
| dec r5d |
| jg .height_loop |
| RET |
| |
| %ifndef ARCH_X86_64 |
| AVG_CACHELINE_CHECK 8, 32, mmxext |
| AVG_CACHELINE_CHECK 12, 32, mmxext |
| AVG_CACHELINE_CHECK 16, 32, mmxext |
| AVG_CACHELINE_CHECK 20, 32, mmxext |
| AVG_CACHELINE_CHECK 16, 64, mmxext |
| AVG_CACHELINE_CHECK 20, 64, mmxext |
| %endif |
| |
| AVG_CACHELINE_CHECK 8, 64, mmxext |
| AVG_CACHELINE_CHECK 12, 64, mmxext |
| AVG_CACHELINE_CHECK 16, 64, sse2 |
| AVG_CACHELINE_CHECK 20, 64, sse2 |
| |
| ;============================================================================= |
| ; pixel copy |
| ;============================================================================= |
| |
| %macro COPY4 4 |
| %2 m0, [r2] |
| %2 m1, [r2+r3] |
| %2 m2, [r2+r3*2] |
| %2 m3, [r2+%4] |
| %1 [r0], m0 |
| %1 [r0+r1], m1 |
| %1 [r0+r1*2], m2 |
| %1 [r0+%3], m3 |
| %endmacro |
| |
| INIT_MMX |
| ;----------------------------------------------------------------------------- |
| ; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride, |
| ; uint8_t *src, int i_src_stride, int i_height ) |
| ;----------------------------------------------------------------------------- |
| cglobal x264_mc_copy_w4_mmx, 4,6 |
| cmp dword r4m, 4 |
| lea r5, [r3*3] |
| lea r4, [r1*3] |
| je .end |
| COPY4 movd, movd, r4, r5 |
| lea r2, [r2+r3*4] |
| lea r0, [r0+r1*4] |
| .end: |
| COPY4 movd, movd, r4, r5 |
| RET |
| |
| cglobal x264_mc_copy_w8_mmx, 5,7 |
| lea r6, [r3*3] |
| lea r5, [r1*3] |
| .height_loop: |
| COPY4 movq, movq, r5, r6 |
| lea r2, [r2+r3*4] |
| lea r0, [r0+r1*4] |
| sub r4d, 4 |
| jg .height_loop |
| REP_RET |
| |
| cglobal x264_mc_copy_w16_mmx, 5,7 |
| lea r6, [r3*3] |
| lea r5, [r1*3] |
| .height_loop: |
| movq mm0, [r2] |
| movq mm1, [r2+8] |
| movq mm2, [r2+r3] |
| movq mm3, [r2+r3+8] |
| movq mm4, [r2+r3*2] |
| movq mm5, [r2+r3*2+8] |
| movq mm6, [r2+r6] |
| movq mm7, [r2+r6+8] |
| movq [r0], mm0 |
| movq [r0+8], mm1 |
| movq [r0+r1], mm2 |
| movq [r0+r1+8], mm3 |
| movq [r0+r1*2], mm4 |
| movq [r0+r1*2+8], mm5 |
| movq [r0+r5], mm6 |
| movq [r0+r5+8], mm7 |
| lea r2, [r2+r3*4] |
| lea r0, [r0+r1*4] |
| sub r4d, 4 |
| jg .height_loop |
| REP_RET |
| |
| INIT_XMM |
| %macro COPY_W16_SSE2 2 |
| cglobal %1, 5,7 |
| lea r6, [r3*3] |
| lea r5, [r1*3] |
| .height_loop: |
| COPY4 movdqa, %2, r5, r6 |
| lea r2, [r2+r3*4] |
| lea r0, [r0+r1*4] |
| sub r4d, 4 |
| jg .height_loop |
| REP_RET |
| %endmacro |
| |
| COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu |
| ; cacheline split with mmx has too much overhead; the speed benefit is near-zero. |
| ; but with SSE3 the overhead is zero, so there's no reason not to include it. |
| COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu |
| COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa |
| |
| |
| |
| ;============================================================================= |
| ; prefetch |
| ;============================================================================= |
| ; FIXME assumes 64 byte cachelines |
| |
| ;----------------------------------------------------------------------------- |
| ; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, |
| ; uint8_t *pix_uv, int stride_uv, int mb_x ) |
| ;----------------------------------------------------------------------------- |
| %ifdef ARCH_X86_64 |
| cglobal x264_prefetch_fenc_mmxext, 5,5 |
| mov eax, r4d |
| and eax, 3 |
| imul eax, r1d |
| lea r0, [r0+rax*4+64] |
| prefetcht0 [r0] |
| prefetcht0 [r0+r1] |
| lea r0, [r0+r1*2] |
| prefetcht0 [r0] |
| prefetcht0 [r0+r1] |
| |
| and r4d, 6 |
| imul r4d, r3d |
| lea r2, [r2+r4+64] |
| prefetcht0 [r2] |
| prefetcht0 [r2+r3] |
| ret |
| |
| %else |
| cglobal x264_prefetch_fenc_mmxext |
| mov r2, [esp+20] |
| mov r1, [esp+8] |
| mov r0, [esp+4] |
| and r2, 3 |
| imul r2, r1 |
| lea r0, [r0+r2*4+64] |
| prefetcht0 [r0] |
| prefetcht0 [r0+r1] |
| lea r0, [r0+r1*2] |
| prefetcht0 [r0] |
| prefetcht0 [r0+r1] |
| |
| mov r2, [esp+20] |
| mov r1, [esp+16] |
| mov r0, [esp+12] |
| and r2, 6 |
| imul r2, r1 |
| lea r0, [r0+r2+64] |
| prefetcht0 [r0] |
| prefetcht0 [r0+r1] |
| ret |
| %endif ; ARCH_X86_64 |
| |
| ;----------------------------------------------------------------------------- |
| ; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) |
| ;----------------------------------------------------------------------------- |
| cglobal x264_prefetch_ref_mmxext, 3,3 |
| dec r2d |
| and r2d, r1d |
| lea r0, [r0+r2*8+64] |
| lea r2, [r1*3] |
| prefetcht0 [r0] |
| prefetcht0 [r0+r1] |
| prefetcht0 [r0+r1*2] |
| prefetcht0 [r0+r2] |
| lea r0, [r0+r1*4] |
| prefetcht0 [r0] |
| prefetcht0 [r0+r1] |
| prefetcht0 [r0+r1*2] |
| prefetcht0 [r0+r2] |
| ret |
| |
| |
| |
| ;============================================================================= |
| ; chroma MC |
| ;============================================================================= |
| |
| %define t0d eax |
| %define t0 rax |
| %ifdef ARCH_X86_64 |
| %define t1d r10d |
| %else |
| %define t1d r1d |
| %endif |
| |
| %macro MC_CHROMA_START 0 |
| movifnidn r2d, r2m |
| movifnidn r3d, r3m |
| movifnidn r4d, r4m |
| movifnidn r5d, r5m |
| mov t0d, r5d |
| mov t1d, r4d |
| sar t0d, 3 |
| sar t1d, 3 |
| imul t0d, r3d |
| add t0d, t1d |
| movsxdifnidn t0, t0d |
| add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride |
| %endmacro |
| |
| ;----------------------------------------------------------------------------- |
| ; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride, |
| ; uint8_t *src, int src_stride, |
| ; int dx, int dy, |
| ; int width, int height ) |
| ;----------------------------------------------------------------------------- |
| %macro MC_CHROMA 1 |
| cglobal x264_mc_chroma_%1, 0,6 |
| %if mmsize == 16 |
| cmp dword r6m, 4 |
| jle x264_mc_chroma_mmxext %+ .skip_prologue |
| %endif |
| .skip_prologue: |
| MC_CHROMA_START |
| pxor m3, m3 |
| and r4d, 7 ; dx &= 7 |
| jz .mc1dy |
| and r5d, 7 ; dy &= 7 |
| jz .mc1dx |
| |
| movd m5, r4d |
| movd m6, r5d |
| SPLATW m5, m5 ; m5 = dx |
| SPLATW m6, m6 ; m6 = dy |
| |
| mova m4, [pw_8 GLOBAL] |
| mova m0, m4 |
| psubw m4, m5 ; m4 = 8-dx |
| psubw m0, m6 ; m0 = 8-dy |
| |
| mova m7, m5 |
| pmullw m5, m0 ; m5 = dx*(8-dy) = cB |
| pmullw m7, m6 ; m7 = dx*dy = cD |
| pmullw m6, m4 ; m6 = (8-dx)*dy = cC |
| pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA |
| |
| mov r4d, r7m |
| %ifdef ARCH_X86_64 |
| mov r10, r0 |
| mov r11, r2 |
| %else |
| mov r0, r0m |
| mov r1, r1m |
| mov r5, r2 |
| %endif |
| |
| .loop2d: |
| movh m1, [r2+r3] |
| movh m0, [r2] |
| punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 |
| punpcklbw m0, m3 |
| pmullw m1, m6 ; 2nd line * cC |
| pmullw m0, m4 ; 1st line * cA |
| paddw m0, m1 ; m0 <- result |
| |
| movh m2, [r2+1] |
| movh m1, [r2+r3+1] |
| punpcklbw m2, m3 |
| punpcklbw m1, m3 |
| |
| paddw m0, [pw_32 GLOBAL] |
| |
| pmullw m2, m5 ; line * cB |
| pmullw m1, m7 ; line * cD |
| paddw m0, m2 |
| paddw m0, m1 |
| psrlw m0, 6 |
| |
| packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4 |
| movh [r0], m0 |
| |
| add r2, r3 |
| add r0, r1 ; dst_stride |
| dec r4d |
| jnz .loop2d |
| |
| %if mmsize == 8 |
| sub dword r6m, 8 |
| jnz .finish ; width != 8 so assume 4 |
| %ifdef ARCH_X86_64 |
| lea r0, [r10+4] ; dst |
| lea r2, [r11+4] ; src |
| %else |
| mov r0, r0m |
| lea r2, [r5+4] |
| add r0, 4 |
| %endif |
| mov r4d, r7m ; height |
| jmp .loop2d |
| %else |
| REP_RET |
| %endif ; mmsize |
| |
| .mc1dy: |
| and r5d, 7 |
| movd m6, r5d |
| mov r5, r3 ; pel_offset = dx ? 1 : src_stride |
| jmp .mc1d |
| .mc1dx: |
| movd m6, r4d |
| mov r5d, 1 |
| .mc1d: |
| mova m5, [pw_8 GLOBAL] |
| SPLATW m6, m6 |
| mova m7, [pw_4 GLOBAL] |
| psubw m5, m6 |
| movifnidn r0d, r0m |
| movifnidn r1d, r1m |
| mov r4d, r7m |
| %if mmsize == 8 |
| cmp dword r6m, 8 |
| je .loop1d_w8 |
| %endif |
| |
| .loop1d_w4: |
| movh m0, [r2+r5] |
| movh m1, [r2] |
| punpcklbw m0, m3 |
| punpcklbw m1, m3 |
| pmullw m0, m6 |
| pmullw m1, m5 |
| paddw m0, m7 |
| paddw m0, m1 |
| psrlw m0, 3 |
| packuswb m0, m3 |
| movh [r0], m0 |
| add r2, r3 |
| add r0, r1 |
| dec r4d |
| jnz .loop1d_w4 |
| .finish: |
| REP_RET |
| |
| %if mmsize == 8 |
| .loop1d_w8: |
| movu m0, [r2+r5] |
| mova m1, [r2] |
| mova m2, m0 |
| mova m4, m1 |
| punpcklbw m0, m3 |
| punpcklbw m1, m3 |
| punpckhbw m2, m3 |
| punpckhbw m4, m3 |
| pmullw m0, m6 |
| pmullw m1, m5 |
| pmullw m2, m6 |
| pmullw m4, m5 |
| paddw m0, m7 |
| paddw m2, m7 |
| paddw m0, m1 |
| paddw m2, m4 |
| psrlw m0, 3 |
| psrlw m2, 3 |
| packuswb m0, m2 |
| mova [r0], m0 |
| add r2, r3 |
| add r0, r1 |
| dec r4d |
| jnz .loop1d_w8 |
| REP_RET |
| %endif ; mmsize |
| %endmacro ; MC_CHROMA |
| |
| INIT_MMX |
| MC_CHROMA mmxext |
| INIT_XMM |
| MC_CHROMA sse2 |
| |
| INIT_MMX |
| cglobal x264_mc_chroma_ssse3, 0,6 |
| MC_CHROMA_START |
| and r4d, 7 |
| and r5d, 7 |
| mov t0d, r4d |
| shl t0d, 8 |
| sub t0d, r4d |
| mov r4d, 8 |
| add t0d, 8 |
| sub r4d, r5d |
| imul r5d, t0d ; (x*255+8)*y |
| imul r4d, t0d ; (x*255+8)*(8-y) |
| cmp dword r6m, 4 |
| jg .width8 |
| mova m5, [pw_32 GLOBAL] |
| movd m6, r5d |
| movd m7, r4d |
| movifnidn r0d, r0m |
| movifnidn r1d, r1m |
| movifnidn r4d, r7m |
| SPLATW m6, m6 |
| SPLATW m7, m7 |
| movh m0, [r2] |
| punpcklbw m0, [r2+1] |
| add r2, r3 |
| .loop4: |
| movh m1, [r2] |
| movh m3, [r2+r3] |
| punpcklbw m1, [r2+1] |
| punpcklbw m3, [r2+r3+1] |
| lea r2, [r2+2*r3] |
| mova m2, m1 |
| mova m4, m3 |
| pmaddubsw m0, m7 |
| pmaddubsw m1, m6 |
| pmaddubsw m2, m7 |
| pmaddubsw m3, m6 |
| paddw m0, m5 |
| paddw m2, m5 |
| paddw m1, m0 |
| paddw m3, m2 |
| mova m0, m4 |
| psrlw m1, 6 |
| psrlw m3, 6 |
| packuswb m1, m1 |
| packuswb m3, m3 |
| movh [r0], m1 |
| movh [r0+r1], m3 |
| sub r4d, 2 |
| lea r0, [r0+2*r1] |
| jg .loop4 |
| REP_RET |
| |
| INIT_XMM |
| .width8: |
| mova m5, [pw_32 GLOBAL] |
| movd m6, r5d |
| movd m7, r4d |
| movifnidn r0d, r0m |
| movifnidn r1d, r1m |
| movifnidn r4d, r7m |
| SPLATW m6, m6 |
| SPLATW m7, m7 |
| movh m0, [r2] |
| movh m1, [r2+1] |
| punpcklbw m0, m1 |
| add r2, r3 |
| .loop8: |
| movh m1, [r2] |
| movh m2, [r2+1] |
| movh m3, [r2+r3] |
| movh m4, [r2+r3+1] |
| punpcklbw m1, m2 |
| punpcklbw m3, m4 |
| lea r2, [r2+2*r3] |
| mova m2, m1 |
| mova m4, m3 |
| pmaddubsw m0, m7 |
| pmaddubsw m1, m6 |
| pmaddubsw m2, m7 |
| pmaddubsw m3, m6 |
| paddw m0, m5 |
| paddw m2, m5 |
| paddw m1, m0 |
| paddw m3, m2 |
| mova m0, m4 |
| psrlw m1, 6 |
| psrlw m3, 6 |
| packuswb m1, m3 |
| movh [r0], m1 |
| movhps [r0+r1], m1 |
| sub r4d, 2 |
| lea r0, [r0+2*r1] |
| jg .loop8 |
| REP_RET |
| |
| ; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size |
| |