blob: d249f1254e201a50af543378d3d899af6b38d708 [file] [log] [blame]
;*****************************************************************************
;* pixel.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Alex Izvorski <aizvorksi@gmail.com>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
pw_1: times 8 dw 1
ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
mask_ff: times 16 db 0xff
times 16 db 0
mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1
mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1
SECTION .text
%macro HADDD 2 ; sum junk
%if mmsize == 16
movhlps %2, %1
paddd %1, %2
pshuflw %2, %1, 0xE
paddd %1, %2
%else
mova %2, %1
psrlq %2, 32
paddd %1, %2
%endif
%endmacro
%macro HADDW 2
pmaddwd %1, [pw_1 GLOBAL]
HADDD %1, %2
%endmacro
%macro HADDUW 2
mova %2, %1
pslld %1, 16
psrld %2, 16
psrld %1, 16
paddd %1, %2
HADDD %1, %2
%endmacro
;=============================================================================
; SSD
;=============================================================================
%macro SSD_FULL 6
mova m1, [r0+%1]
mova m2, [r2+%2]
mova m3, [r0+%3]
mova m4, [r2+%4]
mova m5, m2
mova m6, m4
psubusb m2, m1
psubusb m4, m3
psubusb m1, m5
psubusb m3, m6
por m1, m2
por m3, m4
mova m2, m1
mova m4, m3
punpcklbw m1, m7
punpcklbw m3, m7
punpckhbw m2, m7
punpckhbw m4, m7
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
%if %6
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
%endif
paddd m1, m2
paddd m3, m4
%if %5
paddd m0, m1
%else
SWAP m0, m1
%endif
paddd m0, m3
%endmacro
%macro SSD_HALF 6
movh m1, [r0+%1]
movh m2, [r2+%2]
movh m3, [r0+%3]
movh m4, [r2+%4]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
psubw m1, m2
psubw m3, m4
pmaddwd m1, m1
pmaddwd m3, m3
%if %6
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
%endif
%if %5
paddd m0, m1
%else
SWAP m0, m1
%endif
paddd m0, m3
%endmacro
%macro SSD_QUARTER 6
movd m1, [r0+%1]
movd m2, [r2+%2]
movd m3, [r0+%3]
movd m4, [r2+%4]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
pinsrd m1, [r0+%1], 1
pinsrd m2, [r2+%2], 1
pinsrd m3, [r0+%3], 1
pinsrd m4, [r2+%4], 1
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
psubw m1, m2
psubw m3, m4
pmaddwd m1, m1
pmaddwd m3, m3
%if %6
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
%endif
%if %5
paddd m0, m1
%else
SWAP m0, m1
%endif
paddd m0, m3
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 3
cglobal x264_pixel_ssd_%1x%2_%3, 4,4
%if %1 >= mmsize
pxor m7, m7
%endif
%assign i 0
%rep %2/2
%if %1 > mmsize
SSD_FULL 0, 0, mmsize, mmsize, i, 0
SSD_FULL r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/2-1
%elif %1 == mmsize
SSD_FULL 0, 0, r1, r3, i, i<%2/2-1
%else
SSD_HALF 0, 0, r1, r3, i, i<%2/2-1
%endif
%assign i i+1
%endrep
HADDD m0, m1
movd eax, m0
RET
%endmacro
INIT_MMX
SSD 16, 16, mmx
SSD 16, 8, mmx
SSD 8, 16, mmx
SSD 8, 8, mmx
SSD 8, 4, mmx
SSD 4, 8, mmx
SSD 4, 4, mmx
INIT_XMM
SSD 16, 16, sse2
SSD 16, 8, sse2
SSD 8, 16, sse2
SSD 8, 8, sse2
SSD 8, 4, sse2
cglobal x264_pixel_ssd_4x8_sse4, 4,4
SSD_QUARTER 0, 0, r1, r3, 0, 1
SSD_QUARTER 0, 0, r1, r3, 1, 0
HADDD m0, m1
movd eax, m0
RET
cglobal x264_pixel_ssd_4x4_sse4, 4,4
SSD_QUARTER 0, 0, r1, r3, 0, 0
HADDD m0, m1
movd eax, m0
RET
;=============================================================================
; variance
;=============================================================================
%macro VAR_START 0
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
pxor m7, m7 ; zero
%ifdef ARCH_X86_64
%define t3d r3d
%else
%define t3d r2d
%endif
%endmacro
%macro VAR_END 1
%if mmsize == 16
movhlps m0, m5
paddw m5, m0
%endif
movifnidn r2d, r2m
movd r1d, m5
movd [r2], m5 ; return sum
imul r1d, r1d
HADDD m6, m1
shr r1d, %1
movd eax, m6
sub eax, r1d ; sqr - (sum * sum >> shift)
RET
%endmacro
%macro VAR_2ROW 2
mov t3d, %2
.loop:
mova m0, [r0]
mova m1, m0
mova m3, [r0+%1]
mova m2, m0
punpcklbw m0, m7
mova m4, m3
punpckhbw m1, m7
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
punpckhbw m4, m7
psadbw m2, m7
paddw m5, m2
mova m2, m3
punpcklbw m3, m7
dec t3d
psadbw m2, m7
pmaddwd m0, m0
paddw m5, m2
pmaddwd m1, m1
paddd m6, m0
pmaddwd m3, m3
paddd m6, m1
pmaddwd m4, m4
paddd m6, m3
paddd m6, m4
jg .loop
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_pixel_var_16x16_mmxext, 2,3
VAR_START
VAR_2ROW 8, 16
VAR_END 8
cglobal x264_pixel_var_8x8_mmxext, 2,3
VAR_START
VAR_2ROW r1, 4
VAR_END 6
INIT_XMM
cglobal x264_pixel_var_16x16_sse2, 2,3
VAR_START
VAR_2ROW r1, 8
VAR_END 8
cglobal x264_pixel_var_8x8_sse2, 2,3
VAR_START
mov t3d, 4
.loop:
movh m0, [r0]
movhps m0, [r0+r1]
lea r0, [r0+r1*2]
mova m1, m0
punpcklbw m0, m7
mova m2, m1
punpckhbw m1, m7
dec t3d
pmaddwd m0, m0
pmaddwd m1, m1
psadbw m2, m7
paddw m5, m2
paddd m6, m0
paddd m6, m1
jnz .loop
VAR_END 6
;=============================================================================
; SATD
;=============================================================================
; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
; whereas phaddw-based transform doesn't care what order the coefs end up in.
%macro PHSUMSUB 3
movdqa m%3, m%1
phaddw m%1, m%2
phsubw m%3, m%2
SWAP %2, %3
%endmacro
%macro HADAMARD4_ROW_PHADD 5
PHSUMSUB %1, %2, %5
PHSUMSUB %3, %4, %5
PHSUMSUB %1, %3, %5
PHSUMSUB %2, %4, %5
SWAP %3, %4
%endmacro
%macro HADAMARD4_1D 4
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %1, %3, %2, %4
%endmacro
%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
%xdefine %%n n%1
HADAMARD4_1D m4, m5, m6, m7
TRANSPOSE4x4W 4, 5, 6, 7, %%n
HADAMARD4_1D m4, m5, m6, m7
ABS2 m4, m5, m3, m %+ %%n
ABS2 m6, m7, m3, m %+ %%n
paddw m6, m4
paddw m7, m5
pavgw m6, m7
SWAP %%n, 6
%endmacro
; in: r4=3*stride1, r5=3*stride2
; in: %2 = horizontal offset
; in: %3 = whether we need to increment pix1 and pix2
; clobber: m3..m7
; out: %1 = satd
%macro SATD_4x4_MMX 3
LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2]
LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2]
LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2]
LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2]
%if %3
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
HADAMARD4x4_SUM %1
%endmacro
%macro SATD_8x4_SSE2 1
HADAMARD4_1D m0, m1, m2, m3
%ifidn %1, ssse3_phadd
HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4
%else
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
HADAMARD4_1D m0, m1, m2, m3
%endif
ABS4 m0, m1, m2, m3, m4, m5
paddusw m0, m1
paddusw m2, m3
paddusw m6, m0
paddusw m6, m2
%endmacro
%macro SATD_START_MMX 0
lea r4, [3*r1] ; 3*stride1
lea r5, [3*r3] ; 3*stride2
%endmacro
%macro SATD_END_MMX 0
pshufw m1, m0, 01001110b
paddw m0, m1
pshufw m1, m0, 10110001b
paddw m0, m1
movd eax, m0
and eax, 0xffff
RET
%endmacro
; FIXME avoid the spilling of regs to hold 3*stride.
; for small blocks on x86_32, modify pixel pointer instead.
;-----------------------------------------------------------------------------
; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_pixel_satd_16x4_internal_mmxext
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 0
paddw m0, m2
SATD_4x4_MMX m2, 8, 0
paddw m0, m1
SATD_4x4_MMX m1, 12, 0
paddw m0, m2
paddw m0, m1
ret
cglobal x264_pixel_satd_8x8_internal_mmxext
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 1
paddw m0, m2
paddw m0, m1
x264_pixel_satd_8x4_internal_mmxext:
SATD_4x4_MMX m2, 0, 0
SATD_4x4_MMX m1, 4, 0
paddw m0, m2
paddw m0, m1
ret
cglobal x264_pixel_satd_16x16_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
%rep 3
call x264_pixel_satd_16x4_internal_mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endrep
call x264_pixel_satd_16x4_internal_mmxext
HADDUW m0, m1
movd eax, m0
RET
cglobal x264_pixel_satd_16x8_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
call x264_pixel_satd_16x4_internal_mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
call x264_pixel_satd_16x4_internal_mmxext
SATD_END_MMX
cglobal x264_pixel_satd_8x16_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
call x264_pixel_satd_8x8_internal_mmxext
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
call x264_pixel_satd_8x8_internal_mmxext
SATD_END_MMX
cglobal x264_pixel_satd_8x8_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
call x264_pixel_satd_8x8_internal_mmxext
SATD_END_MMX
cglobal x264_pixel_satd_8x4_mmxext, 4,6
SATD_START_MMX
pxor m0, m0
call x264_pixel_satd_8x4_internal_mmxext
SATD_END_MMX
cglobal x264_pixel_satd_4x8_mmxext, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 0
paddw m0, m1
SATD_END_MMX
%macro SATD_W4 1
INIT_MMX
cglobal x264_pixel_satd_4x4_%1, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
%endmacro
SATD_W4 mmxext
%macro SATD_START_SSE2 0
pxor m6, m6
lea r4, [3*r1]
lea r5, [3*r3]
%endmacro
%macro SATD_END_SSE2 0
psrlw m6, 1
HADDW m6, m7
movd eax, m6
RET
%endmacro
%macro BACKUP_POINTERS 0
%ifdef ARCH_X86_64
mov r10, r0
mov r11, r2
%endif
%endmacro
%macro RESTORE_AND_INC_POINTERS 0
%ifdef ARCH_X86_64
lea r0, [r10+8]
lea r2, [r11+8]
%else
mov r0, r0m
mov r2, r2m
add r0, 8
add r2, 8
%endif
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 1
INIT_XMM
cglobal x264_pixel_satd_8x8_internal_%1
LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
SATD_8x4_SSE2 %1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
x264_pixel_satd_8x4_internal_%1:
LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
x264_pixel_satd_4x8_internal_%1:
SAVE_MM_PERMUTATION satd_4x8_internal
SATD_8x4_SSE2 %1
ret
cglobal x264_pixel_satd_16x16_%1, 4,6
SATD_START_SSE2
BACKUP_POINTERS
call x264_pixel_satd_8x8_internal_%1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
call x264_pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
call x264_pixel_satd_8x8_internal_%1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_16x8_%1, 4,6
SATD_START_SSE2
BACKUP_POINTERS
call x264_pixel_satd_8x8_internal_%1
RESTORE_AND_INC_POINTERS
call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_8x16_%1, 4,6
SATD_START_SSE2
call x264_pixel_satd_8x8_internal_%1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_8x8_%1, 4,6
SATD_START_SSE2
call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_8x4_%1, 4,6
SATD_START_SSE2
call x264_pixel_satd_8x4_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_4x8_%1, 4,6
INIT_XMM
LOAD_MM_PERMUTATION satd_4x8_internal
%define movh movd
SATD_START_SSE2
LOAD_DIFF m0, m7, m6, [r0], [r2]
LOAD_DIFF m1, m7, m6, [r0+r1], [r2+r3]
LOAD_DIFF m2, m7, m6, [r0+2*r1], [r2+2*r3]
LOAD_DIFF m3, m7, m6, [r0+r4], [r2+r5]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
LOAD_DIFF m4, m7, m6, [r0], [r2]
LOAD_DIFF m5, m7, m6, [r0+r1], [r2+r3]
punpcklqdq m0, m4
punpcklqdq m1, m5
LOAD_DIFF m4, m7, m6, [r0+2*r1], [r2+2*r3]
LOAD_DIFF m5, m7, m6, [r0+r4], [r2+r5]
punpcklqdq m2, m4
punpcklqdq m3, m5
%define movh movq
call x264_pixel_satd_4x8_internal_%1
SATD_END_SSE2
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal x264_pixel_sa8d_8x8_internal_%1
lea r10, [r0+4*r1]
lea r11, [r2+4*r3]
LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9, r0, r2
LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9, r10, r11
HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
ABS4 m0, m1, m2, m3, m8, m9
ABS4 m4, m5, m6, m7, m8, m9
paddusw m0, m1
paddusw m2, m3
paddusw m4, m5
paddusw m6, m7
paddusw m0, m2
paddusw m4, m6
pavgw m0, m4
ret
cglobal x264_pixel_sa8d_8x8_%1, 4,6
lea r4, [3*r1]
lea r5, [3*r3]
call x264_pixel_sa8d_8x8_internal_%1
HADDW m0, m1
movd eax, m0
add eax, 1
shr eax, 1
ret
cglobal x264_pixel_sa8d_16x16_%1, 4,6
lea r4, [3*r1]
lea r5, [3*r3]
call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
add r0, 8
add r2, 8
mova m10, m0
call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
paddusw m10, m0
call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
sub r0, 8
sub r2, 8
paddusw m10, m0
call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
paddusw m0, m10
HADDUW m0, m1
movd eax, m0
add eax, 1
shr eax, 1
ret
%else ; ARCH_X86_32
cglobal x264_pixel_sa8d_8x8_internal_%1
LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7
movdqa [esp+4], m2
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2
movdqa m2, [esp+4]
HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp+4], [esp+20]
HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
%ifidn %1, sse2
movdqa [esp+4], m4
movdqa [esp+20], m2
%endif
ABS2 m6, m3, m4, m2
ABS2 m0, m7, m4, m2
paddusw m0, m6
paddusw m7, m3
%ifidn %1, sse2
movdqa m4, [esp+4]
movdqa m2, [esp+20]
%endif
ABS2 m5, m1, m6, m3
ABS2 m4, m2, m6, m3
paddusw m5, m1
paddusw m4, m2
paddusw m0, m7
paddusw m5, m4
pavgw m0, m5
ret
%endif ; ARCH
%endmacro ; SATDS_SSE2
%macro SA8D_16x16_32 1
%ifndef ARCH_X86_64
cglobal x264_pixel_sa8d_8x8_%1, 4,7
mov r6, esp
and esp, ~15
sub esp, 32
lea r4, [3*r1]
lea r5, [3*r3]
call x264_pixel_sa8d_8x8_internal_%1
HADDW m0, m1
movd eax, m0
add eax, 1
shr eax, 1
mov esp, r6
RET
cglobal x264_pixel_sa8d_16x16_%1, 4,7
mov r6, esp
and esp, ~15
sub esp, 48
lea r4, [3*r1]
lea r5, [3*r3]
call x264_pixel_sa8d_8x8_internal_%1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
mova [esp+32], m0
call x264_pixel_sa8d_8x8_internal_%1
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8
add r2, 8
paddusw m0, [esp+32]
mova [esp+32], m0
call x264_pixel_sa8d_8x8_internal_%1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%if mmsize == 16
paddusw m0, [esp+32]
%endif
mova [esp+48-mmsize], m0
call x264_pixel_sa8d_8x8_internal_%1
paddusw m0, [esp+48-mmsize]
%if mmsize == 16
HADDUW m0, m1
%else
mova m2, [esp+32]
pxor m7, m7
mova m1, m0
mova m3, m2
punpcklwd m0, m7
punpckhwd m1, m7
punpcklwd m2, m7
punpckhwd m3, m7
paddd m0, m1
paddd m2, m3
paddd m0, m2
HADDD m0, m1
%endif
movd eax, m0
add eax, 1
shr eax, 1
mov esp, r6
RET
%endif ; !ARCH_X86_64
%endmacro ; SA8D_16x16_32
;=============================================================================
; INTRA SATD
;=============================================================================
%macro INTRA_SA8D_SSE2 1
%ifdef ARCH_X86_64
INIT_XMM
;-----------------------------------------------------------------------------
; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_sa8d_x3_8x8_core_%1
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
movq m1, [r0+1*FENC_STRIDE]
movq m2, [r0+2*FENC_STRIDE]
movq m3, [r0+3*FENC_STRIDE]
movq m4, [r0+4*FENC_STRIDE]
movq m5, [r0+5*FENC_STRIDE]
movq m6, [r0+6*FENC_STRIDE]
movq m7, [r0+7*FENC_STRIDE]
punpcklbw m0, m8
punpcklbw m1, m8
punpcklbw m2, m8
punpcklbw m3, m8
punpcklbw m4, m8
punpcklbw m5, m8
punpcklbw m6, m8
punpcklbw m7, m8
HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
; dc
movzx edi, word [r1+0]
add di, word [r1+16]
add edi, 8
and edi, -16
shl edi, 2
pxor m15, m15
movdqa m8, m2
movdqa m9, m3
movdqa m10, m4
movdqa m11, m5
ABS4 m8, m9, m10, m11, m12, m13
paddusw m8, m10
paddusw m9, m11
%ifidn %1, ssse3
pabsw m10, m6
pabsw m11, m7
pabsw m15, m1
%else
movdqa m10, m6
movdqa m11, m7
movdqa m15, m1
ABS2 m10, m11, m13, m14
ABS1 m15, m13
%endif
paddusw m10, m11
paddusw m8, m9
paddusw m15, m10
paddusw m15, m8
movdqa m14, m15 ; 7x8 sum
movdqa m8, [r1+0] ; left edge
movd m9, edi
psllw m8, 3
psubw m8, m0
psubw m9, m0
ABS1 m8, m10
ABS1 m9, m11 ; 1x8 sum
paddusw m14, m8
paddusw m15, m9
punpcklwd m0, m1
punpcklwd m2, m3
punpcklwd m4, m5
punpcklwd m6, m7
punpckldq m0, m2
punpckldq m4, m6
punpcklqdq m0, m4 ; transpose
movdqa m1, [r1+16] ; top edge
movdqa m2, m15
psllw m1, 3
psrldq m2, 2 ; 8x7 sum
psubw m0, m1 ; 8x1 sum
ABS1 m0, m1
paddusw m2, m0
; 3x HADDW
movdqa m7, [pw_1 GLOBAL]
pmaddwd m2, m7
pmaddwd m14, m7
pmaddwd m15, m7
movdqa m3, m2
punpckldq m2, m14
punpckhdq m3, m14
pshufd m5, m15, 0xf5
paddd m2, m3
paddd m5, m15
movdqa m3, m2
punpcklqdq m2, m5
punpckhqdq m3, m5
pavgw m3, m2
pxor m0, m0
pavgw m3, m0
movq [r2], m3 ; i8x8_v, i8x8_h
psrldq m3, 8
movd [r2+8], m3 ; i8x8_dc
ret
%endif ; ARCH_X86_64
%endmacro ; INTRA_SA8D_SSE2
; in: r0 = fenc
; out: m0..m3 = hadamard coefs
INIT_MMX
ALIGN 16
load_hadamard:
pxor m7, m7
movd m0, [r0+0*FENC_STRIDE]
movd m1, [r0+1*FENC_STRIDE]
movd m2, [r0+2*FENC_STRIDE]
movd m3, [r0+3*FENC_STRIDE]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
HADAMARD4_1D m0, m1, m2, m3
TRANSPOSE4x4W 0, 1, 2, 3, 4
HADAMARD4_1D m0, m1, m2, m3
SAVE_MM_PERMUTATION load_hadamard
ret
%macro SCALAR_SUMSUB 4
add %1, %2
add %3, %4
add %2, %2
add %4, %4
sub %2, %1
sub %4, %3
%endmacro
%macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
%ifnidn %1, 0
shl %1d, 5 ; log(FDEC_STRIDE)
%endif
movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
%ifnidn %1, 0
shr %1d, 5
%endif
SCALAR_SUMSUB %2d, %3d, %4d, %5d
SCALAR_SUMSUB %2d, %4d, %3d, %5d
mov [left_1d+2*%1+0], %2w
mov [left_1d+2*%1+2], %3w
mov [left_1d+2*%1+4], %4w
mov [left_1d+2*%1+6], %5w
%endmacro
%macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
SCALAR_SUMSUB %2d, %3d, %4d, %5d
SCALAR_SUMSUB %2d, %4d, %3d, %5d
mov [top_1d+2*%1+0], %2w
mov [top_1d+2*%1+2], %3w
mov [top_1d+2*%1+4], %4w
mov [top_1d+2*%1+6], %5w
%endmacro
%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
pxor %7, %7
pshufw %4, %1, 01001110b
pshufw %5, %2, 01001110b
pshufw %6, %3, 01001110b
paddw %1, %4
paddw %2, %5
paddw %3, %6
punpcklwd %1, %7
punpcklwd %2, %7
punpcklwd %3, %7
pshufw %4, %1, 01001110b
pshufw %5, %2, 01001110b
pshufw %6, %3, 01001110b
%8 %1, %4
%8 %2, %5
%8 %3, %6
%endmacro
%macro CLEAR_SUMS 0
%ifdef ARCH_X86_64
mov qword [sums+0], 0
mov qword [sums+8], 0
mov qword [sums+16], 0
%else
pxor m7, m7
movq [sums+0], m7
movq [sums+8], m7
movq [sums+16], m7
%endif
%endmacro
; in: m1..m3
; out: m7
; clobber: m4..m6
%macro SUM3x4 1
%ifidn %1, ssse3
pabsw m4, m1
pabsw m5, m2
pabsw m7, m3
paddw m4, m5
%else
movq m4, m1
movq m5, m2
ABS2 m4, m5, m6, m7
movq m7, m3
paddw m4, m5
ABS1 m7, m6
%endif
paddw m7, m4
%endmacro
; in: m0..m3 (4x4), m7 (3x4)
; out: m0 v, m4 h, m5 dc
; clobber: m6
%macro SUM4x3 3 ; dc, left, top
movq m4, %2
movd m5, %1
psllw m4, 2
psubw m4, m0
psubw m5, m0
punpcklwd m0, m1
punpcklwd m2, m3
punpckldq m0, m2 ; transpose
movq m1, %3
psllw m1, 2
psubw m0, m1
ABS2 m4, m5, m2, m3 ; 1x4 sum
ABS1 m0, m1 ; 4x1 sum
%endmacro
%macro INTRA_SATDS_MMX 1
INIT_MMX
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_satd_x3_4x4_%1, 2,6
%ifdef ARCH_X86_64
; stack is 16 byte aligned because abi says so
%define top_1d rsp-8 ; size 8
%define left_1d rsp-16 ; size 8
%define t0 r10
%define t0d r10d
%else
; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
SUB esp, 16
%define top_1d esp+8
%define left_1d esp
%define t0 r2
%define t0d r2d
%endif
call load_hadamard
SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
mov t0d, r0d
SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
lea t0d, [t0d + r0d + 4]
and t0d, -8
shl t0d, 1 ; dc
SUM3x4 %1
SUM4x3 t0d, [left_1d], [top_1d]
paddw m4, m7
paddw m5, m7
movq m1, m5
psrlq m1, 16 ; 4x3 sum
paddw m0, m1
SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
%ifndef ARCH_X86_64
mov r2, r2m
%endif
movd [r2+0], m0 ; i4x4_v satd
movd [r2+4], m4 ; i4x4_h satd
movd [r2+8], m5 ; i4x4_dc satd
%ifndef ARCH_X86_64
ADD esp, 16
%endif
RET
%ifdef ARCH_X86_64
%define t0 r10
%define t0d r10d
%define t2 r11
%define t2w r11w
%define t2d r11d
%else
%define t0 r0
%define t0d r0d
%define t2 r2
%define t2w r2w
%define t2d r2d
%endif
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_satd_x3_16x16_%1, 0,7
%ifdef ARCH_X86_64
%assign stack_pad 88
%else
%assign stack_pad 88 + ((stack_offset+88+4)&15)
%endif
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
SUB rsp, stack_pad
%define sums rsp+64 ; size 24
%define top_1d rsp+32 ; size 32
%define left_1d rsp ; size 32
movifnidn r1d, r1m
CLEAR_SUMS
; 1D hadamards
xor t2d, t2d
mov t0d, 12
.loop_edge:
SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
add t2d, r3d
SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
add t2d, r3d
sub t0d, 4
jge .loop_edge
shr t2d, 1
add t2d, 8
and t2d, -16 ; dc
; 2D hadamards
movifnidn r0d, r0m
xor r3d, r3d
.loop_y:
xor r4d, r4d
.loop_x:
call load_hadamard
SUM3x4 %1
SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
pavgw m4, m7
pavgw m5, m7
paddw m0, [sums+0] ; i16x16_v satd
paddw m4, [sums+8] ; i16x16_h satd
paddw m5, [sums+16] ; i16x16_dc satd
movq [sums+0], m0
movq [sums+8], m4
movq [sums+16], m5
add r0, 4
inc r4d
cmp r4d, 4
jl .loop_x
add r0, 4*FENC_STRIDE-16
inc r3d
cmp r3d, 4
jl .loop_y
; horizontal sum
movifnidn r2d, r2m
movq m2, [sums+16]
movq m1, [sums+8]
movq m0, [sums+0]
movq m7, m2
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
psrld m0, 1
pslld m7, 16
psrld m7, 16
paddd m0, m2
psubd m0, m7
movd [r2+8], m2 ; i16x16_dc satd
movd [r2+4], m1 ; i16x16_h satd
movd [r2+0], m0 ; i16x16_v satd
ADD rsp, stack_pad
RET
;-----------------------------------------------------------------------------
; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal x264_intra_satd_x3_8x8c_%1, 0,6
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
SUB rsp, 72
%define sums rsp+48 ; size 24
%define dc_1d rsp+32 ; size 16
%define top_1d rsp+16 ; size 16
%define left_1d rsp ; size 16
movifnidn r1d, r1m
CLEAR_SUMS
; 1D hadamards
mov t0d, 4
.loop_edge:
SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
sub t0d, 4
jge .loop_edge
; dc
movzx t2d, word [left_1d+0]
movzx r3d, word [top_1d+0]
movzx r4d, word [left_1d+8]
movzx r5d, word [top_1d+8]
add t2d, r3d
lea r3, [r4 + r5]
lea t2, [2*t2 + 8]
lea r3, [2*r3 + 8]
lea r4, [4*r4 + 8]
lea r5, [4*r5 + 8]
and t2d, -16 ; tl
and r3d, -16 ; br
and r4d, -16 ; bl
and r5d, -16 ; tr
mov [dc_1d+ 0], t2d ; tl
mov [dc_1d+ 4], r5d ; tr
mov [dc_1d+ 8], r4d ; bl
mov [dc_1d+12], r3d ; br
lea r5, [dc_1d]
; 2D hadamards
movifnidn r0d, r0m
movifnidn r2d, r2m
xor r3d, r3d
.loop_y:
xor r4d, r4d
.loop_x:
call load_hadamard
SUM3x4 %1
SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
pavgw m4, m7
pavgw m5, m7
paddw m0, [sums+16] ; i4x4_v satd
paddw m4, [sums+8] ; i4x4_h satd
paddw m5, [sums+0] ; i4x4_dc satd
movq [sums+16], m0
movq [sums+8], m4
movq [sums+0], m5
add r0, 4
inc r4d
cmp r4d, 2
jl .loop_x
add r0, 4*FENC_STRIDE-8
add r5, 8
inc r3d
cmp r3d, 2
jl .loop_y
; horizontal sum
movq m0, [sums+0]
movq m1, [sums+8]
movq m2, [sums+16]
movq m7, m0
psrlq m7, 15
paddw m2, m7
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
psrld m2, 1
movd [r2+0], m0 ; i8x8c_dc satd
movd [r2+4], m1 ; i8x8c_h satd
movd [r2+8], m2 ; i8x8c_v satd
ADD rsp, 72
RET
%endmacro ; INTRA_SATDS_MMX
%macro ABS_MOV_SSSE3 2
pabsw %1, %2
%endmacro
%macro ABS_MOV_MMX 2
pxor %1, %1
psubw %1, %2
pmaxsw %1, %2
%endmacro
%define ABS_MOV ABS_MOV_MMX
; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
; out: [tmp]=hadamard4, m0=satd
cglobal x264_hadamard_ac_4x4_mmxext
movh m0, [r0]
movh m1, [r0+r1]
movh m2, [r0+r1*2]
movh m3, [r0+r2]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
HADAMARD4_1D m0, m1, m2, m3
TRANSPOSE4x4W 0, 1, 2, 3, 4
HADAMARD4_1D m0, m1, m2, m3
mova [r3], m0
mova [r3+8], m1
mova [r3+16], m2
mova [r3+24], m3
ABS1 m0, m4
ABS1 m1, m4
pand m0, m6
ABS1 m2, m4
ABS1 m3, m4
paddw m0, m1
paddw m2, m3
paddw m0, m2
SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
ret
cglobal x264_hadamard_ac_2x2_mmxext
mova m0, [r3+0x00]
mova m1, [r3+0x20]
mova m2, [r3+0x40]
mova m3, [r3+0x60]
HADAMARD4_1D m0, m1, m2, m3
ABS2 m0, m1, m4, m5
ABS2 m2, m3, m4, m5
SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext
ret
cglobal x264_hadamard_ac_8x8_mmxext
mova m6, [mask_ac4 GLOBAL]
pxor m7, m7
call x264_hadamard_ac_4x4_mmxext
add r0, 4
add r3, 32
mova m5, m0
call x264_hadamard_ac_4x4_mmxext
lea r0, [r0+4*r1]
add r3, 64
paddw m5, m0
call x264_hadamard_ac_4x4_mmxext
sub r0, 4
sub r3, 32
paddw m5, m0
call x264_hadamard_ac_4x4_mmxext
paddw m5, m0
sub r3, 64
mova [rsp+gprsize+8], m5 ; save satd
call x264_hadamard_ac_2x2_mmxext
add r3, 8
pand m6, m0
mova m7, m1
paddw m6, m2
paddw m7, m3
%rep 2
call x264_hadamard_ac_2x2_mmxext
add r3, 8
paddw m6, m0
paddw m7, m1
paddw m6, m2
paddw m7, m3
%endrep
call x264_hadamard_ac_2x2_mmxext
sub r3, 24
paddw m6, m0
paddw m7, m1
paddw m6, m2
paddw m7, m3
paddw m6, m7
mova [rsp+gprsize], m6 ; save sa8d
SWAP m0, m6
SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
ret
%macro HADAMARD_AC_WXH_MMX 2
cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
sub rsp, 16+128+pad
lea r2, [r1*3]
lea r3, [rsp+16]
call x264_hadamard_ac_8x8_mmxext
%if %2==16
%define ysub r2
lea r0, [r0+r1*4]
sub rsp, 16
call x264_hadamard_ac_8x8_mmxext
%endif
%if %1==16
neg ysub
sub rsp, 16
lea r0, [r0+ysub*4+8]
neg ysub
call x264_hadamard_ac_8x8_mmxext
%if %2==16
lea r0, [r0+r1*4]
sub rsp, 16
call x264_hadamard_ac_8x8_mmxext
%endif
%endif
mova m1, [rsp+0x08]
%if %1*%2 >= 128
paddusw m0, [rsp+0x10]
paddusw m1, [rsp+0x18]
%endif
%if %1*%2 == 256
mova m2, [rsp+0x20]
paddusw m1, [rsp+0x28]
paddusw m2, [rsp+0x30]
mova m3, m0
paddusw m1, [rsp+0x38]
pxor m3, m2
pand m3, [pw_1 GLOBAL]
pavgw m0, m2
psubusw m0, m3
HADDUW m0, m2
%else
psrlw m0, 1
HADDW m0, m2
%endif
psrlw m1, 1
HADDW m1, m3
movd edx, m0
movd eax, m1
shr edx, 1
%ifdef ARCH_X86_64
shl rdx, 32
add rax, rdx
%endif
add rsp, 128+%1*%2/4+pad
RET
%endmacro ; HADAMARD_AC_WXH_MMX
HADAMARD_AC_WXH_MMX 16, 16
HADAMARD_AC_WXH_MMX 8, 16
HADAMARD_AC_WXH_MMX 16, 8
HADAMARD_AC_WXH_MMX 8, 8
%macro HADAMARD_AC_SSE2 1
INIT_XMM
; in: r0=pix, r1=stride, r2=stride*3
; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
cglobal x264_hadamard_ac_8x8_%1
%ifdef ARCH_X86_64
%define spill0 m8
%define spill1 m9
%define spill2 m10
%else
%define spill0 [rsp+gprsize]
%define spill1 [rsp+gprsize+16]
%define spill2 [rsp+gprsize+32]
%endif
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r1]
movh m2, [r0+r1*2]
movh m3, [r0+r2]
lea r0, [r0+r1*4]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
HADAMARD4_1D m0, m1, m2, m3
mova spill0, m3
SWAP m3, m7
movh m4, [r0]
movh m5, [r0+r1]
movh m6, [r0+r1*2]
movh m7, [r0+r2]
punpcklbw m4, m3
punpcklbw m5, m3
punpcklbw m6, m3
punpcklbw m7, m3
HADAMARD4_1D m4, m5, m6, m7
mova m3, spill0
%ifdef ARCH_X86_64
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
%else
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1
%endif
HADAMARD4_1D m0, m1, m2, m3
HADAMARD4_1D m4, m5, m6, m7
mova spill0, m1
mova spill1, m2
mova spill2, m3
ABS_MOV m1, m0
ABS_MOV m2, m4
ABS_MOV m3, m5
paddw m1, m2
SUMSUB_BA m0, m4
pand m1, [mask_ac4 GLOBAL]
ABS_MOV m2, spill0
paddw m1, m3
ABS_MOV m3, spill1
paddw m1, m2
ABS_MOV m2, spill2
paddw m1, m3
ABS_MOV m3, m6
paddw m1, m2
ABS_MOV m2, m7
paddw m1, m3
mova m3, m7
paddw m1, m2
mova m2, m6
psubw m7, spill2
paddw m3, spill2
mova [rsp+gprsize+32], m1 ; save satd
mova m1, m5
psubw m6, spill1
paddw m2, spill1
psubw m5, spill0
paddw m1, spill0
mova spill1, m7
SBUTTERFLY qdq, 0, 4, 7
SBUTTERFLY qdq, 1, 5, 7
SBUTTERFLY qdq, 2, 6, 7
SUMSUB_BADC m0, m4, m1, m5
SUMSUB_BA m2, m6
ABS1 m0, m7
ABS1 m1, m7
pand m0, [mask_ac8 GLOBAL]
ABS1 m2, m7
ABS1 m4, m7
ABS1 m5, m7
ABS1 m6, m7
mova m7, spill1
paddw m0, m4
SBUTTERFLY qdq, 3, 7, 4
SUMSUB_BA m3, m7
paddw m1, m5
ABS1 m3, m4
ABS1 m7, m4
paddw m2, m6
paddw m3, m7
paddw m0, m1
paddw m2, m3
paddw m0, m2
mova [rsp+gprsize+16], m0 ; save sa8d
SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
ret
HADAMARD_AC_WXH_SSE2 16, 16, %1
HADAMARD_AC_WXH_SSE2 8, 16, %1
HADAMARD_AC_WXH_SSE2 16, 8, %1
HADAMARD_AC_WXH_SSE2 8, 8, %1
%endmacro ; HADAMARD_AC_SSE2
; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
%macro HADAMARD_AC_WXH_SSE2 3
cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
sub rsp, 48+pad
lea r2, [r1*3]
call x264_hadamard_ac_8x8_%3
%if %2==16
%define ysub r2
lea r0, [r0+r1*4]
sub rsp, 32
call x264_hadamard_ac_8x8_%3
%endif
%if %1==16
neg ysub
sub rsp, 32
lea r0, [r0+ysub*4+8]
neg ysub
call x264_hadamard_ac_8x8_%3
%if %2==16
lea r0, [r0+r1*4]
sub rsp, 32
call x264_hadamard_ac_8x8_%3
%endif
%endif
mova m1, [rsp+0x20]
%if %1*%2 >= 128
paddusw m0, [rsp+0x30]
paddusw m1, [rsp+0x40]
%endif
%if %1*%2 == 256
paddusw m0, [rsp+0x50]
paddusw m1, [rsp+0x60]
paddusw m0, [rsp+0x70]
paddusw m1, [rsp+0x80]
psrlw m0, 1
%endif
HADDW m0, m2
HADDW m1, m3
movd edx, m0
movd eax, m1
shr edx, 2 - (%1*%2 >> 8)
shr eax, 1
%ifdef ARCH_X86_64
shl rdx, 32
add rax, rdx
%endif
add rsp, 16+%1*%2/2+pad
RET
%endmacro ; HADAMARD_AC_WXH_SSE2
; instantiate satds
%ifndef ARCH_X86_64
cextern x264_pixel_sa8d_8x8_internal_mmxext
SA8D_16x16_32 mmxext
%endif
%define ABS1 ABS1_MMX
%define ABS2 ABS2_MMX
SATDS_SSE2 sse2
SA8D_16x16_32 sse2
INTRA_SA8D_SSE2 sse2
INTRA_SATDS_MMX mmxext
HADAMARD_AC_SSE2 sse2
%define ABS1 ABS1_SSSE3
%define ABS2 ABS2_SSSE3
%define ABS_MOV ABS_MOV_SSSE3
SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
SATDS_SSE2 ssse3
SA8D_16x16_32 ssse3
INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3
HADAMARD_AC_SSE2 ssse3
SATDS_SSE2 ssse3_phadd
;=============================================================================
; SSIM
;=============================================================================
;-----------------------------------------------------------------------------
; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4
pxor m0, m0
pxor m1, m1
pxor m2, m2
pxor m3, m3
pxor m4, m4
%rep 4
movq m5, [r0]
movq m6, [r2]
punpcklbw m5, m0
punpcklbw m6, m0
paddw m1, m5
paddw m2, m6
movdqa m7, m5
pmaddwd m5, m5
pmaddwd m7, m6
pmaddwd m6, m6
paddd m3, m5
paddd m4, m7
paddd m3, m6
add r0, r1
add r2, r3
%endrep
; PHADDW m1, m2
; PHADDD m3, m4
movdqa m7, [pw_1 GLOBAL]
pshufd m5, m3, 0xb1
pmaddwd m1, m7
pmaddwd m2, m7
pshufd m6, m4, 0xb1
packssdw m1, m2
paddd m3, m5
pshufd m1, m1, 0xd8
paddd m4, m6
pmaddwd m1, m7
movdqa m5, m3
punpckldq m3, m4
punpckhdq m5, m4
%ifdef ARCH_X86_64
%define t0 r4
%else
%define t0 eax
mov t0, r4m
%endif
movq [t0+ 0], m1
movq [t0+ 8], m3
psrldq m1, 8
movq [t0+16], m1
movq [t0+24], m5
RET
;-----------------------------------------------------------------------------
; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ssim_end4_sse2, 3,3
movdqa m0, [r0+ 0]
movdqa m1, [r0+16]
movdqa m2, [r0+32]
movdqa m3, [r0+48]
movdqa m4, [r0+64]
paddd m0, [r1+ 0]
paddd m1, [r1+16]
paddd m2, [r1+32]
paddd m3, [r1+48]
paddd m4, [r1+64]
paddd m0, m1
paddd m1, m2
paddd m2, m3
paddd m3, m4
movdqa m5, [ssim_c1 GLOBAL]
movdqa m6, [ssim_c2 GLOBAL]
TRANSPOSE4x4D 0, 1, 2, 3, 4
; s1=m0, s2=m1, ss=m2, s12=m3
movdqa m4, m1
pslld m1, 16
pmaddwd m4, m0 ; s1*s2
por m0, m1
pmaddwd m0, m0 ; s1*s1 + s2*s2
pslld m4, 1
pslld m3, 7
pslld m2, 6
psubd m3, m4 ; covar*2
psubd m2, m0 ; vars
paddd m0, m5
paddd m4, m5
paddd m3, m6
paddd m2, m6
cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
mulps m4, m3
mulps m0, m2
divps m4, m0 ; ssim
cmp r2d, 4
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
%ifdef PIC
lea r3, [mask_ff + 16 GLOBAL]
movdqu m1, [r3 + r2*4]
%else
movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
%endif
pand m4, m1
.skip:
movhlps m0, m4
addps m0, m4
pshuflw m4, m0, 0xE
addss m0, m4
%ifndef ARCH_X86_64
movd r0m, m0
fld dword r0m
%endif
RET
;=============================================================================
; Successive Elimination ADS
;=============================================================================
%macro ADS_START 1 ; unroll_size
%ifdef ARCH_X86_64
%define t0 r6
mov r10, rsp
%else
%define t0 r4
mov rbp, rsp
%endif
mov r0d, r5m
sub rsp, r0
sub rsp, %1*4-1
and rsp, ~15
mov t0, rsp
shl r2d, 1
%endmacro
%macro ADS_END 1
add r1, 8*%1
add r3, 8*%1
add t0, 4*%1
sub r0d, 4*%1
jg .loop
jmp ads_mvs
%endmacro
%define ABS1 ABS1_MMX
;-----------------------------------------------------------------------------
; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ads4_mmxext, 4,7
movq mm6, [r0]
movq mm4, [r0+8]
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA
ADS_START 1
.loop:
movq mm0, [r1]
movq mm1, [r1+16]
psubw mm0, mm7
psubw mm1, mm6
ABS1 mm0, mm2
ABS1 mm1, mm3
movq mm2, [r1+r2]
movq mm3, [r1+r2+16]
psubw mm2, mm5
psubw mm3, mm4
paddw mm0, mm1
ABS1 mm2, mm1
ABS1 mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
%ifdef ARCH_X86_64
pshufw mm1, [r10+8], 0
%else
pshufw mm1, [ebp+stack_offset+28], 0
%endif
paddusw mm0, [r3]
psubusw mm1, mm0
packsswb mm1, mm1
movd [t0], mm1
ADS_END 1
cglobal x264_pixel_ads2_mmxext, 4,7
movq mm6, [r0]
pshufw mm5, r6m, 0
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
ADS_START 1
.loop:
movq mm0, [r1]
movq mm1, [r1+r2]
psubw mm0, mm7
psubw mm1, mm6
ABS1 mm0, mm2
ABS1 mm1, mm3
paddw mm0, mm1
paddusw mm0, [r3]
movq mm4, mm5
psubusw mm4, mm0
packsswb mm4, mm4
movd [t0], mm4
ADS_END 1
cglobal x264_pixel_ads1_mmxext, 4,7
pshufw mm7, [r0], 0
pshufw mm6, r6m, 0
ADS_START 2
.loop:
movq mm0, [r1]
movq mm1, [r1+8]
psubw mm0, mm7
psubw mm1, mm7
ABS1 mm0, mm2
ABS1 mm1, mm3
paddusw mm0, [r3]
paddusw mm1, [r3+8]
movq mm4, mm6
movq mm5, mm6
psubusw mm4, mm0
psubusw mm5, mm1
packsswb mm4, mm5
movq [t0], mm4
ADS_END 2
%macro ADS_SSE2 1
cglobal x264_pixel_ads4_%1, 4,7
movdqa xmm4, [r0]
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, 0xAA
pshufhw xmm5, xmm4, 0
pshufhw xmm4, xmm4, 0xAA
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
punpckhqdq xmm5, xmm5
punpckhqdq xmm4, xmm4
%ifdef ARCH_X86_64
pshuflw xmm8, r6m, 0
punpcklqdq xmm8, xmm8
ADS_START 2
movdqu xmm10, [r1]
movdqu xmm11, [r1+r2]
.loop:
movdqa xmm0, xmm10
movdqu xmm1, [r1+16]
movdqa xmm10, xmm1
psubw xmm0, xmm7
psubw xmm1, xmm6
ABS1 xmm0, xmm2
ABS1 xmm1, xmm3
movdqa xmm2, xmm11
movdqu xmm3, [r1+r2+16]
movdqa xmm11, xmm3
psubw xmm2, xmm5
psubw xmm3, xmm4
paddw xmm0, xmm1
movdqu xmm9, [r3]
ABS1 xmm2, xmm1
ABS1 xmm3, xmm1
paddw xmm0, xmm2
paddw xmm0, xmm3
paddusw xmm0, xmm9
movdqa xmm1, xmm8
psubusw xmm1, xmm0
packsswb xmm1, xmm1
movq [t0], xmm1
%else
ADS_START 2
.loop:
movdqu xmm0, [r1]
movdqu xmm1, [r1+16]
psubw xmm0, xmm7
psubw xmm1, xmm6
ABS1 xmm0, xmm2
ABS1 xmm1, xmm3
movdqu xmm2, [r1+r2]
movdqu xmm3, [r1+r2+16]
psubw xmm2, xmm5
psubw xmm3, xmm4
paddw xmm0, xmm1
ABS1 xmm2, xmm1
ABS1 xmm3, xmm1
paddw xmm0, xmm2
paddw xmm0, xmm3
movd xmm1, [ebp+stack_offset+28]
movdqu xmm2, [r3]
pshuflw xmm1, xmm1, 0
punpcklqdq xmm1, xmm1
paddusw xmm0, xmm2
psubusw xmm1, xmm0
packsswb xmm1, xmm1
movq [t0], xmm1
%endif ; ARCH
ADS_END 2
cglobal x264_pixel_ads2_%1, 4,7
movq xmm6, [r0]
movd xmm5, r6m
pshuflw xmm7, xmm6, 0
pshuflw xmm6, xmm6, 0xAA
pshuflw xmm5, xmm5, 0
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
punpcklqdq xmm5, xmm5
ADS_START 2
.loop:
movdqu xmm0, [r1]
movdqu xmm1, [r1+r2]
psubw xmm0, xmm7
psubw xmm1, xmm6
movdqu xmm4, [r3]
ABS1 xmm0, xmm2
ABS1 xmm1, xmm3
paddw xmm0, xmm1
paddusw xmm0, xmm4
movdqa xmm1, xmm5
psubusw xmm1, xmm0
packsswb xmm1, xmm1
movq [t0], xmm1
ADS_END 2
cglobal x264_pixel_ads1_%1, 4,7
movd xmm7, [r0]
movd xmm6, r6m
pshuflw xmm7, xmm7, 0
pshuflw xmm6, xmm6, 0
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
ADS_START 4
.loop:
movdqu xmm0, [r1]
movdqu xmm1, [r1+16]
psubw xmm0, xmm7
psubw xmm1, xmm7
movdqu xmm2, [r3]
movdqu xmm3, [r3+16]
ABS1 xmm0, xmm4
ABS1 xmm1, xmm5
paddusw xmm0, xmm2
paddusw xmm1, xmm3
movdqa xmm4, xmm6
movdqa xmm5, xmm6
psubusw xmm4, xmm0
psubusw xmm5, xmm1
packsswb xmm4, xmm5
movdqa [t0], xmm4
ADS_END 4
%endmacro
ADS_SSE2 sse2
%define ABS1 ABS1_SSSE3
ADS_SSE2 ssse3
; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
; int nmv=0, i, j;
; *(uint32_t*)(masks+width) = 0;
; for( i=0; i<width; i+=8 )
; {
; uint64_t mask = *(uint64_t*)(masks+i);
; if( !mask ) continue;
; for( j=0; j<8; j++ )
; if( mask & (255<<j*8) )
; mvs[nmv++] = i+j;
; }
; return nmv;
; }
cglobal x264_pixel_ads_mvs
ads_mvs:
xor eax, eax
xor esi, esi
%ifdef ARCH_X86_64
; mvs = r4
; masks = rsp
; width = r5
; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
mov dword [rsp+r5], 0
jmp .loopi
.loopi0:
add esi, 8
cmp esi, r5d
jge .end
.loopi:
mov rdi, [rsp+rsi]
test rdi, rdi
jz .loopi0
xor ecx, ecx
%macro TEST 1
mov [r4+rax*2], si
test edi, 0xff<<(%1*8)
setne cl
add eax, ecx
inc esi
%endmacro
TEST 0
TEST 1
TEST 2
TEST 3
shr rdi, 32
TEST 0
TEST 1
TEST 2
TEST 3
cmp esi, r5d
jl .loopi
.end:
mov rsp, r10
ret
%else
; no PROLOGUE, inherit from x264_pixel_ads1
mov ebx, [ebp+stack_offset+20] ; mvs
mov edi, [ebp+stack_offset+24] ; width
mov dword [esp+edi], 0
push ebp
jmp .loopi
.loopi0:
add esi, 8
cmp esi, edi
jge .end
.loopi:
mov ebp, [esp+esi+4]
mov edx, [esp+esi+8]
mov ecx, ebp
or ecx, edx
jz .loopi0
xor ecx, ecx
%macro TEST 2
mov [ebx+eax*2], si
test %2, 0xff<<(%1*8)
setne cl
add eax, ecx
inc esi
%endmacro
TEST 0, ebp
TEST 1, ebp
TEST 2, ebp
TEST 3, ebp
TEST 0, edx
TEST 1, edx
TEST 2, edx
TEST 3, edx
cmp esi, edi
jl .loopi
.end:
pop esp
RET
%endif ; ARCH