blob: a0394144f293a738a342a8db488515477d24f733 [file] [log] [blame]
;*****************************************************************************
;* quant-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;* Christian Heine <sennindemokrit@gmx.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
pb_1: times 16 db 1
pw_1: times 8 dw 1
pd_1: times 4 dd 1
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
%endmacro
%macro DQM8 6
dw %1, %4, %5, %4, %1, %4, %5, %4
dw %4, %2, %6, %2, %4, %2, %6, %2
dw %5, %6, %3, %6, %5, %6, %3, %6
; last line not used, just padding for power-of-2 stride
times 8 dw 0
%endmacro
dequant4_scale:
DQM4 10, 13, 16
DQM4 11, 14, 18
DQM4 13, 16, 20
DQM4 14, 18, 23
DQM4 16, 20, 25
DQM4 18, 23, 29
dequant8_scale:
DQM8 20, 18, 32, 19, 25, 24
DQM8 22, 19, 35, 21, 28, 26
DQM8 26, 23, 42, 24, 33, 31
DQM8 28, 25, 45, 26, 35, 33
DQM8 32, 28, 51, 30, 40, 38
DQM8 36, 32, 58, 34, 46, 43
decimate_mask_table4:
db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
SECTION .text
%macro QUANT_DC_START 0
movd m6, r1m ; mf
movd m7, r2m ; bias
%ifidn m0, mm0
pshufw m6, m6, 0
pshufw m7, m7, 0
%else
pshuflw m6, m6, 0
pshuflw m7, m7, 0
punpcklqdq m6, m6
punpcklqdq m7, m7
%endif
%endmacro
%macro PABSW_MMX 2
pxor %1, %1
pcmpgtw %1, %2
pxor %2, %1
psubw %2, %1
SWAP %1, %2
%endmacro
%macro PSIGNW_MMX 2
pxor %1, %2
psubw %1, %2
%endmacro
%macro PABSW_SSSE3 2
pabsw %1, %2
%endmacro
%macro PSIGNW_SSSE3 2
psignw %1, %2
%endmacro
%macro QUANT_ONE 3
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
mova m1, %1 ; load dct coeffs
PABSW m0, m1
paddusw m0, %3 ; round
pmulhuw m0, %2 ; divide
PSIGNW m0, m1 ; restore sign
mova %1, m0 ; store
%endmacro
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
%macro QUANT_DC 2
cglobal %1, 1,1
QUANT_DC_START
%assign x 0
%rep %2
QUANT_ONE [r0+x], m6, m7
%assign x x+mmsize
%endrep
RET
%endmacro
;-----------------------------------------------------------------------------
; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
%macro QUANT_AC 2
cglobal %1, 3,3
%assign x 0
%rep %2
QUANT_ONE [r0+x], [r1+x], [r2+x]
%assign x x+mmsize
%endrep
RET
%endmacro
INIT_MMX
%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX
QUANT_DC x264_quant_2x2_dc_mmxext, 1
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
QUANT_DC x264_quant_4x4_dc_mmxext, 4
QUANT_AC x264_quant_4x4_mmx, 4
QUANT_AC x264_quant_8x8_mmx, 16
%endif
INIT_XMM
QUANT_DC x264_quant_4x4_dc_sse2, 2
QUANT_AC x264_quant_4x4_sse2, 2
QUANT_AC x264_quant_8x8_sse2, 8
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
QUANT_DC x264_quant_4x4_dc_ssse3, 2
QUANT_AC x264_quant_4x4_ssse3, 2
QUANT_AC x264_quant_8x8_ssse3, 8
INIT_MMX
QUANT_DC x264_quant_2x2_dc_ssse3, 1
;=============================================================================
; dequant
;=============================================================================
%macro DEQUANT16_L 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m5 i_qbits
mova m0, %2
packssdw m0, %3
pmullw m0, %1
psllw m0, m5
mova %1, m0
%endmacro
%macro DEQUANT32_R 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m5 -i_qbits
;;; m6 f
;;; m7 0
mova m0, %1
mova m1, m0
punpcklwd m0, m7
punpckhwd m1, m7
pmaddwd m0, %2
pmaddwd m1, %3
paddd m0, m6
paddd m1, m6
psrad m0, m5
psrad m1, m5
packssdw m0, m1
mova %1, m0
%endmacro
%macro DEQUANT_LOOP 3
%if 8*(%2-2*%3)
mov t0d, 8*(%2-2*%3)
%%loop:
%1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
%1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
sub t0d, 16*%3
jge %%loop
rep ret
%else
%1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
%1 [r0 ], [r1 ], [r1+ 8*%3]
ret
%endif
%endmacro
%macro DEQUANT16_FLAT 2-8
mova m0, %1
%assign i %0-2
%rep %0-1
%if i
mova m %+ i, [r0+%2]
pmullw m %+ i, m0
%else
pmullw m0, [r0+%2]
%endif
psllw m %+ i, m7
mova [r0+%2], m %+ i
%assign i i-1
%rotate 1
%endrep
%endmacro
%ifdef ARCH_X86_64
%define t0 r4
%define t0d r4d
%define t1 r3
%define t1d r3d
%define t2 r2
%define t2d r2d
%else
%define t0 r2
%define t0d r2d
%define t1 r0
%define t1d r0d
%define t2 r1
%define t2d r1d
%endif
%macro DEQUANT_START 2
movifnidn t2d, r2m
imul t0d, t2d, 0x2b
shr t0d, 8 ; i_qbits = i_qp / 6
lea t1, [t0*3]
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %1
%ifdef ARCH_X86_64
add r1, t2 ; dequant_mf[i_mf]
%else
add r1, r1m ; dequant_mf[i_mf]
mov r0, r0m ; dct
%endif
sub t0d, %2
jl .rshift32 ; negative qbits => rightshift
%endmacro
;-----------------------------------------------------------------------------
; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
cglobal x264_dequant_%2x%2_%1, 0,3
DEQUANT_START %3+2, %3
.lshift:
movd m5, t0d
DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
.rshift32:
neg t0d
movd m5, t0d
mova m6, [pd_1 GLOBAL]
pxor m7, m7
pslld m6, m5
psrld m6, 1
DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
cglobal x264_dequant_%2x%2_flat16_%1, 0,3
movifnidn t2d, r2m
%if %2 == 8
cmp t2d, 12
jl x264_dequant_%2x%2_%1
sub t2d, 12
%endif
imul t0d, t2d, 0x2b
shr t0d, 8 ; i_qbits = i_qp / 6
lea t1, [t0*3]
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %3
%ifdef PIC
lea r1, [dequant%2_scale GLOBAL]
add r1, t2
%else
lea r1, [dequant%2_scale + t2 GLOBAL]
%endif
movifnidn r0d, r0m
movd m7, t0d
%if %2 == 4
%ifidn %1, mmx
DEQUANT16_FLAT [r1], 0, 16
DEQUANT16_FLAT [r1+8], 8, 24
%else
DEQUANT16_FLAT [r1], 0, 16
%endif
%elifidn %1, mmx
DEQUANT16_FLAT [r1], 0, 8, 64, 72
DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
%else
DEQUANT16_FLAT [r1], 0, 64
DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
DEQUANT16_FLAT [r1+32], 32, 96
%endif
ret
%endmacro ; DEQUANT
%ifndef ARCH_X86_64
INIT_MMX
DEQUANT mmx, 4, 4, 1
DEQUANT mmx, 8, 6, 1
%endif
INIT_XMM
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
%macro DEQUANT_DC 1
cglobal x264_dequant_4x4dc_%1, 0,3
DEQUANT_START 6, 6
.lshift:
movd m6, [r1]
movd m5, t0d
pslld m6, m5
%if mmsize==16
pshuflw m6, m6, 0
punpcklqdq m6, m6
%else
pshufw m6, m6, 0
%endif
%assign x 0
%rep 16/mmsize
mova m0, [r0+mmsize*0+x]
mova m1, [r0+mmsize*1+x]
pmullw m0, m6
pmullw m1, m6
mova [r0+mmsize*0+x], m0
mova [r0+mmsize*1+x], m1
%assign x x+mmsize*2
%endrep
RET
.rshift32:
neg t0d
movd m5, t0d
mova m6, [pw_1 GLOBAL]
mova m7, m6
pslld m6, m5
psrld m6, 1
movd m4, [r1]
%if mmsize==8
punpcklwd m4, m4
%else
pshuflw m4, m4, 0
%endif
punpcklwd m4, m6
%assign x 0
%rep 32/mmsize
mova m0, [r0+x]
mova m1, m0
punpcklwd m0, m7
punpckhwd m1, m7
pmaddwd m0, m4
pmaddwd m1, m4
psrad m0, m5
psrad m1, m5
packssdw m0, m1
mova [r0+x], m0
%assign x x+mmsize
%endrep
RET
%endmacro
INIT_MMX
DEQUANT_DC mmxext
INIT_XMM
DEQUANT_DC sse2
;-----------------------------------------------------------------------------
; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 1
cglobal x264_denoise_dct_%1, 4,5
movzx r4d, word [r0] ; backup DC coefficient
pxor m7, m7
.loop:
sub r3, mmsize
mova m2, [r0+r3*2+0*mmsize]
mova m3, [r0+r3*2+1*mmsize]
PABSW m0, m2
PABSW m1, m3
mova m4, m0
mova m5, m1
psubusw m0, [r2+r3*2+0*mmsize]
psubusw m1, [r2+r3*2+1*mmsize]
PSIGNW m0, m2
PSIGNW m1, m3
mova [r0+r3*2+0*mmsize], m0
mova [r0+r3*2+1*mmsize], m1
mova m2, m4
mova m3, m5
punpcklwd m4, m7
punpckhwd m2, m7
punpcklwd m5, m7
punpckhwd m3, m7
paddd m4, [r1+r3*4+0*mmsize]
paddd m2, [r1+r3*4+1*mmsize]
paddd m5, [r1+r3*4+2*mmsize]
paddd m3, [r1+r3*4+3*mmsize]
mova [r1+r3*4+0*mmsize], m4
mova [r1+r3*4+1*mmsize], m2
mova [r1+r3*4+2*mmsize], m5
mova [r1+r3*4+3*mmsize], m3
jg .loop
mov [r0], r4w ; restore DC coefficient
RET
%endmacro
%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX
%ifndef ARCH_X86_64
INIT_MMX
DENOISE_DCT mmx
%endif
INIT_XMM
DENOISE_DCT sse2
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
DENOISE_DCT ssse3
;-----------------------------------------------------------------------------
; int x264_decimate_score( int16_t *dct )
;-----------------------------------------------------------------------------
%macro DECIMATE_MASK_SSE2 6
%ifidn %5, ssse3
pabsw xmm0, [%3+ 0]
pabsw xmm1, [%3+16]
%else
movdqa xmm0, [%3+ 0]
movdqa xmm1, [%3+16]
ABS2_MMX xmm0, xmm1, xmm3, xmm4
%endif
packsswb xmm0, xmm1
pxor xmm2, xmm2
pcmpeqb xmm2, xmm0
pcmpgtb xmm0, %4
pmovmskb %1, xmm2
pmovmskb %2, xmm0
%endmacro
%macro DECIMATE_MASK_MMX 6
movq mm0, [%3+ 0]
movq mm1, [%3+ 8]
movq mm2, [%3+16]
movq mm3, [%3+24]
ABS2_MMX mm0, mm1, mm4, mm5
ABS2_MMX mm2, mm3, mm4, mm5
packsswb mm0, mm1
packsswb mm2, mm3
pxor mm4, mm4
pxor mm5, mm5
pcmpeqb mm4, mm0
pcmpeqb mm5, mm2
pcmpgtb mm0, %4
pcmpgtb mm2, %4
pmovmskb %6, mm4
pmovmskb %1, mm5
shl %1, 8
or %1, %6
pmovmskb %6, mm0
pmovmskb %2, mm2
shl %2, 8
or %2, %6
%endmacro
cextern x264_decimate_table4
cextern x264_decimate_table8
%macro DECIMATE4x4 2
;A LUT is faster than bsf on AMD processors, and no slower on Intel
;This is not true for score64.
cglobal x264_decimate_score%1_%2, 1,3
%ifdef PIC
lea r10, [x264_decimate_table4 GLOBAL]
lea r11, [decimate_mask_table4 GLOBAL]
%define table r10
%define mask_table r11
%else
%define table x264_decimate_table4
%define mask_table decimate_mask_table4
%endif
DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
xor edx, 0xffff
je .ret
test eax, eax
jne .ret9
%if %1==15
shr edx, 1
%endif
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
cmp edx, ecx
je .ret
bsr ecx, ecx
shr edx, 1
shr edx, cl
bsf ecx, edx
shr edx, 1
shr edx, cl
add al, byte [table + rcx]
add al, byte [mask_table + rdx]
.ret:
REP_RET
.ret9:
mov eax, 9
RET
%endmacro
%ifndef ARCH_X86_64
%define DECIMATE_MASK DECIMATE_MASK_MMX
DECIMATE4x4 15, mmxext
DECIMATE4x4 16, mmxext
%endif
%define DECIMATE_MASK DECIMATE_MASK_SSE2
DECIMATE4x4 15, sse2
DECIMATE4x4 15, ssse3
DECIMATE4x4 16, sse2
DECIMATE4x4 16, ssse3
%macro DECIMATE8x8 1
%ifdef ARCH_X86_64
cglobal x264_decimate_score64_%1, 1,4
%ifdef PIC
lea r10, [x264_decimate_table8 GLOBAL]
%define table r10
%else
%define table x264_decimate_table8
%endif
mova m7, [pb_1 GLOBAL]
DECIMATE_MASK r1d, eax, r0, m7, %1, null
test eax, eax
jne .ret9
DECIMATE_MASK r2d, eax, r0+32, m7, %1, null
shl r2d, 16
or r1d, r2d
DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null
shl r2, 32
or eax, r3d
or r1, r2
DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
shl r2, 48
or r1, r2
xor r1, -1
je .ret
or eax, r3d
jne .ret9
.loop:
bsf rcx, r1
shr r1, cl
add al, byte [table + rcx]
shr r1, 1
jne .loop
.ret:
REP_RET
.ret9:
mov eax, 9
RET
%else ; ARCH
%ifidn %1, mmxext
cglobal x264_decimate_score64_%1, 1,6
%else
cglobal x264_decimate_score64_%1, 1,5
%endif
mova m7, [pb_1 GLOBAL]
DECIMATE_MASK r3, r2, r0, m7, %1, r5
test r2, r2
jne .ret9
DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
shl r4, 16
or r3, r4
DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
or r2, r1
DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
shl r1, 16
or r4, r1
xor r3, -1
je .tryret
xor r4, -1
.cont:
or r0, r2
jne .ret9 ;r0 is zero at this point, so we don't need to zero it
.loop:
bsf ecx, r3
test r3, r3
je .largerun
shrd r3, r4, cl
shr r4, cl
add r0b, byte [x264_decimate_table8 + ecx]
shrd r3, r4, 1
shr r4, 1
cmp r0, 6 ;score64's threshold is never higher than 6
jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
test r3, r3
jne .loop
test r4, r4
jne .loop
.ret:
REP_RET
.tryret:
xor r4, -1
jne .cont
REP_RET
.ret9:
mov eax, 9
RET
.largerun:
mov r3, r4
xor r4, r4
bsf ecx, r3
shr r3, cl
shr r3, 1
jne .loop
REP_RET
%endif ; ARCH
%endmacro
%ifndef ARCH_X86_64
INIT_MMX
%define DECIMATE_MASK DECIMATE_MASK_MMX
DECIMATE8x8 mmxext
%endif
INIT_XMM
%define DECIMATE_MASK DECIMATE_MASK_SSE2
DECIMATE8x8 sse2
DECIMATE8x8 ssse3
%macro LAST_MASK_SSE2 2-3
movdqa xmm0, [%2+ 0]
pxor xmm2, xmm2
packsswb xmm0, [%2+16]
pcmpeqb xmm0, xmm2
pmovmskb %1, xmm0
%endmacro
%macro LAST_MASK_MMX 3
movq mm0, [%2+ 0]
movq mm1, [%2+16]
pxor mm2, mm2
packsswb mm0, [%2+ 8]
packsswb mm1, [%2+24]
pcmpeqb mm0, mm2
pcmpeqb mm1, mm2
pmovmskb %1, mm0
pmovmskb %3, mm1
shl %3, 8
or %1, %3
%endmacro
%ifdef ARCH_X86_64
cglobal x264_coeff_last4_mmxext, 1,1
bsr rax, [r0]
shr eax, 4
RET
%else
cglobal x264_coeff_last4_mmxext, 0,3
mov edx, r0m
mov eax, [edx+4]
xor ecx, ecx
test eax, eax
cmovz eax, [edx]
setnz cl
bsr eax, eax
shr eax, 4
lea eax, [eax+ecx*2]
RET
%endif
%macro COEFF_LAST 1
cglobal x264_coeff_last15_%1, 1,3
LAST_MASK r1d, r0-2, r2d
xor r1d, 0xffff
bsr eax, r1d
dec eax
RET
cglobal x264_coeff_last16_%1, 1,3
LAST_MASK r1d, r0, r2d
xor r1d, 0xffff
bsr eax, r1d
RET
%ifndef ARCH_X86_64
%ifidn %1, mmxext
cglobal x264_coeff_last64_%1, 1,5
%else
cglobal x264_coeff_last64_%1, 1,4
%endif
LAST_MASK r1d, r0, r4d
LAST_MASK r2d, r0+32, r4d
shl r2d, 16
or r1d, r2d
LAST_MASK r2d, r0+64, r4d
LAST_MASK r3d, r0+96, r4d
shl r3d, 16
or r2d, r3d
not r1d
xor r2d, -1
jne .secondhalf
bsr eax, r1d
RET
.secondhalf:
bsr eax, r2d
add eax, 32
RET
%endif
%endmacro
%ifdef ARCH_X86_64
cglobal x264_coeff_last64_sse2, 1,4
LAST_MASK_SSE2 r1d, r0
LAST_MASK_SSE2 r2d, r0+32
LAST_MASK_SSE2 r3d, r0+64
LAST_MASK_SSE2 r0d, r0+96
shl r2d, 16
shl r0d, 16
or r1d, r2d
or r3d, r0d
shl r3, 32
or r1, r3
not r1
bsr rax, r1
RET
%endif
%ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX
COEFF_LAST mmxext
%endif
%define LAST_MASK LAST_MASK_SSE2
COEFF_LAST sse2