src/parsec/disk-image/parsec/parsec-benchmark/pkgs/apps/x264/src/common/x86/cabac-a.asm - public/gem5-resources - Git at Google

 ;*****************************************************************************
 ;* cabac-a.asm: h264 encoder library
 ;*****************************************************************************
 ;* Copyright (C) 2008 x264 project
 ;*
 ;* Author: Loren Merritt <lorenm@u.washington.edu>
 ;*         Jason Garrett-Glaser <darkshikari@gmail.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
 ;* the Free Software Foundation; either version 2 of the License, or
 ;* (at your option) any later version.
 ;*
 ;* This program is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ;* GNU General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU General Public License
 ;* along with this program; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 ;*****************************************************************************

 %include "x86inc.asm"

 SECTION_RODATA

 SECTION .text

 cextern x264_cabac_range_lps
 cextern x264_cabac_transition
 cextern x264_cabac_renorm_shift

 %macro DEF_TMP 16
     %rep 8
         %define t%1d r%9d
         %define t%1b r%9b
         %define t%1  r%9
         %rotate 1
     %endrep
 %endmacro

 ; t3 must be ecx, since it's used for shift.
 %ifdef ARCH_X86_64
     DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
     %define pointer resq
 %else
     DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
     %define pointer resd
 %endif

 struc cb
     .low: resd 1
     .range: resd 1
     .queue: resd 1
     .bytes_outstanding: resd 1
     .start: pointer 1
     .p: pointer 1
     .end: pointer 1
     align 16, resb 1
     .bits_encoded: resd 1
     .state: resb 460
 endstruc

 %macro LOAD_GLOBAL 4
 %ifdef PIC
     ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
     lea   r11, [%2 GLOBAL]
     %ifnidn %3, 0
     add   r11, %3
     %endif
     movzx %1, byte [r11+%4]
 %else
     movzx %1, byte [%2+%3+%4]
 %endif
 %endmacro

 cglobal x264_cabac_encode_decision_asm, 0,7
     movifnidn t0d, r0m
     movifnidn t1d, r1m
     mov   t5d, [r0+cb.range]
     movzx t3d, byte [r0+cb.state+t1]
     mov   t4d, t5d
     shr   t5d, 6
     and   t5d, 3
     LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4
     sub   t4d, t5d
     mov   t6d, t3d
     shr   t6d, 6
     movifnidn t2d, r2m
     cmp   t6d, t2d
     mov   t6d, [r0+cb.low]
     lea   t7,  [t6+t4]
     cmovne t4d, t5d
     cmovne t6d, t7d
     LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
     movifnidn t1d, r1m
     mov   [r0+cb.state+t1], t3b
 .renorm:
     mov   t3d, t4d
     shr   t3d, 3
     LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
     shl   t4d, t3b
     shl   t6d, t3b
     add   t3d, [r0+cb.queue]
     mov   [r0+cb.range], t4d
     mov   [r0+cb.low], t6d
     mov   [r0+cb.queue], t3d
     cmp   t3d, 8
     jge .putbyte
     REP_RET
 .putbyte:
     ; alive: t0=cb t3=queue t6=low
     add   t3d, 2
     mov   t1d, 1
     mov   t2d, t6d
     shl   t1d, t3b
     shr   t2d, t3b ; out
     dec   t1d
     sub   t3d, 10
     and   t6d, t1d
     cmp   t2b, 0xff ; FIXME is a 32bit op faster?
     mov   [r0+cb.queue], t3d
     mov   [r0+cb.low], t6d
     mov   t1d, t2d
     mov   t4,  [r0+cb.p]
     je .postpone
     mov   t5d, [r0+cb.bytes_outstanding]
     shr   t1d, 8 ; carry
     add   [t4-1], t1b
     test  t5d, t5d
     jz .no_outstanding
     dec   t1d
 .loop_outstanding:
     mov   [t4], t1b
     inc   t4
     dec   t5d
     jg .loop_outstanding
 .no_outstanding:
     mov   [t4], t2b
     inc   t4
     mov   [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
     mov   [r0+cb.p], t4
     RET
 .postpone:
     inc   dword [r0+cb.bytes_outstanding]
     RET
	;*****************************************************************************
	;* cabac-a.asm: h264 encoder library
	;*****************************************************************************
	;* Copyright (C) 2008 x264 project
	;*
	;* Author: Loren Merritt <lorenm@u.washington.edu>
	;* Jason Garrett-Glaser <darkshikari@gmail.com>
	;*
	;* This program is free software; you can redistribute it and/or modify
	;* it under the terms of the GNU General Public License as published by
	;* the Free Software Foundation; either version 2 of the License, or
	;* (at your option) any later version.
	;*
	;* This program is distributed in the hope that it will be useful,
	;* but WITHOUT ANY WARRANTY; without even the implied warranty of
	;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	;* GNU General Public License for more details.
	;*
	;* You should have received a copy of the GNU General Public License
	;* along with this program; if not, write to the Free Software
	;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
	;*****************************************************************************

	%include "x86inc.asm"

	SECTION_RODATA

	SECTION .text

	cextern x264_cabac_range_lps
	cextern x264_cabac_transition
	cextern x264_cabac_renorm_shift

	%macro DEF_TMP 16
	%rep 8
	%define t%1d r%9d
	%define t%1b r%9b
	%define t%1 r%9
	%rotate 1
	%endrep
	%endmacro

	; t3 must be ecx, since it's used for shift.
	%ifdef ARCH_X86_64
	DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
	%define pointer resq
	%else
	DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
	%define pointer resd
	%endif

	struc cb
	.low: resd 1
	.range: resd 1
	.queue: resd 1
	.bytes_outstanding: resd 1
	.start: pointer 1
	.p: pointer 1
	.end: pointer 1
	align 16, resb 1
	.bits_encoded: resd 1
	.state: resb 460
	endstruc

	%macro LOAD_GLOBAL 4
	%ifdef PIC
	; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
	lea r11, [%2 GLOBAL]
	%ifnidn %3, 0
	add r11, %3
	%endif
	movzx %1, byte [r11+%4]
	%else
	movzx %1, byte [%2+%3+%4]
	%endif
	%endmacro

	cglobal x264_cabac_encode_decision_asm, 0,7
	movifnidn t0d, r0m
	movifnidn t1d, r1m
	mov t5d, [r0+cb.range]
	movzx t3d, byte [r0+cb.state+t1]
	mov t4d, t5d
	shr t5d, 6
	and t5d, 3
	LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4
	sub t4d, t5d
	mov t6d, t3d
	shr t6d, 6
	movifnidn t2d, r2m
	cmp t6d, t2d
	mov t6d, [r0+cb.low]
	lea t7, [t6+t4]
	cmovne t4d, t5d
	cmovne t6d, t7d
	LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
	movifnidn t1d, r1m
	mov [r0+cb.state+t1], t3b
	.renorm:
	mov t3d, t4d
	shr t3d, 3
	LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
	shl t4d, t3b
	shl t6d, t3b
	add t3d, [r0+cb.queue]
	mov [r0+cb.range], t4d
	mov [r0+cb.low], t6d
	mov [r0+cb.queue], t3d
	cmp t3d, 8
	jge .putbyte
	REP_RET
	.putbyte:
	; alive: t0=cb t3=queue t6=low
	add t3d, 2
	mov t1d, 1
	mov t2d, t6d
	shl t1d, t3b
	shr t2d, t3b ; out
	dec t1d
	sub t3d, 10
	and t6d, t1d
	cmp t2b, 0xff ; FIXME is a 32bit op faster?
	mov [r0+cb.queue], t3d
	mov [r0+cb.low], t6d
	mov t1d, t2d
	mov t4, [r0+cb.p]
	je .postpone
	mov t5d, [r0+cb.bytes_outstanding]
	shr t1d, 8 ; carry
	add [t4-1], t1b
	test t5d, t5d
	jz .no_outstanding
	dec t1d
	.loop_outstanding:
	mov [t4], t1b
	inc t4
	dec t5d
	jg .loop_outstanding
	.no_outstanding:
	mov [t4], t2b
	inc t4
	mov [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
	mov [r0+cb.p], t4
	RET
	.postpone:
	inc dword [r0+cb.bytes_outstanding]
	RET