blob: 10c8135d5c877c68f9b719f653effc1116e6a484 [file] [log] [blame]
;*****************************************************************************
;* mc-a2.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;* Holger Lubitz <hal@duncan.ol.sub.de>
;* Mathieu Monnier <manao@melix.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
%include "x86inc.asm"
SECTION_RODATA
pw_1: times 8 dw 1
pw_16: times 8 dw 16
pw_32: times 8 dw 32
SECTION .text
%macro LOAD_ADD 4
movh %4, %3
movh %1, %2
punpcklbw %4, m0
punpcklbw %1, m0
paddw %1, %4
%endmacro
%macro LOAD_ADD_2 6
mova %5, %3
mova %1, %4
mova %6, %5
mova %2, %1
punpcklbw %5, m0
punpcklbw %1, m0
punpckhbw %6, m0
punpckhbw %2, m0
paddw %1, %5
paddw %2, %6
%endmacro
%macro FILT_V2 0
psubw m1, m2 ; a-b
psubw m4, m5
psubw m2, m3 ; b-c
psubw m5, m6
psllw m2, 2
psllw m5, 2
psubw m1, m2 ; a-5*b+4*c
psubw m4, m5
psllw m3, 4
psllw m6, 4
paddw m1, m3 ; a-5*b+20*c
paddw m4, m6
%endmacro
%macro FILT_H 3
psubw %1, %2 ; a-b
psraw %1, 2 ; (a-b)/4
psubw %1, %2 ; (a-b)/4-b
paddw %1, %3 ; (a-b)/4-b+c
psraw %1, 2 ; ((a-b)/4-b+c)/4
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
%endmacro
%macro FILT_H2 6
psubw %1, %2
psubw %4, %5
psraw %1, 2
psraw %4, 2
psubw %1, %2
psubw %4, %5
paddw %1, %3
paddw %4, %6
psraw %1, 2
psraw %4, 2
paddw %1, %3
paddw %4, %6
%endmacro
%macro FILT_PACK 3
paddw %1, m7
paddw %2, m7
psraw %1, %3
psraw %2, %3
packuswb %1, %2
%endmacro
%macro PALIGNR_MMX 4
%ifnidn %4, %2
mova %4, %2
%endif
%if mmsize == 8
psllq %1, (8-%3)*8
psrlq %4, %3*8
%else
pslldq %1, 16-%3
psrldq %4, %3
%endif
por %1, %4
%endmacro
%macro PALIGNR_SSSE3 4
palignr %1, %2, %3
%endmacro
INIT_MMX
%macro HPEL_V 1
;-----------------------------------------------------------------------------
; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
cglobal x264_hpel_filter_v_%1, 5,6
lea r5, [r1+r3]
sub r1, r3
sub r1, r3
add r0, r4
lea r2, [r2+r4*2]
neg r4
pxor m0, m0
.loop:
LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
FILT_V2
mova m7, [pw_16 GLOBAL]
mova [r2+r4*2], m1
mova [r2+r4*2+mmsize], m4
paddw m1, m7
paddw m4, m7
psraw m1, 5
psraw m4, 5
packuswb m1, m4
mova [r0+r4], m1
add r1, mmsize
add r5, mmsize
add r4, mmsize
jl .loop
REP_RET
%endmacro
HPEL_V mmxext
;-----------------------------------------------------------------------------
; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
cglobal x264_hpel_filter_c_mmxext, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
%define src r1+r2*2
movq m7, [pw_32 GLOBAL]
.loop:
movq m1, [src-4]
movq m2, [src-2]
movq m3, [src ]
movq m4, [src+4]
movq m5, [src+6]
paddw m3, [src+2] ; c0
paddw m2, m4 ; b0
paddw m1, m5 ; a0
movq m6, [src+8]
paddw m4, [src+14] ; a1
paddw m5, [src+12] ; b1
paddw m6, [src+10] ; c1
FILT_H2 m1, m2, m3, m4, m5, m6
FILT_PACK m1, m4, 6
movntq [r0+r2], m1
add r2, 8
jl .loop
REP_RET
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
cglobal x264_hpel_filter_h_mmxext, 3,3
add r0, r2
add r1, r2
neg r2
%define src r1+r2
pxor m0, m0
.loop:
movd m1, [src-2]
movd m2, [src-1]
movd m3, [src ]
movd m6, [src+1]
movd m4, [src+2]
movd m5, [src+3]
punpcklbw m1, m0
punpcklbw m2, m0
punpcklbw m3, m0
punpcklbw m6, m0
punpcklbw m4, m0
punpcklbw m5, m0
paddw m3, m6 ; c0
paddw m2, m4 ; b0
paddw m1, m5 ; a0
movd m7, [src+7]
movd m6, [src+6]
punpcklbw m7, m0
punpcklbw m6, m0
paddw m4, m7 ; c1
paddw m5, m6 ; b1
movd m7, [src+5]
movd m6, [src+4]
punpcklbw m7, m0
punpcklbw m6, m0
paddw m6, m7 ; a1
movq m7, [pw_1 GLOBAL]
FILT_H2 m1, m2, m3, m4, m5, m6
FILT_PACK m1, m4, 1
movntq [r0+r2], m1
add r2, 8
jl .loop
REP_RET
INIT_XMM
%macro HPEL_C 1
;-----------------------------------------------------------------------------
; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
cglobal x264_hpel_filter_c_%1, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
%define src r1+r2*2
%ifidn %1, ssse3
mova m7, [pw_32 GLOBAL]
%define tpw_32 m7
%elifdef ARCH_X86_64
mova m8, [pw_32 GLOBAL]
%define tpw_32 m8
%else
%define tpw_32 [pw_32 GLOBAL]
%endif
.loop:
%ifidn %1,sse2_misalign
movu m0, [src-4]
movu m1, [src-2]
mova m2, [src]
paddw m0, [src+6]
paddw m1, [src+4]
paddw m2, [src+2]
%else
mova m6, [src-16]
mova m2, [src]
mova m3, [src+16]
mova m0, m2
mova m1, m2
mova m4, m3
mova m5, m3
PALIGNR m3, m2, 2, m7
PALIGNR m4, m2, 4, m7
PALIGNR m5, m2, 6, m7
PALIGNR m0, m6, 12, m7
PALIGNR m1, m6, 14, m7
paddw m2, m3
paddw m1, m4
paddw m0, m5
%endif
FILT_H m0, m1, m2
paddw m0, tpw_32
psraw m0, 6
packuswb m0, m0
movq [r0+r2], m0
add r2, 8
jl .loop
REP_RET
%endmacro
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
cglobal x264_hpel_filter_h_sse2, 3,3
add r0, r2
add r1, r2
neg r2
%define src r1+r2
pxor m0, m0
.loop:
movh m1, [src-2]
movh m2, [src-1]
movh m3, [src ]
movh m4, [src+1]
movh m5, [src+2]
movh m6, [src+3]
punpcklbw m1, m0
punpcklbw m2, m0
punpcklbw m3, m0
punpcklbw m4, m0
punpcklbw m5, m0
punpcklbw m6, m0
paddw m3, m4 ; c0
paddw m2, m5 ; b0
paddw m1, m6 ; a0
movh m4, [src+6]
movh m5, [src+7]
movh m6, [src+10]
movh m7, [src+11]
punpcklbw m4, m0
punpcklbw m5, m0
punpcklbw m6, m0
punpcklbw m7, m0
paddw m5, m6 ; b1
paddw m4, m7 ; a1
movh m6, [src+8]
movh m7, [src+9]
punpcklbw m6, m0
punpcklbw m7, m0
paddw m6, m7 ; c1
mova m7, [pw_1 GLOBAL] ; FIXME xmm8
FILT_H2 m1, m2, m3, m4, m5, m6
FILT_PACK m1, m4, 1
movntdq [r0+r2], m1
add r2, 16
jl .loop
REP_RET
%ifndef ARCH_X86_64
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
cglobal x264_hpel_filter_h_ssse3, 3,3
add r0, r2
add r1, r2
neg r2
%define src r1+r2
pxor m0, m0
movh m1, [src-8]
punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
movh m2, [src]
punpcklbw m2, m0
mova m7, [pw_1 GLOBAL]
.loop:
movh m3, [src+8]
punpcklbw m3, m0
mova m4, m2
palignr m2, m1, 14
mova m5, m3
palignr m3, m4, 4
paddw m3, m2
mova m2, m4
palignr m4, m1, 12
mova m1, m5
palignr m5, m2, 6
paddw m5, m4
mova m4, m1
palignr m1, m2, 2
paddw m1, m2
FILT_H m5, m3, m1
movh m1, [src+16]
punpcklbw m1, m0
mova m3, m4
palignr m4, m2, 14
mova m6, m1
palignr m1, m3, 4
paddw m1, m4
mova m4, m3
palignr m3, m2, 12
mova m2, m6
palignr m6, m4, 6
paddw m6, m3
mova m3, m2
palignr m2, m4, 2
paddw m2, m4
FILT_H m6, m1, m2
FILT_PACK m5, m6, 1
movdqa [r0+r2], m5
add r2, 16
mova m2, m3
mova m1, m4
jl .loop
REP_RET
%endif
%define PALIGNR PALIGNR_MMX
%ifndef ARCH_X86_64
HPEL_C sse2
%endif
HPEL_V sse2
HPEL_C sse2_misalign
%define PALIGNR PALIGNR_SSSE3
HPEL_C ssse3
%ifdef ARCH_X86_64
%macro DO_FILT_V 5
LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
FILT_V2
mova %1, m1
mova %2, m4
paddw m1, m15
paddw m4, m15
add r3, 16
add r1, 16
psraw m1, 5
psraw m4, 5
packuswb m1, m4
movntps [r11+r4+%5], m1
%endmacro
%macro DO_FILT_H 4
mova m1, %2
PALIGNR m1, %1, 12, m4
mova m2, %2
PALIGNR m2, %1, 14, m4
mova %1, %3
PALIGNR %3, %2, 6, m4
mova m3, %1
PALIGNR m3, %2, 4, m4
mova m4, %1
paddw %3, m1
PALIGNR m4, %2, 2, m1
paddw m3, m2
paddw m4, %2
FILT_H %3, m3, m4
paddw %3, m15
psraw %3, %4
%endmacro
%macro DO_FILT_CC 4
DO_FILT_H %1, %2, %3, 6
DO_FILT_H %2, %1, %4, 6
packuswb %3, %4
movntps [r5+r4], %3
%endmacro
%macro DO_FILT_HH 4
DO_FILT_H %1, %2, %3, 1
DO_FILT_H %2, %1, %4, 1
packuswb %3, %4
movntps [r0+r4], %3
%endmacro
%macro DO_FILT_H2 6
DO_FILT_H %1, %2, %3, 6
psrlw m15, 5
DO_FILT_H %4, %5, %6, 1
packuswb %6, %3
%endmacro
%macro HPEL 1
;-----------------------------------------------------------------------------
; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
cglobal x264_hpel_filter_%1, 7,7
mov r10, r3
sub r5, 16
mov r11, r1
and r10, 15
sub r3, r10
add r0, r5
add r11, r5
add r10, r5
add r5, r2
mov r2, r4
neg r10
lea r1, [r3+r2]
sub r3, r2
sub r3, r2
mov r4, r10
pxor m0, m0
pcmpeqw m15, m15
psrlw m15, 15 ; pw_1
psllw m15, 4
;ALIGN 16
.loopy:
; first filter_v
; prefetching does not help here! lots of variants tested, all slower
DO_FILT_V m8, m7, m13, m12, 0
;ALIGN 16
.loopx:
DO_FILT_V m6, m5, m11, m10, 16
.lastx:
paddw m15, m15
DO_FILT_CC m9, m8, m7, m6
movdqa m7, m12 ; not really necessary, but seems free and
movdqa m6, m11 ; gives far shorter code
psrlw m15, 5
DO_FILT_HH m14, m13, m7, m6
psllw m15, 4 ; pw_16
movdqa m7, m5
movdqa m12, m10
add r4, 16
jl .loopx
cmp r4, 16
jl .lastx
; setup regs for next y
sub r4, r10
sub r4, r2
sub r1, r4
sub r3, r4
add r0, r2
add r11, r2
add r5, r2
mov r4, r10
sub r6d, 1
jg .loopy
sfence
RET
%endmacro
%define PALIGNR PALIGNR_MMX
HPEL sse2
%define PALIGNR PALIGNR_SSSE3
HPEL ssse3
%endif
cglobal x264_sfence
sfence
ret
;-----------------------------------------------------------------------------
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
; uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
cglobal x264_plane_copy_mmxext, 6,7
movsxdifnidn r1, r1d
movsxdifnidn r3, r3d
add r4d, 3
and r4d, ~3
mov r6d, r4d
and r6d, ~15
sub r1, r6
sub r3, r6
.loopy:
mov r6d, r4d
sub r6d, 64
jl .endx
.loopx:
prefetchnta [r2+256]
movq mm0, [r2 ]
movq mm1, [r2+ 8]
movq mm2, [r2+16]
movq mm3, [r2+24]
movq mm4, [r2+32]
movq mm5, [r2+40]
movq mm6, [r2+48]
movq mm7, [r2+56]
movntq [r0 ], mm0
movntq [r0+ 8], mm1
movntq [r0+16], mm2
movntq [r0+24], mm3
movntq [r0+32], mm4
movntq [r0+40], mm5
movntq [r0+48], mm6
movntq [r0+56], mm7
add r2, 64
add r0, 64
sub r6d, 64
jge .loopx
.endx:
prefetchnta [r2+256]
add r6d, 48
jl .end16
.loop16:
movq mm0, [r2 ]
movq mm1, [r2+8]
movntq [r0 ], mm0
movntq [r0+8], mm1
add r2, 16
add r0, 16
sub r6d, 16
jge .loop16
.end16:
add r6d, 12
jl .end4
.loop4:
movd mm2, [r2+r6]
movd [r0+r6], mm2
sub r6d, 4
jge .loop4
.end4:
add r2, r3
add r0, r1
dec r5d
jg .loopy
sfence
emms
RET
; These functions are not general-use; not only do the SSE ones require aligned input,
; but they also will fail if given a non-mod16 size or a size less than 64.
; memzero SSE will fail for non-mod128.
;-----------------------------------------------------------------------------
; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
cglobal x264_memcpy_aligned_mmx, 3,3
test r2d, 16
jz .copy32
sub r2d, 16
movq mm0, [r1 + r2 + 0]
movq mm1, [r1 + r2 + 8]
movq [r0 + r2 + 0], mm0
movq [r0 + r2 + 8], mm1
jmp .copyend
.copy32:
sub r2d, 32
movq mm0, [r1 + r2 + 0]
movq mm1, [r1 + r2 + 8]
movq mm2, [r1 + r2 + 16]
movq mm3, [r1 + r2 + 24]
movq [r0 + r2 + 0], mm0
movq [r0 + r2 + 8], mm1
movq [r0 + r2 + 16], mm2
movq [r0 + r2 + 24], mm3
jg .copy32
.copyend
REP_RET
;-----------------------------------------------------------------------------
; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
cglobal x264_memcpy_aligned_sse2, 3,3
test r2d, 16
jz .copy32
sub r2d, 16
movdqa xmm0, [r1 + r2]
movdqa [r0 + r2], xmm0
jmp .copyend
.copy32:
test r2d, 32
jz .copy64
sub r2d, 32
movdqa xmm0, [r1 + r2 + 0]
movdqa [r0 + r2 + 0], xmm0
movdqa xmm1, [r1 + r2 + 16]
movdqa [r0 + r2 + 16], xmm1
jmp .copyend
.copy64:
sub r2d, 64
movdqa xmm0, [r1 + r2 + 0]
movdqa [r0 + r2 + 0], xmm0
movdqa xmm1, [r1 + r2 + 16]
movdqa [r0 + r2 + 16], xmm1
movdqa xmm2, [r1 + r2 + 32]
movdqa [r0 + r2 + 32], xmm2
movdqa xmm3, [r1 + r2 + 48]
movdqa [r0 + r2 + 48], xmm3
jg .copy64
.copyend
REP_RET
;-----------------------------------------------------------------------------
; void *x264_memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
%macro MEMZERO 1
cglobal x264_memzero_aligned_%1, 2,2
pxor m0, m0
.loop:
sub r1d, mmsize*8
%assign i 0
%rep 8
mova [r0 + r1 + i], m0
%assign i i+mmsize
%endrep
jg .loop
REP_RET
%endmacro
INIT_MMX
MEMZERO mmx
INIT_XMM
MEMZERO sse2
%macro FILT8x4 7
mova %3, [r0+%7]
mova %4, [r0+r5+%7]
pavgb %3, %4
pavgb %4, [r0+r5*2+%7]
PALIGNR %1, %3, 1, m6
PALIGNR %2, %4, 1, m6
pavgb %1, %3
pavgb %2, %4
mova %5, %1
mova %6, %2
pand %1, m7
pand %2, m7
psrlw %5, 8
psrlw %6, 8
%endmacro
%macro FILT16x2 4
mova m3, [r0+%4+mmsize]
mova m2, [r0+%4]
pavgb m3, [r0+%4+r5+mmsize]
pavgb m2, [r0+%4+r5]
PALIGNR %1, m3, 1, m6
pavgb %1, m3
PALIGNR m3, m2, 1, m6
pavgb m3, m2
mova m5, m3
mova m4, %1
pand m3, m7
pand %1, m7
psrlw m5, 8
psrlw m4, 8
packuswb m3, %1
packuswb m5, m4
mova [%2], m3
mova [%3], m5
mova %1, m2
%endmacro
%macro FILT8x2U 3
mova m3, [r0+%3+8]
mova m2, [r0+%3]
pavgb m3, [r0+%3+r5+8]
pavgb m2, [r0+%3+r5]
mova m1, [r0+%3+9]
mova m0, [r0+%3+1]
pavgb m1, [r0+%3+r5+9]
pavgb m0, [r0+%3+r5+1]
pavgb m1, m3
pavgb m0, m2
mova m3, m1
mova m2, m0
pand m1, m7
pand m0, m7
psrlw m3, 8
psrlw m2, 8
packuswb m0, m1
packuswb m2, m3
mova [%1], m0
mova [%2], m2
%endmacro
;-----------------------------------------------------------------------------
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 1 ; FIXME
cglobal x264_frame_init_lowres_core_%1, 6,7
; src += 2*(height-1)*stride + 2*width
mov r6d, r8m
dec r6d
imul r6d, r5d
add r6d, r7m
lea r0, [r0+r6*2]
; dst += (height-1)*stride + width
mov r6d, r8m
dec r6d
imul r6d, r6m
add r6d, r7m
add r1, r6
add r2, r6
add r3, r6
add r4, r6
; gap = stride - width
mov r6d, r6m
sub r6d, r7m
PUSH r6
%define dst_gap [rsp+gprsize]
mov r6d, r5d
sub r6d, r7m
shl r6d, 1
PUSH r6
%define src_gap [rsp]
%if mmsize == 16
; adjust for the odd end case
mov r6d, r7m
and r6d, 8
sub r1, r6
sub r2, r6
sub r3, r6
sub r4, r6
add dst_gap, r6d
%endif ; mmsize
pcmpeqb m7, m7
psrlw m7, 8
.vloop:
mov r6d, r7m
%ifnidn %1, mmxext
mova m0, [r0]
mova m1, [r0+r5]
pavgb m0, m1
pavgb m1, [r0+r5*2]
%endif
%if mmsize == 16
test r6d, 8
jz .hloop
sub r0, 16
FILT8x4 m0, m1, m2, m3, m4, m5, 0
packuswb m0, m4
packuswb m1, m5
movq [r1], m0
movhps [r2], m0
movq [r3], m1
movhps [r4], m1
mova m0, m2
mova m1, m3
sub r6d, 8
%endif ; mmsize
.hloop:
sub r0, mmsize*2
sub r1, mmsize
sub r2, mmsize
sub r3, mmsize
sub r4, mmsize
%ifdef m8
FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
mova m8, m0
mova m9, m1
FILT8x4 m2, m3, m0, m1, m4, m5, 0
packuswb m2, m8
packuswb m3, m9
packuswb m4, m10
packuswb m5, m11
mova [r1], m2
mova [r2], m4
mova [r3], m3
mova [r4], m5
%elifidn %1, mmxext
FILT8x2U r1, r2, 0
FILT8x2U r3, r4, r5
%else
FILT16x2 m0, r1, r2, 0
FILT16x2 m1, r3, r4, r5
%endif
sub r6d, mmsize
jg .hloop
.skip:
mov r6, dst_gap
sub r0, src_gap
sub r1, r6
sub r2, r6
sub r3, r6
sub r4, r6
dec dword r8m
jg .vloop
ADD rsp, 2*gprsize
emms
RET
%endmacro ; FRAME_INIT_LOWRES
INIT_MMX
%define PALIGNR PALIGNR_MMX
FRAME_INIT_LOWRES mmxext
%ifndef ARCH_X86_64
FRAME_INIT_LOWRES cache32_mmxext
%endif
INIT_XMM
FRAME_INIT_LOWRES sse2
%define PALIGNR PALIGNR_SSSE3
FRAME_INIT_LOWRES ssse3