blob: 706e105fb459bf8eddafebd3e9e7f151daf0b6e6 [file] [log] [blame]
#ifndef __RTSOFTSSE_H__
#define __RTSOFTSSE_H__
#ifdef _MSC_VER
#pragma warning(disable: 4244)
#pragma warning(disable: 4311)
#endif
#ifdef __INTEL_COMPILER
#pragma warning(disable:1684)
#ifndef _WIN32
#pragma warning(disable:810)
#endif
#endif
#ifdef VECTOR_SIZE
#error macro conflict found in __FILE__ << ":" << __LINE__
#endif
#define VECTOR_SIZE 4
typedef struct sse_f {
sse_f() {}
float f[VECTOR_SIZE];
} sse_f;
typedef struct sse_i64 {
union {
char b[2*VECTOR_SIZE];
signed short wd[VECTOR_SIZE];
signed int dw[VECTOR_SIZE>>1];
}__sse_i64;
} sse_i64;
typedef struct sse_i {
sse_i() {}
union {
signed char b[4*VECTOR_SIZE];
signed int i[VECTOR_SIZE];
unsigned int ui[VECTOR_SIZE];
sse_i64 l[VECTOR_SIZE>>1];
} __sse_i;
} sse_i;
#define _MM_ZERO 0.0f
#define _MM_ONE 1.0f
#define _MM_FP(x,n) (x.f[n])
#define _MM_INT(x,n) (*((int *)(&(x.f[n]))))
#define _MM_BITONE (0xffffffff)
#define _MM_BITZERO (0x00000000)
#define _MM_MIN(a,b) (((a) < (b)) ? (a) : (b))
#define _MM_MAX(a,b) (((a) > (b)) ? (a) : (b))
#define APPROXIMATE_ROUNDUP_MODES
#ifdef APPROXIMATE_ROUNDUP_MODES
// Approximate roundup functions
// (will not provide exact SSE functionality, but works faster).
_INLINE static int _emm_round_nearest(double d) {
return (int)d;
}
_INLINE static int _emm_round_trunc(double d) {
return (int)d;
}
_INLINE static int _mm_round_trunc(float f) {
return (int)f;
}
_INLINE static double _mminternal_sqrt(double src) {
return (float)sqrt(src);
}
#else
// Exact roundup functions.
_INLINE static int _emm_round_nearest(double d) {
int result;
__asm {
fld d
fistp result
}
return result;
}
_INLINE static int _emm_round_trunc(double d) {
int result;
int saved_cw;
int new_cw;
__asm {
push eax
fld d
fstcw saved_cw
mov eax, saved_cw
or eax, 3072
mov new_cw, eax
fldcw new_cw
fistp result
fldcw saved_cw
pop eax
}
return result;
}
_INLINE static int _mm_round_trunc(float f) {
int result;
int saved_cw;
int new_cw;
__asm {
push eax
fld f
fstcw saved_cw
mov eax, saved_cw
or eax, 3072
mov new_cw, eax
fldcw new_cw
fistp result
fldcw saved_cw
pop eax
}
return result;
}
// This is helper function for getting sqrt - Port from ICC9.1
_INLINE static double _mminternal_sqrt(double src) {
double result;
_asm {
fld QWORD PTR src
fsqrt
fstp result;
}
return result;
}
#endif
#ifndef _MM_NO_ABORT
#define _mminternal_abort(str) \
{ fprintf(stderr, "*** Functionality intrinsics error: %s ***\n", str); \
exit(1); }
#else
#define _mminternal_abort(str) \
{ fprintf(stderr, "*** Functionality intrinsics warning: %s ***\n", str); }
#endif
#ifndef _MM_NO_ALIGN_CHECK
#define _mminternal_assert_16B(addr) \
if ((unsigned long long int)addr % 16 != 0) { \
_mminternal_abort("address must be 16-byte aligned"); \
}
#else
#define _mminternal_assert_16B(addr) ;
#endif
_INLINE static sse_i _mm_add_epi32(sse_i a, sse_i b) {
sse_i result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.i[i] = a.__sse_i.i[i] + b.__sse_i.i[i];
return result;
}
_INLINE static sse_f _mm_add_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.f[i] = a.f[i] + b.f[i];
return result;
}
_INLINE static sse_f _mm_add_ss(sse_f a, sse_f b) {
sse_f result;
result.f[0] = a.f[0] + b.f[0];
//#pragma unroll(3)
for (int i = 1; i < VECTOR_SIZE; i++)
result.f[i] = a.f[i];
return result;
}
_INLINE static sse_f _mm_and_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = _MM_INT(a,i) & _MM_INT(b,i);
return result;
}
_INLINE static sse_i _mm_and_si128(sse_i a, sse_i b) {
sse_i result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.i[i] = a.__sse_i.i[i] & b.__sse_i.i[i];
return result;
}
_INLINE static sse_f _mm_andnot_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = (~_MM_INT(a,i)) & _MM_INT(b,i);
return result;
}
_INLINE static sse_i _mm_andnot_si128(sse_i a, sse_i b) {
sse_i result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.i[i] = (~a.__sse_i.i[i]) & b.__sse_i.i[i];
return result;
}
_INLINE static sse_i _mm_cmpeq_epi32(sse_i a, sse_i b) {
sse_i result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.i[i] = (a.__sse_i.i[i] == b.__sse_i.i[i]) ? 0xffffffff: 0x00000000;
return result;
}
_INLINE static sse_f _mm_cmpeq_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = (_MM_INT(a,i) == _MM_INT(b,i)) ? _MM_BITONE : _MM_BITZERO;
return result;
}
_INLINE static sse_f _mm_cmpge_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = (_MM_FP(a,i) >= _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;
return result;
}
_INLINE static sse_f _mm_cmpgt_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = (_MM_FP(a,i) > _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;
return result;
}
_INLINE static sse_f _mm_cmple_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = (_MM_FP(a,i) <= _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;
return result;
}
_INLINE static sse_i _mm_cmplt_epi32(sse_i a, sse_i b) {
sse_i result;
int i;
//#pragma unroll(4)
for (i=0;i<VECTOR_SIZE;i++)
result.__sse_i.i[i] = (a.__sse_i.i[i] < b.__sse_i.i[i]) ? 0xffffffff: 0x0;
return result;
}
_INLINE static sse_i _mm_cmpgt_epi32(sse_i a, sse_i b) {
sse_i result;
int i;
//#pragma unroll(4)
for (i=0;i<VECTOR_SIZE;i++)
result.__sse_i.i[i] = (a.__sse_i.i[i] > b.__sse_i.i[i]) ? 0xffffffff: 0x0;
return result;
}
_INLINE static sse_f _mm_cmplt_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = (_MM_FP(a,i) < _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;
return result;
}
_INLINE static sse_f _mm_cmple_ss(sse_f a, sse_f b) {
sse_f result;
_MM_INT(result,0) = (_MM_FP(a,0) <= _MM_FP(b,0)) ? _MM_BITONE : _MM_BITZERO;
//#pragma unroll(3)
for (int i = 1; i < VECTOR_SIZE; i++)
_MM_FP(result,i) = _MM_FP(a,i);
return result;
}
_INLINE static sse_f _mm_cmpneq_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = (!(_MM_FP(a,i) == _MM_FP(b,i))) ? _MM_BITONE : _MM_BITZERO;
return result;
}
_INLINE static sse_f _mm_cvtepi32_ps(sse_i a) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_FP(result,i) = (float)(a.__sse_i.i[i]);
return result;
}
_INLINE static sse_i _mm_cvtps_epi32(sse_f a) {
sse_i result;
sse_f temp;
temp = a;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.i[i] = _emm_round_nearest(_MM_FP(temp,i));
return result;
}
_INLINE static sse_f _mm_cvtsi32_ss(sse_f a, int b) {
sse_f result;
_MM_FP(result,0) = (float) b;
//#pragma unroll(3)
for (int i = 1; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = _MM_INT(a,i);
return result;
}
_INLINE static sse_i _mm_cvttps_epi32(sse_f a) {
sse_i result;
sse_f temp;
temp = a;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.i[i] = _emm_round_trunc(_MM_FP(temp,i));
return result;
}
_INLINE static sse_i64 _mm_cvtps_pi32(sse_f a) {
sse_i64 result;
result.__sse_i64.dw[0] = _emm_round_nearest(_MM_FP(a,0) );
result.__sse_i64.dw[1] = _emm_round_nearest(_MM_FP(a,1) );
return result;
}
_INLINE static int _mm_cvttss_si32(sse_f a) {
int result;
result = _mm_round_trunc ( _MM_FP(a,0) );
return result;
}
_INLINE static sse_f _mm_div_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_FP(result,i) = _MM_FP(a,i) / _MM_FP(b,i);
return result;
}
_INLINE static sse_f _mm_load_ps(float const *a) {
sse_f result;
_mminternal_assert_16B(a);
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_FP(result,i) = a[i];
return result;
}
_INLINE static sse_i _mm_load_si128(sse_i *p) {
sse_i result;
_mminternal_assert_16B(p);
result = *p;
return result;
}
_INLINE static void _mm_maskmoveu_si128(sse_i d, sse_i n, char *p) {
int i;
for(i=0;i<16;i++) {
if(n.__sse_i.b[i] & 0x80)
p[i] = d.__sse_i.b[i];
}
}
_INLINE static sse_f _mm_max_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_FP(result,i) = _MM_MAX(_MM_FP(a,i), _MM_FP(b,i));
return result;
}
_INLINE static sse_f _mm_min_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_FP(result,i) = _MM_MIN(_MM_FP(a,i), _MM_FP(b,i));
return result;
}
_INLINE static sse_f _mm_movehl_ps(sse_f a, sse_f b) {
sse_f result;
_MM_FP(result,0) = _MM_FP(b,2);
_MM_FP(result,1) = _MM_FP(b,3);
_MM_FP(result,2) = _MM_FP(a,2);
_MM_FP(result,3) = _MM_FP(a,3);
return result;
}
_INLINE static int _mm_movemask_epi8(sse_i a) {
int result;
result = ((((a.__sse_i.b[0])>>7)&0x1) | (((a.__sse_i.b[1])>>6)&0x2) |
(((a.__sse_i.b[2])>>5)&0x4) | (((a.__sse_i.b[3])>>4)&0x8) |
(((a.__sse_i.b[4])>>3)&0x10) | (((a.__sse_i.b[5])>>2)&0x20) |
(((a.__sse_i.b[6])>>1)&0x40) | (((a.__sse_i.b[7])>>4)&0x80) |
(((a.__sse_i.b[8])<<1)&0x100) | (((a.__sse_i.b[9])<<2)&0x200) |
(((a.__sse_i.b[10])<<3)&0x400) | (((a.__sse_i.b[11])<<4)&0x800) |
(((a.__sse_i.b[12])<<5)&0x1000) | (((a.__sse_i.b[13])<<6)&0x2000) |
(((a.__sse_i.b[14])<<7)&0x4000) | (((a.__sse_i.b[15])<<8)&0x8000));
return result;
}
_INLINE static int _mm_movemask_ps(sse_f a) {
int result;
result = (((_MM_INT(a,0)>>31)&0x1) | ((_MM_INT(a,1)>>30)&0x2) |
((_MM_INT(a,2)>>29)&0x4) | ((_MM_INT(a,3)>>28)&0x8));
return result;
}
_INLINE static sse_f _mm_mul_ss(sse_f a, sse_f b) {
sse_f result;
_MM_FP(result,0) = _MM_FP(a,0) * _MM_FP(b,0);
//#pragma unroll(3)
for (int i = 1; i < VECTOR_SIZE; i++)
_MM_FP(result,i) = _MM_FP(a,i);
return result;
}
_INLINE static sse_f _mm_mul_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_FP(result,i) = _MM_FP(a,i) * _MM_FP(b,i);
return result;
}
/////////////////////////////////////////////////////////////////////////////
// This is helper function for getting approx value - Port from ICC9.1
_INLINE static float _mminternal_approx(float x) {
unsigned int *p = (unsigned int*)&x;
*p = *p & 0xfffff800;
return x;
}
_INLINE static sse_f _mm_or_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_INT(result,i) = _MM_INT(a,i) | _MM_INT(b,i);
return result;
}
_INLINE static sse_i _mm_or_si128(sse_i a, sse_i b) {
sse_i result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.i[i] = a.__sse_i.i[i] | b.__sse_i.i[i];
return result;
}
// TODO
// _mm_packs_pi32
_INLINE static sse_f _mm_rcp_ps(sse_f a) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
_MM_FP(result,i) = _mminternal_approx(_MM_ONE / _MM_FP(a,i));
return result;
}
_INLINE static sse_f _mm_rcp_ss(sse_f a) {
sse_f result;
result.f[0] = _mminternal_approx( _MM_ONE / (a.f[0]) );
//#pragma unroll(3)
for (int i = 1; i < VECTOR_SIZE; i++)
result.f[i] = a.f[i];
return result;
}
//FIXME - How to get sqrt efficient
_INLINE static sse_f _mm_rsqrt_ss(sse_f a) {
sse_f result;
result.f[0] = _mminternal_approx(1.0f / _mminternal_sqrt(a.f[0]));
//#pragma unroll(3)
for (int i = 1; i < VECTOR_SIZE; i++)
result.f[i] = a.f[i];
return result;
}
_INLINE static sse_f _mm_rsqrt_ps(sse_f a) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.f[i] = _mminternal_approx(1.0f / _mminternal_sqrt(a.f[i]));
return result;
}
_INLINE static sse_i _mm_set1_epi32(int i) {
sse_i result;
result.__sse_i.i[3] = result.__sse_i.i[2] = result.__sse_i.i[1] = result.__sse_i.i[0] = i;
return result;
}
_INLINE static sse_i _mm_set_epi32(int i3, int i2, int i1, int i0) {
sse_i result;
result.__sse_i.i[0] = i0;
result.__sse_i.i[1] = i1;
result.__sse_i.i[2] = i2;
result.__sse_i.i[3] = i3;
return result;
}
_INLINE static sse_f _mm_set_ps(float a, float b, float c, float d) {
sse_f result;
result.f[0] = a;
result.f[1] = b;
result.f[2] = c;
result.f[3] = d;
return result;
}
_INLINE static sse_f _mm_set_ps1(float a) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.f[i] = a;
return result;
}
_INLINE static sse_f _mm_set_ss(float a) {
sse_f result;
result.f[0] = a;
//#pragma unroll(3)
for (int i = 1; i < VECTOR_SIZE; i++)
result.f[i] = 0.0f;
return result;
}
_INLINE static sse_i _mm_setr_epi32(int i0, int i1, int i2, int i3) {
sse_i result;
result.__sse_i.i[0] = i0;
result.__sse_i.i[1] = i1;
result.__sse_i.i[2] = i2;
result.__sse_i.i[3] = i3;
return result;
}
_INLINE static sse_f _mm_setr_ps(float a, float b, float c, float d) {
sse_f result;
result.f[0] = a;
result.f[1] = b;
result.f[2] = c;
result.f[3] = d;
return result;
}
_INLINE static sse_f _mm_setzero_ps(void) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.f[i] = 0.0;
return result;
}
_INLINE static sse_i _mm_slli_epi32(sse_i a, int count) {
sse_i result;
int i;
if (count > 31){
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.i[i]=0;
} else {
//#pragma unroll(4)
for (i=0;i<VECTOR_SIZE;i++)
result.__sse_i.i[i] = a.__sse_i.i[i] << count;
}
return result;
}
_INLINE static sse_i _mm_shuffle_epi32(sse_i a, int imm8) {
sse_i result;
int t;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++) {
t = ((imm8 >> 2*i) & 0x3) ;
result.__sse_i.i[i] = (t==0 ? a.__sse_i.i[0] :
(t==1 ? a.__sse_i.i[1] :
(t==2 ? a.__sse_i.i[2] : a.__sse_i.i[3])));
}
return result;
}
_INLINE static sse_f _mm_shuffle_ps(sse_f a, sse_f b, unsigned int imm8) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.f[i] = b.f[ ((imm8 >> 2*i) & 0x3) ];
return result;
}
_INLINE static void _mm_stream_ps(float *a, sse_f b) {
_mminternal_assert_16B(a);
*(sse_f*)a = b;
}
_INLINE static sse_i _mm_srli_epi32(sse_i a, int count) {
sse_i result;
int i;
if (count > 31) {
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.ui[i] = 0;
} else {
//#pragma unroll(4)
for (i=0;i<VECTOR_SIZE;i++)
result.__sse_i.ui[i] = a.__sse_i.ui[i] >> count;
}
return result;
}
_INLINE static sse_f _mm_sqrt_ps(sse_f a) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.f[i] = _mminternal_sqrt(a.f[i]);
return result;
}
_INLINE static void _mm_store_ps(float *v, sse_f a) {
//FIXME address must be 16-byte aligned
//TODO add assert ( v % 16 != 0 )
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
v[i] = a.f[i];
}
_INLINE static sse_i _mm_sub_epi32(sse_i a, sse_i b) {
sse_i result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.__sse_i.i[i] = a.__sse_i.i[i] - b.__sse_i.i[i];
return result;
}
_INLINE static sse_f _mm_sub_ss(sse_f a, sse_f b) {
sse_f result;
result.f[0] = a.f[0] - b.f[0];
//#pragma unroll(3)
for (int i = 1; i < VECTOR_SIZE; i++)
result.f[i] = a.f[i];
return result;
}
_INLINE static sse_f _mm_sub_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.f[i] = a.f[i] - b.f[i];
return result;
}
_INLINE static sse_f _mm_unpackhi_ps(sse_f a, sse_f b) {
sse_f result;
result.f[0] = a.f[2];
result.f[1] = b.f[2];
result.f[2] = a.f[3];
result.f[3] = b.f[3];
return result;
}
_INLINE static sse_f _mm_unpacklo_ps(sse_f a, sse_f b) {
sse_f result;
result.f[0] = a.f[0];
result.f[1] = b.f[0];
result.f[2] = a.f[1];
result.f[3] = b.f[1];
return result;
}
//FIXME - Type cast might be wrong
_INLINE static sse_f _mm_xor_ps(sse_f a, sse_f b) {
sse_f result;
//#pragma unroll(4)
for (int i = 0; i < VECTOR_SIZE; i++)
result.f[i] = ( ((int)a.f[i]) ^ ((int)b.f[i]) );
return result;
}
_INLINE static sse_i64 _mm_packs_pi32(sse_i64 a, sse_i64 b) {
sse_i64 result;
//FIXME signed ? how to deal with that????
result.__sse_i64.wd[0] = a.__sse_i64.dw[0];
result.__sse_i64.wd[1] = a.__sse_i64.dw[1];
result.__sse_i64.wd[2] = b.__sse_i64.dw[0];
result.__sse_i64.wd[3] = b.__sse_i64.dw[1];
return result;
}
_INLINE static sse_i64 _mm_setzero_si64() {
sse_i64 result;
result.__sse_i64.dw[1] = result.__sse_i64.dw[0] = 0;
return result;
}
_INLINE static sse_i64 _mm_unpacklo_pi16(sse_i64 a, sse_i64 b) {
sse_i64 result;
result.__sse_i64.wd[0] = a.__sse_i64.wd[0];
result.__sse_i64.wd[1] = b.__sse_i64.wd[0];
result.__sse_i64.wd[2] = a.__sse_i64.wd[1];
result.__sse_i64.wd[3] = b.__sse_i64.wd[1];
return result;
}
_INLINE static sse_i64 _mm_unpackhi_pi16(sse_i64 a, sse_i64 b) {
sse_i64 result;
result.__sse_i64.wd[0] = a.__sse_i64.wd[2];
result.__sse_i64.wd[1] = b.__sse_i64.wd[2];
result.__sse_i64.wd[2] = a.__sse_i64.wd[3];
result.__sse_i64.wd[3] = b.__sse_i64.wd[3];
return result;
}
_INLINE static sse_i64 _mm_unpacklo_pi32(sse_i64 a, sse_i64 b) {
sse_i64 result;
result.__sse_i64.dw[0] = a.__sse_i64.dw[0];
result.__sse_i64.dw[1] = b.__sse_i64.dw[0];
return result;
}
_INLINE static sse_i64 _mm_unpackhi_pi32(sse_i64 a, sse_i64 b) {
sse_i64 result;
result.__sse_i64.dw[0] = a.__sse_i64.dw[1];
result.__sse_i64.dw[1] = b.__sse_i64.dw[1];
return result;
}
_INLINE static sse_i64 _mm_packs_pu16(sse_i64 a, sse_i64 b) {
sse_i64 result;
result.__sse_i64.b[0] = a.__sse_i64.wd[0];
result.__sse_i64.b[1] = a.__sse_i64.wd[1];
result.__sse_i64.b[2] = a.__sse_i64.wd[2];
result.__sse_i64.b[3] = a.__sse_i64.wd[3];
result.__sse_i64.b[4] = b.__sse_i64.wd[0];
result.__sse_i64.b[5] = b.__sse_i64.wd[1];
result.__sse_i64.b[6] = b.__sse_i64.wd[2];
result.__sse_i64.b[7] = b.__sse_i64.wd[3];
return result;
}
_INLINE static sse_i _mm_setr_epi64(sse_i64 a, sse_i64 b) {
sse_i result;
result.__sse_i.l[0] = a;
result.__sse_i.l[1] = b;
return result;
}
_INLINE static unsigned int _mm_getcsr() {
//FIXME
return 0;
}
_INLINE static void _mm_setcsr(unsigned int v) {
//FIXME
return;
}
/////////////////////////////////////////////////////////////////////////////
#define _mm_cvtps_pi16(a) \
_mm_packs_pi32(_mm_cvtps_pi32(a), \
_mm_cvtps_pi32(_mm_movehl_ps((a), (a))))
#undef VECTOR_SIZE
#endif