src/parsec/disk-image/parsec/parsec-benchmark/pkgs/apps/raytrace/src/RTTL/common/RTEmulatedSSE.hxx - public/gem5-resources - Git at Google

 #ifndef __RTSOFTSSE_H__
 #define __RTSOFTSSE_H__

 #ifdef _MSC_VER
 #pragma warning(disable: 4244)
 #pragma warning(disable: 4311)
 #endif
 #ifdef __INTEL_COMPILER
 #pragma warning(disable:1684)
 #ifndef _WIN32
 #pragma warning(disable:810)
 #endif
 #endif

 #ifdef VECTOR_SIZE
 #error macro conflict found in __FILE__ << ":" << __LINE__
 #endif
 #define VECTOR_SIZE 4

 typedef struct sse_f {
 	sse_f()	{}

 	float f[VECTOR_SIZE];
 } sse_f;

 typedef struct sse_i64 {
 	union {
 	  char b[2*VECTOR_SIZE];
 	  signed short wd[VECTOR_SIZE];
           signed int dw[VECTOR_SIZE>>1];
 	}__sse_i64;
 } sse_i64;

 typedef struct sse_i {
 	sse_i() {}
 	union {
 	  signed char b[4*VECTOR_SIZE];
 	  signed int i[VECTOR_SIZE];
 	  unsigned int ui[VECTOR_SIZE];
           sse_i64 l[VECTOR_SIZE>>1];
 	} __sse_i;
 } sse_i;


 #define _MM_ZERO 0.0f
 #define _MM_ONE  1.0f

 #define _MM_FP(x,n) (x.f[n])
 #define _MM_INT(x,n) (*((int *)(&(x.f[n]))))

 #define _MM_BITONE  (0xffffffff)
 #define _MM_BITZERO (0x00000000)

 #define _MM_MIN(a,b) (((a) < (b)) ? (a) : (b))
 #define _MM_MAX(a,b) (((a) > (b)) ? (a) : (b))


 #define APPROXIMATE_ROUNDUP_MODES
 #ifdef  APPROXIMATE_ROUNDUP_MODES

 // Approximate roundup functions
 // (will not provide exact SSE functionality, but works faster).

 _INLINE static int _emm_round_nearest(double d) {
     return (int)d;
 }

 _INLINE static int _emm_round_trunc(double d) {
     return (int)d;
 }

 _INLINE static int _mm_round_trunc(float f) {
     return (int)f;
 }

 _INLINE static double _mminternal_sqrt(double src) {
     return (float)sqrt(src);
 }

 #else

 // Exact roundup functions.

 _INLINE static int _emm_round_nearest(double d) {
 	int result;
 	__asm	{
 		fld d
             fistp result
     }
 	return result;
 }

 _INLINE static int _emm_round_trunc(double d) {
 	int result;
 	int saved_cw;
 	int new_cw;
 	__asm	{
 		push      eax
             fld       d
             fstcw     saved_cw
             mov       eax, saved_cw
             or        eax, 3072
             mov       new_cw, eax
             fldcw     new_cw
             fistp     result
             fldcw     saved_cw
             pop       eax
     }
 	return result;
 }

 _INLINE static int _mm_round_trunc(float f) {
 	int result;
 	int saved_cw;
 	int new_cw;
 	__asm	{
 		push      eax
             fld       f
             fstcw     saved_cw
             mov       eax, saved_cw
             or        eax, 3072
             mov       new_cw, eax
             fldcw     new_cw
             fistp     result
             fldcw     saved_cw
             pop       eax
     }
 	return result;
 }

 // This is helper function for getting sqrt - Port from ICC9.1
 _INLINE static double _mminternal_sqrt(double src) {
     double result;
 	_asm	{
 		fld QWORD PTR src
             fsqrt
             fstp result;
 	}

     return result;
 }
 #endif

 #ifndef _MM_NO_ABORT
 #define _mminternal_abort(str)                                          \
     { fprintf(stderr, "*** Functionality intrinsics error: %s ***\n", str); \
         exit(1); }
 #else
 #define _mminternal_abort(str)                                          \
     { fprintf(stderr, "*** Functionality intrinsics warning: %s ***\n", str); }
 #endif

 #ifndef _MM_NO_ALIGN_CHECK
 #define _mminternal_assert_16B(addr)                            \
     if ((unsigned long long int)addr % 16 != 0) {                             \
         _mminternal_abort("address must be 16-byte aligned");   \
     }
 #else
 #define _mminternal_assert_16B(addr) ;
 #endif

 _INLINE static sse_i _mm_add_epi32(sse_i a, sse_i b) {
 	sse_i result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.__sse_i.i[i] = a.__sse_i.i[i] +  b.__sse_i.i[i];
 	return result;
 }

 _INLINE static sse_f _mm_add_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.f[i] = a.f[i] + b.f[i];

 	return result;
 }

 _INLINE static sse_f _mm_add_ss(sse_f a, sse_f b) {
 	sse_f result;

 	result.f[0] = a.f[0] + b.f[0];
     //#pragma unroll(3)
     for (int i = 1; i < VECTOR_SIZE; i++)
         result.f[i] = a.f[i];

 	return result;
 }

 _INLINE static sse_f _mm_and_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = _MM_INT(a,i) & _MM_INT(b,i);

 	return result;
 }

 _INLINE static sse_i _mm_and_si128(sse_i a, sse_i b) {
 	sse_i result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.__sse_i.i[i] = a.__sse_i.i[i] & b.__sse_i.i[i];

 	return result;
 }

 _INLINE static sse_f _mm_andnot_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = (~_MM_INT(a,i)) & _MM_INT(b,i);

 	return result;
 }

 _INLINE static sse_i _mm_andnot_si128(sse_i a, sse_i b) {
 	sse_i result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.__sse_i.i[i] = (~a.__sse_i.i[i]) & b.__sse_i.i[i];

 	return result;
 }

 _INLINE static sse_i _mm_cmpeq_epi32(sse_i a, sse_i b) {
 	sse_i result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.__sse_i.i[i] = (a.__sse_i.i[i] == b.__sse_i.i[i]) ? 0xffffffff: 0x00000000;

 	return result;
 }


 _INLINE static sse_f _mm_cmpeq_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = (_MM_INT(a,i) == _MM_INT(b,i)) ? _MM_BITONE : _MM_BITZERO;

 	return result;
 }

 _INLINE static sse_f _mm_cmpge_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = (_MM_FP(a,i) >= _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;

 	return result;
 }

 _INLINE static sse_f _mm_cmpgt_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = (_MM_FP(a,i) > _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;

 	return result;
 }

 _INLINE static sse_f _mm_cmple_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = (_MM_FP(a,i) <= _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;

 	return result;
 }

 _INLINE static sse_i _mm_cmplt_epi32(sse_i a, sse_i b) {
     sse_i result;
     int i;

     //#pragma unroll(4)
     for (i=0;i<VECTOR_SIZE;i++)
         result.__sse_i.i[i] = (a.__sse_i.i[i] < b.__sse_i.i[i]) ? 0xffffffff: 0x0;

     return result;
 }

 _INLINE static sse_i _mm_cmpgt_epi32(sse_i a, sse_i b) {
     sse_i result;
     int i;

     //#pragma unroll(4)
     for (i=0;i<VECTOR_SIZE;i++)
         result.__sse_i.i[i] = (a.__sse_i.i[i] > b.__sse_i.i[i]) ? 0xffffffff: 0x0;

     return result;
 }

 _INLINE static sse_f _mm_cmplt_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = (_MM_FP(a,i) < _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;

 	return result;
 }

 _INLINE static sse_f _mm_cmple_ss(sse_f a, sse_f b) {
 	sse_f result;

 	_MM_INT(result,0) = (_MM_FP(a,0) <= _MM_FP(b,0)) ? _MM_BITONE : _MM_BITZERO;
     //#pragma unroll(3)
     for (int i = 1; i < VECTOR_SIZE; i++)
         _MM_FP(result,i) = _MM_FP(a,i);

 	return result;
 }

 _INLINE static sse_f  _mm_cmpneq_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = (!(_MM_FP(a,i) == _MM_FP(b,i))) ? _MM_BITONE : _MM_BITZERO;

 	return result;
 }

 _INLINE static sse_f _mm_cvtepi32_ps(sse_i a) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_FP(result,i) = (float)(a.__sse_i.i[i]);

 	return result;
 }

 _INLINE static sse_i _mm_cvtps_epi32(sse_f a) {
 	sse_i result;
 	sse_f temp;
 	temp = a;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.__sse_i.i[i] =  _emm_round_nearest(_MM_FP(temp,i));

 	return result;
 }

 _INLINE static sse_f _mm_cvtsi32_ss(sse_f a, int b) {
 	sse_f result;

 	_MM_FP(result,0) = (float) b;
     //#pragma unroll(3)
     for (int i = 1; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = _MM_INT(a,i);

 	return result;
 }

 _INLINE static sse_i _mm_cvttps_epi32(sse_f a) {
 	sse_i result;
 	sse_f temp;
 	temp = a;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.__sse_i.i[i] = _emm_round_trunc(_MM_FP(temp,i));

 	return result;
 }


 _INLINE static sse_i64 _mm_cvtps_pi32(sse_f a) {
     sse_i64 result;

     result.__sse_i64.dw[0] = _emm_round_nearest(_MM_FP(a,0) );
     result.__sse_i64.dw[1] = _emm_round_nearest(_MM_FP(a,1) );

     return result;
 }

 _INLINE static int _mm_cvttss_si32(sse_f a) {
 	int result;

 	result = _mm_round_trunc ( _MM_FP(a,0) );

 	return result;
 }

 _INLINE static sse_f _mm_div_ps(sse_f a, sse_f b) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_FP(result,i) = _MM_FP(a,i) / _MM_FP(b,i);

     return result;
 }


 _INLINE static sse_f _mm_load_ps(float const *a) {
 	sse_f result;

 	_mminternal_assert_16B(a);
     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_FP(result,i) = a[i];

 	return result;
 }


 _INLINE static sse_i _mm_load_si128(sse_i *p) {
 	sse_i result;

 	_mminternal_assert_16B(p);
 	result = *p;

 	return result;
 }

 _INLINE static void _mm_maskmoveu_si128(sse_i d, sse_i n, char *p) {
 	int i;
 	for(i=0;i<16;i++) {
 		if(n.__sse_i.b[i] & 0x80)
 			p[i] = d.__sse_i.b[i];
 	}

 }

 _INLINE static sse_f _mm_max_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_FP(result,i) = _MM_MAX(_MM_FP(a,i), _MM_FP(b,i));

 	return result;
 }

 _INLINE static sse_f _mm_min_ps(sse_f a, sse_f b) {
 	sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_FP(result,i) = _MM_MIN(_MM_FP(a,i), _MM_FP(b,i));

 	return result;
 }

 _INLINE static sse_f _mm_movehl_ps(sse_f a, sse_f b) {
     sse_f result;

     _MM_FP(result,0) = _MM_FP(b,2);
     _MM_FP(result,1) = _MM_FP(b,3);
     _MM_FP(result,2) = _MM_FP(a,2);
     _MM_FP(result,3) = _MM_FP(a,3);

     return result;
 }


 _INLINE static int _mm_movemask_epi8(sse_i a) {
 	int result;

 	result = ((((a.__sse_i.b[0])>>7)&0x1) | (((a.__sse_i.b[1])>>6)&0x2) |
               (((a.__sse_i.b[2])>>5)&0x4) | (((a.__sse_i.b[3])>>4)&0x8) |
               (((a.__sse_i.b[4])>>3)&0x10) | (((a.__sse_i.b[5])>>2)&0x20) |
               (((a.__sse_i.b[6])>>1)&0x40) | (((a.__sse_i.b[7])>>4)&0x80) |
               (((a.__sse_i.b[8])<<1)&0x100) | (((a.__sse_i.b[9])<<2)&0x200) |
               (((a.__sse_i.b[10])<<3)&0x400) | (((a.__sse_i.b[11])<<4)&0x800) |
               (((a.__sse_i.b[12])<<5)&0x1000) | (((a.__sse_i.b[13])<<6)&0x2000) |
               (((a.__sse_i.b[14])<<7)&0x4000) | (((a.__sse_i.b[15])<<8)&0x8000));

 	return result;
 }

 _INLINE static int _mm_movemask_ps(sse_f a) {
 	int result;

 	result = (((_MM_INT(a,0)>>31)&0x1) | ((_MM_INT(a,1)>>30)&0x2) |
               ((_MM_INT(a,2)>>29)&0x4) | ((_MM_INT(a,3)>>28)&0x8));

 	return result;
 }

 _INLINE static sse_f _mm_mul_ss(sse_f a, sse_f b) {
     sse_f result;

     _MM_FP(result,0) = _MM_FP(a,0) * _MM_FP(b,0);
     //#pragma unroll(3)
     for (int i = 1; i < VECTOR_SIZE; i++)
         _MM_FP(result,i) = _MM_FP(a,i);

     return result;
 }

 _INLINE static sse_f _mm_mul_ps(sse_f a, sse_f b) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_FP(result,i) = _MM_FP(a,i) * _MM_FP(b,i);

     return result;
 }

 /////////////////////////////////////////////////////////////////////////////
 // This is helper function for getting approx value - Port from ICC9.1
 _INLINE static float _mminternal_approx(float x) {
 	unsigned int *p = (unsigned int*)&x;
 	*p = *p & 0xfffff800;
 	return x;
 }

 _INLINE static sse_f _mm_or_ps(sse_f a, sse_f b) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_INT(result,i) = _MM_INT(a,i) | _MM_INT(b,i);

     return result;
 }

 _INLINE static sse_i _mm_or_si128(sse_i a, sse_i b) {
     sse_i result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.__sse_i.i[i] = a.__sse_i.i[i] | b.__sse_i.i[i];

     return result;
 }

 // TODO
 // _mm_packs_pi32

 _INLINE static sse_f _mm_rcp_ps(sse_f a) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         _MM_FP(result,i) = _mminternal_approx(_MM_ONE / _MM_FP(a,i));
     return result;
 }

 _INLINE static sse_f _mm_rcp_ss(sse_f a) {
     sse_f result;

     result.f[0] = _mminternal_approx( _MM_ONE / (a.f[0]) );
     //#pragma unroll(3)
     for (int i = 1; i < VECTOR_SIZE; i++)
         result.f[i] = a.f[i];

     return result;
 }
 //FIXME - How to get sqrt efficient
 _INLINE static sse_f _mm_rsqrt_ss(sse_f a) {
     sse_f result;

     result.f[0] = _mminternal_approx(1.0f / _mminternal_sqrt(a.f[0]));
     //#pragma unroll(3)
     for (int i = 1; i < VECTOR_SIZE; i++)
         result.f[i] = a.f[i];

     return result;
 }

 _INLINE static sse_f _mm_rsqrt_ps(sse_f a) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.f[i] = _mminternal_approx(1.0f / _mminternal_sqrt(a.f[i]));

     return result;
 }

 _INLINE static sse_i _mm_set1_epi32(int i) {
     sse_i result;
     result.__sse_i.i[3] = result.__sse_i.i[2] = result.__sse_i.i[1] = result.__sse_i.i[0] = i;

     return result;
 }

 _INLINE static sse_i _mm_set_epi32(int i3, int i2, int i1, int i0) {
     sse_i result;
     result.__sse_i.i[0] = i0;
     result.__sse_i.i[1] = i1;
     result.__sse_i.i[2] = i2;
     result.__sse_i.i[3] = i3;

     return result;
 }


 _INLINE static sse_f _mm_set_ps(float a, float b, float c, float d) {
     sse_f result;

     result.f[0] = a;
     result.f[1] = b;
     result.f[2] = c;
     result.f[3] = d;

     return result;
 }

 _INLINE static sse_f _mm_set_ps1(float a) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.f[i] = a;

     return result;
 }

 _INLINE static sse_f _mm_set_ss(float a) {
     sse_f result;

     result.f[0] = a;
     //#pragma unroll(3)
     for (int i = 1; i < VECTOR_SIZE; i++)
         result.f[i] = 0.0f;

     return result;
 }

 _INLINE static sse_i _mm_setr_epi32(int i0, int i1, int i2, int i3) {
     sse_i result;
     result.__sse_i.i[0] = i0;
     result.__sse_i.i[1] = i1;
     result.__sse_i.i[2] = i2;
     result.__sse_i.i[3] = i3;

     return result;
 }

 _INLINE static sse_f _mm_setr_ps(float a, float b, float c, float d) {
     sse_f result;

     result.f[0] = a;
     result.f[1] = b;
     result.f[2] = c;
     result.f[3] = d;

     return result;
 }

 _INLINE static sse_f _mm_setzero_ps(void) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.f[i] = 0.0;
     return result;
 }

 _INLINE static sse_i _mm_slli_epi32(sse_i a, int count) {
 	sse_i result;
 	int i;

 	if (count > 31){
     //#pragma unroll(4)
         for (int i = 0; i < VECTOR_SIZE; i++)
             result.__sse_i.i[i]=0;
 	} else {
     //#pragma unroll(4)
 		for (i=0;i<VECTOR_SIZE;i++)
 			result.__sse_i.i[i] = a.__sse_i.i[i] << count;
 	}
 	return result;
 }

 _INLINE static sse_i _mm_shuffle_epi32(sse_i a, int imm8) {
     sse_i result;

     int t;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++) {
         t =  ((imm8 >> 2*i) & 0x3) ;
         result.__sse_i.i[i] = (t==0 ? a.__sse_i.i[0] :
                               (t==1 ? a.__sse_i.i[1] :
                               (t==2 ? a.__sse_i.i[2] : a.__sse_i.i[3])));
     }

     return result;
 }


 _INLINE static sse_f _mm_shuffle_ps(sse_f a, sse_f b, unsigned int imm8) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.f[i] = b.f[ ((imm8 >> 2*i) & 0x3) ];

     return result;
 }

 _INLINE static void _mm_stream_ps(float *a, sse_f b) {
 	_mminternal_assert_16B(a);
 	*(sse_f*)a = b;
 }

 _INLINE static sse_i _mm_srli_epi32(sse_i a, int count) {
 	sse_i result;
 	int i;

 	if (count > 31) {
     //#pragma unroll(4)
         for (int i = 0; i < VECTOR_SIZE; i++)
             result.__sse_i.ui[i] = 0;
 	} else {
     //#pragma unroll(4)
 		for (i=0;i<VECTOR_SIZE;i++)
 			result.__sse_i.ui[i] = a.__sse_i.ui[i] >> count;
 	}

 	return result;
 }


 _INLINE static sse_f _mm_sqrt_ps(sse_f a) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.f[i] = _mminternal_sqrt(a.f[i]);

     return result;
 }

 _INLINE static void _mm_store_ps(float *v, sse_f a) {
     //FIXME address must be 16-byte aligned
     //TODO add assert ( v % 16 != 0 )
     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         v[i] = a.f[i];
 }

 _INLINE static sse_i _mm_sub_epi32(sse_i a, sse_i b) {
     sse_i result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.__sse_i.i[i] = a.__sse_i.i[i] - b.__sse_i.i[i];

     return result;
 }


 _INLINE static sse_f _mm_sub_ss(sse_f a, sse_f b) {
     sse_f result;

     result.f[0] = a.f[0] - b.f[0];
     //#pragma unroll(3)
     for (int i = 1; i < VECTOR_SIZE; i++)
         result.f[i] = a.f[i];

     return result;
 }

 _INLINE static sse_f _mm_sub_ps(sse_f a, sse_f b) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.f[i] = a.f[i] - b.f[i];

     return result;
 }

 _INLINE static sse_f _mm_unpackhi_ps(sse_f a, sse_f b) {
     sse_f result;

     result.f[0] = a.f[2];
     result.f[1] = b.f[2];
     result.f[2] = a.f[3];
     result.f[3] = b.f[3];

     return result;
 }

 _INLINE static sse_f _mm_unpacklo_ps(sse_f a, sse_f b) {
     sse_f result;

     result.f[0] = a.f[0];
     result.f[1] = b.f[0];
     result.f[2] = a.f[1];
     result.f[3] = b.f[1];

     return result;
 }

 //FIXME - Type cast might be wrong
 _INLINE static sse_f _mm_xor_ps(sse_f a, sse_f b) {
     sse_f result;

     //#pragma unroll(4)
     for (int i = 0; i < VECTOR_SIZE; i++)
         result.f[i] = ( ((int)a.f[i]) ^ ((int)b.f[i]) );

     return result;
 }

 _INLINE static sse_i64 _mm_packs_pi32(sse_i64 a, sse_i64 b) {
     sse_i64 result;
     //FIXME signed ? how to deal with that????
     result.__sse_i64.wd[0] = a.__sse_i64.dw[0];
     result.__sse_i64.wd[1] = a.__sse_i64.dw[1];

     result.__sse_i64.wd[2] = b.__sse_i64.dw[0];
     result.__sse_i64.wd[3] = b.__sse_i64.dw[1];

     return result;
 }

 _INLINE static sse_i64 _mm_setzero_si64() {
     sse_i64 result;
     result.__sse_i64.dw[1] = result.__sse_i64.dw[0] = 0;

     return result;
 }

 _INLINE static sse_i64 _mm_unpacklo_pi16(sse_i64 a, sse_i64 b) {
     sse_i64 result;
     result.__sse_i64.wd[0] = a.__sse_i64.wd[0];
     result.__sse_i64.wd[1] = b.__sse_i64.wd[0];
     result.__sse_i64.wd[2] = a.__sse_i64.wd[1];
     result.__sse_i64.wd[3] = b.__sse_i64.wd[1];

     return result;
 }

 _INLINE static sse_i64 _mm_unpackhi_pi16(sse_i64 a, sse_i64 b) {
     sse_i64 result;
     result.__sse_i64.wd[0] = a.__sse_i64.wd[2];
     result.__sse_i64.wd[1] = b.__sse_i64.wd[2];
     result.__sse_i64.wd[2] = a.__sse_i64.wd[3];
     result.__sse_i64.wd[3] = b.__sse_i64.wd[3];

     return result;
 }


 _INLINE static sse_i64 _mm_unpacklo_pi32(sse_i64 a, sse_i64 b) {
     sse_i64 result;
     result.__sse_i64.dw[0] = a.__sse_i64.dw[0];
     result.__sse_i64.dw[1] = b.__sse_i64.dw[0];

     return result;
 }

 _INLINE static sse_i64 _mm_unpackhi_pi32(sse_i64 a, sse_i64 b) {
     sse_i64 result;
     result.__sse_i64.dw[0] = a.__sse_i64.dw[1];
     result.__sse_i64.dw[1] = b.__sse_i64.dw[1];

     return result;
 }

 _INLINE static sse_i64 _mm_packs_pu16(sse_i64 a, sse_i64 b) {
     sse_i64 result;
     result.__sse_i64.b[0] = a.__sse_i64.wd[0];
     result.__sse_i64.b[1] = a.__sse_i64.wd[1];
     result.__sse_i64.b[2] = a.__sse_i64.wd[2];
     result.__sse_i64.b[3] = a.__sse_i64.wd[3];

     result.__sse_i64.b[4] = b.__sse_i64.wd[0];
     result.__sse_i64.b[5] = b.__sse_i64.wd[1];
     result.__sse_i64.b[6] = b.__sse_i64.wd[2];
     result.__sse_i64.b[7] = b.__sse_i64.wd[3];

     return result;
 }

 _INLINE static sse_i _mm_setr_epi64(sse_i64 a, sse_i64 b) {
     sse_i result;

     result.__sse_i.l[0] = a;
     result.__sse_i.l[1] = b;

     return result;
 }

 _INLINE static unsigned int _mm_getcsr() {
     //FIXME
     return 0;
 }

 _INLINE static void _mm_setcsr(unsigned int v) {
     //FIXME
     return;
 }

 /////////////////////////////////////////////////////////////////////////////

 #define _mm_cvtps_pi16(a)                                   \
     _mm_packs_pi32(_mm_cvtps_pi32(a),                       \
                    _mm_cvtps_pi32(_mm_movehl_ps((a), (a))))


 #undef VECTOR_SIZE

 #endif
	#ifndef __RTSOFTSSE_H__
	#define __RTSOFTSSE_H__

	#ifdef _MSC_VER
	#pragma warning(disable: 4244)
	#pragma warning(disable: 4311)
	#endif
	#ifdef __INTEL_COMPILER
	#pragma warning(disable:1684)
	#ifndef _WIN32
	#pragma warning(disable:810)
	#endif
	#endif

	#ifdef VECTOR_SIZE
	#error macro conflict found in __FILE__ << ":" << __LINE__
	#endif
	#define VECTOR_SIZE 4

	typedef struct sse_f {
	sse_f() {}

	float f[VECTOR_SIZE];
	} sse_f;

	typedef struct sse_i64 {
	union {
	char b[2*VECTOR_SIZE];
	signed short wd[VECTOR_SIZE];
	signed int dw[VECTOR_SIZE>>1];
	}__sse_i64;
	} sse_i64;

	typedef struct sse_i {
	sse_i() {}
	union {
	signed char b[4*VECTOR_SIZE];
	signed int i[VECTOR_SIZE];
	unsigned int ui[VECTOR_SIZE];
	sse_i64 l[VECTOR_SIZE>>1];
	} __sse_i;
	} sse_i;



	#define _MM_ZERO 0.0f
	#define _MM_ONE 1.0f

	#define _MM_FP(x,n) (x.f[n])
	#define _MM_INT(x,n) (((int )(&(x.f[n]))))

	#define _MM_BITONE (0xffffffff)
	#define _MM_BITZERO (0x00000000)

	#define _MM_MIN(a,b) (((a) < (b)) ? (a) : (b))
	#define _MM_MAX(a,b) (((a) > (b)) ? (a) : (b))


	#define APPROXIMATE_ROUNDUP_MODES
	#ifdef APPROXIMATE_ROUNDUP_MODES

	// Approximate roundup functions
	// (will not provide exact SSE functionality, but works faster).

	_INLINE static int _emm_round_nearest(double d) {
	return (int)d;
	}

	_INLINE static int _emm_round_trunc(double d) {
	return (int)d;
	}

	_INLINE static int _mm_round_trunc(float f) {
	return (int)f;
	}

	_INLINE static double _mminternal_sqrt(double src) {
	return (float)sqrt(src);
	}

	#else

	// Exact roundup functions.

	_INLINE static int _emm_round_nearest(double d) {
	int result;
	__asm {
	fld d
	fistp result
	}
	return result;
	}

	_INLINE static int _emm_round_trunc(double d) {
	int result;
	int saved_cw;
	int new_cw;
	__asm {
	push eax
	fld d
	fstcw saved_cw
	mov eax, saved_cw
	or eax, 3072
	mov new_cw, eax
	fldcw new_cw
	fistp result
	fldcw saved_cw
	pop eax
	}
	return result;
	}

	_INLINE static int _mm_round_trunc(float f) {
	int result;
	int saved_cw;
	int new_cw;
	__asm {
	push eax
	fld f
	fstcw saved_cw
	mov eax, saved_cw
	or eax, 3072
	mov new_cw, eax
	fldcw new_cw
	fistp result
	fldcw saved_cw
	pop eax
	}
	return result;
	}

	// This is helper function for getting sqrt - Port from ICC9.1
	_INLINE static double _mminternal_sqrt(double src) {
	double result;
	_asm {
	fld QWORD PTR src
	fsqrt
	fstp result;
	}

	return result;
	}
	#endif

	#ifndef _MM_NO_ABORT
	#define _mminternal_abort(str) \
	{ fprintf(stderr, "* Functionality intrinsics error: %s *\n", str); \
	exit(1); }
	#else
	#define _mminternal_abort(str) \
	{ fprintf(stderr, "* Functionality intrinsics warning: %s *\n", str); }
	#endif

	#ifndef _MM_NO_ALIGN_CHECK
	#define _mminternal_assert_16B(addr) \
	if ((unsigned long long int)addr % 16 != 0) { \
	_mminternal_abort("address must be 16-byte aligned"); \
	}
	#else
	#define _mminternal_assert_16B(addr) ;
	#endif

	_INLINE static sse_i _mm_add_epi32(sse_i a, sse_i b) {
	sse_i result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.i[i] = a.__sse_i.i[i] + b.__sse_i.i[i];
	return result;
	}

	_INLINE static sse_f _mm_add_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.f[i] = a.f[i] + b.f[i];

	return result;
	}

	_INLINE static sse_f _mm_add_ss(sse_f a, sse_f b) {
	sse_f result;

	result.f[0] = a.f[0] + b.f[0];
	//#pragma unroll(3)
	for (int i = 1; i < VECTOR_SIZE; i++)
	result.f[i] = a.f[i];

	return result;
	}

	_INLINE static sse_f _mm_and_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = _MM_INT(a,i) & _MM_INT(b,i);

	return result;
	}

	_INLINE static sse_i _mm_and_si128(sse_i a, sse_i b) {
	sse_i result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.i[i] = a.__sse_i.i[i] & b.__sse_i.i[i];

	return result;
	}

	_INLINE static sse_f _mm_andnot_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = (~_MM_INT(a,i)) & _MM_INT(b,i);

	return result;
	}

	_INLINE static sse_i _mm_andnot_si128(sse_i a, sse_i b) {
	sse_i result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.i[i] = (~a.__sse_i.i[i]) & b.__sse_i.i[i];

	return result;
	}

	_INLINE static sse_i _mm_cmpeq_epi32(sse_i a, sse_i b) {
	sse_i result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.i[i] = (a.__sse_i.i[i] == b.__sse_i.i[i]) ? 0xffffffff: 0x00000000;

	return result;
	}


	_INLINE static sse_f _mm_cmpeq_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = (_MM_INT(a,i) == _MM_INT(b,i)) ? _MM_BITONE : _MM_BITZERO;

	return result;
	}

	_INLINE static sse_f _mm_cmpge_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = (_MM_FP(a,i) >= _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;

	return result;
	}

	_INLINE static sse_f _mm_cmpgt_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = (_MM_FP(a,i) > _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;

	return result;
	}

	_INLINE static sse_f _mm_cmple_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = (_MM_FP(a,i) <= _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;

	return result;
	}

	_INLINE static sse_i _mm_cmplt_epi32(sse_i a, sse_i b) {
	sse_i result;
	int i;

	//#pragma unroll(4)
	for (i=0;i<VECTOR_SIZE;i++)
	result.__sse_i.i[i] = (a.__sse_i.i[i] < b.__sse_i.i[i]) ? 0xffffffff: 0x0;

	return result;
	}

	_INLINE static sse_i _mm_cmpgt_epi32(sse_i a, sse_i b) {
	sse_i result;
	int i;

	//#pragma unroll(4)
	for (i=0;i<VECTOR_SIZE;i++)
	result.__sse_i.i[i] = (a.__sse_i.i[i] > b.__sse_i.i[i]) ? 0xffffffff: 0x0;

	return result;
	}

	_INLINE static sse_f _mm_cmplt_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = (_MM_FP(a,i) < _MM_FP(b,i)) ? _MM_BITONE : _MM_BITZERO;

	return result;
	}

	_INLINE static sse_f _mm_cmple_ss(sse_f a, sse_f b) {
	sse_f result;

	_MM_INT(result,0) = (_MM_FP(a,0) <= _MM_FP(b,0)) ? _MM_BITONE : _MM_BITZERO;
	//#pragma unroll(3)
	for (int i = 1; i < VECTOR_SIZE; i++)
	_MM_FP(result,i) = _MM_FP(a,i);

	return result;
	}

	_INLINE static sse_f _mm_cmpneq_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = (!(_MM_FP(a,i) == _MM_FP(b,i))) ? _MM_BITONE : _MM_BITZERO;

	return result;
	}

	_INLINE static sse_f _mm_cvtepi32_ps(sse_i a) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_FP(result,i) = (float)(a.__sse_i.i[i]);

	return result;
	}

	_INLINE static sse_i _mm_cvtps_epi32(sse_f a) {
	sse_i result;
	sse_f temp;
	temp = a;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.i[i] = _emm_round_nearest(_MM_FP(temp,i));

	return result;
	}

	_INLINE static sse_f _mm_cvtsi32_ss(sse_f a, int b) {
	sse_f result;

	_MM_FP(result,0) = (float) b;
	//#pragma unroll(3)
	for (int i = 1; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = _MM_INT(a,i);

	return result;
	}

	_INLINE static sse_i _mm_cvttps_epi32(sse_f a) {
	sse_i result;
	sse_f temp;
	temp = a;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.i[i] = _emm_round_trunc(_MM_FP(temp,i));

	return result;
	}


	_INLINE static sse_i64 _mm_cvtps_pi32(sse_f a) {
	sse_i64 result;

	result.__sse_i64.dw[0] = _emm_round_nearest(_MM_FP(a,0) );
	result.__sse_i64.dw[1] = _emm_round_nearest(_MM_FP(a,1) );

	return result;
	}

	_INLINE static int _mm_cvttss_si32(sse_f a) {
	int result;

	result = _mm_round_trunc ( _MM_FP(a,0) );

	return result;
	}

	_INLINE static sse_f _mm_div_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_FP(result,i) = _MM_FP(a,i) / _MM_FP(b,i);

	return result;
	}


	_INLINE static sse_f _mm_load_ps(float const *a) {
	sse_f result;

	_mminternal_assert_16B(a);
	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_FP(result,i) = a[i];

	return result;
	}




	_INLINE static sse_i _mm_load_si128(sse_i *p) {
	sse_i result;

	_mminternal_assert_16B(p);
	result = *p;

	return result;
	}

	_INLINE static void _mm_maskmoveu_si128(sse_i d, sse_i n, char *p) {
	int i;
	for(i=0;i<16;i++) {
	if(n.__sse_i.b[i] & 0x80)
	p[i] = d.__sse_i.b[i];
	}

	}

	_INLINE static sse_f _mm_max_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_FP(result,i) = _MM_MAX(_MM_FP(a,i), _MM_FP(b,i));

	return result;
	}

	_INLINE static sse_f _mm_min_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_FP(result,i) = _MM_MIN(_MM_FP(a,i), _MM_FP(b,i));

	return result;
	}

	_INLINE static sse_f _mm_movehl_ps(sse_f a, sse_f b) {
	sse_f result;

	_MM_FP(result,0) = _MM_FP(b,2);
	_MM_FP(result,1) = _MM_FP(b,3);
	_MM_FP(result,2) = _MM_FP(a,2);
	_MM_FP(result,3) = _MM_FP(a,3);

	return result;
	}



	_INLINE static int _mm_movemask_epi8(sse_i a) {
	int result;

	result = ((((a.__sse_i.b[0])>>7)&0x1) \| (((a.__sse_i.b[1])>>6)&0x2) \|
	(((a.__sse_i.b[2])>>5)&0x4) \| (((a.__sse_i.b[3])>>4)&0x8) \|
	(((a.__sse_i.b[4])>>3)&0x10) \| (((a.__sse_i.b[5])>>2)&0x20) \|
	(((a.__sse_i.b[6])>>1)&0x40) \| (((a.__sse_i.b[7])>>4)&0x80) \|
	(((a.__sse_i.b[8])<<1)&0x100) \| (((a.__sse_i.b[9])<<2)&0x200) \|
	(((a.__sse_i.b[10])<<3)&0x400) \| (((a.__sse_i.b[11])<<4)&0x800) \|
	(((a.__sse_i.b[12])<<5)&0x1000) \| (((a.__sse_i.b[13])<<6)&0x2000) \|
	(((a.__sse_i.b[14])<<7)&0x4000) \| (((a.__sse_i.b[15])<<8)&0x8000));

	return result;
	}

	_INLINE static int _mm_movemask_ps(sse_f a) {
	int result;

	result = (((_MM_INT(a,0)>>31)&0x1) \| ((_MM_INT(a,1)>>30)&0x2) \|
	((_MM_INT(a,2)>>29)&0x4) \| ((_MM_INT(a,3)>>28)&0x8));

	return result;
	}

	_INLINE static sse_f _mm_mul_ss(sse_f a, sse_f b) {
	sse_f result;

	_MM_FP(result,0) = _MM_FP(a,0) * _MM_FP(b,0);
	//#pragma unroll(3)
	for (int i = 1; i < VECTOR_SIZE; i++)
	_MM_FP(result,i) = _MM_FP(a,i);

	return result;
	}

	_INLINE static sse_f _mm_mul_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_FP(result,i) = _MM_FP(a,i) * _MM_FP(b,i);

	return result;
	}

	/////////////////////////////////////////////////////////////////////////////
	// This is helper function for getting approx value - Port from ICC9.1
	_INLINE static float _mminternal_approx(float x) {
	unsigned int p = (unsigned int)&x;
	p = p & 0xfffff800;
	return x;
	}

	_INLINE static sse_f _mm_or_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_INT(result,i) = _MM_INT(a,i) \| _MM_INT(b,i);

	return result;
	}

	_INLINE static sse_i _mm_or_si128(sse_i a, sse_i b) {
	sse_i result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.i[i] = a.__sse_i.i[i] \| b.__sse_i.i[i];

	return result;
	}

	// TODO
	// _mm_packs_pi32

	_INLINE static sse_f _mm_rcp_ps(sse_f a) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	_MM_FP(result,i) = _mminternal_approx(_MM_ONE / _MM_FP(a,i));
	return result;
	}

	_INLINE static sse_f _mm_rcp_ss(sse_f a) {
	sse_f result;

	result.f[0] = _mminternal_approx( _MM_ONE / (a.f[0]) );
	//#pragma unroll(3)
	for (int i = 1; i < VECTOR_SIZE; i++)
	result.f[i] = a.f[i];

	return result;
	}
	//FIXME - How to get sqrt efficient
	_INLINE static sse_f _mm_rsqrt_ss(sse_f a) {
	sse_f result;

	result.f[0] = _mminternal_approx(1.0f / _mminternal_sqrt(a.f[0]));
	//#pragma unroll(3)
	for (int i = 1; i < VECTOR_SIZE; i++)
	result.f[i] = a.f[i];

	return result;
	}

	_INLINE static sse_f _mm_rsqrt_ps(sse_f a) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.f[i] = _mminternal_approx(1.0f / _mminternal_sqrt(a.f[i]));

	return result;
	}

	_INLINE static sse_i _mm_set1_epi32(int i) {
	sse_i result;
	result.__sse_i.i[3] = result.__sse_i.i[2] = result.__sse_i.i[1] = result.__sse_i.i[0] = i;

	return result;
	}

	_INLINE static sse_i _mm_set_epi32(int i3, int i2, int i1, int i0) {
	sse_i result;
	result.__sse_i.i[0] = i0;
	result.__sse_i.i[1] = i1;
	result.__sse_i.i[2] = i2;
	result.__sse_i.i[3] = i3;

	return result;
	}


	_INLINE static sse_f _mm_set_ps(float a, float b, float c, float d) {
	sse_f result;

	result.f[0] = a;
	result.f[1] = b;
	result.f[2] = c;
	result.f[3] = d;

	return result;
	}

	_INLINE static sse_f _mm_set_ps1(float a) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.f[i] = a;

	return result;
	}

	_INLINE static sse_f _mm_set_ss(float a) {
	sse_f result;

	result.f[0] = a;
	//#pragma unroll(3)
	for (int i = 1; i < VECTOR_SIZE; i++)
	result.f[i] = 0.0f;

	return result;
	}

	_INLINE static sse_i _mm_setr_epi32(int i0, int i1, int i2, int i3) {
	sse_i result;
	result.__sse_i.i[0] = i0;
	result.__sse_i.i[1] = i1;
	result.__sse_i.i[2] = i2;
	result.__sse_i.i[3] = i3;

	return result;
	}

	_INLINE static sse_f _mm_setr_ps(float a, float b, float c, float d) {
	sse_f result;

	result.f[0] = a;
	result.f[1] = b;
	result.f[2] = c;
	result.f[3] = d;

	return result;
	}

	_INLINE static sse_f _mm_setzero_ps(void) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.f[i] = 0.0;
	return result;
	}

	_INLINE static sse_i _mm_slli_epi32(sse_i a, int count) {
	sse_i result;
	int i;

	if (count > 31){
	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.i[i]=0;
	} else {
	//#pragma unroll(4)
	for (i=0;i<VECTOR_SIZE;i++)
	result.__sse_i.i[i] = a.__sse_i.i[i] << count;
	}
	return result;
	}

	_INLINE static sse_i _mm_shuffle_epi32(sse_i a, int imm8) {
	sse_i result;

	int t;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++) {
	t = ((imm8 >> 2*i) & 0x3) ;
	result.__sse_i.i[i] = (t==0 ? a.__sse_i.i[0] :
	(t==1 ? a.__sse_i.i[1] :
	(t==2 ? a.__sse_i.i[2] : a.__sse_i.i[3])));
	}

	return result;
	}


	_INLINE static sse_f _mm_shuffle_ps(sse_f a, sse_f b, unsigned int imm8) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.f[i] = b.f[ ((imm8 >> 2*i) & 0x3) ];

	return result;
	}

	_INLINE static void _mm_stream_ps(float *a, sse_f b) {
	_mminternal_assert_16B(a);
	(sse_f)a = b;
	}

	_INLINE static sse_i _mm_srli_epi32(sse_i a, int count) {
	sse_i result;
	int i;

	if (count > 31) {
	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.ui[i] = 0;
	} else {
	//#pragma unroll(4)
	for (i=0;i<VECTOR_SIZE;i++)
	result.__sse_i.ui[i] = a.__sse_i.ui[i] >> count;
	}

	return result;
	}


	_INLINE static sse_f _mm_sqrt_ps(sse_f a) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.f[i] = _mminternal_sqrt(a.f[i]);

	return result;
	}

	_INLINE static void _mm_store_ps(float *v, sse_f a) {
	//FIXME address must be 16-byte aligned
	//TODO add assert ( v % 16 != 0 )
	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	v[i] = a.f[i];
	}

	_INLINE static sse_i _mm_sub_epi32(sse_i a, sse_i b) {
	sse_i result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.__sse_i.i[i] = a.__sse_i.i[i] - b.__sse_i.i[i];

	return result;
	}


	_INLINE static sse_f _mm_sub_ss(sse_f a, sse_f b) {
	sse_f result;

	result.f[0] = a.f[0] - b.f[0];
	//#pragma unroll(3)
	for (int i = 1; i < VECTOR_SIZE; i++)
	result.f[i] = a.f[i];

	return result;
	}

	_INLINE static sse_f _mm_sub_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.f[i] = a.f[i] - b.f[i];

	return result;
	}

	_INLINE static sse_f _mm_unpackhi_ps(sse_f a, sse_f b) {
	sse_f result;

	result.f[0] = a.f[2];
	result.f[1] = b.f[2];
	result.f[2] = a.f[3];
	result.f[3] = b.f[3];

	return result;
	}

	_INLINE static sse_f _mm_unpacklo_ps(sse_f a, sse_f b) {
	sse_f result;

	result.f[0] = a.f[0];
	result.f[1] = b.f[0];
	result.f[2] = a.f[1];
	result.f[3] = b.f[1];

	return result;
	}

	//FIXME - Type cast might be wrong
	_INLINE static sse_f _mm_xor_ps(sse_f a, sse_f b) {
	sse_f result;

	//#pragma unroll(4)
	for (int i = 0; i < VECTOR_SIZE; i++)
	result.f[i] = ( ((int)a.f[i]) ^ ((int)b.f[i]) );

	return result;
	}

	_INLINE static sse_i64 _mm_packs_pi32(sse_i64 a, sse_i64 b) {
	sse_i64 result;
	//FIXME signed ? how to deal with that????
	result.__sse_i64.wd[0] = a.__sse_i64.dw[0];
	result.__sse_i64.wd[1] = a.__sse_i64.dw[1];

	result.__sse_i64.wd[2] = b.__sse_i64.dw[0];
	result.__sse_i64.wd[3] = b.__sse_i64.dw[1];

	return result;
	}

	_INLINE static sse_i64 _mm_setzero_si64() {
	sse_i64 result;
	result.__sse_i64.dw[1] = result.__sse_i64.dw[0] = 0;

	return result;
	}

	_INLINE static sse_i64 _mm_unpacklo_pi16(sse_i64 a, sse_i64 b) {
	sse_i64 result;
	result.__sse_i64.wd[0] = a.__sse_i64.wd[0];
	result.__sse_i64.wd[1] = b.__sse_i64.wd[0];
	result.__sse_i64.wd[2] = a.__sse_i64.wd[1];
	result.__sse_i64.wd[3] = b.__sse_i64.wd[1];

	return result;
	}

	_INLINE static sse_i64 _mm_unpackhi_pi16(sse_i64 a, sse_i64 b) {
	sse_i64 result;
	result.__sse_i64.wd[0] = a.__sse_i64.wd[2];
	result.__sse_i64.wd[1] = b.__sse_i64.wd[2];
	result.__sse_i64.wd[2] = a.__sse_i64.wd[3];
	result.__sse_i64.wd[3] = b.__sse_i64.wd[3];

	return result;
	}


	_INLINE static sse_i64 _mm_unpacklo_pi32(sse_i64 a, sse_i64 b) {
	sse_i64 result;
	result.__sse_i64.dw[0] = a.__sse_i64.dw[0];
	result.__sse_i64.dw[1] = b.__sse_i64.dw[0];

	return result;
	}

	_INLINE static sse_i64 _mm_unpackhi_pi32(sse_i64 a, sse_i64 b) {
	sse_i64 result;
	result.__sse_i64.dw[0] = a.__sse_i64.dw[1];
	result.__sse_i64.dw[1] = b.__sse_i64.dw[1];

	return result;
	}

	_INLINE static sse_i64 _mm_packs_pu16(sse_i64 a, sse_i64 b) {
	sse_i64 result;
	result.__sse_i64.b[0] = a.__sse_i64.wd[0];
	result.__sse_i64.b[1] = a.__sse_i64.wd[1];
	result.__sse_i64.b[2] = a.__sse_i64.wd[2];
	result.__sse_i64.b[3] = a.__sse_i64.wd[3];

	result.__sse_i64.b[4] = b.__sse_i64.wd[0];
	result.__sse_i64.b[5] = b.__sse_i64.wd[1];
	result.__sse_i64.b[6] = b.__sse_i64.wd[2];
	result.__sse_i64.b[7] = b.__sse_i64.wd[3];

	return result;
	}

	_INLINE static sse_i _mm_setr_epi64(sse_i64 a, sse_i64 b) {
	sse_i result;

	result.__sse_i.l[0] = a;
	result.__sse_i.l[1] = b;

	return result;
	}

	_INLINE static unsigned int _mm_getcsr() {
	//FIXME
	return 0;
	}

	_INLINE static void _mm_setcsr(unsigned int v) {
	//FIXME
	return;
	}

	/////////////////////////////////////////////////////////////////////////////

	#define _mm_cvtps_pi16(a) \
	_mm_packs_pi32(_mm_cvtps_pi32(a), \
	_mm_cvtps_pi32(_mm_movehl_ps((a), (a))))


	#undef VECTOR_SIZE

	#endif