blob: 3a3dff877679bd535fed86909bddd51b3d966579 [file] [log] [blame]
/*
BG/Q tuned version of short force evaluation kernel: 81% of peak performance
Argonne Leadership Computing Facility, Argonne, IL 60439
Vitali Morozov (morozov@anl.gov)
*/
#include <assert.h>
#include <math.h>
#include <stdlib.h>
//#include <spi/include/l1p/sprefetch.h>
//#include </opt/ibmcmp/vacpp/bg/12.1/include/builtins.h>
//#include </soft/compilers/ibmcmp-may2012/vacpp/bg/12.1/include/builtins.h>
//#undef __bgq__
#ifdef __bgq__
#include IBMCMP_BUILTINS
int isAligned( void *in );
void Step16_int( int count1, float xxi, float yyi, float zzi, float fsrrmax2, float mp_rsm2, float *xx1, float *yy1, float *zz1, float *mass1, float *ax, float *ay, float *az )
{
int i = 0, j, k;
const int offset = 32;
vector4double b0, b1, b2, b3, b4, b5, b6;
vector4double c0, c1, c2, c3, c4, c5, c6;
vector4double a0 = (vector4double){ 0.5, 0.5, 0.5, 0.5 };
vector4double a1 = (vector4double){ (double)xxi, (double)xxi, (double)xxi, (double)xxi };
vector4double a2 = (vector4double){ (double)yyi, (double)yyi, (double)yyi, (double)yyi };
vector4double a3 = (vector4double){ (double)zzi, (double)zzi, (double)zzi, (double)zzi };
vector4double a4 = (vector4double){ (double)fsrrmax2, (double)fsrrmax2, (double)fsrrmax2, (double)fsrrmax2 };
vector4double a5 = (vector4double){ (double)mp_rsm2, (double)mp_rsm2, (double)mp_rsm2, (double)mp_rsm2 };
vector4double a6 = (vector4double){ 0., 0., 0., 0. };
vector4double a7 = (vector4double){ 0., 0., 0., 0. };
vector4double a8 = (vector4double){ 0., 0., 0., 0. };
vector4double a9 = (vector4double){ 0., 0., 0., 0. };
vector4double a10 = (vector4double){ 0.269327, 0.269327, 0.269327, 0.269327 };
vector4double a11 = (vector4double){ -0.0750978, -0.0750978, -0.0750978, -0.0750978 };
vector4double a12 = (vector4double){ 0.0114808, 0.0114808, 0.0114808, 0.0114808 };
vector4double a13 = (vector4double){ -0.00109313, -0.00109313, -0.00109313, -0.00109313 };
vector4double a14 = (vector4double){ 0.0000605491, 0.0000605491, 0.0000605491, 0.0000605491 };
vector4double a15 = (vector4double){ -0.00000147177, -0.00000147177, -0.00000147177, -0.00000147177 };
/*
int32_t depth = 3;
L1P_SetStreamPolicy( L1P_stream_confirmed );
L1P_SetStreamDepth(depth);
*/
for ( i = 0, j = 0; i < count1-7; i = i + 8, j = j + 32 )
{
__dcbt( (void *)&xx1 [i+offset] );
__dcbt( (void *)&yy1 [i+offset] );
__dcbt( (void *)&zz1 [i+offset] );
__dcbt( (void *)&mass1[i+offset] );
b0 = vec_ld( j , xx1 );
c0 = vec_ld( j+16, xx1 );
b1 = vec_ld( j , yy1 );
c1 = vec_ld( j+16, yy1 );
b2 = vec_ld( j , zz1 );
c2 = vec_ld( j+16, zz1 );
b3 = vec_sub( b0, a1 );
c3 = vec_sub( c0, a1 );
b4 = vec_sub( b1, a2 );
c4 = vec_sub( c1, a2 );
b5 = vec_sub( b2, a3 );
c5 = vec_sub( c2, a3 );
b0 = vec_madd( b3, b3, a6 );
c0 = vec_madd( c3, c3, a6 );
b0 = vec_madd( b4, b4, b0 );
c0 = vec_madd( c4, c4, c0 );
b6 = vec_madd( b5, b5, b0 );
c6 = vec_madd( c5, c5, c0 );
b0 = vec_madd( b5, b5, b0 );
c0 = vec_madd( c5, c5, c0 );
b0 = vec_add( b0, a5 );
c0 = vec_add( c0, a5 );
b1 = vec_madd( b0, b0, a6 );
c1 = vec_madd( c0, c0, a6 );
b0 = vec_madd( b1, b0, a6 );
c0 = vec_madd( c1, c0, a6 );
b1 = vec_rsqrte( b0 );
c1 = vec_rsqrte( c0 );
b2 = vec_madd( b1, b1, a6 );
c2 = vec_madd( c1, c1, a6 );
b0 = vec_madd( b0, b2, a6 );
c0 = vec_madd( c0, c2, a6 );
b0 = vec_nmsub( a0, b0, a0 );
c0 = vec_nmsub( a0, c0, a0 );
b0 = vec_madd( b1, b0, b1 );
c0 = vec_madd( c1, c0, c1 );
b1 = vec_madd( b6, a15, a14 );
c1 = vec_madd( c6, a15, a14 );
b1 = vec_madd( b6, b1, a13 );
c1 = vec_madd( c6, c1, a13 );
b1 = vec_madd( b6, b1, a12 );
c1 = vec_madd( c6, c1, a12 );
b1 = vec_madd( b6, b1, a11 );
c1 = vec_madd( c6, c1, a11 );
b1 = vec_madd( b6, b1, a10 );
c1 = vec_madd( c6, c1, a10 );
b0 = vec_sub( b0, b1 );
c0 = vec_sub( c0, c1 );
b1 = vec_ld( j , mass1 );
c1 = vec_ld( j+16, mass1 );
b2 = vec_sub( b6, a4 );
c2 = vec_sub( c6, a4 );
b1 = vec_sel( b1, a6, b2 );
c1 = vec_sel( c1, a6, c2 );
b1 = vec_madd( b1, b0, a6 );
c1 = vec_madd( c1, c0, a6 );
b2 = vec_sub( a6, b6 );
c2 = vec_sub( a6, c6 );
b0 = vec_sel( b1, a6, b2 );
c0 = vec_sel( c1, a6, c2 );
a7 = vec_madd( b0, b3, a7 );
a7 = vec_madd( c0, c3, a7 );
a8 = vec_madd( b0, b4, a8 );
a8 = vec_madd( c0, c4, a8 );
a9 = vec_madd( b0, b5, a9 );
a9 = vec_madd( c0, c5, a9 );
}
*ax = ( a7[0] + a7[1] + a7[2] + a7[3] );
*ay = ( a8[0] + a8[1] + a8[2] + a8[3] );
*az = ( a9[0] + a9[1] + a9[2] + a9[3] );
const float ma0 = 0.269327, ma1 = -0.0750978, ma2 = 0.0114808, ma3 = -0.00109313, ma4 = 0.0000605491, ma5 = -0.00000147177;
float dxc, dyc, dzc, m, r2, f;
for ( k = i; k < count1; k++ )
{
dxc = xx1[k] - xxi;
dyc = yy1[k] - yyi;
dzc = zz1[k] - zzi;
r2 = dxc * dxc + dyc * dyc + dzc * dzc;
m = ( r2 < fsrrmax2 ) ? mass1[k] : 0.0f;
f = pow( r2 + mp_rsm2, -1.5 ) - ( ma0 + r2*(ma1 + r2*(ma2 + r2*(ma3 + r2*(ma4 + r2*ma5)))));
f = ( r2 > 0.0f ) ? m * f : 0.0f;
*ax = *ax + f * dxc;
*ay = *ay + f * dyc;
*az = *az + f * dzc;
}
}
int isAligned(void *in){
const int mask_04 = 0xFFFFFFFC;
const int mask_08 = 0xFFFFFFF8;
const int mask_16 = 0xFFFFFFF0;
const int mask_32 = 0xFFFFFFE0;
if((int)in == ((int)in & mask_32))
return 32;
if((int)in == ((int)in & mask_16))
return 16;
if((int)in == ((int)in & mask_08))
return 8;
if((int)in == ((int)in & mask_04))
return 4;
return -1;
}
#endif