blob: 7143ded80e2bd20d8b12d2fe4ccc17b078db12a8 [file] [log] [blame]
/*
BG/Q tuned version of HACC: 69.2% of peak performance on 96 racks of Sequoia
Argonne Leadership Computing Facility, Argonne, IL 60439
Vitali Morozov (morozov@anl.gov)
*/
//#undef __bgq__
#ifdef __bgq__
//#include </soft/compilers/ibmcmp-feb2012/vacpp/bg/12.1/include/builtins.h>
#include IBMCMP_BUILTINS
int isAligned(void *in);
void cm( int count, float *xx, float *yy, float *zz, float *mass, float *xmin, float *xmax, float *xc)
{
// xmin/xmax are currently set to the whole bounding box, but this is too conservative, so we'll
// set them based on the actual particle content.
double x, y, z, m, w;
double xa, ya, za, ma;
int i, j, k;
float x1, x2, y1, y2, z1, z2;
vector4double xv, yv, zv, wv, dv0, dv1, dv2, dv3, dv4, dv5;
vector4double xi0, xi1, yi0, yi1, zi0, zi1;
vector4double xs, ys, zs, ms;
int ALXX, ALYY, ALZZ, ALMM;
ALXX = isAligned( (void *)xx );
//ALYY = isAligned( (void *)yy );
//ALZZ = isAligned( (void *)zz );
//ALMM = isAligned( (void *)mass );
i = 0; j = 0; k = 0;
if ( ( ALXX == 4 ) && ( isAligned( (void *)&xx[1]) == 8 ) ) i = 3;
if ( ( ALXX == 4 ) && ( isAligned( (void *)&xx[1]) >= 16 ) ) i = 1;
if ( ( ALXX == 8 ) ) i = 2;
ma = 0.; xa = 0.; ya = 0.; za = 0.;
x1 = xx[0]; x2 = xx[0];
y1 = yy[0]; y2 = yy[0];
z1 = zz[0]; z2 = zz[0];
for ( k = 0; k < i; k++ )
{
if ( x1 > xx[k] ) x1 = xx[k];
if ( x2 < xx[k] ) x2 = xx[k];
if ( y1 > yy[k] ) y1 = yy[k];
if ( y2 < yy[k] ) y2 = yy[k];
if ( z1 > zz[k] ) z1 = zz[k];
if ( z2 < zz[k] ) z2 = zz[k];
w = mass[k];
xa = xa + w * xx[k];
ya = ya + w * yy[k];
za = za + w * zz[k];
ma = ma + w;
}
xi0 = vec_splats( (double)x1 );
xi1 = vec_splats( (double)x2 );
yi0 = vec_splats( (double)y1 );
yi1 = vec_splats( (double)y2 );
zi0 = vec_splats( (double)z1 );
zi1 = vec_splats( (double)z2 );
xs = vec_splats( 0. );
ys = vec_splats( 0. );
zs = vec_splats( 0. );
ms = vec_splats( 0. );
for ( i = k, j = k * 4; i < count-3; i = i + 4, j = j + 16 )
{
xv = vec_lda( j, xx );
yv = vec_lda( j, yy );
zv = vec_lda( j, zz );
wv = vec_lda( j, mass );
dv0 = vec_cmpgt( xi0, xv );
dv1 = vec_cmplt( xi1, xv );
dv2 = vec_cmpgt( yi0, yv );
dv3 = vec_cmplt( yi1, yv );
dv4 = vec_cmpgt( zi0, zv );
dv5 = vec_cmplt( zi1, zv );
xi0 = vec_sel( xi0, xv, dv0 );
xi1 = vec_sel( xi1, xv, dv1 );
yi0 = vec_sel( yi0, yv, dv2 );
yi1 = vec_sel( yi1, yv, dv3 );
zi0 = vec_sel( zi0, zv, dv4 );
zi1 = vec_sel( zi1, zv, dv5 );
xs = vec_madd( wv, xv, xs );
ys = vec_madd( wv, yv, ys );
zs = vec_madd( wv, zv, zs );
ms = vec_add( ms, wv );
}
if ( i > 0 )
{
if ( x1 > xi0[0] ) x1 = xi0[0];
if ( x1 > xi0[1] ) x1 = xi0[1];
if ( x1 > xi0[2] ) x1 = xi0[2];
if ( x1 > xi0[3] ) x1 = xi0[3];
if ( x2 < xi1[0] ) x2 = xi1[0];
if ( x2 < xi1[1] ) x2 = xi1[1];
if ( x2 < xi1[2] ) x2 = xi1[2];
if ( x2 < xi1[3] ) x2 = xi1[3];
if ( y1 > yi0[0] ) y1 = yi0[0];
if ( y1 > yi0[1] ) y1 = yi0[1];
if ( y1 > yi0[2] ) y1 = yi0[2];
if ( y1 > yi0[3] ) y1 = yi0[3];
if ( y2 < yi1[0] ) y2 = yi1[0];
if ( y2 < yi1[1] ) y2 = yi1[1];
if ( y2 < yi1[2] ) y2 = yi1[2];
if ( y2 < yi1[3] ) y2 = yi1[3];
if ( z1 > zi0[0] ) z1 = zi0[0];
if ( z1 > zi0[1] ) z1 = zi0[1];
if ( z1 > zi0[2] ) z1 = zi0[2];
if ( z1 > zi0[3] ) z1 = zi0[3];
if ( z2 < zi1[0] ) z2 = zi1[0];
if ( z2 < zi1[1] ) z2 = zi1[1];
if ( z2 < zi1[2] ) z2 = zi1[2];
if ( z2 < zi1[3] ) z2 = zi1[3];
xa = xa + ( xs[0] + xs[1] + xs[2] + xs[3] );
ya = ya + ( ys[0] + ys[1] + ys[2] + ys[3] );
za = za + ( zs[0] + zs[1] + zs[2] + zs[3] );
ma = ma + ( ms[0] + ms[1] + ms[2] + ms[3] );
}
for ( k = i; k < count; k++ )
{
if ( x1 > xx[k] ) x1 = xx[k];
if ( x2 < xx[k] ) x2 = xx[k];
if ( y1 > yy[k] ) y1 = yy[k];
if ( y2 < yy[k] ) y2 = yy[k];
if ( z1 > zz[k] ) z1 = zz[k];
if ( z2 < zz[k] ) z2 = zz[k];
w = mass[k];
xa = xa + w * xx[k];
ya = ya + w * yy[k];
za = za + w * zz[k];
ma = ma + w;
}
xmin[0] = x1; xmax[0] = x2;
xmin[1] = y1; xmax[1] = y2;
xmin[2] = z1; xmax[2] = z2;
xc[0] = (float) ( xa / ma);
xc[1] = (float) ( ya / ma);
xc[2] = (float) ( za / ma);
return;
}
#else
#include <math.h>
/*
static inline void cm(ID_T count, const POSVEL_T* __restrict xx, const POSVEL_T* __restrict yy,
const POSVEL_T* __restrict zz, const POSVEL_T* __restrict mass,
POSVEL_T* __restrict xmin, POSVEL_T* __restrict xmax, POSVEL_T* __restrict xc)
*/
void cm( int count, float *xx, float *yy, float *zz, float *mass, float *xmin, float *xmax, float *xc)
{
// xmin/xmax are currently set to the whole bounding box, but this is too conservative, so we'll
// set them based on the actual particle content.
double x = 0, y = 0, z = 0, m = 0;
for (int i = 0; i < count; ++i) {
if (i == 0) {
xmin[0] = xmax[0] = xx[0];
xmin[1] = xmax[1] = yy[0];
xmin[2] = xmax[2] = zz[0];
} else {
xmin[0] = fminf(xmin[0], xx[i]);
xmax[0] = fmaxf(xmax[0], xx[i]);
xmin[1] = fminf(xmin[1], yy[i]);
xmax[1] = fmaxf(xmax[1], yy[i]);
xmin[2] = fminf(xmin[2], zz[i]);
xmax[2] = fmaxf(xmax[2], zz[i]);
}
float w = mass[i];
x += w*xx[i];
y += w*yy[i];
z += w*zz[i];
m += w;
}
xc[0] = (float) (x/m);
xc[1] = (float) (y/m);
xc[2] = (float) (z/m);
}
#endif