src/gpu/halo-finder/src/dfft/solver.hpp - public/gem5-resources - Git at Google

 #ifndef SOLVER_HPP
 #define SOLVER_HPP

 #include "complex-type.h"

 #if !defined(FFTW3) && defined(PENCIL)
 #error PENCIL FFT REQUIRES FFTW3
 #endif

 #if defined(FFTW3) && defined(PENCIL)
 #ifdef ESSL_FFTW
 #include <fftw3_essl.h>
 #else
 #include <fftw3.h>
 #endif
 #elif defined(FFTW3)
 #include <fftw3-mpi.h>
 #else
 #include <fftw_mpi.h>
 #endif

 #include <algorithm>
 #include <vector>
 #include <cassert>

 #include "allocator.hpp"
 #include "distribution.hpp"

 #include "bigchunk.h"

 #define CERRILLOS_SLAB_HACK 1

 const double pi = 3.14159265358979323846;

 // pgCC doesn't yet play well with C99 constructs, so...
 #ifdef __PGI__
 extern "C" long int lrint(double x);
 #endif

 #define FFTW_ADDR(X) reinterpret_cast<fftw_complex*>(&(X)[0])

 ///
 // Abstract base class for Poisson solvers.
 //
 // Derived classes must provide their own implementation of
 // initialize_greens_function(), and, if necessary, override the
 // backward_solve() and backward_solve_gradient() methods.
 ///
 class SolverBase : public Distribution {

 public:

   // methods

   SolverBase()
   {
   }

   SolverBase(MPI_Comm comm, int ng)
     : Distribution(comm, ng)
   {
     initialize(comm);
   }

   SolverBase(MPI_Comm comm, std::vector<int> const & n)
     : Distribution(comm, n)
   {
     initialize(comm);
   }

   virtual ~SolverBase()
   {
 #if defined(FFTW3) && defined(PENCIL)
     fftw_destroy_plan(m_plan_f_x);
     fftw_destroy_plan(m_plan_f_y);
     fftw_destroy_plan(m_plan_f_z);
     fftw_destroy_plan(m_plan_b_x);
     fftw_destroy_plan(m_plan_b_y);
     fftw_destroy_plan(m_plan_b_z);
 #elif defined(FFTW3)
     fftw_destroy_plan(m_plan_f);
     fftw_destroy_plan(m_plan_b);
 #else
     fftwnd_mpi_destroy_plan(m_plan_f);
     fftwnd_mpi_destroy_plan(m_plan_b);
 #endif
   }

   // solve interfaces

   void forward_solve(complex_t const *rho)
   {
 #if defined(FFTW3) && defined(PENCIL)

     distribution_3_to_2(rho, &m_buf1[0], &m_d, 0);            // rho  --> buf1
     fftw_execute(m_plan_f_x);                                 // buf1 --> buf2
     distribution_2_to_3(&m_buf2[0], &m_buf1[0], &m_d, 0);     // buf2 --> buf1
     distribution_3_to_2(&m_buf1[0], &m_buf2[0], &m_d, 1);     // buf1 --> buf2
     fftw_execute(m_plan_f_y);                                 // buf2 --> buf1
     distribution_2_to_3(&m_buf1[0], &m_buf2[0], &m_d, 1);     // buf1 --> buf2
     distribution_3_to_2(&m_buf2[0], &m_buf1[0], &m_d, 2);     // buf2 --> buf1
     fftw_execute(m_plan_f_z);                                 // buf1 --> buf2

 #elif defined(FFTW3)
     distribution_3_to_1(rho, &m_buf1[0], &m_d);             // rho  --> buf1
     fftw_execute(m_plan_f);                                 // buf1 --> buf2
 #else
     distribution_3_to_1(rho, &m_buf2[0], &m_d);             // rho  --> buf2
     fftwnd_mpi(m_plan_f, 1,
 	       FFTW_ADDR(m_buf2),
 	       FFTW_ADDR(m_buf3),
 	       FFTW_NORMAL_ORDER);                          // buf2 -->buf2
 #endif
   }

   void backward_solve(complex_t *phi)
   {
     kspace_solve(&m_buf2[0], &m_buf1[0]);                   // buf2 --> buf1
 #if defined(FFTW3) && defined(PENCIL)
     fftw_execute(m_plan_b_z);                               // buf1 --> buf3
     distribution_2_to_3(&m_buf3[0], &m_buf1[0], &m_d, 2);   // buf3 --> buf1
     distribution_3_to_2(&m_buf1[0], &m_buf3[0], &m_d, 1);   // buf1 --> buf3
     fftw_execute(m_plan_b_y);                               // buf3 --> buf1
     distribution_2_to_3(&m_buf1[0], &m_buf3[0], &m_d, 1);   // buf1 --> buf3
     distribution_3_to_2(&m_buf3[0], &m_buf1[0], &m_d, 0);   // buf3 --> buf1
     fftw_execute(m_plan_b_x);                               // buf1 --> buf3
     distribution_2_to_3(&m_buf3[0], phi, &m_d, 0);          // buf3 --> phi
 #elif defined(FFTW3)
     fftw_execute(m_plan_b);                                 // buf1 --> buf3
     distribution_1_to_3(&m_buf3[0], phi, &m_d);             // buf3 --> phi
 #else
     fftwnd_mpi(m_plan_b, 1,
 	       (fftw_complex *) &m_buf1[0],
 	       (fftw_complex *) &m_buf3[0],
 	       FFTW_NORMAL_ORDER);                           // buf1 -->buf1
     distribution_1_to_3(&m_buf1[0], phi, &m_d);              // buf1 --> phi
 #endif
   }

   void backward_solve_gradient(int axis, complex_t *grad_phi)
   {
     kspace_solve_gradient(axis, &m_buf2[0], &m_buf1[0]);    // buf2 --> buf1
 #if defined(FFTW3) && defined(PENCIL)
     fftw_execute(m_plan_b_z);                               // buf1 --> buf3
     distribution_2_to_3(&m_buf3[0], &m_buf1[0], &m_d, 2);   // buf3 --> buf1
     distribution_3_to_2(&m_buf1[0], &m_buf3[0], &m_d, 1);   // buf1 --> buf3
     fftw_execute(m_plan_b_y);                               // buf3 --> buf1
     distribution_2_to_3(&m_buf1[0], &m_buf3[0], &m_d, 1);   // buf1 --> buf3
     distribution_3_to_2(&m_buf3[0], &m_buf1[0], &m_d, 0);   // buf3 --> buf1
     fftw_execute(m_plan_b_x);                               // buf1 --> buf3
     distribution_2_to_3(&m_buf3[0], grad_phi, &m_d, 0);     // buf3 --> grad_phi
 #elif defined(FFTW3)
     fftw_execute(m_plan_b);                                 // buf1 --> buf3
     distribution_1_to_3(&m_buf3[0], grad_phi, &m_d);        // buf3 --> grad_phi
 #else
     fftwnd_mpi(m_plan_b, 1,
 	       (fftw_complex *) &m_buf1[0],
 	       (fftw_complex *) &m_buf3[0],
 	       FFTW_NORMAL_ORDER);                           // buf1 -> buf1
     distribution_1_to_3(&m_buf1[0], grad_phi, &m_d);         // buf1 --> grad_phi
 #endif
   }

   void solve(const complex_t *rho, complex_t *phi)
   {
     forward_solve(rho);
     backward_solve(phi);
   }

   void solve_gradient(int axis, const complex_t *rho, complex_t *phi)
   {
     forward_solve(rho);
     backward_solve_gradient(axis, phi);
   }

   // interfaces for std::vector

   void forward_solve(std::vector<complex_t> const & rho)
   {
     forward_solve(&rho[0]);
   }

   void backward_solve(std::vector<complex_t> & phi)
   {
     backward_solve(&phi[0]);
   }

   void backward_solve_gradient(int axis, std::vector<complex_t> & phi)
   {
     backward_solve_gradient(axis, &phi[0]);
   }

   void solve(std::vector<complex_t> const & rho, std::vector<complex_t> & phi)
   {
     solve(&rho[0], &phi[0]);
   }

   void solve_gradient(int axis, std::vector<complex_t> const & rho, std::vector<complex_t> & phi)
   {
     solve_gradient(axis, &rho[0], &phi[0]);
   }


   // analysis interfaces

   ///
   // calculate the k-space power spectrum
   //   P(modk) = Sum { |rho(k)|^2 : |k| = modk, k <- [0, ng / 2)^3, periodically extended }
   ///
   void power_spectrum(std::vector<double> & power)
   {
     //intermediate in m_buf2 for both FFTW2 and FFTW3
     std::vector<complex_t, bigchunk_allocator<complex_t> > const & rho = m_buf2;

     int ng = m_d.n[0];
     double volume = 1.0 * ng * ng * ng;
     double kk, tpi;

     tpi = 2.0*atan(1.0)*4.0;

     // cache periodic ksq
     m_pk_ksq.resize(ng);
     m_pk_cic.resize(ng);
     double ksq_max = 0;
     for (int k = 0; k < ng / 2; ++k) {

       m_pk_ksq[k] = k * k;
       ksq_max = max(ksq_max, m_pk_ksq[k]);

       m_pk_ksq[k + ng / 2] = (k - ng / 2) * (k - ng / 2);
       ksq_max = max(ksq_max, m_pk_ksq[k + ng / 2]);

       kk = tpi*k/ng;
       m_pk_cic[k] = pow(sin(0.5*kk)/(0.5*kk),-4.0);

       kk = tpi*(k-ng/2)/ng;
       m_pk_cic[k + ng/2] = pow(sin(0.5*kk)/(0.5*kk),-4.0);
     }
     m_pk_cic[0] = 1.0;

     long modk_max = lrint(sqrt(3 * ksq_max)); // round to nearest integer

     // calculate power spectrum
     power.resize(modk_max + 1);
     power.assign(modk_max + 1, 0.0);

     m_pk_weight.resize(modk_max + 1);
     m_pk_weight.assign(modk_max + 1, 0.0);

     /*
 !-----3-D anti-CIC filter for deconvolution

       forall(ii=1:ng)kk(ii)=(ii-1)*tpi/(1.0*ng)

       do ii=1,ng
       if(ii.ge.ng/2+1)kk(ii)=(ii-ng-1)*tpi/(1.0*ng)
       enddo

       mult(1)=1.0

       forall(ii=2:ng)mult(ii)=
       #      1.0/(sin(kk(ii)/2.0)/(kk(ii)/2.0))**2

       forall(ii=1:ng,jj=1:ng,mm=1:ng)erhotr(ii,jj,mm)=
       #      mult(ii)*mult(jj)*mult(mm)*erhotr(ii,jj,mm)
     */

     int local_dim[3];
     int self_coord[3];
 #if defined(FFTW3) && defined(PENCIL)
     self_coord[0]=self_2d_z(0);
     self_coord[1]=self_2d_z(1);
     self_coord[2]=self_2d_z(2);
     local_dim[0]=local_ng_2d_z(0);
     local_dim[1]=local_ng_2d_z(1);
     local_dim[2]=local_ng_2d_z(2);
 #else
     self_coord[0]=self_1d(0);
     self_coord[1]=self_1d(1);
     self_coord[2]=self_1d(2);
     local_dim[0]=local_ng_1d(0);
     local_dim[1]=local_ng_1d(1);
     local_dim[2]=local_ng_1d(2);
 #endif
     int index = 0;
     for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
       int k0 = local_k0 + self_coord[0] * local_dim[0];
       double ksq0 = m_pk_ksq[k0];

       for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
         int k1 = local_k1 + self_coord[1] * local_dim[1];
 	double ksq1 = m_pk_ksq[k1];

 	for (int local_k2 = 0; local_k2 < local_dim[2]; ++local_k2) {
           int k2 = local_k2 + self_coord[2] * local_dim[2];
 	  double ksq2 = m_pk_ksq[k2];
 	  long modk = lrint(sqrt(ksq0 + ksq1 + ksq2)); //round to nearest int
 	  //power[modk] += real(rho[index] * conj(rho[index]));
 	  power[modk] += std::real(rho[index] * conj(rho[index])) * m_pk_cic[k0] * m_pk_cic[k1] * m_pk_cic[k2];
 	  m_pk_weight[modk] += volume;
 	  index++;
 	}
 	index += m_d.padding[2];
       }
       index += m_d.padding[1];
     }

     // accumulate across processors
     MPI_Allreduce(MPI_IN_PLACE, &power[0], power.size(),
 		  MPI_DOUBLE, MPI_SUM, cart_1d());
     MPI_Allreduce(MPI_IN_PLACE, &m_pk_weight[0], m_pk_weight.size(),
 		  MPI_DOUBLE, MPI_SUM, cart_1d());

     //make sure we don't divide by zero
     for(size_t i = 0; i < m_pk_weight.size(); ++i) {
       m_pk_weight[i] += 1.0 * (m_pk_weight[i] < 1.0);
     }

     // scale power by weight
     std::transform(power.begin(), power.end(),
 		   m_pk_weight.begin(), power.begin(),
 		   std::divides<double>());
   }

   ///
   // General initialization
   ///
   void initialize(MPI_Comm comm, bool transposed_order = false)
   {
     int flags_f;
     int flags_b;

     // distribution_init(comm, &n[0], &n[0], &m_d, false);
     // distribution_assert_commensurate(&m_d);
 #if defined(FFTW3) && !defined(PENCIL)
     fftw_mpi_init();
 #endif
     m_greens_functions_initialized = false;
     m_buf1.resize(local_size());
     m_buf2.resize(local_size());
     m_buf3.resize(local_size());

     // create plan for forward and backward DFT's
     flags_f = flags_b = FFTW_ESTIMATE;
 #if defined(FFTW3) && defined(PENCIL)
     m_plan_f_x = fftw_plan_many_dft(1, // rank
                                     &(m_d.process_topology_2_x.n[0]), // const int *n,
                                     m_d.process_topology_2_x.n[1] * m_d.process_topology_2_x.n[2], // howmany
                                     FFTW_ADDR(m_buf1),
                                     NULL, // const int *inembed,
                                     1, // int istride,
                                     m_d.process_topology_2_x.n[0], // int idist,
                                     FFTW_ADDR(m_buf2),
                                     NULL, // const int *onembed,
                                     1, // int ostride,
                                     m_d.process_topology_2_x.n[0], // int odist,
                                     FFTW_FORWARD, // int sign,
                                     0); // unsigned flags
     m_plan_f_y = fftw_plan_many_dft(1, // rank
                                     &(m_d.process_topology_2_y.n[1]), // const int *n,
                                     m_d.process_topology_2_y.n[0] * m_d.process_topology_2_y.n[2], // howmany
                                     FFTW_ADDR(m_buf2),
                                     NULL, // const int *inembed,
                                     1, // int istride,
                                     m_d.process_topology_2_y.n[1], // int idist,
                                     FFTW_ADDR(m_buf1),
                                     NULL, // const int *onembed,
                                     1, // int ostride,
                                     m_d.process_topology_2_y.n[1], // int odist,
                                     FFTW_FORWARD, // int sign,
                                     0); // unsigned flags
     m_plan_f_z = fftw_plan_many_dft(1, // rank
                                     &(m_d.process_topology_2_z.n[2]), // const int *n,
                                     m_d.process_topology_2_z.n[1] * m_d.process_topology_2_z.n[0], // howmany
                                     FFTW_ADDR(m_buf1),
                                     NULL, // const int *inembed,
                                     1, // int istride,
                                     m_d.process_topology_2_z.n[2], // int idist,
                                     FFTW_ADDR(m_buf2),
                                     NULL, // const int *onembed,
                                     1, // int ostride,
                                     m_d.process_topology_2_z.n[2], // int odist,
                                     FFTW_FORWARD, // int sign,
                                     0); // unsigned flags
     m_plan_b_x = fftw_plan_many_dft(1, // rank
                                     &(m_d.process_topology_2_x.n[0]), // const int *n,
                                     m_d.process_topology_2_x.n[1] * m_d.process_topology_2_x.n[2], // howmany
                                     FFTW_ADDR(m_buf1),
                                     NULL, // const int *inembed,
                                     1, // int istride,
                                     m_d.process_topology_2_x.n[0], // int idist,
                                     FFTW_ADDR(m_buf3),
                                     NULL, // const int *onembed,
                                     1, // int ostride,
                                     m_d.process_topology_2_x.n[0], // int odist,
                                     FFTW_BACKWARD, // int sign,
                                     0); // unsigned flags
     m_plan_b_y = fftw_plan_many_dft(1, // rank
                                     &(m_d.process_topology_2_y.n[1]), // const int *n,
                                     m_d.process_topology_2_y.n[0] * m_d.process_topology_2_y.n[2], // howmany
                                     FFTW_ADDR(m_buf3),
                                     NULL, // const int *inembed,
                                     1, // int istride,
                                     m_d.process_topology_2_y.n[1], // int idist,
                                     FFTW_ADDR(m_buf1),
                                     NULL, // const int *onembed,
                                     1, // int ostride,
                                     m_d.process_topology_2_y.n[1], // int odist,
                                     FFTW_BACKWARD, // int sign,
                                     0); // unsigned flags
     m_plan_b_z = fftw_plan_many_dft(1, // rank
                                     &(m_d.process_topology_2_z.n[2]), // const int *n,
                                     m_d.process_topology_2_z.n[1] * m_d.process_topology_2_z.n[0], // howmany
                                     FFTW_ADDR(m_buf1),
                                     NULL, // const int *inembed,
                                     1, // int istride,
                                     m_d.process_topology_2_z.n[2], // int idist,
                                     FFTW_ADDR(m_buf3),
                                     NULL, // const int *onembed,
                                     1, // int ostride,
                                     m_d.process_topology_2_z.n[2], // int odist,
                                     FFTW_BACKWARD, // int sign,
                                     0); // unsigned flags
 #elif defined(FFTW3)
     if (transposed_order) {
       flags_f |= FFTW_MPI_TRANSPOSED_OUT;
       flags_b |= FFTW_MPI_TRANSPOSED_IN;
     }

 #if CERRILLOS_SLAB_HACK == 1
     //attempt to make mpi fftw 3.3 work on more of cerrillos
     //because of green's function application,
     //does not require changes to simulation code
     flags_f |= FFTW_DESTROY_INPUT;
     flags_b |= FFTW_DESTROY_INPUT;
 #endif

     m_plan_f = fftw_mpi_plan_dft_3d(m_d.n[0], m_d.n[1], m_d.n[2],
 				    FFTW_ADDR(m_buf1), FFTW_ADDR(m_buf2),
 				    comm, FFTW_FORWARD, flags_f);
     m_plan_b = fftw_mpi_plan_dft_3d(m_d.n[0], m_d.n[1], m_d.n[2],
 				    FFTW_ADDR(m_buf1), FFTW_ADDR(m_buf3),
 				    comm, FFTW_BACKWARD, flags_b);
 #else
     m_plan_f = fftw3d_mpi_create_plan(comm, m_d.n[0], m_d.n[1], m_d.n[2], FFTW_FORWARD, flags_f);
     m_plan_b = fftw3d_mpi_create_plan(comm, m_d.n[0], m_d.n[1], m_d.n[2], FFTW_BACKWARD, flags_b);
 #endif

   }

   ///
   // Solve for the potential by applying the Green's function to the
   // density (in k-space)
   //   rho           density (input)
   //   phi           potential (output)
   ///
   void kspace_solve(const complex_t *rho, complex_t *phi)
   {
     int k[3];
     int index;

     initialize_greens_function();

     int local_dim[3];
     int self_coord[3];
 #if defined(FFTW3) && defined(PENCIL)
     self_coord[0]=self_2d_z(0);
     self_coord[1]=self_2d_z(1);
     self_coord[2]=self_2d_z(2);
     local_dim[0]=local_ng_2d_z(0);
     local_dim[1]=local_ng_2d_z(1);
     local_dim[2]=local_ng_2d_z(2);
 #else
     self_coord[0]=self_1d(0);
     self_coord[1]=self_1d(1);
     self_coord[2]=self_1d(2);
     local_dim[0]=local_ng_1d(0);
     local_dim[1]=local_ng_1d(1);
     local_dim[2]=local_ng_1d(2);
 #endif

     index = 0;
     for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
       k[0] = local_k0 + self_coord[0] * local_dim[0];
       for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
           k[1] = local_k1 + self_coord[1] * local_dim[1];
 	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
           k[2] = local_k2 + self_coord[2] * local_dim[2];
 	  phi[index] = m_green[index] * rho[index];
 	  index++;
 	}
 	index += m_d.padding[2];
       }
       index += m_d.padding[1];
     }
   }

   ///
   // Solve for the gradient of the potential along the given axis by
   // applying the derivative Green's function to the density (in
   // k-space)
   //   axis          the axis along which to take the gradient
   //   rho           density (input)
   //   grad_phi      the gradient of the potential (output)
   ///
   void kspace_solve_gradient(int axis, const complex_t *rho, complex_t *grad_phi)
   {
     int k[3];
     int index;

     initialize_greens_function();

     int local_dim[3];
     int self_coord[3];
 #if defined(FFTW3) && defined(PENCIL)
     self_coord[0]=self_2d_z(0);
     self_coord[1]=self_2d_z(1);
     self_coord[2]=self_2d_z(2);
     local_dim[0]=local_ng_2d_z(0);
     local_dim[1]=local_ng_2d_z(1);
     local_dim[2]=local_ng_2d_z(2);
 #else
     self_coord[0]=self_1d(0);
     self_coord[1]=self_1d(1);
     self_coord[2]=self_1d(2);
     local_dim[0]=local_ng_1d(0);
     local_dim[1]=local_ng_1d(1);
     local_dim[2]=local_ng_1d(2);
 #endif

     index = 0;
     for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
       k[0] = local_k0 + self_coord[0] * local_dim[0];
       for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
           k[1] = local_k1 + self_coord[1] * local_dim[1];
 	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
           k[2] = local_k2 + self_coord[2] * local_dim[2];
 	  grad_phi[index] = I * (- m_gradient[k[axis]]) * m_green[index] * rho[index];
 	  index++;
 	}
 	index += m_d.padding[2];
       }
       index += m_d.padding[1];
     }
   }

   ///
   // Allocate and pre-calculate isotropic Green's function and
   // gradient operator.
   ///
   virtual void initialize_greens_function() = 0;

 protected:
   double max(double a, double b) { return a > b ? a : b; }
   std::vector<double> m_green; // green's function
   std::vector<double> m_gradient; //imaginary part of the gradient in grid units
   std::vector<double> m_pk_cic;
   std::vector<double> m_pk_weight;
   std::vector<int>    m_pk_ksq;
   std::vector<complex_t, bigchunk_allocator<complex_t> > m_buf1;
   std::vector<complex_t, bigchunk_allocator<complex_t> > m_buf2;
   std::vector<complex_t, bigchunk_allocator<complex_t> > m_buf3;
 #if defined(FFTW3) && defined(PENCIL)
   fftw_plan m_plan_f_x;
   fftw_plan m_plan_f_y;
   fftw_plan m_plan_f_z;
   fftw_plan m_plan_b_x;
   fftw_plan m_plan_b_y;
   fftw_plan m_plan_b_z;
 #elif defined(FFTW3)
   fftw_plan m_plan_f;
   fftw_plan m_plan_b;
 #else
   fftwnd_mpi_plan m_plan_f;
   fftwnd_mpi_plan m_plan_b;
 #endif
   bool m_greens_functions_initialized;
 };


 ///
 //  Poison solver using a 2nd-order discrete Green's function, and a
 //  2nd-order derivative.
 //
 //  G(k) = 1 / (2 * (Sum_i cos(2 pi k_i / n) - 3))
 //  (D_i f)(k) =  ( - i * sin(2 pi k / n) / (2 pi / n) )* f(k)
 ///
 class SolverDiscrete : public SolverBase {

 public:

   SolverDiscrete(MPI_Comm comm, int ng)
     : SolverBase(comm, ng)
   {
   }

   SolverDiscrete(MPI_Comm comm, std::vector<int> n)
     : SolverBase(comm, n)
   {
   }

   ///
   // Allocate and pre-calculate isotropic Green's function (1D
   // distribution) and trigonometric factors:
   //   1 / (2 * (Sum_i cos(2 pi k_i / n) - 3))
   ///
   void initialize_greens_function()
   {
     double kstep;
     int index;
     int k[3];
     std::vector<double> cosine;

     if (m_greens_functions_initialized) {
       return;
     }

     m_greens_functions_initialized = true;

     m_green.resize(local_size());
     m_gradient.resize(m_d.n[0]);
     cosine.resize(m_d.n[0]);

     // cache trigonometric factors and imaginary part of gradient
     kstep = 2.0 * pi / (double) m_d.n[0];
     for (int kk = 0; kk < m_d.n[0]; ++kk) {
       cosine[kk] = cos(kk * kstep);
       m_gradient[kk] = sin(kk * kstep); // imaginary part of gradient
     }

     // cache isotropic Green's function (1D or 2D distribution)
     int local_dim[3];
     int self_coord[3];
 #if defined(FFTW3) && defined(PENCIL)
     self_coord[0]=self_2d_z(0);
     self_coord[1]=self_2d_z(1);
     self_coord[2]=self_2d_z(2);
     local_dim[0]=local_ng_2d_z(0);
     local_dim[1]=local_ng_2d_z(1);
     local_dim[2]=local_ng_2d_z(2);
 #else
     self_coord[0]=self_1d(0);
     self_coord[1]=self_1d(1);
     self_coord[2]=self_1d(2);
     local_dim[0]=local_ng_1d(0);
     local_dim[1]=local_ng_1d(1);
     local_dim[2]=local_ng_1d(2);
 #endif
     index = 0;
     double coeff = 0.5 / double(global_size());
     for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
       k[0] = local_k0 + self_coord[0] * local_dim[0];
       for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
           k[1] = local_k1 + self_coord[1] * local_dim[1];
 	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
           k[2] = local_k2 + self_coord[2] * local_dim[2];
 	  m_green[index] = coeff / (cosine[k[0]] + cosine[k[1]] + cosine[k[2]] - 3.0);
 	  index++;
 	}
 	index += m_d.padding[2];
       }
       index += m_d.padding[1];
     }
     // handle the pole
     if (self() == 0) {
       m_green[0] = 0.0;
     }
   }
 };


 ///
 //  Poison solver using a 6th-order discrete Green's function with a
 //  Gaussian noise-quieting filter function, and a 4th-order
 //  derivative.
 //
 //    G(k) = W(k) * 45/128 / Sum_i [ cos (2 pi k / n)
 //                                 - 5/64 cos(4 pi k / n)
 //                                 + 1/1024 cos(8 pi k_i / n)
 //                                 - 945/1024 ]
 //  or
 //    G(k) = W(k) / Sum_i ( a0 + a1 cos (2 pi k / n)
 //                             + a2 cos (4 pi k / n)
 //                             + a3 cos (8 pi k / n) )
 //  where
 //     a0 = -21/8
 //     a1 =  128/45
 //     a2 = -2/9
 //     a3 =  1/360
 //  satisfying
 //     a0 + a1 + a2 + a3 = 0
 //     (a1 + 4 a2 + 16 a3) = 2
 //  and
 //    W(k) = exp( - k^2 sigma^2 / 4) [ sin(k/2) / (k/2)]^n_s
 //    sigma = 0.8
 //    n_s = 3
 //
 //  The gradient operator (imaginary part, grid units) is
 //    (gradient f)(k) =  b1 sin(2 pi k / n) + b2 sin( 4 pi k / n) f(k)
 //  where
 //     b1 = 4/3
 //     b2 = -1/6
 //  satisfying
 //     b1 + 2 b2 = 1
 //     b1 + 8 b2 = 0
 ///
 class SolverQuiet : public SolverBase {

 public:

   SolverQuiet(MPI_Comm comm, int ng)
     : SolverBase(comm, ng)
   {
   }

   SolverQuiet(MPI_Comm comm, std::vector<int> n)
     : SolverBase(comm, n)
   {
   }

   ///
   // Allocate and pre-calculate isotropic Green's function (1D
   // distribution) and trigonometric factors:
   //   1 / (2 * (Sum_i cos(2 pi k_i / n) - 3))
   ///
   void initialize_greens_function()
   {
     double kstep;
     int index;
     int k[3];
     std::vector<double> c1;
     std::vector<double> c2;
     std::vector<double> c3;
     std::vector<double> filter;
     std::vector<double> kperiodic;
     double const a0 = - 21.0 / 8.0;
     double const a1 =   128.0 / 45.0;
     double const a2 = - 2.0 / 9.0;
     double const a3 =   1.0 / 360.0;
     double const b1 =   4.0 / 3.0;
     double const b2 = - 1.0 / 6.0;
     double const sigma = 0.8;
     double const ns = 3.0;
     int ng = m_d.n[0];

     if (m_greens_functions_initialized) {
       return;
     }

     // check Taylor series coefficent conditions
     assert(fabs(a0 + a1 + a2 + a3) < 1.0e-12);
     assert(fabs(a1 + 4 * a2 + 16 * a3 - 2.0) < 1.0e-12);
     assert(fabs(b1 + 2 * b2 - 1.0) < 1.0e-12);
     assert(fabs(b1 + 8 * b2) < 1.0e-12);

     m_greens_functions_initialized = true;

     m_green.resize(local_size());
     m_gradient.resize(ng);
     c1.resize(ng);
     c2.resize(ng);
     c3.resize(ng);
     kperiodic.resize(ng);
     filter.resize(ng);

     // cache k array with the correct periodicity
     // cache trigonometric factors and imaginary part of gradient
     kstep = 2.0 * pi / static_cast<double>(ng);
     for (int kk = 0; kk < ng; ++kk) {
       c1[kk] = cos(kk * kstep);
       c2[kk] = cos(2 * kk * kstep);
       c3[kk] = cos(4 * kk * kstep);
       if (kk < ng / 2) {
 	kperiodic[kk] = kk * kstep;
 	kperiodic[kk + ng / 2] = (kk - ng / 2) * kstep;
       }
     }

     // cache Green's function filter, and 4th order k-space gradient operator
     filter[0] = 1.0;
     m_gradient[0] = 1.0;
     for (int kk = 1; kk < ng; ++kk) {
       filter[kk] = exp(- 0.25 * sigma * sigma * kperiodic[kk] * kperiodic[kk])
 	* pow(sin(0.5 * kperiodic[kk]) / (0.5 * kperiodic[kk]), ns);
       m_gradient[kk] = (b1 * sin(kperiodic[kk]) + b2 * sin(2 * kperiodic[kk]));
     }

     // cache isotropic Green's function (1D or 2D distribution)
     int local_dim[3];
     int self_coord[3];
 #if defined(FFTW3) && defined(PENCIL)
     self_coord[0]=self_2d_z(0);
     self_coord[1]=self_2d_z(1);
     self_coord[2]=self_2d_z(2);
     local_dim[0]=local_ng_2d_z(0);
     local_dim[1]=local_ng_2d_z(1);
     local_dim[2]=local_ng_2d_z(2);
 #else
     self_coord[0]=self_1d(0);
     self_coord[1]=self_1d(1);
     self_coord[2]=self_1d(2);
     local_dim[0]=local_ng_1d(0);
     local_dim[1]=local_ng_1d(1);
     local_dim[2]=local_ng_1d(2);
 #endif
     index = 0;
     double coeff = 1.0 / double(global_size());
     for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
       k[0] = local_k0 + self_coord[0] * local_dim[0];
       double d0 = a0 + a1 * c1[k[0]] + a2 * c2[k[0]] + a3 * c3[k[0]];
       for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
           k[1] = local_k1 + self_coord[1] * local_dim[1];
 	  double d1 = a0 + a1 * c1[k[1]] + a2 * c2[k[1]] + a3 * c3[k[1]];
 	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
           k[2] = local_k2 + self_coord[2] * local_dim[2];
 	  double filt = coeff * filter[k[0]] * filter[k[1]] * filter[k[2]];
 	  double d2 = a0 + a1 * c1[k[2]] + a2 * c2[k[2]] + a3 * c3[k[2]];
 	  m_green[index] = filt / (d0 + d1 + d2);
 	  index++;
 	}
 	index += m_d.padding[2];
       }
       index += m_d.padding[1];
     }

     // handle the pole
     if (self() == 0) {
       m_green[0] = 0.0;
     }
   }
 };


 ///
 //  Poisson solver class using continuum Green's function:
 //    - 1 / k^2
 ///
 class SolverContinuum : public SolverBase {

 public:

   virtual void this_class_is_not_yet_tested_and_should_not_be_used() = 0;

   SolverContinuum(MPI_Comm comm, int ng)
     : SolverBase(comm, ng)
   {
   }

   SolverContinuum(MPI_Comm comm, std::vector<int> n)
     : SolverBase(comm, n)
   {
   }

   ///
   // Allocate and pre-calculate isotropic Green's function and gradient.
   ///
   void initialize_greens_function()
   {
     double kstep;
     double coeff;
     int index;
     int k[3];
     int ng = m_d.n[0];

     if (m_greens_functions_initialized) {
       return;
     }

     m_greens_functions_initialized = true;

     m_green.resize(local_size());
     m_gradient.resize(ng);

     // cache imaginary part of gradient, imposing symmetries by hand
     kstep = 2.0 * pi / (double) ng;
     for (int kk = 0; kk < ng / 2; ++kk) {
       m_gradient[kk] = kk * kstep;
       m_gradient[kk + ng / 2] = (kk - ng / 2) * kstep;
     }

     // cache isotropic Green's function (1D or 2D distribution)
     int local_dim[3];
     int self_coord[3];
 #if defined(FFTW3) && defined(PENCIL)
     self_coord[0]=self_2d_z(0);
     self_coord[1]=self_2d_z(1);
     self_coord[2]=self_2d_z(2);
     local_dim[0]=local_ng_2d_z(0);
     local_dim[1]=local_ng_2d_z(1);
     local_dim[2]=local_ng_2d_z(2);
 #else
     self_coord[0]=self_1d(0);
     self_coord[1]=self_1d(1);
     self_coord[2]=self_1d(2);
     local_dim[0]=local_ng_1d(0);
     local_dim[1]=local_ng_1d(1);
     local_dim[2]=local_ng_1d(2);
 #endif
     index = 0;
     coeff = -1.0 / double(global_size());
     for (int local_k0 = 0; local_k0 < local_dim[0]; ++local_k0) {
       k[0] = local_k0 + self_coord[0] * local_dim[0];
       double k0sq = m_gradient[k[0]] * m_gradient[k[0]];
       for (int local_k1 = 0; local_k1 < local_dim[1]; ++local_k1) {
           k[1] = local_k1 + self_coord[1] * local_dim[1];
 	  double k1sq = m_gradient[k[1]] * m_gradient[k[1]];
 	for (int local_k2 = 0;  local_k2 < local_dim[2]; ++local_k2) {
           k[2] = local_k2 + self_coord[2] * local_dim[2];
 	  double k2sq = m_gradient[k[2]] * m_gradient[k[2]];
 	  m_green[index] = coeff / (k0sq + k1sq + k2sq);
 	  index++;
 	}
 	index += m_d.padding[2];
       }
       index += m_d.padding[1];
     }
     // handle the pole
     if (self() == 0) {
       m_green[0] = 0.0;
     }
   }
 };

 #endif