src/parsec/disk-image/parsec/parsec-benchmark/ext/splash2x/kernels/lu_cb/src/lu.C - public/gem5-resources - Git at Google

 /*************************************************************************/
 /*                                                                       */
 /*  Copyright (c) 1994 Stanford University                               */
 /*                                                                       */
 /*  All rights reserved.                                                 */
 /*                                                                       */
 /*  Permission is given to use, copy, and modify this software for any   */
 /*  non-commercial purpose as long as this copyright notice is not       */
 /*  removed.  All other uses, including redistribution in whole or in    */
 /*  part, are forbidden without prior written permission.                */
 /*                                                                       */
 /*  This software is provided with absolutely no warranty and no         */
 /*  support.                                                             */
 /*                                                                       */
 /*************************************************************************/

 /*************************************************************************/
 /*                                                                       */
 /*  Parallel dense blocked LU factorization (no pivoting)                */
 /*                                                                       */
 /*  This version contains two dimensional arrays in which the first      */
 /*  dimension is the block to be operated on, and the second contains    */
 /*  all data points in that block.  In this manner, all data points in   */
 /*  a block (which are operated on by the same processor) are allocated  */
 /*  contiguously and locally, and false sharing is eliminated.           */
 /*                                                                       */
 /*  Command line options:                                                */
 /*                                                                       */
 /*  -nN : Decompose NxN matrix.                                          */
 /*  -pP : P = number of processors.                                      */
 /*  -bB : Use a block size of B. BxB elements should fit in cache for    */
 /*        good performance. Small block sizes (B=8, B=16) work well.     */
 /*  -s  : Print individual processor timing statistics.                  */
 /*  -t  : Test output.                                                   */
 /*  -o  : Print out matrix values.                                       */
 /*  -h  : Print out command line options.                                */
 /*                                                                       */
 /*  Note: This version works under both the FORK and SPROC models        */
 /*                                                                       */
 /*************************************************************************/

 #include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
 #ifdef ENABLE_PARSEC_HOOKS
 #include <hooks.h>
 #endif
 MAIN_ENV

 #define MAXRAND                         32767.0
 #define DEFAULT_N                         512
 #define DEFAULT_P                           1
 #define DEFAULT_B                          16
 #define min(a,b) ((a) < (b) ? (a) : (b))
 //#define PAGE_SIZE                       4096
 #define PAGE_SIZE			1024

 struct GlobalMemory {
   double *t_in_fac;
   double *t_in_solve;
   double *t_in_mod;
   double *t_in_bar;
   double *completion;
   unsigned long starttime;
   unsigned long rf;
   unsigned long rs;
   unsigned long done;
   long id;
   BARDEC(start)
   LOCKDEC(idlock)
 } *Global;

 struct LocalCopies {
   double t_in_fac;
   double t_in_solve;
   double t_in_mod;
   double t_in_bar;
 };

 long n = DEFAULT_N;          /* The size of the matrix */
 long P = DEFAULT_P;          /* Number of processors */
 long block_size = DEFAULT_B; /* Block dimension */
 long nblocks;                /* Number of blocks in each dimension */
 long num_rows;               /* Number of processors per row of processor grid */
 long num_cols;               /* Number of processors per col of processor grid */
 double **a;                  /* a = lu; l and u both placed back in a */
 double *rhs;
 long *proc_bytes;            /* Bytes to malloc per processor to hold blocks of A*/
 double **last_malloc;        /* Starting point of last block of A */

 long test_result = 0;        /* Test result of factorization? */
 long doprint = 0;            /* Print out matrix values? */
 long dostats = 0;            /* Print out individual processor statistics? */

 void SlaveStart(void);
 void OneSolve(long n, long block_size, long MyNum, long dostats);
 void lu0(double *a, long n, long stride);
 void bdiv(double *a, double *diag, long stride_a, long stride_diag, long dimi, long dimk);
 void bmodd(double *a, double *c, long dimi, long dimj, long stride_a, long stride_c);
 void bmod(double *a, double *b, double *c, long dimi, long dimj, long dimk, long stridea, long strideb, long stridec);
 void daxpy(double *a, double *b, long n, double alpha);
 long BlockOwner(long I, long J);
 long BlockOwnerColumn(long I, long J);
 long BlockOwnerRow(long I, long J);
 void lu(long n, long bs, long MyNum, struct LocalCopies *lc, long dostats);
 void InitA(double *rhs);
 double TouchA(long bs, long MyNum);
 void PrintA(void);
 void CheckResult(long n, double **a, double *rhs);
 void printerr(char *s);

 int main(int argc, char *argv[])
 {
 #ifdef ENABLE_PARSEC_HOOKS
 	__parsec_bench_begin (__splash2_lu_cb);
 #endif
   long i, j;
   long ch;
   extern char *optarg;
   double mint, maxt, avgt;
   double min_fac, min_solve, min_mod, min_bar;
   double max_fac, max_solve, max_mod, max_bar;
   double avg_fac, avg_solve, avg_mod, avg_bar;
   long proc_num;
   long edge;
   long size;
   unsigned long start;

   CLOCK(start)

   while ((ch = getopt(argc, argv, "n:p:b:cstoh")) != -1) {
     switch(ch) {
     case 'n': n = atoi(optarg); break;
     case 'p': P = atoi(optarg); break;
     case 'b': block_size = atoi(optarg); break;
     case 's': dostats = 1; break;
     case 't': test_result = !test_result; break;
     case 'o': doprint = !doprint; break;
     case 'h': printf("Usage: LU <options>\n\n");
 	      printf("options:\n");
               printf("  -nN : Decompose NxN matrix.\n");
               printf("  -pP : P = number of processors.\n");
               printf("  -bB : Use a block size of B. BxB elements should fit in cache for \n");
               printf("        good performance. Small block sizes (B=8, B=16) work well.\n");
               printf("  -c  : Copy non-locally allocated blocks to local memory before use.\n");
               printf("  -s  : Print individual processor timing statistics.\n");
               printf("  -t  : Test output.\n");
               printf("  -o  : Print out matrix values.\n");
               printf("  -h  : Print out command line options.\n\n");
               printf("Default: LU -n%1d -p%1d -b%1d\n",
 		     DEFAULT_N,DEFAULT_P,DEFAULT_B);
               exit(0);
               break;
     }
   }

   MAIN_INITENV(,150000000)

   printf("\n");
   printf("Blocked Dense LU Factorization\n");
   printf("     %ld by %ld Matrix\n",n,n);
   printf("     %ld Processors\n",P);
   printf("     %ld by %ld Element Blocks\n",block_size,block_size);
   printf("\n");
   printf("\n");

   num_rows = (long) sqrt((double) P);
   for (;;) {
     num_cols = P/num_rows;
     if (num_rows*num_cols == P)
       break;
     num_rows--;
   }
   nblocks = n/block_size;
   if (block_size * nblocks != n) {
     nblocks++;
   }

   edge = n%block_size;
   if (edge == 0) {
     edge = block_size;
   }

   proc_bytes = (long *) malloc(P*sizeof(long));
   if (proc_bytes == NULL) {
 	  fprintf(stderr,"Could not malloc memory for proc_bytes.\n");
 	  exit(-1);
   }
   last_malloc = (double **) G_MALLOC(P*sizeof(double *));
   if (last_malloc == NULL) {
 	  fprintf(stderr,"Could not malloc memory for last_malloc.\n");
 	  exit(-1);
   }
   for (i=0;i<P;i++) {
     proc_bytes[i] = 0;
     last_malloc[i] = NULL;
   }
   for (i=0;i<nblocks;i++) {
     for (j=0;j<nblocks;j++) {
       proc_num = BlockOwner(i,j);
       if ((i == nblocks-1) && (j == nblocks-1)) {
         size = edge*edge;
       } else if ((i == nblocks-1) || (j == nblocks-1)) {
         size = edge*block_size;
       } else {
         size = block_size*block_size;
       }
       proc_bytes[proc_num] += size*sizeof(double);
     }
   }
   for (i=0;i<P;i++) {
     last_malloc[i] = (double *) G_MALLOC(proc_bytes[i] + PAGE_SIZE)
     if (last_malloc[i] == NULL) {
       fprintf(stderr,"Could not malloc memory blocks for proc %ld\n",i);
       exit(-1);
     }
     last_malloc[i] = (double *) (((unsigned long) last_malloc[i]) + PAGE_SIZE -
                      ((unsigned long) last_malloc[i]) % PAGE_SIZE);

 /* Note that this causes all blocks to start out page-aligned, and that
    for block sizes that exceed cache line size, blocks start at cache-line
    aligned addresses as well.  This reduces false sharing */

   }
   a = (double **) G_MALLOC(nblocks*nblocks*sizeof(double *));
   if (a == NULL) {
     printerr("Could not malloc memory for a\n");
     exit(-1);
   }
   for (i=0;i<nblocks;i++) {
     for (j=0;j<nblocks;j++) {
       proc_num = BlockOwner(i,j);
       a[i+j*nblocks] = last_malloc[proc_num];
       if ((i == nblocks-1) && (j == nblocks-1)) {
         size = edge*edge;
       } else if ((i == nblocks-1) || (j == nblocks-1)) {
         size = edge*block_size;
       } else {
         size = block_size*block_size;
       }
       last_malloc[proc_num] += size;
     }
   }

   rhs = (double *) G_MALLOC(n*sizeof(double));
   if (rhs == NULL) {
     printerr("Could not malloc memory for rhs\n");
     exit(-1);
   }

   Global = (struct GlobalMemory *) G_MALLOC(sizeof(struct GlobalMemory));
   Global->t_in_fac = (double *) G_MALLOC(P*sizeof(double));
   Global->t_in_mod = (double *) G_MALLOC(P*sizeof(double));
   Global->t_in_solve = (double *) G_MALLOC(P*sizeof(double));
   Global->t_in_bar = (double *) G_MALLOC(P*sizeof(double));
   Global->completion = (double *) G_MALLOC(P*sizeof(double));

   if (Global == NULL) {
     printerr("Could not malloc memory for Global\n");
     exit(-1);
   } else if (Global->t_in_fac == NULL) {
     printerr("Could not malloc memory for Global->t_in_fac\n");
     exit(-1);
   } else if (Global->t_in_mod == NULL) {
     printerr("Could not malloc memory for Global->t_in_mod\n");
     exit(-1);
   } else if (Global->t_in_solve == NULL) {
     printerr("Could not malloc memory for Global->t_in_solve\n");
     exit(-1);
   } else if (Global->t_in_bar == NULL) {
     printerr("Could not malloc memory for Global->t_in_bar\n");
     exit(-1);
   } else if (Global->completion == NULL) {
     printerr("Could not malloc memory for Global->completion\n");
     exit(-1);
   }

 /* POSSIBLE ENHANCEMENT:  Here is where one might distribute the a[i]
    blocks across physically distributed memories as desired.

    One way to do this is as follows:

    for (i=0;i<nblocks;i++) {
      for (j=0;j<nblocks;j++) {
        proc_num = BlockOwner(i,j);
        if ((i == nblocks-1) && (j == nblocks-1)) {
          size = edge*edge;
        } else if ((i == nblocks-1) || (j == nblocks-1)) {
          size = edge*block_size;
        } else {
          size = block_size*block_size;
        }

        Place all addresses x such that
        (&(a[i+j*nblocks][0]) <= x < &(a[i+j*nblocks][size-1]))
        on node proc_num
      }
    }
 */

   BARINIT(Global->start, P);
   LOCKINIT(Global->idlock);
   Global->id = 0;

   InitA(rhs);
   if (doprint) {
     printf("Matrix before decomposition:\n");
     PrintA();
   }

 #ifdef ENABLE_PARSEC_HOOKS
 	__parsec_roi_begin();
 #endif
   CREATE(SlaveStart, P);
   WAIT_FOR_END(P);
 #ifdef ENABLE_PARSEC_HOOKS
 	__parsec_roi_end();
 #endif

   if (doprint) {
     printf("\nMatrix after decomposition:\n");
     PrintA();
   }

   if (dostats) {
     maxt = avgt = mint = Global->completion[0];
     for (i=1; i<P; i++) {
       if (Global->completion[i] > maxt) {
         maxt = Global->completion[i];
       }
       if (Global->completion[i] < mint) {
         mint = Global->completion[i];
       }
       avgt += Global->completion[i];
     }
     avgt = avgt / P;

     min_fac = max_fac = avg_fac = Global->t_in_fac[0];
     min_solve = max_solve = avg_solve = Global->t_in_solve[0];
     min_mod = max_mod = avg_mod = Global->t_in_mod[0];
     min_bar = max_bar = avg_bar = Global->t_in_bar[0];

     for (i=1; i<P; i++) {
       if (Global->t_in_fac[i] > max_fac) {
         max_fac = Global->t_in_fac[i];
       }
       if (Global->t_in_fac[i] < min_fac) {
         min_fac = Global->t_in_fac[i];
       }
       if (Global->t_in_solve[i] > max_solve) {
         max_solve = Global->t_in_solve[i];
       }
       if (Global->t_in_solve[i] < min_solve) {
         min_solve = Global->t_in_solve[i];
       }
       if (Global->t_in_mod[i] > max_mod) {
         max_mod = Global->t_in_mod[i];
       }
       if (Global->t_in_mod[i] < min_mod) {
         min_mod = Global->t_in_mod[i];
       }
       if (Global->t_in_bar[i] > max_bar) {
         max_bar = Global->t_in_bar[i];
       }
       if (Global->t_in_bar[i] < min_bar) {
         min_bar = Global->t_in_bar[i];
       }
       avg_fac += Global->t_in_fac[i];
       avg_solve += Global->t_in_solve[i];
       avg_mod += Global->t_in_mod[i];
       avg_bar += Global->t_in_bar[i];
     }
     avg_fac = avg_fac/P;
     avg_solve = avg_solve/P;
     avg_mod = avg_mod/P;
     avg_bar = avg_bar/P;
   }
   printf("                            PROCESS STATISTICS\n");
   printf("              Total      Diagonal     Perimeter      Interior       Barrier\n");
   printf(" Proc         Time         Time         Time           Time          Time\n");
   printf("    0    %10.0f    %10.0f    %10.0f    %10.0f    %10.0f\n",
           Global->completion[0],Global->t_in_fac[0],
           Global->t_in_solve[0],Global->t_in_mod[0],
           Global->t_in_bar[0]);
   if (dostats) {
     for (i=1; i<P; i++) {
       printf("  %3ld    %10.0f    %10.0f    %10.0f    %10.0f    %10.0f\n",
               i,Global->completion[i],Global->t_in_fac[i],
 	      Global->t_in_solve[i],Global->t_in_mod[i],
 	      Global->t_in_bar[i]);
     }
     printf("  Avg    %10.0f    %10.0f    %10.0f    %10.0f    %10.0f\n",
            avgt,avg_fac,avg_solve,avg_mod,avg_bar);
     printf("  Min    %10.0f    %10.0f    %10.0f    %10.0f    %10.0f\n",
            mint,min_fac,min_solve,min_mod,min_bar);
     printf("  Max    %10.0f    %10.0f    %10.0f    %10.0f    %10.0f\n",
            maxt,max_fac,max_solve,max_mod,max_bar);
   }
   printf("\n");
   Global->starttime = start;
   printf("                            TIMING INFORMATION\n");
   printf("Start time                        : %16lu\n", Global->starttime);
   printf("Initialization finish time        : %16lu\n", Global->rs);
   printf("Overall finish time               : %16lu\n", Global->rf);
   printf("Total time with initialization    : %16lu\n", Global->rf-Global->starttime);
   printf("Total time without initialization : %16lu\n", Global->rf-Global->rs);
   printf("\n");

   if (test_result) {
     printf("                             TESTING RESULTS\n");
     CheckResult(n, a, rhs);
   }

   MAIN_END;
 #ifdef ENABLE_PARSEC_HOOKS
 	__parsec_bench_end();
 #endif
 }


 void SlaveStart()
 {
   long MyNum;

   LOCK(Global->idlock)
     MyNum = Global->id;
     Global->id ++;
   UNLOCK(Global->idlock)

 /* POSSIBLE ENHANCEMENT:  Here is where one might pin processes to
    processors to avoid migration */

   BARINCLUDE(Global->start);
   OneSolve(n, block_size, MyNum, dostats);
 }


 void OneSolve(long n, long block_size, long MyNum, long dostats)
 {
   unsigned long myrs;
   unsigned long myrf;
   unsigned long mydone;
   struct LocalCopies *lc;

   lc = (struct LocalCopies *) malloc(sizeof(struct LocalCopies));
   if (lc == NULL) {
     fprintf(stderr,"Proc %ld could not malloc memory for lc\n",MyNum);
     exit(-1);
   }
   lc->t_in_fac = 0.0;
   lc->t_in_solve = 0.0;
   lc->t_in_mod = 0.0;
   lc->t_in_bar = 0.0;

   /* barrier to ensure all initialization is done */
   BARRIER(Global->start, P);

   /* to remove cold-start misses, all processors touch their own data */
   TouchA(block_size, MyNum);

   BARRIER(Global->start, P);

 /* POSSIBLE ENHANCEMENT:  Here is where one might reset the
    statistics that one is measuring about the parallel execution */

   if ((MyNum == 0) || (dostats)) {
     CLOCK(myrs);
   }

   lu(n, block_size, MyNum, lc, dostats);

   if ((MyNum == 0) || (dostats)) {
     CLOCK(mydone);
   }

   BARRIER(Global->start, P);

   if ((MyNum == 0) || (dostats)) {
     Global->t_in_fac[MyNum] = lc->t_in_fac;
     Global->t_in_solve[MyNum] = lc->t_in_solve;
     Global->t_in_mod[MyNum] = lc->t_in_mod;
     Global->t_in_bar[MyNum] = lc->t_in_bar;
     Global->completion[MyNum] = mydone-myrs;
   }
   if (MyNum == 0) {
     CLOCK(myrf);
     Global->rs = myrs;
     Global->done = mydone;
     Global->rf = myrf;
   }
 }


 void lu0(double *a, long n, long stride)
 {
   long j;
   long k;
   long length;
   double alpha;

   for (k=0; k<n; k++) {
     /* modify subsequent columns */
     for (j=k+1; j<n; j++) {
       a[k+j*stride] /= a[k+k*stride];
       alpha = -a[k+j*stride];
       length = n-k-1;
       daxpy(&a[k+1+j*stride], &a[k+1+k*stride], n-k-1, alpha);
     }
   }
 }


 void bdiv(double *a, double *diag, long stride_a, long stride_diag, long dimi, long dimk)
 {
   long j;
   long k;
   double alpha;

   for (k=0; k<dimk; k++) {
     for (j=k+1; j<dimk; j++) {
       alpha = -diag[k+j*stride_diag];
       daxpy(&a[j*stride_a], &a[k*stride_a], dimi, alpha);
     }
   }
 }


 void bmodd(double *a, double *c, long dimi, long dimj, long stride_a, long stride_c)
 {
   long j;
   long k;
   long length;
   double alpha;

   for (k=0; k<dimi; k++) {
     for (j=0; j<dimj; j++) {
       c[k+j*stride_c] /= a[k+k*stride_a];
       alpha = -c[k+j*stride_c];
       length = dimi - k - 1;
       daxpy(&c[k+1+j*stride_c], &a[k+1+k*stride_a], dimi-k-1, alpha);
     }
   }
 }


 void bmod(double *a, double *b, double *c, long dimi, long dimj, long dimk, long stridea, long strideb, long stridec)
 {
   long j;
   long k;
   double alpha;

   for (k=0; k<dimk; k++) {
     for (j=0; j<dimj; j++) {
       alpha = -b[k+j*strideb];
       daxpy(&c[j*stridec], &a[k*stridea], dimi, alpha);
     }
   }
 }


 void daxpy(double *a, double *b, long n, double alpha)
 {
   long i;

   for (i=0; i<n; i++) {
     a[i] += alpha*b[i];
   }
 }


 long BlockOwner(long I, long J)
 {
 //	return((J%num_cols) + (I%num_rows)*num_cols);
 	return((I + J*nblocks) % P);
 }

 long BlockOwnerColumn(long I, long J)
 {
 	return(I % P);
 }

 long BlockOwnerRow(long I, long J)
 {
 	return(((J % P) + (P / 2)) % P);
 }

 void lu(long n, long bs, long MyNum, struct LocalCopies *lc, long dostats)
 {
   long i, il, j, jl, k, kl;
   long I, J, K;
   double *A, *B, *C, *D;
   long strI, strJ, strK;
   unsigned long t1, t2, t3, t4, t11, t22;

   for (k=0, K=0; k<n; k+=bs, K++) {
     kl = k + bs;
     if (kl > n) {
       kl = n;
       strK = kl - k;
     } else {
       strK = bs;
     }

     if ((MyNum == 0) || (dostats)) {
       CLOCK(t1);
     }

     /* factor diagonal block */
     if (BlockOwner(K, K) == MyNum) {
       A = a[K+K*nblocks];
       lu0(A, strK, strK);
     }

     if ((MyNum == 0) || (dostats)) {
       CLOCK(t11);
     }

     BARRIER(Global->start, P);

     if ((MyNum == 0) || (dostats)) {
       CLOCK(t2);
     }

     /* divide column k by diagonal block */
     D = a[K+K*nblocks];
     for (i=kl, I=K+1; i<n; i+=bs, I++) {
       if (BlockOwnerColumn(I, K) == MyNum) {  /* parcel out blocks */
 	il = i + bs;
 	if (il > n) {
 	  il = n;
           strI = il - i;
         } else {
           strI = bs;
         }
 	A = a[I+K*nblocks];
 	bdiv(A, D, strI, strK, strI, strK);
       }
     }
     /* modify row k by diagonal block */
     for (j=kl, J=K+1; j<n; j+=bs, J++) {
       if (BlockOwnerRow(K, J) == MyNum) {  /* parcel out blocks */
 	jl = j+bs;
 	if (jl > n) {
 	  jl = n;
           strJ = jl - j;
         } else {
           strJ = bs;
         }
         A = a[K+J*nblocks];
 	bmodd(D, A, strK, strJ, strK, strK);
       }
     }

     if ((MyNum == 0) || (dostats)) {
       CLOCK(t22);
     }

     BARRIER(Global->start, P);

     if ((MyNum == 0) || (dostats)) {
       CLOCK(t3);
     }

     /* modify subsequent block columns */
     for (i=kl, I=K+1; i<n; i+=bs, I++) {
       il = i+bs;
       if (il > n) {
 	il = n;
         strI = il - i;
       } else {
         strI = bs;
       }
       A = a[I+K*nblocks];
       for (j=kl, J=K+1; j<n; j+=bs, J++) {
 	jl = j + bs;
 	if (jl > n) {
 	  jl = n;
           strJ= jl - j;
         } else {
           strJ = bs;
         }
 	if (BlockOwner(I, J) == MyNum) {  /* parcel out blocks */
 	  B = a[K+J*nblocks];
 	  C = a[I+J*nblocks];
 	  bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
 	}
       }
     }

     if ((MyNum == 0) || (dostats)) {
       CLOCK(t4);
       lc->t_in_fac += (t11-t1);
       lc->t_in_solve += (t22-t2);
       lc->t_in_mod += (t4-t3);
       lc->t_in_bar += (t2-t11) + (t3-t22);
     }
   }
 }


 void InitA(double *rhs)
 {
   long i, j;
   long ii, jj;
   long edge;
   long ibs;
   long jbs, skip;

   srand48((long) 1);
   edge = n%block_size;
   for (j=0; j<n; j++) {
     for (i=0; i<n; i++) {
       if ((n - i) <= edge) {
 	ibs = edge;
 	ibs = n-edge;
 	skip = edge;
       } else {
 	ibs = block_size;
 	skip = block_size;
       }
       if ((n - j) <= edge) {
 	jbs = edge;
 	jbs = n-edge;
       } else {
 	jbs = block_size;
       }
       ii = (i/block_size) + (j/block_size)*nblocks;
       jj = (i%ibs)+(j%jbs)*skip;
       a[ii][jj] = ((double) lrand48())/MAXRAND;
       if (i == j) {
 	a[ii][jj] *= 10;
       }
     }
   }

   for (j=0; j<n; j++) {
     rhs[j] = 0.0;
   }
   for (j=0; j<n; j++) {
     for (i=0; i<n; i++) {
       if ((n - i) <= edge) {
 	ibs = edge;
 	ibs = n-edge;
 	skip = edge;
       } else {
 	ibs = block_size;
 	skip = block_size;
       }
       if ((n - j) <= edge) {
 	jbs = edge;
 	jbs = n-edge;
       } else {
 	jbs = block_size;
       }
       ii = (i/block_size) + (j/block_size)*nblocks;
       jj = (i%ibs)+(j%jbs)*skip;
       rhs[i] += a[ii][jj];
     }
   }
 }


 double TouchA(long bs, long MyNum)
 {
   long i, j, I, J;
   double tot = 0.0;
   long ibs;
   long jbs;

   /* touch my portion of A[] */

   for (J=0; J<nblocks; J++) {
     for (I=0; I<nblocks; I++) {
       if (BlockOwner(I, J) == MyNum) {
 	if (J == nblocks-1) {
 	  jbs = n%bs;
 	  if (jbs == 0) {
 	    jbs = bs;
           }
 	} else {
 	  jbs = bs;
 	}
 	if (I == nblocks-1) {
 	  ibs = n%bs;
 	  if (ibs == 0) {
 	    ibs = bs;
           }
 	} else {
 	  ibs = bs;
 	}
 	for (j=0; j<jbs; j++) {
 	  for (i=0; i<ibs; i++) {
 	    tot += a[I+J*nblocks][i+j*ibs];
           }
 	}
       }
     }
   }
   return(tot);
 }


 void PrintA()
 {
   long i, j;
   long ii, jj;
   long edge;
   long ibs, jbs, skip;

   edge = n%block_size;
   for (i=0; i<n; i++) {
     for (j=0; j<n; j++) {
       if ((n - i) <= edge) {
 	ibs = edge;
 	ibs = n-edge;
         skip = edge;
       } else {
 	ibs = block_size;
         skip = block_size;
       }
       if ((n - j) <= edge) {
 	jbs = edge;
 	jbs = n-edge;
       } else {
 	jbs = block_size;
       }
       ii = (i/block_size) + (j/block_size)*nblocks;
       jj = (i%ibs)+(j%jbs)*skip;
       printf("%8.1f ", a[ii][jj]);
     }
     printf("\n");
   }
   fflush(stdout);
 }


 void CheckResult(long n, double **a, double *rhs)
 {
   long i, j, bogus = 0;
   double *y, diff, max_diff;
   long ii, jj;
   long edge;
   long ibs, jbs, skip;

   edge = n%block_size;
   y = (double *) malloc(n*sizeof(double));
   if (y == NULL) {
     printerr("Could not malloc memory for y\n");
     exit(-1);
   }
   for (j=0; j<n; j++) {
     y[j] = rhs[j];
   }
   for (j=0; j<n; j++) {
     if ((n - j) <= edge) {
       jbs = edge;
       jbs = n-edge;
       skip = edge;
     } else {
       jbs = block_size;
       skip = block_size;
     }
     ii = (j/block_size) + (j/block_size)*nblocks;
     jj = (j%jbs)+(j%jbs)*skip;

     y[j] = y[j]/a[ii][jj];
     for (i=j+1; i<n; i++) {
       if ((n - i) <= edge) {
         ibs = edge;
         ibs = n-edge;
         skip = edge;
       } else {
         ibs = block_size;
         skip = block_size;
       }
       ii = (i/block_size) + (j/block_size)*nblocks;
       jj = (i%ibs)+(j%jbs)*skip;

       y[i] -= a[ii][jj]*y[j];
     }
   }

   for (j=n-1; j>=0; j--) {
     for (i=0; i<j; i++) {
       if ((n - i) <= edge) {
 	ibs = edge;
         ibs = n-edge;
         skip = edge;
       } else {
 	ibs = block_size;
         skip = block_size;
       }
       if ((n - j) <= edge) {
 	jbs = edge;
         jbs = n-edge;
       } else {
 	jbs = block_size;
       }
       ii = (i/block_size) + (j/block_size)*nblocks;
       jj = (i%ibs)+(j%jbs)*skip;
       y[i] -= a[ii][jj]*y[j];
     }
   }

   max_diff = 0.0;
   for (j=0; j<n; j++) {
     diff = y[j] - 1.0;
     if (fabs(diff) > 0.00001) {
       bogus = 1;
       max_diff = diff;
     }
   }
   if (bogus) {
     printf("TEST FAILED: (%.5f diff)\n", max_diff);
   } else {
     printf("TEST PASSED\n");
   }
   free(y);
 }


 void printerr(char *s)
 {
   fprintf(stderr,"ERROR: %s\n",s);
 }
	/*************************************************************************/
	/* */
	/* Copyright (c) 1994 Stanford University */
	/* */
	/* All rights reserved. */
	/* */
	/* Permission is given to use, copy, and modify this software for any */
	/* non-commercial purpose as long as this copyright notice is not */
	/* removed. All other uses, including redistribution in whole or in */
	/* part, are forbidden without prior written permission. */
	/* */
	/* This software is provided with absolutely no warranty and no */
	/* support. */
	/* */
	/*************************************************************************/

	/*************************************************************************/
	/* */
	/* Parallel dense blocked LU factorization (no pivoting) */
	/* */
	/* This version contains two dimensional arrays in which the first */
	/* dimension is the block to be operated on, and the second contains */
	/* all data points in that block. In this manner, all data points in */
	/* a block (which are operated on by the same processor) are allocated */
	/* contiguously and locally, and false sharing is eliminated. */
	/* */
	/* Command line options: */
	/* */
	/* -nN : Decompose NxN matrix. */
	/* -pP : P = number of processors. */
	/* -bB : Use a block size of B. BxB elements should fit in cache for */
	/* good performance. Small block sizes (B=8, B=16) work well. */
	/* -s : Print individual processor timing statistics. */
	/* -t : Test output. */
	/* -o : Print out matrix values. */
	/* -h : Print out command line options. */
	/* */
	/* Note: This version works under both the FORK and SPROC models */
	/* */
	/*************************************************************************/

	#include <stdio.h>
	#include <math.h>
	#include <stdlib.h>
	#ifdef ENABLE_PARSEC_HOOKS
	#include <hooks.h>
	#endif
	MAIN_ENV

	#define MAXRAND 32767.0
	#define DEFAULT_N 512
	#define DEFAULT_P 1
	#define DEFAULT_B 16
	#define min(a,b) ((a) < (b) ? (a) : (b))
	//#define PAGE_SIZE 4096
	#define PAGE_SIZE 1024

	struct GlobalMemory {
	double *t_in_fac;
	double *t_in_solve;
	double *t_in_mod;
	double *t_in_bar;
	double *completion;
	unsigned long starttime;
	unsigned long rf;
	unsigned long rs;
	unsigned long done;
	long id;
	BARDEC(start)
	LOCKDEC(idlock)
	} *Global;

	struct LocalCopies {
	double t_in_fac;
	double t_in_solve;
	double t_in_mod;
	double t_in_bar;
	};

	long n = DEFAULT_N; /* The size of the matrix */
	long P = DEFAULT_P; /* Number of processors */
	long block_size = DEFAULT_B; /* Block dimension */
	long nblocks; /* Number of blocks in each dimension */
	long num_rows; /* Number of processors per row of processor grid */
	long num_cols; /* Number of processors per col of processor grid */
	double *a; / a = lu; l and u both placed back in a */
	double *rhs;
	long proc_bytes; / Bytes to malloc per processor to hold blocks of A*/
	double *last_malloc; / Starting point of last block of A */

	long test_result = 0; /* Test result of factorization? */
	long doprint = 0; /* Print out matrix values? */
	long dostats = 0; /* Print out individual processor statistics? */

	void SlaveStart(void);
	void OneSolve(long n, long block_size, long MyNum, long dostats);
	void lu0(double *a, long n, long stride);
	void bdiv(double a, double diag, long stride_a, long stride_diag, long dimi, long dimk);
	void bmodd(double a, double c, long dimi, long dimj, long stride_a, long stride_c);
	void bmod(double a, double b, double *c, long dimi, long dimj, long dimk, long stridea, long strideb, long stridec);
	void daxpy(double a, double b, long n, double alpha);
	long BlockOwner(long I, long J);
	long BlockOwnerColumn(long I, long J);
	long BlockOwnerRow(long I, long J);
	void lu(long n, long bs, long MyNum, struct LocalCopies *lc, long dostats);
	void InitA(double *rhs);
	double TouchA(long bs, long MyNum);
	void PrintA(void);
	void CheckResult(long n, double *a, double rhs);
	void printerr(char *s);

	int main(int argc, char *argv[])
	{
	#ifdef ENABLE_PARSEC_HOOKS
	__parsec_bench_begin (__splash2_lu_cb);
	#endif
	long i, j;
	long ch;
	extern char *optarg;
	double mint, maxt, avgt;
	double min_fac, min_solve, min_mod, min_bar;
	double max_fac, max_solve, max_mod, max_bar;
	double avg_fac, avg_solve, avg_mod, avg_bar;
	long proc_num;
	long edge;
	long size;
	unsigned long start;

	CLOCK(start)

	while ((ch = getopt(argc, argv, "n:p:b:cstoh")) != -1) {
	switch(ch) {
	case 'n': n = atoi(optarg); break;
	case 'p': P = atoi(optarg); break;
	case 'b': block_size = atoi(optarg); break;
	case 's': dostats = 1; break;
	case 't': test_result = !test_result; break;
	case 'o': doprint = !doprint; break;
	case 'h': printf("Usage: LU <options>\n\n");
	printf("options:\n");
	printf(" -nN : Decompose NxN matrix.\n");
	printf(" -pP : P = number of processors.\n");
	printf(" -bB : Use a block size of B. BxB elements should fit in cache for \n");
	printf(" good performance. Small block sizes (B=8, B=16) work well.\n");
	printf(" -c : Copy non-locally allocated blocks to local memory before use.\n");
	printf(" -s : Print individual processor timing statistics.\n");
	printf(" -t : Test output.\n");
	printf(" -o : Print out matrix values.\n");
	printf(" -h : Print out command line options.\n\n");
	printf("Default: LU -n%1d -p%1d -b%1d\n",
	DEFAULT_N,DEFAULT_P,DEFAULT_B);
	exit(0);
	break;
	}
	}

	MAIN_INITENV(,150000000)

	printf("\n");
	printf("Blocked Dense LU Factorization\n");
	printf(" %ld by %ld Matrix\n",n,n);
	printf(" %ld Processors\n",P);
	printf(" %ld by %ld Element Blocks\n",block_size,block_size);
	printf("\n");
	printf("\n");

	num_rows = (long) sqrt((double) P);
	for (;;) {
	num_cols = P/num_rows;
	if (num_rows*num_cols == P)
	break;
	num_rows--;
	}
	nblocks = n/block_size;
	if (block_size * nblocks != n) {
	nblocks++;
	}

	edge = n%block_size;
	if (edge == 0) {
	edge = block_size;
	}

	proc_bytes = (long ) malloc(Psizeof(long));
	if (proc_bytes == NULL) {
	fprintf(stderr,"Could not malloc memory for proc_bytes.\n");
	exit(-1);
	}
	last_malloc = (double *) G_MALLOC(Psizeof(double *));
	if (last_malloc == NULL) {
	fprintf(stderr,"Could not malloc memory for last_malloc.\n");
	exit(-1);
	}
	for (i=0;i<P;i++) {
	proc_bytes[i] = 0;
	last_malloc[i] = NULL;
	}
	for (i=0;i<nblocks;i++) {
	for (j=0;j<nblocks;j++) {
	proc_num = BlockOwner(i,j);
	if ((i == nblocks-1) && (j == nblocks-1)) {
	size = edge*edge;
	} else if ((i == nblocks-1) \|\| (j == nblocks-1)) {
	size = edge*block_size;
	} else {
	size = block_size*block_size;
	}
	proc_bytes[proc_num] += size*sizeof(double);
	}
	}
	for (i=0;i<P;i++) {
	last_malloc[i] = (double *) G_MALLOC(proc_bytes[i] + PAGE_SIZE)
	if (last_malloc[i] == NULL) {
	fprintf(stderr,"Could not malloc memory blocks for proc %ld\n",i);
	exit(-1);
	}
	last_malloc[i] = (double *) (((unsigned long) last_malloc[i]) + PAGE_SIZE -
	((unsigned long) last_malloc[i]) % PAGE_SIZE);

	/* Note that this causes all blocks to start out page-aligned, and that
	for block sizes that exceed cache line size, blocks start at cache-line
	aligned addresses as well. This reduces false sharing */

	}
	a = (double *) G_MALLOC(nblocksnblockssizeof(double ));
	if (a == NULL) {
	printerr("Could not malloc memory for a\n");
	exit(-1);
	}
	for (i=0;i<nblocks;i++) {
	for (j=0;j<nblocks;j++) {
	proc_num = BlockOwner(i,j);
	a[i+j*nblocks] = last_malloc[proc_num];
	if ((i == nblocks-1) && (j == nblocks-1)) {
	size = edge*edge;
	} else if ((i == nblocks-1) \|\| (j == nblocks-1)) {
	size = edge*block_size;
	} else {
	size = block_size*block_size;
	}
	last_malloc[proc_num] += size;
	}
	}

	rhs = (double ) G_MALLOC(nsizeof(double));
	if (rhs == NULL) {
	printerr("Could not malloc memory for rhs\n");
	exit(-1);
	}

	Global = (struct GlobalMemory *) G_MALLOC(sizeof(struct GlobalMemory));
	Global->t_in_fac = (double ) G_MALLOC(Psizeof(double));
	Global->t_in_mod = (double ) G_MALLOC(Psizeof(double));
	Global->t_in_solve = (double ) G_MALLOC(Psizeof(double));
	Global->t_in_bar = (double ) G_MALLOC(Psizeof(double));
	Global->completion = (double ) G_MALLOC(Psizeof(double));

	if (Global == NULL) {
	printerr("Could not malloc memory for Global\n");
	exit(-1);
	} else if (Global->t_in_fac == NULL) {
	printerr("Could not malloc memory for Global->t_in_fac\n");
	exit(-1);
	} else if (Global->t_in_mod == NULL) {
	printerr("Could not malloc memory for Global->t_in_mod\n");
	exit(-1);
	} else if (Global->t_in_solve == NULL) {
	printerr("Could not malloc memory for Global->t_in_solve\n");
	exit(-1);
	} else if (Global->t_in_bar == NULL) {
	printerr("Could not malloc memory for Global->t_in_bar\n");
	exit(-1);
	} else if (Global->completion == NULL) {
	printerr("Could not malloc memory for Global->completion\n");
	exit(-1);
	}

	/* POSSIBLE ENHANCEMENT: Here is where one might distribute the a[i]
	blocks across physically distributed memories as desired.

	One way to do this is as follows:

	for (i=0;i<nblocks;i++) {
	for (j=0;j<nblocks;j++) {
	proc_num = BlockOwner(i,j);
	if ((i == nblocks-1) && (j == nblocks-1)) {
	size = edge*edge;
	} else if ((i == nblocks-1) \|\| (j == nblocks-1)) {
	size = edge*block_size;
	} else {
	size = block_size*block_size;
	}

	Place all addresses x such that
	(&(a[i+jnblocks][0]) <= x < &(a[i+jnblocks][size-1]))
	on node proc_num
	}
	}
	*/

	BARINIT(Global->start, P);
	LOCKINIT(Global->idlock);
	Global->id = 0;

	InitA(rhs);
	if (doprint) {
	printf("Matrix before decomposition:\n");
	PrintA();
	}

	#ifdef ENABLE_PARSEC_HOOKS
	__parsec_roi_begin();
	#endif
	CREATE(SlaveStart, P);
	WAIT_FOR_END(P);
	#ifdef ENABLE_PARSEC_HOOKS
	__parsec_roi_end();
	#endif

	if (doprint) {
	printf("\nMatrix after decomposition:\n");
	PrintA();
	}

	if (dostats) {
	maxt = avgt = mint = Global->completion[0];
	for (i=1; i<P; i++) {
	if (Global->completion[i] > maxt) {
	maxt = Global->completion[i];
	}
	if (Global->completion[i] < mint) {
	mint = Global->completion[i];
	}
	avgt += Global->completion[i];
	}
	avgt = avgt / P;

	min_fac = max_fac = avg_fac = Global->t_in_fac[0];
	min_solve = max_solve = avg_solve = Global->t_in_solve[0];
	min_mod = max_mod = avg_mod = Global->t_in_mod[0];
	min_bar = max_bar = avg_bar = Global->t_in_bar[0];

	for (i=1; i<P; i++) {
	if (Global->t_in_fac[i] > max_fac) {
	max_fac = Global->t_in_fac[i];
	}
	if (Global->t_in_fac[i] < min_fac) {
	min_fac = Global->t_in_fac[i];
	}
	if (Global->t_in_solve[i] > max_solve) {
	max_solve = Global->t_in_solve[i];
	}
	if (Global->t_in_solve[i] < min_solve) {
	min_solve = Global->t_in_solve[i];
	}
	if (Global->t_in_mod[i] > max_mod) {
	max_mod = Global->t_in_mod[i];
	}
	if (Global->t_in_mod[i] < min_mod) {
	min_mod = Global->t_in_mod[i];
	}
	if (Global->t_in_bar[i] > max_bar) {
	max_bar = Global->t_in_bar[i];
	}
	if (Global->t_in_bar[i] < min_bar) {
	min_bar = Global->t_in_bar[i];
	}
	avg_fac += Global->t_in_fac[i];
	avg_solve += Global->t_in_solve[i];
	avg_mod += Global->t_in_mod[i];
	avg_bar += Global->t_in_bar[i];
	}
	avg_fac = avg_fac/P;
	avg_solve = avg_solve/P;
	avg_mod = avg_mod/P;
	avg_bar = avg_bar/P;
	}
	printf(" PROCESS STATISTICS\n");
	printf(" Total Diagonal Perimeter Interior Barrier\n");
	printf(" Proc Time Time Time Time Time\n");
	printf(" 0 %10.0f %10.0f %10.0f %10.0f %10.0f\n",
	Global->completion[0],Global->t_in_fac[0],
	Global->t_in_solve[0],Global->t_in_mod[0],
	Global->t_in_bar[0]);
	if (dostats) {
	for (i=1; i<P; i++) {
	printf(" %3ld %10.0f %10.0f %10.0f %10.0f %10.0f\n",
	i,Global->completion[i],Global->t_in_fac[i],
	Global->t_in_solve[i],Global->t_in_mod[i],
	Global->t_in_bar[i]);
	}
	printf(" Avg %10.0f %10.0f %10.0f %10.0f %10.0f\n",
	avgt,avg_fac,avg_solve,avg_mod,avg_bar);
	printf(" Min %10.0f %10.0f %10.0f %10.0f %10.0f\n",
	mint,min_fac,min_solve,min_mod,min_bar);
	printf(" Max %10.0f %10.0f %10.0f %10.0f %10.0f\n",
	maxt,max_fac,max_solve,max_mod,max_bar);
	}
	printf("\n");
	Global->starttime = start;
	printf(" TIMING INFORMATION\n");
	printf("Start time : %16lu\n", Global->starttime);
	printf("Initialization finish time : %16lu\n", Global->rs);
	printf("Overall finish time : %16lu\n", Global->rf);
	printf("Total time with initialization : %16lu\n", Global->rf-Global->starttime);
	printf("Total time without initialization : %16lu\n", Global->rf-Global->rs);
	printf("\n");

	if (test_result) {
	printf(" TESTING RESULTS\n");
	CheckResult(n, a, rhs);
	}

	MAIN_END;
	#ifdef ENABLE_PARSEC_HOOKS
	__parsec_bench_end();
	#endif
	}


	void SlaveStart()
	{
	long MyNum;

	LOCK(Global->idlock)
	MyNum = Global->id;
	Global->id ++;
	UNLOCK(Global->idlock)

	/* POSSIBLE ENHANCEMENT: Here is where one might pin processes to
	processors to avoid migration */

	BARINCLUDE(Global->start);
	OneSolve(n, block_size, MyNum, dostats);
	}


	void OneSolve(long n, long block_size, long MyNum, long dostats)
	{
	unsigned long myrs;
	unsigned long myrf;
	unsigned long mydone;
	struct LocalCopies *lc;

	lc = (struct LocalCopies *) malloc(sizeof(struct LocalCopies));
	if (lc == NULL) {
	fprintf(stderr,"Proc %ld could not malloc memory for lc\n",MyNum);
	exit(-1);
	}
	lc->t_in_fac = 0.0;
	lc->t_in_solve = 0.0;
	lc->t_in_mod = 0.0;
	lc->t_in_bar = 0.0;

	/* barrier to ensure all initialization is done */
	BARRIER(Global->start, P);

	/* to remove cold-start misses, all processors touch their own data */
	TouchA(block_size, MyNum);

	BARRIER(Global->start, P);

	/* POSSIBLE ENHANCEMENT: Here is where one might reset the
	statistics that one is measuring about the parallel execution */

	if ((MyNum == 0) \|\| (dostats)) {
	CLOCK(myrs);
	}

	lu(n, block_size, MyNum, lc, dostats);

	if ((MyNum == 0) \|\| (dostats)) {
	CLOCK(mydone);
	}

	BARRIER(Global->start, P);

	if ((MyNum == 0) \|\| (dostats)) {
	Global->t_in_fac[MyNum] = lc->t_in_fac;
	Global->t_in_solve[MyNum] = lc->t_in_solve;
	Global->t_in_mod[MyNum] = lc->t_in_mod;
	Global->t_in_bar[MyNum] = lc->t_in_bar;
	Global->completion[MyNum] = mydone-myrs;
	}
	if (MyNum == 0) {
	CLOCK(myrf);
	Global->rs = myrs;
	Global->done = mydone;
	Global->rf = myrf;
	}
	}


	void lu0(double *a, long n, long stride)
	{
	long j;
	long k;
	long length;
	double alpha;

	for (k=0; k<n; k++) {
	/* modify subsequent columns */
	for (j=k+1; j<n; j++) {
	a[k+jstride] /= a[k+kstride];
	alpha = -a[k+j*stride];
	length = n-k-1;
	daxpy(&a[k+1+jstride], &a[k+1+kstride], n-k-1, alpha);
	}
	}
	}


	void bdiv(double a, double diag, long stride_a, long stride_diag, long dimi, long dimk)
	{
	long j;
	long k;
	double alpha;

	for (k=0; k<dimk; k++) {
	for (j=k+1; j<dimk; j++) {
	alpha = -diag[k+j*stride_diag];
	daxpy(&a[jstride_a], &a[kstride_a], dimi, alpha);
	}
	}
	}


	void bmodd(double a, double c, long dimi, long dimj, long stride_a, long stride_c)
	{
	long j;
	long k;
	long length;
	double alpha;

	for (k=0; k<dimi; k++) {
	for (j=0; j<dimj; j++) {
	c[k+jstride_c] /= a[k+kstride_a];
	alpha = -c[k+j*stride_c];
	length = dimi - k - 1;
	daxpy(&c[k+1+jstride_c], &a[k+1+kstride_a], dimi-k-1, alpha);
	}
	}
	}


	void bmod(double a, double b, double *c, long dimi, long dimj, long dimk, long stridea, long strideb, long stridec)
	{
	long j;
	long k;
	double alpha;

	for (k=0; k<dimk; k++) {
	for (j=0; j<dimj; j++) {
	alpha = -b[k+j*strideb];
	daxpy(&c[jstridec], &a[kstridea], dimi, alpha);
	}
	}
	}


	void daxpy(double a, double b, long n, double alpha)
	{
	long i;

	for (i=0; i<n; i++) {
	a[i] += alpha*b[i];
	}
	}


	long BlockOwner(long I, long J)
	{
	// return((J%num_cols) + (I%num_rows)*num_cols);
	return((I + J*nblocks) % P);
	}

	long BlockOwnerColumn(long I, long J)
	{
	return(I % P);
	}

	long BlockOwnerRow(long I, long J)
	{
	return(((J % P) + (P / 2)) % P);
	}

	void lu(long n, long bs, long MyNum, struct LocalCopies *lc, long dostats)
	{
	long i, il, j, jl, k, kl;
	long I, J, K;
	double A, B, C, D;
	long strI, strJ, strK;
	unsigned long t1, t2, t3, t4, t11, t22;

	for (k=0, K=0; k<n; k+=bs, K++) {
	kl = k + bs;
	if (kl > n) {
	kl = n;
	strK = kl - k;
	} else {
	strK = bs;
	}

	if ((MyNum == 0) \|\| (dostats)) {
	CLOCK(t1);
	}

	/* factor diagonal block */
	if (BlockOwner(K, K) == MyNum) {
	A = a[K+K*nblocks];
	lu0(A, strK, strK);
	}

	if ((MyNum == 0) \|\| (dostats)) {
	CLOCK(t11);
	}

	BARRIER(Global->start, P);

	if ((MyNum == 0) \|\| (dostats)) {
	CLOCK(t2);
	}

	/* divide column k by diagonal block */
	D = a[K+K*nblocks];
	for (i=kl, I=K+1; i<n; i+=bs, I++) {
	if (BlockOwnerColumn(I, K) == MyNum) { /* parcel out blocks */
	il = i + bs;
	if (il > n) {
	il = n;
	strI = il - i;
	} else {
	strI = bs;
	}
	A = a[I+K*nblocks];
	bdiv(A, D, strI, strK, strI, strK);
	}
	}
	/* modify row k by diagonal block */
	for (j=kl, J=K+1; j<n; j+=bs, J++) {
	if (BlockOwnerRow(K, J) == MyNum) { /* parcel out blocks */
	jl = j+bs;
	if (jl > n) {
	jl = n;
	strJ = jl - j;
	} else {
	strJ = bs;
	}
	A = a[K+J*nblocks];
	bmodd(D, A, strK, strJ, strK, strK);
	}
	}

	if ((MyNum == 0) \|\| (dostats)) {
	CLOCK(t22);
	}

	BARRIER(Global->start, P);

	if ((MyNum == 0) \|\| (dostats)) {
	CLOCK(t3);
	}

	/* modify subsequent block columns */
	for (i=kl, I=K+1; i<n; i+=bs, I++) {
	il = i+bs;
	if (il > n) {
	il = n;
	strI = il - i;
	} else {
	strI = bs;
	}
	A = a[I+K*nblocks];
	for (j=kl, J=K+1; j<n; j+=bs, J++) {
	jl = j + bs;
	if (jl > n) {
	jl = n;
	strJ= jl - j;
	} else {
	strJ = bs;
	}
	if (BlockOwner(I, J) == MyNum) { /* parcel out blocks */
	B = a[K+J*nblocks];
	C = a[I+J*nblocks];
	bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
	}
	}
	}

	if ((MyNum == 0) \|\| (dostats)) {
	CLOCK(t4);
	lc->t_in_fac += (t11-t1);
	lc->t_in_solve += (t22-t2);
	lc->t_in_mod += (t4-t3);
	lc->t_in_bar += (t2-t11) + (t3-t22);
	}
	}
	}


	void InitA(double *rhs)
	{
	long i, j;
	long ii, jj;
	long edge;
	long ibs;
	long jbs, skip;

	srand48((long) 1);
	edge = n%block_size;
	for (j=0; j<n; j++) {
	for (i=0; i<n; i++) {
	if ((n - i) <= edge) {
	ibs = edge;
	ibs = n-edge;
	skip = edge;
	} else {
	ibs = block_size;
	skip = block_size;
	}
	if ((n - j) <= edge) {
	jbs = edge;
	jbs = n-edge;
	} else {
	jbs = block_size;
	}
	ii = (i/block_size) + (j/block_size)*nblocks;
	jj = (i%ibs)+(j%jbs)*skip;
	a[ii][jj] = ((double) lrand48())/MAXRAND;
	if (i == j) {
	a[ii][jj] *= 10;
	}
	}
	}

	for (j=0; j<n; j++) {
	rhs[j] = 0.0;
	}
	for (j=0; j<n; j++) {
	for (i=0; i<n; i++) {
	if ((n - i) <= edge) {
	ibs = edge;
	ibs = n-edge;
	skip = edge;
	} else {
	ibs = block_size;
	skip = block_size;
	}
	if ((n - j) <= edge) {
	jbs = edge;
	jbs = n-edge;
	} else {
	jbs = block_size;
	}
	ii = (i/block_size) + (j/block_size)*nblocks;
	jj = (i%ibs)+(j%jbs)*skip;
	rhs[i] += a[ii][jj];
	}
	}
	}


	double TouchA(long bs, long MyNum)
	{
	long i, j, I, J;
	double tot = 0.0;
	long ibs;
	long jbs;

	/* touch my portion of A[] */

	for (J=0; J<nblocks; J++) {
	for (I=0; I<nblocks; I++) {
	if (BlockOwner(I, J) == MyNum) {
	if (J == nblocks-1) {
	jbs = n%bs;
	if (jbs == 0) {
	jbs = bs;
	}
	} else {
	jbs = bs;
	}
	if (I == nblocks-1) {
	ibs = n%bs;
	if (ibs == 0) {
	ibs = bs;
	}
	} else {
	ibs = bs;
	}
	for (j=0; j<jbs; j++) {
	for (i=0; i<ibs; i++) {
	tot += a[I+Jnblocks][i+jibs];
	}
	}
	}
	}
	}
	return(tot);
	}


	void PrintA()
	{
	long i, j;
	long ii, jj;
	long edge;
	long ibs, jbs, skip;

	edge = n%block_size;
	for (i=0; i<n; i++) {
	for (j=0; j<n; j++) {
	if ((n - i) <= edge) {
	ibs = edge;
	ibs = n-edge;
	skip = edge;
	} else {
	ibs = block_size;
	skip = block_size;
	}
	if ((n - j) <= edge) {
	jbs = edge;
	jbs = n-edge;
	} else {
	jbs = block_size;
	}
	ii = (i/block_size) + (j/block_size)*nblocks;
	jj = (i%ibs)+(j%jbs)*skip;
	printf("%8.1f ", a[ii][jj]);
	}
	printf("\n");
	}
	fflush(stdout);
	}


	void CheckResult(long n, double *a, double rhs)
	{
	long i, j, bogus = 0;
	double *y, diff, max_diff;
	long ii, jj;
	long edge;
	long ibs, jbs, skip;

	edge = n%block_size;
	y = (double ) malloc(nsizeof(double));
	if (y == NULL) {
	printerr("Could not malloc memory for y\n");
	exit(-1);
	}
	for (j=0; j<n; j++) {
	y[j] = rhs[j];
	}
	for (j=0; j<n; j++) {
	if ((n - j) <= edge) {
	jbs = edge;
	jbs = n-edge;
	skip = edge;
	} else {
	jbs = block_size;
	skip = block_size;
	}
	ii = (j/block_size) + (j/block_size)*nblocks;
	jj = (j%jbs)+(j%jbs)*skip;

	y[j] = y[j]/a[ii][jj];
	for (i=j+1; i<n; i++) {
	if ((n - i) <= edge) {
	ibs = edge;
	ibs = n-edge;
	skip = edge;
	} else {
	ibs = block_size;
	skip = block_size;
	}
	ii = (i/block_size) + (j/block_size)*nblocks;
	jj = (i%ibs)+(j%jbs)*skip;

	y[i] -= a[ii][jj]*y[j];
	}
	}

	for (j=n-1; j>=0; j--) {
	for (i=0; i<j; i++) {
	if ((n - i) <= edge) {
	ibs = edge;
	ibs = n-edge;
	skip = edge;
	} else {
	ibs = block_size;
	skip = block_size;
	}
	if ((n - j) <= edge) {
	jbs = edge;
	jbs = n-edge;
	} else {
	jbs = block_size;
	}
	ii = (i/block_size) + (j/block_size)*nblocks;
	jj = (i%ibs)+(j%jbs)*skip;
	y[i] -= a[ii][jj]*y[j];
	}
	}

	max_diff = 0.0;
	for (j=0; j<n; j++) {
	diff = y[j] - 1.0;
	if (fabs(diff) > 0.00001) {
	bogus = 1;
	max_diff = diff;
	}
	}
	if (bogus) {
	printf("TEST FAILED: (%.5f diff)\n", max_diff);
	} else {
	printf("TEST PASSED\n");
	}
	free(y);
	}


	void printerr(char *s)
	{
	fprintf(stderr,"ERROR: %s\n",s);
	}