blob: 798cdcb78fb908e48212a322821743c24bc7d592 [file] [log] [blame]
/*************************************************************************/
/* */
/* Copyright (c) 1994 Stanford University */
/* */
/* All rights reserved. */
/* */
/* Permission is given to use, copy, and modify this software for any */
/* non-commercial purpose as long as this copyright notice is not */
/* removed. All other uses, including redistribution in whole or in */
/* part, are forbidden without prior written permission. */
/* */
/* This software is provided with absolutely no warranty and no */
/* support. */
/* */
/*************************************************************************/
EXTERN_ENV
#include "matrix.h"
#include <math.h>
int vMiss=0, wMiss=0, xMiss=0, yMiss=0; /* Local but don't care */
extern int BS;
extern struct GlobalMemory *Global;
extern BMatrix LB;
extern int solution_method;
extern int *node, *firstchild; /* global */
int *block_start, *all_blocks;
/* arguments */
extern int *T, P;
BFac(diag, MyNum, lc)
int MyNum;
struct LocalCopies *lc;
{
int n, is, il, js, jl, ks, kl;
double *A;
n = BLOCK(diag)->length;
A = BLOCK(diag)->nz;
for (js=0; js<n; js+=BS) {
jl = js+BS; if (jl > n) jl = n;
OneFac(&A[js+n*js], jl-js, n);
for (is=jl; is<n; is+=BS) {
il = is+BS; if (il > n) il = n;
CopyBlock(A, lc->blktmp, n, is, js, il, jl, MyNum, lc);
OneDiv(&A[js+n*js], lc->blktmp, jl-js, il-is, n);
CopyBlockBack(A, lc->blktmp, n, is, js, il, jl, MyNum, lc);
for (ks=jl; ks<is; ks+=BS) {
kl = ks+BS; if (kl > n) kl = n;
OneMatmat(lc->blktmp, &A[ks+n*js], &A[is+n*ks], kl-ks, jl-js, il-is, n, n);
}
OneLower(lc->blktmp, &A[is+n*is], il-is, jl-js, n);
}
}
}
/* Factor A (dim n1 by n1), stride n2 */
OneFac(A, n1, n2)
double *A;
{
int i, j, k;
for (j=0; j<n1; j++) {
for (k=0; k<j; k++)
for (i=j; i<n1; i++)
A[i+n2*j] -= A[j+n2*k]*A[i+n2*k];
A[k+n2*k] = sqrt(A[k+n2*k]);
for (i=j+1; i<n1; i++)
A[i+n2*k] /= A[k+n2*k];
}
}
BDiv(diag, below, n1, n3, diag_nz, below_nz, MyNum, lc)
int diag, below, MyNum;
int n1, n3;
double *diag_nz, *below_nz;
struct LocalCopies *lc;
{
int is, il, js, jl, ks, kl;
double *A, *B;
/* diag block is A, dim n1 by n1 */
/* below block is B, dim n3 by n1 */
A = diag_nz; B = below_nz;
if (n1*n3 <= BS*BS) {
OneDiv(A, B, n1, n3, n1);
}
else {
for (js=0; js<n1; js+=BS) {
jl = js+BS; if (jl > n1) jl = n1;
for (is=0; is<n3; is+=BS) {
il = is+BS; if (il > n3) il = n3;
CopyBlock(B, lc->blktmp, n3, is, js, il, jl, MyNum, lc);
OneDiv(&A[js+js*n1], lc->blktmp, jl-js, il-is, n1);
CopyBlockBack(B, lc->blktmp, n3, is, js, il, jl, MyNum, lc);
for (ks=jl; ks<n1; ks+=BS) {
kl = ks+BS; if (kl > n1) kl = n1;
OneMatmat(lc->blktmp, &A[ks+js*n1], &B[is+ks*n3], kl-ks, jl-js, il-is,
n3, n1);
}
}
}
}
}
/* diag block is A, dim n1 by n1, stride n4 */
/* below block is B, dim n3 by n1 */
/* n4 is stride of A */
OneDiv(A, B, n1, n3, n4)
double *A, *B;
{
int i, j, k;
double a_j0k0, a_j0k1, a_j0k2, a_j0k3;
double a_j1k0, a_j1k1, a_j1k2, a_j1k3;
double *b0, *b1, *b2, *b3;
double *dest0, *dest1, *last;
double *tmp;
double t0, t1, tmp0, tmp1;
for (j=0; j<n1-1; j+=2) {
for (k=0; k<j-3; k+=4) {
tmp = &A[j+n4*k];
a_j0k0 = *tmp; a_j1k0 = *(tmp+1); tmp += n4;
a_j0k1 = *tmp; a_j1k1 = *(tmp+1); tmp += n4;
a_j0k2 = *tmp; a_j1k2 = *(tmp+1); tmp += n4;
a_j0k3 = *tmp; a_j1k3 = *(tmp+1);
dest0 = &B[n3*j]; dest1 = last = dest0+n3;
b0 = &B[n3*k]; b1 = b0+n3; b2 = b1+n3; b3 = b2+n3;
while (dest0 != last) {
t0 = *dest0; t1 = *dest1;
tmp0 = *b0++; t0 -= a_j0k0*tmp0; t1 -= a_j1k0*tmp0;
tmp1 = *b1++; t0 -= a_j0k1*tmp1; t1 -= a_j1k1*tmp1;
tmp0 = *b2++; t0 -= a_j0k2*tmp0; t1 -= a_j1k2*tmp0;
tmp1 = *b3++; t0 -= a_j0k3*tmp1; t1 -= a_j1k3*tmp1;
*dest0++ = t0; *dest1++ = t1;
}
}
for (; k<j; k++) {
a_j0k0 = A[j+n4*k];
a_j1k0 = A[j+1+n4*k];
for (i=0; i<n3; i++) {
B[i+n3*(j+0)] -= a_j0k0*B[i+n3*k];
B[i+n3*(j+1)] -= a_j1k0*B[i+n3*k];
}
}
tmp0 = 1.0/A[j+n4*j];
for (i=0; i<n3; i++)
B[i+n3*j] *= tmp0;
for (i=0; i<n3; i++) {
a_j1k0 = A[j+1+n4*k];
B[i+n3*(j+1)] -= a_j1k0*B[i+n3*j];
}
tmp0 = 1.0/A[j+1+n4*(j+1)];
for (i=0; i<n3; i++)
B[i+n3*(j+1)] *= tmp0;
}
for (; j<n1; j++) {
for (k=0; k<j; k++) {
a_j0k0 = A[j+n4*k];
for (i=0; i<n3; i++)
B[i+n3*j] -= a_j0k0*B[i+n3*k];
}
for (i=0; i<n3; i++)
B[i+n3*j] /= A[j+n4*j];
}
}
BMod(top, bend, n1, n2, n3, top_nz, bend_nz, dest_nz, MyNum, lc)
int top, bend, MyNum;
int n1, n2, n3;
double *top_nz, *bend_nz, *dest_nz;
struct LocalCopies *lc;
{
int is, il, ks, kl, hbs;
double *B, *A, *C;
/* three blocks form an L */
/* top part is A, dim n1 by n2 */
/* bend part is B, dim n3 by n2 */
/* result is dim n3 by n1 dense block */
A = top_nz;
B = bend_nz;
C = dest_nz;
if (n2*n3 <= BS*BS) {
OneMatmat(B, A, C, n1, n2, n3, n3, n1);
}
else if (n3 < FitsInCache/16) { /* 16 columns at a time are good enough */
hbs = FitsInCache/n3;
for (ks=0; ks<n2; ks+=hbs) {
kl = ks+hbs; if (kl > n2) kl = n2;
OneMatmat(&B[ks*n3], &A[ks*n1], C, n1, kl-ks, n3, n3, n1);
}
}
else {
for (is=0; is<n3; is+=BS) {
il = is+BS; if (il > n3) il = n3;
for (ks=0; ks<n2; ks+=BS) {
kl = ks+BS; if (kl > n2) kl = n2;
CopyBlock(B, lc->blktmp, n3, is, ks, il, kl, MyNum, lc);
OneMatmat(lc->blktmp, &A[ks*n1], &C[is], n1, kl-ks, il-is, n3, n1);
}
}
}
}
CopyBlock(B, dest, n3, is, ks, il, kl,MyNum,lc)
double *B, *dest;
int MyNum;
struct LocalCopies *lc;
{
int i, k, bs;
double *destination, *bptr, *top_of_B;
bs = il-is;
for (k=ks; k<kl; k++)
for (i=is; i<il; i++) {
dest[(i-is)+(k-ks)*bs] = B[i+k*n3];
}
}
CopyBlockBack(B, src, n3, is, ks, il, kl)
double *B, *src;
{
int i, k, bs;
bs = il-is;
for (k=ks; k<kl; k++)
for (i=is; i<il; i++)
B[i+k*n3] = src[(i-is)+(k-ks)*bs];
}
/* Multiply B (B dim n3 by n2) and AT (A dim n1 by n2) */
/* Result added into C (C dim n3 by n1) */
/* n4 is stride of C, n5 is stride of A */
OneMatmat(B, A, C, n1, n2, n3, n4, n5)
double *B, *A, *C;
{
int i, j, k;
double a_j0k0, a_j0k1, a_j0k2, a_j0k3;
double a_j0k4, a_j0k5, a_j0k6, a_j0k7;
double a_j1k0, a_j1k1, a_j1k2, a_j1k3;
double a_j1k4, a_j1k5, a_j1k6, a_j1k7;
double *b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7;
double *dest0, *dest1, *last;
double *tmp;
double t0, t1, tmp0, tmp1;
for (j=0; j<n1-1; j+=2) {
k = 0;
for (; k<n2-7; k+=8) {
tmp = &A[j+n5*k];
a_j0k0 = *tmp; a_j1k0 = *(tmp+1); tmp += n5;
a_j0k1 = *tmp; a_j1k1 = *(tmp+1); tmp += n5;
a_j0k2 = *tmp; a_j1k2 = *(tmp+1); tmp += n5;
a_j0k3 = *tmp; a_j1k3 = *(tmp+1); tmp += n5;
a_j0k4 = *tmp; a_j1k4 = *(tmp+1); tmp += n5;
a_j0k5 = *tmp; a_j1k5 = *(tmp+1); tmp += n5;
a_j0k6 = *tmp; a_j1k6 = *(tmp+1); tmp += n5;
a_j0k7 = *tmp; a_j1k7 = *(tmp+1);
dest0 = &C[n4*j]; dest1 = dest0 + n4;
last = dest0 + n3;
b0 = &B[n3*k]; b1 = b0+n3; b2 = b1+n3; b3 = b2+n3;
b4 = b3+n3; b5 = b4+n3; b6 = b5+n3; b7 = b6+n3;
while (dest0 != last) {
t0 = *dest0; t1 = *dest1;
tmp0 = *b0++; t0 -= a_j0k0*tmp0; t1 -= a_j1k0*tmp0;
tmp1 = *b1++; t0 -= a_j0k1*tmp1; t1 -= a_j1k1*tmp1;
tmp0 = *b2++; t0 -= a_j0k2*tmp0; t1 -= a_j1k2*tmp0;
tmp1 = *b3++; t0 -= a_j0k3*tmp1; t1 -= a_j1k3*tmp1;
tmp0 = *b4++; t0 -= a_j0k4*tmp0; t1 -= a_j1k4*tmp0;
tmp1 = *b5++; t0 -= a_j0k5*tmp1; t1 -= a_j1k5*tmp1;
tmp0 = *b6++; t0 -= a_j0k6*tmp0; t1 -= a_j1k6*tmp0;
tmp1 = *b7++; t0 -= a_j0k7*tmp1; t1 -= a_j1k7*tmp1;
*dest0++ = t0; *dest1++ = t1;
}
}
for (; k<n2-3; k+=4) {
tmp = &A[j+n5*k];
a_j0k0 = *tmp; a_j1k0 = *(tmp+1); tmp += n5;
a_j0k1 = *tmp; a_j1k1 = *(tmp+1); tmp += n5;
a_j0k2 = *tmp; a_j1k2 = *(tmp+1); tmp += n5;
a_j0k3 = *tmp; a_j1k3 = *(tmp+1);
dest0 = &C[n4*j]; dest1 = dest0 + n4;
last = dest0 + n3;
b0 = &B[n3*k]; b1 = b0+n3; b2 = b1+n3; b3 = b2+n3;
while (dest0 != last) {
t0 = *dest0; t1 = *dest1;
tmp0 = *b0++; t0 -= a_j0k0*tmp0; t1 -= a_j1k0*tmp0;
tmp1 = *b1++; t0 -= a_j0k1*tmp1; t1 -= a_j1k1*tmp1;
tmp0 = *b2++; t0 -= a_j0k2*tmp0; t1 -= a_j1k2*tmp0;
tmp1 = *b3++; t0 -= a_j0k3*tmp1; t1 -= a_j1k3*tmp1;
*dest0++ = t0; *dest1++ = t1;
}
}
for (; k<n2; k++) {
a_j0k0 = A[j+n5*k];
a_j1k0 = A[j+1+n5*k];
for (i=0; i<n3; i++) {
t0 = B[i+n3*k];
C[i+n4*j] -= a_j0k0*t0;
C[i+n4*(j+1)] -= a_j1k0*t0;
}
}
}
for (;j<n1; j++) {
for (k=0; k<n2-3; k+=4) {
tmp = &A[j+n5*k];
a_j0k0 = *tmp; tmp += n5;
a_j0k1 = *tmp; tmp += n5;
a_j0k2 = *tmp; tmp += n5;
a_j0k3 = *tmp; tmp += n5;
b0 = &B[n3*k]; b1 = b0+n3; b2 = b1+n3; b3 = b2+n3;
dest0 = &C[n4*j]; last = dest0 + n3;
while (dest0 != last) {
t0 = *dest0;
t0 -= a_j0k0*(*b0++);
t0 -= a_j0k1*(*b1++);
t0 -= a_j0k2*(*b2++);
t0 -= a_j0k3*(*b3++);
*dest0++ = t0;
}
}
for (; k<n2; k++) {
a_j0k0 = A[j+n5*k];
for (i=0; i<n3; i++)
C[i+n4*j] -= a_j0k0*B[i+n3*k];
}
}
}
/* block is n1 by n2, with given nz */
/* subtract lower triangle of AA^T from 'dest_nz' */
BLMod(left, n1, n2, left_nz, dest_nz, MyNum, lc)
int left, n1, n2, MyNum;
double *left_nz, *dest_nz;
struct LocalCopies *lc;
{
int is, ks, il, kl;
double *A, *C;
A = left_nz;
C = dest_nz;
if (n1*n2 <= BS*BS) {
OneLower(A, C, n1, n2, n1);
}
else {
for (is=0; is<n1; is+=BS) {
il = is+BS; if (il > n1) il = n1;
for (ks=0; ks<n2; ks+=BS) {
kl = ks+BS; if (kl > n2) kl = n2;
CopyBlock(A, lc->blktmp, n1, is, ks, il, kl, MyNum, lc);
OneMatmat(lc->blktmp, &A[ks*n1], &C[is], is, kl-ks, il-is, n1, n1);
OneLower(lc->blktmp, &C[is+is*n1], il-is, kl-ks, n1);
}
}
}
}
/* left part is A, dim n1 by n2 */
/* result is lower triangle of square block, dim n1 by n1 */
/* n3 is stride of C */
OneLower(A, C, n1, n2, n3)
double *A, *C;
{
int i, j, k;
double *tmp;
double a_j0k0, a_j0k1, a_j0k2, a_j0k3;
double a_j1k0, a_j1k1, a_j1k2, a_j1k3;
double *b0, *b1, *b2, *b3;
double t0, t1, tmp0, tmp1;
double *dest0, *dest1, *last;
for (j=0; j<n1-1; j+=2) {
for (k=0; k<n2-3; k+=4) {
tmp = &A[j+n1*k];
a_j0k0 = *tmp; a_j1k0 = *(tmp+1); tmp += n1;
a_j0k1 = *tmp; a_j1k1 = *(tmp+1); tmp += n1;
a_j0k2 = *tmp; a_j1k2 = *(tmp+1); tmp += n1;
a_j0k3 = *tmp; a_j1k3 = *(tmp+1);
b0 = &A[j+n1*k]; b1 = b0+n1; b2 = b1+n1; b3 = b2+n1;
dest0 = &C[j+n3*j];
last = dest0+n1-j;
*dest0++ -= a_j0k0**b0++ + a_j0k1**b1++ + a_j0k2**b2++ + a_j0k3**b3++;
dest1 = dest0 + n3;
while (dest0 != last) {
t0 = *dest0; t1 = *dest1;
tmp0 = *b0++; t0 -= a_j0k0*tmp0; t1 -= a_j1k0*tmp0;
tmp1 = *b1++; t0 -= a_j0k1*tmp1; t1 -= a_j1k1*tmp1;
tmp0 = *b2++; t0 -= a_j0k2*tmp0; t1 -= a_j1k2*tmp0;
tmp1 = *b3++; t0 -= a_j0k3*tmp1; t1 -= a_j1k3*tmp1;
*dest0++ = t0; *dest1++ = t1;
}
}
for (; k<n2; k++) {
a_j0k0 = A[j+n1*k];
a_j1k0 = A[j+1+n1*k];
C[j+n3*j] -= a_j0k0*A[j+n1*k];
for (i=j+1; i<n1; i++) {
C[i+n3*j] -= a_j0k0*A[i+n1*k];
C[i+n3*(j+1)] -= a_j1k0*A[i+n1*k];
}
}
}
for (; j<n1; j++)
for (k=0; k<n2; k++) {
a_j0k0 = A[j+n1*k];
for (i=j; i<n1; i++)
C[i+n3*j] -= a_j0k0*A[i+n1*k];
}
}
FindBlockUpdate(domain, bli, blj, update, stride)
int domain, blj, bli;
double **update;
int *stride;
{
int i;
int into_i, into_j, update_len;
double *domain_update;
into_j = 0;
for (i=LB.col[LB.n+domain]; i<blj; i++)
into_j += BLOCK(i)->length;
into_i = into_j;
for (; i<bli; i++)
into_i += BLOCK(i)->length;
update_len = into_i;
for (; i<LB.col[LB.n+domain+1]; i++)
update_len += BLOCK(i)->length;
domain_update = LB.proc_domain_storage[domain];
*update = &domain_update[into_j*update_len - into_j*(into_j+1)/2 + into_i];
*stride = update_len - into_j - 1;
}