| /*************************************************************************/ |
| /* */ |
| /* Copyright (c) 1994 Stanford University */ |
| /* */ |
| /* All rights reserved. */ |
| /* */ |
| /* Permission is given to use, copy, and modify this software for any */ |
| /* non-commercial purpose as long as this copyright notice is not */ |
| /* removed. All other uses, including redistribution in whole or in */ |
| /* part, are forbidden without prior written permission. */ |
| /* */ |
| /* This software is provided with absolutely no warranty and no */ |
| /* support. */ |
| /* */ |
| /*************************************************************************/ |
| |
| EXTERN_ENV |
| |
| #include <math.h> |
| #include "matrix.h" |
| |
| extern long *node; /* ALL GLOBAL */ |
| extern long postpass_partition_size; |
| extern long distribute; |
| BMatrix LB; |
| extern SMatrix L; |
| long P_dimi, P_dimj; |
| |
| /* perform symbolic factorization of original matrix into block form */ |
| |
| void CreateBlockedMatrix2(SMatrix M, long block_ub, long *T, long *firstchild, long *child, long *PERM, long *INVP, long *domain, long *partition) |
| { |
| long i, j, k, p, which, super, n_nz; |
| long *structure, *nz; |
| Block *blocks; |
| extern long P; |
| extern long *domains, *proc_domains; |
| long num_partitions, piece_size, piece, current; |
| |
| LB.n = M.n; |
| LB.domain = domain; |
| LB.domains = domains; |
| LB.proc_domains = proc_domains; |
| LB.n_domains = proc_domains[P]; |
| LB.entries_allocated = block_ub+20; |
| |
| LB.proc_domain_storage = (double **) MyMalloc(LB.n_domains*sizeof(double *), |
| DISTRIBUTED); |
| for (i=0; i<LB.n_domains; i++) |
| LB.proc_domain_storage[i] = NULL; |
| |
| /* one dummy column for each domain */ |
| LB.partition_size = (long *) MyMalloc((LB.n+LB.n_domains+1)*sizeof(long), |
| DISTRIBUTED); |
| LB.col = (long *) MyMalloc((LB.n+LB.n_domains+1)*sizeof(long), DISTRIBUTED); |
| |
| LB.entry = (Entry *) G_MALLOC(LB.entries_allocated*sizeof(Entry),0); |
| MigrateMem(LB.entry, LB.entries_allocated*sizeof(Entry), DISTRIBUTED); |
| LB.row = (long *) G_MALLOC(LB.entries_allocated*sizeof(long), 0); |
| MigrateMem(LB.row, LB.entries_allocated*sizeof(long), DISTRIBUTED); |
| |
| FindMachineDimensions(P); |
| |
| /* merge partitions */ |
| |
| for (j=0; j<M.n; j+=node[j]) { |
| if (!LB.domain[j]) { |
| num_partitions = FindNumPartitions(node[j], postpass_partition_size); |
| current = j; |
| for (piece=0; piece<num_partitions; piece++) { |
| piece_size = node[j]*(piece+1)/num_partitions - |
| node[j]*piece/num_partitions; |
| for (k=current; k<current+piece_size; k++) { |
| partition[k] = current; |
| } |
| current += piece_size; |
| } |
| } |
| } |
| |
| /* determine block sizes */ |
| |
| j = 0; LB.max_partition = 0; LB.n_partitions = 0; |
| while (j < M.n) { |
| if (LB.domain[j]) { |
| LB.partition_size[j] = 1; |
| j++; |
| } |
| else { |
| k = j; |
| while (partition[k] == j && k < M.n) |
| k++; |
| LB.partition_size[j] = k-j; |
| for (i=j+1; i<k; i++) |
| LB.partition_size[i] = j-i; |
| |
| if (LB.partition_size[j] > LB.max_partition) |
| LB.max_partition = LB.partition_size[j]; |
| LB.n_partitions++; |
| |
| j = k; |
| } |
| } |
| for (j=0; j<LB.n_domains; j++) |
| LB.partition_size[LB.n+j] = 1; |
| if (LB.n_partitions == 0) |
| LB.n_partitions = 1; |
| |
| /* determine numbering based on partitions only */ |
| LB.renumbering = (long *) MyMalloc((LB.n+LB.n_domains)*sizeof(long), |
| DISTRIBUTED); |
| ComputePartitionNumbering(LB.renumbering); |
| for (j=0; j<LB.n_domains; j++) |
| LB.renumbering[LB.n+j] = j%LB.n_partitions; |
| |
| /* determine mapping of rows/columns of blocks to rows/columns of procs */ |
| LB.mapI = (long *) MyMalloc(LB.n_partitions*sizeof(long), DISTRIBUTED); |
| LB.mapJ = (long *) MyMalloc(LB.n_partitions*sizeof(long), DISTRIBUTED); |
| for (i=0; i<LB.n_partitions; i++) |
| LB.mapI[i] = LB.mapJ[i] = i; |
| printf("No redistribution\n"); |
| |
| printf("Supers: "); DumpSizes(LB, domain, node); |
| printf("Blocks: "); DumpSizes(LB, domain, LB.partition_size); |
| printf("%ld partitions\n", LB.n_partitions); |
| |
| structure = (long *) malloc(M.n*sizeof(long)); |
| nz = (long *) malloc(M.n*sizeof(long)); |
| for (i=0; i<M.n; i++) |
| structure[i] = 0; |
| |
| /* find the block structure of the factor */ |
| LB.col[0] = 0; |
| /* first the domains... */ |
| for (super=0; super<LB.n; super+=node[super]) |
| if (domain[super]) { |
| FindSuperStructure(M, super, PERM, INVP, firstchild, child, |
| structure, nz, &n_nz); |
| FindDomStructure(super, nz, n_nz); |
| } |
| else { |
| for (j=super; j<super+node[super]; j+=LB.partition_size[j]) { |
| FindBlStructure(M, j, PERM, INVP, firstchild, child, |
| structure, nz); |
| |
| /* fill in rest of partition */ |
| for (i=j+1; i<j+LB.partition_size[j]; i++) |
| LB.col[i+1] = LB.col[i]; |
| } |
| } |
| /* ...then the domain dummies */ |
| for (j=0; j<LB.n_domains; j++) |
| FindDummyDomainStructure(j); |
| |
| LB.n_entries = LB.col[LB.n+LB.n_domains]; |
| free(structure); free(nz); |
| |
| /* place domain entries on owner clusters */ |
| PlaceDomains(P); |
| |
| /* count how many blocks are needed */ |
| LB.n_blocks = 0; |
| for (j=0; j<LB.n; j+=LB.partition_size[j]) |
| if (!LB.domain[j]) |
| LB.n_blocks += (LB.col[j+1]-LB.col[j]); |
| for (j=0; j<LB.n_domains; j++) |
| LB.n_blocks += (LB.col[LB.n+j+1]-LB.col[LB.n+j]); |
| |
| printf("%ld partitions, %ld blocks\n", LB.n_partitions, LB.n_blocks); |
| |
| /* now allocate storage for blocks and fill in simple info */ |
| blocks = (Block *) MyMalloc(LB.n_blocks*sizeof(Block), DISTRIBUTED); |
| which = 0; |
| for (j=0; j<LB.n; j+=LB.partition_size[j]) { |
| if (!LB.domain[j]) |
| for (i=LB.col[j]; i<LB.col[j+1]; i++) { |
| BLOCK(i) = &blocks[which]; |
| BLOCK(i)->i = LB.row[i]; |
| BLOCK(i)->j = j; |
| if (LB.renumbering[BLOCK(i)->i] < 0 || |
| LB.renumbering[BLOCK(i)->j] < 0) { |
| printf("Block %ld has bad structure\n", which); |
| exit(-1); |
| } |
| BLOCK(i)->done = 0; |
| BLOCK(i)->pair = NULL; |
| which++; |
| } |
| } |
| /* and for domain dummies */ |
| for (p=0; p<P; p++) |
| for (j=LB.proc_domains[p]; j<LB.proc_domains[p+1]; j++) |
| for (i=LB.col[LB.n+j]; i<LB.col[LB.n+j+1]; i++) { |
| BLOCK(i) = &blocks[which]; |
| BLOCK(i)->i = LB.row[i]; |
| BLOCK(i)->j = LB.n+j; |
| BLOCK(i)->nz = NULL; |
| BLOCK(i)->owner = p; |
| BLOCK(i)->done = 0; |
| BLOCK(i)->pair = NULL; |
| which++; |
| } |
| |
| ComputeBlockParents(T); |
| |
| } |
| |
| long FindNumPartitions(long set_size, long piece_size) |
| { |
| long num_partitions; |
| |
| if (set_size <= 4*piece_size/3) |
| num_partitions = 1; |
| else { |
| num_partitions = (set_size+piece_size-1)/piece_size; |
| if (piece_size - set_size/num_partitions > |
| set_size/(num_partitions-1) - piece_size) |
| num_partitions--; |
| } |
| |
| return(num_partitions); |
| } |
| |
| |
| void ComputeBlockParents(long *T) |
| { |
| long b, i, parent_col; |
| |
| /* compute block parents */ |
| |
| for (b=0; b<LB.n; b+=LB.partition_size[b]) |
| if (!LB.domain[b]) { |
| for (i=LB.col[b]; i<LB.col[b+1]; i++) { |
| parent_col = T[b+LB.partition_size[b]-1]; |
| if (parent_col == LB.n) |
| BLOCK(i)->parent = -1; |
| else if (BLOCK(i)->i <= BLOCK(i)->j) |
| BLOCK(i)->parent = -1; /* above diag */ |
| else { |
| BLOCK(i)->parent = FindBlock(BLOCK(i)->i, parent_col); |
| if (BLOCK(i)->parent == -1) |
| printf("Parent not found\n"); |
| } |
| } |
| } |
| |
| /* find parents for domain dummy blocks */ |
| for (b=0; b<LB.n_domains; b++) |
| for (i=LB.col[LB.n+b]; i<LB.col[LB.n+b+1]; i++) { |
| parent_col = T[LB.domains[b]]; |
| BLOCK(i)->parent = FindBlock(BLOCK(i)->i, parent_col); |
| } |
| } |
| |
| |
| /* find non-zero structure of individual blocks */ |
| |
| void FillInStructure(SMatrix M, long *firstchild, long *child, long *PERM, long *INVP) |
| { |
| long i, j, col, super; |
| long *structure, *nz, n_nz; |
| |
| /* all procedures get structure=0, and return structure=0 */ |
| structure = (long *) malloc(M.n*sizeof(long)); |
| nz = (long *) malloc(M.n*sizeof(long)); |
| for (i=0; i<M.n; i++) |
| structure[i] = 0; |
| |
| /* find the structure of dummy domain blocks */ |
| for (j=0; j<LB.n_domains; j++) { |
| col = LB.domains[j]; |
| for (i=LB.col[col]+1; i<LB.col[col+1]; i++) |
| nz[i-LB.col[col]-1] = LB.row[i]; |
| n_nz = LB.col[col+1]-LB.col[col]-1; |
| FindDetailedStructure(LB.n+j, structure, nz, n_nz); |
| } |
| |
| /* find the structure of the individual blocks */ |
| for (super=0; super<LB.n; super+=node[super]) { |
| FindSuperStructure(M, super, PERM, INVP, firstchild, child, |
| structure, nz, &n_nz); |
| if (!LB.domain[super]) |
| for (j=super; j<super+node[super]; j+=LB.partition_size[j]) { |
| FindDetailedStructure(j, structure, nz, n_nz); |
| } |
| } |
| |
| free(structure); free(nz); |
| |
| } |
| |
| /* put original non-zero values into blocks */ |
| |
| void FillInNZ(SMatrix M, long *PERM, long *INVP) |
| { |
| long j; |
| double *scatter; |
| |
| scatter = (double *) malloc(M.n*sizeof(double)); |
| for (j=0; j<M.n; j++) |
| scatter[j] = 0.0; |
| |
| /* fill in non-zeroes */ |
| for (j=0; j<LB.n; j+=LB.partition_size[j]) |
| FillIn(M, j, PERM, INVP, scatter); |
| |
| free(scatter); |
| } |
| |
| |
| void FindDomStructure(long super, long *nz, long n_nz) |
| { |
| long col, i; |
| |
| for (col=super; col<super+node[super]; col++) { |
| LB.col[col+1] = LB.col[col] + n_nz - (col-super); |
| if (LB.col[col+1] > LB.entries_allocated) { |
| printf("Overflow\n"); |
| exit(-1); |
| } |
| |
| for (i=col-super; i<n_nz; i++) |
| LB.row[LB.col[col]+i-(col-super)] = nz[i]; |
| } |
| } |
| |
| void FindDummyDomainStructure(long which_domain) |
| { |
| long col, row, current_block, current_block_last; |
| |
| col = LB.domains[which_domain]; |
| |
| current_block = current_block_last = -1; |
| row = LB.col[col]+1; |
| |
| /* column starts out empty */ |
| LB.col[LB.n+which_domain+1] = LB.col[LB.n+which_domain]; |
| |
| while (row < LB.col[col+1]) { |
| current_block = LB.row[row]; |
| if (LB.partition_size[current_block] < 0) |
| current_block += LB.partition_size[current_block]; |
| current_block_last = current_block + LB.partition_size[current_block]; |
| LB.row[LB.col[LB.n+which_domain+1]] = current_block; |
| LB.col[LB.n+which_domain+1]++; |
| while (LB.row[row] >= current_block && |
| LB.row[row] < current_block_last && |
| row < LB.col[col+1]) |
| row++; |
| } |
| |
| if (LB.col[LB.n+which_domain+1] > LB.entries_allocated) { |
| printf("Overflow!!\n"); |
| exit(-1); |
| } |
| } |
| |
| |
| void CheckColLength(long col, long n_nz) |
| { |
| extern long *nz; |
| |
| if (n_nz != nz[col]) |
| printf("Col %ld: %ld vs %ld\n", col, n_nz, nz[col]); |
| } |
| |
| void FindBlStructure(SMatrix M, long super, long *PERM, long *INVP, long *firstchild, long *child, long *structure, long *nz) |
| { |
| long truecol, i, c, col, the_child, bl, n_nz; |
| |
| n_nz = 0; |
| for (col=super; col<super+node[super]; col++) { |
| truecol = PERM[col]; |
| for (i=M.col[truecol]; i<M.col[truecol+1]; i++) { |
| bl = INVP[M.row[i]]; |
| if (LB.partition_size[bl] < 0) |
| bl += LB.partition_size[bl]; |
| if (bl >= super && !structure[bl]) { |
| structure[bl] = 1; |
| nz[n_nz++] = bl; |
| } |
| } |
| } |
| |
| for (c=firstchild[super]; c<firstchild[super+1]; c++) { |
| the_child = child[c]; |
| if (LB.partition_size[the_child] < 0) |
| the_child += LB.partition_size[the_child]; |
| for (i=LB.col[the_child]; i<LB.col[the_child+1]; i++) { |
| bl = LB.row[i]; |
| if (LB.partition_size[bl] < 0) |
| bl += LB.partition_size[bl]; |
| if (bl >= super && !structure[bl]) { |
| structure[bl] = 1; |
| nz[n_nz++] = bl; |
| } |
| } |
| } |
| |
| /* reset structure[] to zero */ |
| for (i=0; i<n_nz; i++) |
| structure[nz[i]] = 0; |
| |
| InsSort(nz, n_nz); |
| |
| LB.col[super+1] = LB.col[super] + n_nz; |
| if (LB.col[super+1] > LB.entries_allocated) { |
| printf("Overflow\n"); |
| exit(-1); |
| } |
| for (i=0; i<n_nz; i++) |
| LB.row[LB.col[super]+i] = nz[i]; |
| |
| } |
| |
| |
| void FindSuperStructure(SMatrix M, long super, long *PERM, long *INVP, long *firstchild, long *child, long *structure, long *nz, long *n_nz) |
| { |
| long i, truecol, current, bl, c, the_child, row; |
| |
| *n_nz = 0; |
| |
| /* first add non-zeroes that come from columns within block */ |
| for (current=super; current<super+node[super]; current++) { |
| truecol = PERM[current]; |
| for (i=M.col[truecol]; i<M.col[truecol+1]; i++) { |
| row = INVP[M.row[i]]; |
| if (row >= super && !structure[row]) { |
| structure[row] = 1; |
| nz[(*n_nz)++] = row; |
| } |
| } |
| } |
| |
| /* then add non-zeroes that come from children */ |
| for (c=firstchild[super]; c<firstchild[super+1]; c++) { |
| the_child = child[c]; |
| if (LB.partition_size[the_child] < 0) |
| the_child += LB.partition_size[the_child]; |
| if (LB.domain[the_child]) { |
| for (i=LB.col[the_child]; i<LB.col[the_child+1]; i++) { |
| row = LB.row[i]; |
| if (row >= super && !structure[row]) { |
| structure[row] = 1; |
| nz[(*n_nz)++] = row; |
| } |
| } |
| } |
| else { |
| for (i=LB.col[the_child]; i<LB.col[the_child+1]; i++) { |
| for (bl=0; bl<BLOCK(i)->length; bl++) { |
| if (BLOCK(i)->structure) |
| row = LB.row[i]+BLOCK(i)->structure[bl]; |
| else |
| row = LB.row[i]+bl; |
| if (row >= super && !structure[row]) { |
| structure[row] = 1; |
| nz[(*n_nz)++] = row; |
| } |
| } |
| } |
| } |
| } |
| |
| for (i=0; i<*n_nz; i++) |
| structure[nz[i]] = 0; |
| |
| InsSort(nz, *n_nz); |
| |
| CheckColLength(super, *n_nz); |
| |
| } |
| |
| |
| void FindDetailedStructure(long col, long *structure, long *nz, long n_nz) |
| { |
| long i, j, row, n, owner; |
| |
| for (i=0; i<n_nz; i++) |
| structure[nz[i]] = 1; |
| |
| for (i=LB.col[col]; i<LB.col[col+1]; i++) { |
| n = 0; |
| row = LB.row[i]; |
| for (j=0; j<LB.partition_size[row]; j++) |
| if (structure[row+j]) |
| n++; |
| |
| BLOCK(i)->length = n; |
| if (n == LB.partition_size[row]) { |
| BLOCK(i)->structure = NULL; |
| } |
| else { |
| owner = EmbeddedOwner(i); |
| if (owner < 0) |
| printf("%ld,%ld: %ld\n", BLOCKROW(i), BLOCKCOL(i), owner); |
| BLOCK(i)->structure = (long *) MyMalloc(n*sizeof(long), owner); |
| n = 0; |
| for (j=0; j<LB.partition_size[row]; j++) |
| if (structure[row+j]) |
| BLOCK(i)->structure[n++] = j; |
| |
| } |
| } |
| |
| for (i=0; i<n_nz; i++) |
| structure[nz[i]] = 0; |
| } |
| |
| |
| void AllocateNZ() |
| { |
| long i, j, b, size; |
| |
| for (j=0; j<LB.n; j+=LB.partition_size[j]) |
| if (!LB.domain[j]) { |
| for (b=LB.col[j]; b<LB.col[j+1]; b++) { |
| size = LB.partition_size[j]*BLOCK(b)->length; |
| BLOCK(b)->nz = (double *) MyMalloc(size*sizeof(double), BLOCK(b)->owner); |
| for (i=0; i<size; i++) |
| BLOCK(b)->nz[i] = 0.0; |
| } |
| } |
| } |
| |
| |
| |
| void FillIn(SMatrix M, long col, long *PERM, long *INVP, double *scatter) |
| { |
| long i, b, j1, row, truecol; |
| |
| truecol = PERM[col]; |
| if (LB.domain[col]) { |
| for (i=M.col[truecol]; i<M.col[truecol+1]; i++) { |
| row = INVP[M.row[i]]; |
| if (row >= col) { |
| if (M.nz) |
| scatter[row] = M.nz[i]; |
| else |
| scatter[row] = Value(M.row[i], truecol); |
| } |
| |
| } |
| for (i=LB.col[col]; i<LB.col[col+1]; i++) { |
| LB.entry[i].nz = scatter[LB.row[i]]; |
| scatter[LB.row[i]] = 0.0; |
| } |
| } |
| else { |
| |
| for (j1=0; j1<LB.partition_size[col]; j1++) { |
| truecol = PERM[col+j1]; |
| for (i=M.col[truecol]; i<M.col[truecol+1]; i++) { |
| row = INVP[M.row[i]]; |
| if (row >= col+j1) { |
| if (M.nz) |
| scatter[row] = M.nz[i]; |
| else |
| scatter[row] = Value(M.row[i], truecol); |
| } |
| } |
| for (b=LB.col[col]; b<LB.col[col+1]; b++) { |
| for (i=0; i<BLOCK(b)->length; i++) { |
| if (BLOCK(b)->structure) |
| row = LB.row[b] + BLOCK(b)->structure[i]; |
| else row = LB.row[b] + i; |
| BLOCK(b)->nz[i+j1*BLOCK(b)->length] = |
| scatter[row]; |
| scatter[row] = 0.0; |
| } |
| } |
| } |
| } |
| } |
| |
| |
| void InsSort(long *nz, long n) |
| { |
| long i, j, tmp; |
| |
| for (i=1; i<n; i++) { |
| j = i; |
| while (j>0 && nz[j-1] > nz[j]) { |
| tmp = nz[j]; nz[j] = nz[j-1]; nz[j-1] = tmp; j--; |
| } |
| } |
| } |
| |
| |
| /* determine relative indices for all blocks */ |
| |
| long BlDepth(long col) |
| { |
| long current, depth; |
| extern long *T; |
| |
| depth = 0; |
| current = col; |
| while (T[current] != current) { |
| current = T[current]; |
| depth++; |
| } |
| |
| return(depth); |
| } |
| |
| /* must be stable, blocks in same column must remain in sorted order */ |
| void SortByKey(long n, long *blocks, long *keys) |
| { |
| long i, j, blocki, keyi; |
| |
| for (i=0; i<n; i++) { |
| blocki = blocks[i]; |
| keyi = keys[i]; |
| j = i; |
| while ((j > 0) && (keys[j-1] > keyi)) { |
| blocks[j] = blocks[j-1]; |
| keys[j] = keys[j-1]; |
| j--; |
| } |
| blocks[j] = blocki; |
| keys[j] = keyi; |
| } |
| } |
| |
| |
| /* must be stable, blocks in same column must remain in sorted order */ |
| |
| void DumpSizes(BMatrix LB, long *domain, long *sizes) |
| { |
| long i, *buckets, maxm; |
| |
| maxm = 0; |
| for (i=0; i<LB.n; i+=sizes[i]) |
| if (!domain[i] && sizes[i] > maxm) |
| maxm = sizes[i]; |
| |
| buckets = (long *) malloc((maxm+1)*sizeof(long)); |
| for (i=0; i<=maxm; i++) |
| buckets[i] = 0; |
| |
| for (i=0; i<LB.n; i+=sizes[i]) |
| if (!domain[i]) |
| buckets[sizes[i]]++; |
| |
| for (i=0; i<=maxm; i++) { |
| if (buckets[i] == 0) |
| ; |
| else |
| printf("%ld: %ld ", i, buckets[i]); |
| } |
| printf("\n"); |
| |
| free(buckets); |
| } |
| |
| |
| void ComputePartitionNumbering(long *numbering) |
| { |
| long j, which; |
| |
| for (j=0; j<LB.n; j++) |
| numbering[j] = -1; |
| |
| which = 0; |
| for (j=0; j<LB.n; j+=LB.partition_size[j]) |
| if (!LB.domain[j]) |
| numbering[j] = which++; |
| } |
| |
| |
| /* factor P */ |
| void FindMachineDimensions(long P) |
| { |
| long try = 0, div = 0; |
| |
| for (try=(long) sqrt((double) P); try>0; try--) { |
| div = P/try; |
| if (div*try == P) |
| break; |
| } |
| |
| P_dimi = div; P_dimj = try; |
| printf("Processor array is %ld by %ld\n", P_dimi, P_dimj); |
| } |
| |
| |
| long EmbeddedOwner(long block) |
| { |
| long row, col; |
| |
| row = LB.mapI[LB.renumbering[BLOCKROW(block)]] % P_dimi; |
| col = LB.mapJ[LB.renumbering[BLOCKCOL(block)]] % P_dimj; |
| |
| return(row + col*P_dimi); |
| } |
| |