[SPARC64]: Eliminate NR_CPUS limitations.

Cheetah systems can have cpuids as large as 1023, although physical
systems don't have that many cpus.

Only three limitations existed in the kernel preventing arbitrary
NR_CPUS values:

1) dcache dirty cpu state stored in page->flags on
   D-cache aliasing platforms.  With some build time
   calculations and some build-time BUG checks on
   page->flags layout, this one was easily solved.

2) The cheetah XCALL delivery code could only handle
   a cpumask with up to 32 cpus set.  Some simple looping
   logic clears that up too.

3) thread_info->cpu was a u8, easily changed to a u16.

There are a few spots in the kernel that still put NR_CPUS
sized arrays on the kernel stack, but that's not a sparc64
specific problem.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index f7fa873..c550bba 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -400,7 +400,7 @@
 static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
 {
 	u64 pstate, ver;
-	int nack_busy_id, is_jbus;
+	int nack_busy_id, is_jbus, need_more;
 
 	if (cpus_empty(mask))
 		return;
@@ -416,6 +416,7 @@
 	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
 
 retry:
+	need_more = 0;
 	__asm__ __volatile__("wrpr %0, %1, %%pstate\n\t"
 			     : : "r" (pstate), "i" (PSTATE_IE));
 
@@ -444,6 +445,10 @@
 				: /* no outputs */
 				: "r" (target), "i" (ASI_INTR_W));
 			nack_busy_id++;
+			if (nack_busy_id == 32) {
+				need_more = 1;
+				break;
+			}
 		}
 	}
 
@@ -460,6 +465,16 @@
 			if (dispatch_stat == 0UL) {
 				__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 						     : : "r" (pstate));
+				if (unlikely(need_more)) {
+					int i, cnt = 0;
+					for_each_cpu_mask(i, mask) {
+						cpu_clear(i, mask);
+						cnt++;
+						if (cnt == 32)
+							break;
+					}
+					goto retry;
+				}
 				return;
 			}
 			if (!--stuck)
@@ -497,6 +512,8 @@
 				if ((dispatch_stat & check_mask) == 0)
 					cpu_clear(i, mask);
 				this_busy_nack += 2;
+				if (this_busy_nack == 64)
+					break;
 			}
 
 			goto retry;