[SPARC64]: More SUN4V cpu mondo bug fixing. This cpu mondo sending interface isn't all that easy to use correctly... We were clearing out the wrong bits from the "mask" after getting something other than EOK from the hypervisor. It turns out the hypervisor can just be resent the same cpu_list[] array, with the 0xffff "done" entries still in there, and it will do the right thing. So don't update or try to rebuild the cpu_list[] array to condense it. This requires the "forward_progress" check to be done slightly differently, but this new scheme is less bug prone than what we were doing before. Signed-off-by: David S. Miller <davem@davemloft.net>

commit: 3cab0c3e8636d5005041aa52224f796c3a4ef872 [log] [tgz]
author: David S. Miller <davem@sunset.davemloft.net> Thu Mar 02 21:50:47 2006 -0800
committer: David S. Miller <davem@sunset.davemloft.net> Mon Mar 20 01:14:17 2006 -0800
tree: 582c92940f46cb0ecf8fafd4fde1cfd346172366
parent: bcc28ee0bf390df0d81cc9dafe980faef6b2771a [diff] [blame]
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 6bc7fd4..c4548a8 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c

@@ -563,7 +563,7 @@
 	u64 *mondo;
 	cpumask_t error_mask;
 	unsigned long flags, status;
-	int cnt, retries, this_cpu, i;
+	int cnt, retries, this_cpu, prev_sent, i;
 
 	/* We have to do this whole thing with interrupts fully disabled.
 	 * Otherwise if we send an xcall from interrupt context it will
@@ -595,8 +595,9 @@
 
 	cpus_clear(error_mask);
 	retries = 0;
+	prev_sent = 0;
 	do {
-		int forward_progress;
+		int forward_progress, n_sent;
 
 		status = sun4v_cpu_mondo_send(cnt,
 					      tb->cpu_list_pa,
@@ -606,18 +607,23 @@
 		if (likely(status == HV_EOK))
 			break;
 
-		/* First, clear out all the cpus in the mask that were
-		 * successfully sent to.  The hypervisor indicates this
-		 * by setting the cpu list entry of such cpus to 0xffff.
+		/* First, see if we made any forward progress.
+		 *
+		 * The hypervisor indicates successful sends by setting
+		 * cpu list entries to the value 0xffff.
 		 */
-		forward_progress = 0;
+		n_sent = 0;
 		for (i = 0; i < cnt; i++) {
-			if (cpu_list[i] == 0xffff) {
-				cpu_clear(i, mask);
-				forward_progress = 1;
-			}
+			if (likely(cpu_list[i] == 0xffff))
+				n_sent++;
 		}
 
+		forward_progress = 0;
+		if (n_sent > prev_sent)
+			forward_progress = 1;
+
+		prev_sent = n_sent;
+
 		/* If we get a HV_ECPUERROR, then one or more of the cpus
 		 * in the list are in error state.  Use the cpu_state()
 		 * hypervisor call to find out which cpus are in error state.
@@ -634,18 +640,20 @@
 				err = sun4v_cpu_state(cpu);
 				if (err >= 0 &&
 				    err == HV_CPU_STATE_ERROR) {
-					cpu_clear(cpu, mask);
+					cpu_list[i] = 0xffff;
 					cpu_set(cpu, error_mask);
 				}
 			}
 		} else if (unlikely(status != HV_EWOULDBLOCK))
 			goto fatal_mondo_error;
 
-		/* Rebuild the cpu_list[] array and try again.  */
-		cnt = 0;
-		for_each_cpu_mask(i, mask)
-			cpu_list[cnt++] = i;
-
+		/* Don't bother rewriting the CPU list, just leave the
+		 * 0xffff and non-0xffff entries in there and the
+		 * hypervisor will do the right thing.
+		 *
+		 * Only advance timeout state if we didn't make any
+		 * forward progress.
+		 */
 		if (unlikely(!forward_progress)) {
 			if (unlikely(++retries > 10000))
 				goto fatal_mondo_timeout;
commit	3cab0c3e8636d5005041aa52224f796c3a4ef872	[log] [tgz]
author	David S. Miller <davem@sunset.davemloft.net>	Thu Mar 02 21:50:47 2006 -0800
committer	David S. Miller <davem@sunset.davemloft.net>	Mon Mar 20 01:14:17 2006 -0800
tree	582c92940f46cb0ecf8fafd4fde1cfd346172366
parent	bcc28ee0bf390df0d81cc9dafe980faef6b2771a [diff] [blame]