[SPARC64]: Get SUN4V SMP working.

The sibling cpu bringup is extremely fragile.  We can only
perform the most basic calls until we take over the trap
table from the firmware/hypervisor on the new cpu.

This means no accesses to %g4, %g5, %g6 since those can't be
TLB translated without our trap handlers.

In order to achieve this:

1) Change sun4v_init_mondo_queues() so that it can operate in
   several modes.

   It can allocate the queues, or install them in the current
   processor, or both.

   The boot cpu does both in it's call early on.

   Later, the boot cpu allocates the sibling cpu queue, starts
   the sibling cpu, then the sibling cpu loads them in.

2) init_cur_cpu_trap() is changed to take the current_thread_info()
   as an argument instead of reading %g6 directly on the current
   cpu.

3) Create a trampoline stack for the sibling cpus.  We do our basic
   kernel calls using this stack, which is locked into the kernel
   image, then go to our proper thread stack after taking over the
   trap table.

4) While we are in this delicate startup state, we put 0xdeadbeef
   into %g4/%g5/%g6 in order to catch accidental accesses.

5) On the final prom_set_trap_table*() call, we put &init_thread_union
   into %g6.  This is a hack to make prom_world(0) work.  All that
   wants to do is restore the %asi register using
   get_thread_current_ds().

Longer term we should just do the OBP calls to set the trap table by
hand just like we do for everything else.  This would avoid that silly
prom_world(0) issue, then we can remove the init_thread_union hack.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index 6eb44ca..bb0bb34 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -1018,21 +1018,29 @@
 }
 
 /* Allocate and register the mondo and error queues for this cpu.  */
-void __cpuinit sun4v_init_mondo_queues(int use_bootmem)
+void __cpuinit sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load)
 {
-	int cpu = hard_smp_processor_id();
 	struct trap_per_cpu *tb = &trap_block[cpu];
 
-	alloc_one_mondo(&tb->cpu_mondo_pa, use_bootmem);
-	alloc_one_mondo(&tb->dev_mondo_pa, use_bootmem);
-	alloc_one_mondo(&tb->resum_mondo_pa, use_bootmem);
-	alloc_one_kbuf(&tb->resum_kernel_buf_pa, use_bootmem);
-	alloc_one_mondo(&tb->nonresum_mondo_pa, use_bootmem);
-	alloc_one_kbuf(&tb->nonresum_kernel_buf_pa, use_bootmem);
+	if (alloc) {
+		alloc_one_mondo(&tb->cpu_mondo_pa, use_bootmem);
+		alloc_one_mondo(&tb->dev_mondo_pa, use_bootmem);
+		alloc_one_mondo(&tb->resum_mondo_pa, use_bootmem);
+		alloc_one_kbuf(&tb->resum_kernel_buf_pa, use_bootmem);
+		alloc_one_mondo(&tb->nonresum_mondo_pa, use_bootmem);
+		alloc_one_kbuf(&tb->nonresum_kernel_buf_pa, use_bootmem);
 
-	init_cpu_send_mondo_info(tb, use_bootmem);
+		init_cpu_send_mondo_info(tb, use_bootmem);
+	}
 
-	sun4v_register_mondo_queues(cpu);
+	if (load) {
+		if (cpu != hard_smp_processor_id()) {
+			prom_printf("SUN4V: init mondo on cpu %d not %d\n",
+				    cpu, hard_smp_processor_id());
+			prom_halt();
+		}
+		sun4v_register_mondo_queues(cpu);
+	}
 }
 
 /* Only invoked on boot processor. */
@@ -1043,7 +1051,7 @@
 	memset(&ivector_table[0], 0, sizeof(ivector_table));
 
 	if (tlb_type == hypervisor)
-		sun4v_init_mondo_queues(1);
+		sun4v_init_mondo_queues(1, hard_smp_processor_id(), 1, 1);
 
 	/* We need to clear any IRQ's pending in the soft interrupt
 	 * registers, a spurious one could be left around from the
diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c
index 06807cf..9b0c409 100644
--- a/arch/sparc64/kernel/setup.c
+++ b/arch/sparc64/kernel/setup.c
@@ -384,7 +384,7 @@
 	paging_init();
 
 	/* Get boot processor trap_block[] setup.  */
-	init_cur_cpu_trap();
+	init_cur_cpu_trap(current_thread_info());
 }
 
 static int __init set_preferred_console(void)
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 527dfd7..b586345 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -316,6 +316,8 @@
 	spin_unlock_irqrestore(&itc_sync_lock, flags);
 }
 
+extern void sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load);
+
 extern unsigned long sparc64_cpu_startup;
 
 /* The OBP cpu startup callback truncates the 3rd arg cookie to
@@ -339,6 +341,9 @@
 	cpu_set(cpu, cpu_callout_map);
 
 	if (tlb_type == hypervisor) {
+		/* Alloc the mondo queues, cpu will load them.  */
+		sun4v_init_mondo_queues(0, cpu, 1, 0);
+
 		prom_startcpu_cpuid(cpu, entry, cookie);
 	} else {
 		int cpu_node;
@@ -352,6 +357,7 @@
 			break;
 		udelay(100);
 	}
+
 	if (callin_flag) {
 		ret = 0;
 	} else {
diff --git a/arch/sparc64/kernel/trampoline.S b/arch/sparc64/kernel/trampoline.S
index b9c9f54..a4dc01a 100644
--- a/arch/sparc64/kernel/trampoline.S
+++ b/arch/sparc64/kernel/trampoline.S
@@ -30,12 +30,16 @@
 dtlb_load:
 	.asciz	"SUNW,dtlb-load"
 
+	/* XXX __cpuinit this thing XXX */
+#define TRAMP_STACK_SIZE	1024
+	.align	16
+tramp_stack:
+	.skip	TRAMP_STACK_SIZE
+
 	.text
 	.align		8
 	.globl		sparc64_cpu_startup, sparc64_cpu_startup_end
 sparc64_cpu_startup:
-	flushw
-
 	BRANCH_IF_SUN4V(g1, niagara_startup)
 	BRANCH_IF_CHEETAH_BASE(g1, g5, cheetah_startup)
 	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1, g5, cheetah_plus_startup)
@@ -58,6 +62,7 @@
 	or	%g5, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g5
 	stxa	%g5, [%g0] ASI_DCU_CONTROL_REG
 	membar	#Sync
+	/* fallthru */
 
 cheetah_generic_startup:
 	mov	TSB_EXTENSION_P, %g3
@@ -90,19 +95,17 @@
 	membar		#Sync
 
 startup_continue:
-	wrpr		%g0, 15, %pil
-
 	sethi		%hi(0x80000000), %g2
 	sllx		%g2, 32, %g2
 	wr		%g2, 0, %tick_cmpr
 
+	mov		%o0, %l0
+
 	BRANCH_IF_SUN4V(g1, niagara_lock_tlb)
 
 	/* Call OBP by hand to lock KERNBASE into i/d tlbs.
 	 * We lock 2 consequetive entries if we are 'bigkernel'.
 	 */
-	mov		%o0, %l0
-
 	sethi		%hi(prom_entry_lock), %g2
 1:	ldstub		[%g2 + %lo(prom_entry_lock)], %g1
 	membar		#StoreLoad | #StoreStore
@@ -112,7 +115,6 @@
 	sethi		%hi(p1275buf), %g2
 	or		%g2, %lo(p1275buf), %g2
 	ldx		[%g2 + 0x10], %l2
-	mov		%sp, %l1
 	add		%l2, -(192 + 128), %sp
 	flushw
 
@@ -308,18 +310,9 @@
 	ta		HV_FAST_TRAP
 
 after_lock_tlb:
-	mov		%l1, %sp
-	flushw
-
-	mov		%l0, %o0
-
 	wrpr		%g0, (PSTATE_PRIV | PSTATE_PEF), %pstate
 	wr		%g0, 0, %fprs
 
-	/* XXX Buggy PROM... */
-	srl		%o0, 0, %o0
-	ldx		[%o0], %g6
-
 	wr		%g0, ASI_P, %asi
 
 	mov		PRIMARY_CONTEXT, %g7
@@ -341,22 +334,25 @@
 
 	membar		#Sync
 
-	mov		1, %g5
-	sllx		%g5, THREAD_SHIFT, %g5
-	sub		%g5, (STACKFRAME_SZ + STACK_BIAS), %g5
-	add		%g6, %g5, %sp
+	/* Everything we do here, until we properly take over the
+	 * trap table, must be done with extreme care.  We cannot
+	 * make any references to %g6 (current thread pointer),
+	 * %g4 (current task pointer), or %g5 (base of current cpu's
+	 * per-cpu area) until we properly take over the trap table
+	 * from the firmware and hypervisor.
+	 *
+	 * Get onto temporary stack which is in the locked kernel image.
+	 */
+	sethi		%hi(tramp_stack), %g1
+	or		%g1, %lo(tramp_stack), %g1
+	add		%g1, TRAMP_STACK_SIZE, %g1
+	sub		%g1, STACKFRAME_SZ + STACK_BIAS, %sp
 	mov		0, %fp
 
-	wrpr		%g0, 0, %wstate
-	wrpr		%g0, 0, %tl
-
-	/* Load TBA, then we can resurface. */
-	sethi		%hi(sparc64_ttable_tl0), %g5
-	wrpr		%g5, %tba
-
-	ldx		[%g6 + TI_TASK], %g4
-
-	wrpr		%g0, 0, %wstate
+	/* Put garbage in these registers to trap any access to them.  */
+	set		0xdeadbeef, %g4
+	set		0xdeadbeef, %g5
+	set		0xdeadbeef, %g6
 
 	call		init_irqwork_curcpu
 	 nop
@@ -367,11 +363,17 @@
 	bne,pt		%icc, 1f
 	 nop
 
+	call		hard_smp_processor_id
+	 nop
+	
+	mov		%o0, %o1
+	mov		0, %o0
+	mov		0, %o2
 	call		sun4v_init_mondo_queues
-	 mov		0, %o0
+	 mov		1, %o3
 
 1:	call		init_cur_cpu_trap
-	 nop
+	 ldx		[%l0], %o0
 
 	/* Start using proper page size encodings in ctx register.  */
 	sethi		%hi(sparc64_kern_pri_context), %g3
@@ -386,9 +388,14 @@
 
 	membar		#Sync
 
-	rdpr		%pstate, %o1
-	or		%o1, PSTATE_IE, %o1
-	wrpr		%o1, 0, %pstate
+	wrpr		%g0, 0, %wstate
+
+	/* As a hack, put &init_thread_union into %g6.
+	 * prom_world() loads from here to restore the %asi
+	 * register.
+	 */
+	sethi		%hi(init_thread_union), %g6
+	or		%g6, %lo(init_thread_union), %g6
 
 	sethi		%hi(is_sun4v), %o0
 	lduw		[%o0 + %lo(is_sun4v)], %o0
@@ -418,7 +425,20 @@
 1:	call		prom_set_trap_table
 	 sethi		%hi(sparc64_ttable_tl0), %o0
 
-2:	call		smp_callin
+2:	ldx		[%l0], %g6
+	ldx		[%g6 + TI_TASK], %g4
+
+	mov		1, %g5
+	sllx		%g5, THREAD_SHIFT, %g5
+	sub		%g5, (STACKFRAME_SZ + STACK_BIAS), %g5
+	add		%g6, %g5, %sp
+	mov		0, %fp
+
+	rdpr		%pstate, %o1
+	or		%o1, PSTATE_IE, %o1
+	wrpr		%o1, 0, %pstate
+
+	call		smp_callin
 	 nop
 	call		cpu_idle
 	 mov		0, %o0
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index 5956d0a..c9484ae 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -2413,12 +2413,12 @@
 /* This can get invoked before sched_init() so play it super safe
  * and use hard_smp_processor_id().
  */
-void init_cur_cpu_trap(void)
+void init_cur_cpu_trap(struct thread_info *t)
 {
 	int cpu = hard_smp_processor_id();
 	struct trap_per_cpu *p = &trap_block[cpu];
 
-	p->thread = current_thread_info();
+	p->thread = t;
 	p->pgd_paddr = 0;
 }
 
diff --git a/include/asm-sparc64/cpudata.h b/include/asm-sparc64/cpudata.h
index 5a970f5..771aa94 100644
--- a/include/asm-sparc64/cpudata.h
+++ b/include/asm-sparc64/cpudata.h
@@ -77,7 +77,7 @@
 	unsigned long		__pad2[4];
 } __attribute__((aligned(64)));
 extern struct trap_per_cpu trap_block[NR_CPUS];
-extern void init_cur_cpu_trap(void);
+extern void init_cur_cpu_trap(struct thread_info *);
 extern void setup_tba(void);
 
 #ifdef CONFIG_SMP