ARM: Fix RFE macrop.

This changes the RFE macroop into 3 microops:

URa = [sp]; URb = [sp+4]; // load CPSR,PC values from stack
sp = sp + offset;         // optionally auto-increment
PC = URa; CPSR = URb;     // write to the PC and CPSR.

Importantly:
- writing to PC is handled in the last micro-op.
- loading occurs prior to state changes.
diff --git a/src/arch/arm/isa/insts/ldr.isa b/src/arch/arm/isa/insts/ldr.isa
index c60a91a..2e45f28 100644
--- a/src/arch/arm/isa/insts/ldr.isa
+++ b/src/arch/arm/isa/insts/ldr.isa
@@ -67,7 +67,7 @@
             self.memFlags = ["ArmISA::TLB::MustBeOne"]
             self.codeBlobs = {"postacc_code" : ""}
 
-        def emitHelper(self, base = 'Memory', wbDecl = None, instFlags = []):
+        def emitHelper(self, base = 'Memory', wbDecl = None, instFlags = [], pcDecl = None):
 
             global header_output, decoder_output, exec_output
 
@@ -76,7 +76,8 @@
             (newHeader,
              newDecoder,
              newExec) = self.fillTemplates(self.name, self.Name, codeBlobs,
-                                           self.memFlags, instFlags, base, wbDecl)
+                                           self.memFlags, instFlags, base,
+                                           wbDecl, pcDecl)
 
             header_output += newHeader
             decoder_output += newDecoder
@@ -104,26 +105,18 @@
                 wbDiff = 8
             accCode = '''
             CPSR cpsr = Cpsr;
-            SCTLR sctlr = Sctlr;
-            // Use the version of NPC that gets set before NextThumb
-            pNPC = cSwap<uint32_t>(Mem.ud, cpsr.e);
-            uint32_t tempSpsr = cSwap<uint32_t>(Mem.ud >> 32, cpsr.e);
-            uint32_t newCpsr =
-                cpsrWriteByInstr(cpsr | CondCodes, tempSpsr,
-                                 0xF, true, sctlr.nmfi);
-            Cpsr = ~CondCodesMask & newCpsr;
-            NextThumb = ((CPSR)newCpsr).t;
-            NextJazelle = ((CPSR)newCpsr).j;
-            ForcedItState = ((((CPSR)tempSpsr).it2 << 2) & 0xFC)
-                | (((CPSR)tempSpsr).it1 & 0x3);
-            CondCodes = CondCodesMask & newCpsr;
+            URc = cpsr | CondCodes;
+            URa = cSwap<uint32_t>(Mem.ud, cpsr.e);
+            URb = cSwap<uint32_t>(Mem.ud >> 32, cpsr.e);
             '''
             self.codeBlobs["memacc_code"] = accCode
 
             wbDecl = None
+            pcDecl = "MicroUopSetPCCPSR(machInst, INTREG_UREG0, INTREG_UREG1, INTREG_UREG2);"
+
             if self.writeback:
                 wbDecl = "MicroAddiUop(machInst, base, base, %d);" % wbDiff
-            self.emitHelper('RfeOp', wbDecl, ["IsSerializeAfter", "IsNonSpeculative"])
+            self.emitHelper('RfeOp', wbDecl, ["IsSerializeAfter", "IsNonSpeculative"], pcDecl)
 
     class LoadImmInst(LoadInst):
         def __init__(self, *args, **kargs):
diff --git a/src/arch/arm/isa/insts/macromem.isa b/src/arch/arm/isa/insts/macromem.isa
index 0e3bcc6..d6c9293 100644
--- a/src/arch/arm/isa/insts/macromem.isa
+++ b/src/arch/arm/isa/insts/macromem.isa
@@ -608,23 +608,48 @@
                                     'predicate_test': predicateTest},
                                    ['IsMicroop'])
 
+    setPCCPSRDecl = '''
+                    CPSR cpsrOrCondCodes = URc;
+                    SCTLR sctlr = Sctlr;
+                    pNPC = URa;
+                    uint32_t newCpsr =
+                    cpsrWriteByInstr(cpsrOrCondCodes, URb,
+                                     0xF, true, sctlr.nmfi);
+                    Cpsr = ~CondCodesMask & newCpsr;
+                    NextThumb = ((CPSR)newCpsr).t;
+                    NextJazelle = ((CPSR)newCpsr).j;
+                    ForcedItState = ((((CPSR)URb).it2 << 2) & 0xFC)
+                                    | (((CPSR)URb).it1 & 0x3);
+                    CondCodes = CondCodesMask & newCpsr;
+                    '''
+
+    microUopSetPCCPSRIop = InstObjParams('uopSet_uop', 'MicroUopSetPCCPSR',
+                                         'MicroSetPCCPSR',
+                                         {'code': setPCCPSRDecl,
+                                          'predicate_test': predicateTest},
+                                         ['IsMicroop'])
+
     header_output = MicroIntImmDeclare.subst(microAddiUopIop) + \
                     MicroIntImmDeclare.subst(microSubiUopIop) + \
                     MicroIntRegDeclare.subst(microAddUopIop) + \
                     MicroIntRegDeclare.subst(microSubUopIop) + \
-                    MicroIntMovDeclare.subst(microUopRegMovIop)
+                    MicroIntMovDeclare.subst(microUopRegMovIop) + \
+                    MicroSetPCCPSRDeclare.subst(microUopSetPCCPSRIop)
 
     decoder_output = MicroIntImmConstructor.subst(microAddiUopIop) + \
                      MicroIntImmConstructor.subst(microSubiUopIop) + \
                      MicroIntRegConstructor.subst(microAddUopIop) + \
                      MicroIntRegConstructor.subst(microSubUopIop) + \
-                     MicroIntMovConstructor.subst(microUopRegMovIop)
+                     MicroIntMovConstructor.subst(microUopRegMovIop) + \
+                     MicroSetPCCPSRConstructor.subst(microUopSetPCCPSRIop)
 
     exec_output = PredOpExecute.subst(microAddiUopIop) + \
                   PredOpExecute.subst(microSubiUopIop) + \
                   PredOpExecute.subst(microAddUopIop) + \
                   PredOpExecute.subst(microSubUopIop) + \
-                  PredOpExecute.subst(microUopRegMovIop)
+                  PredOpExecute.subst(microUopRegMovIop) + \
+                  PredOpExecute.subst(microUopSetPCCPSRIop)
+
 }};
 
 let {{
diff --git a/src/arch/arm/isa/insts/mem.isa b/src/arch/arm/isa/insts/mem.isa
index 507f8cd..d0c0f07 100644
--- a/src/arch/arm/isa/insts/mem.isa
+++ b/src/arch/arm/isa/insts/mem.isa
@@ -48,7 +48,7 @@
             self.constructTemplate = eval(self.decConstBase + 'Constructor')
 
         def fillTemplates(self, name, Name, codeBlobs, memFlags, instFlags,
-                          base = 'Memory', wbDecl = None):
+                          base = 'Memory', wbDecl = None, pcDecl = None):
             # Make sure flags are in lists (convert to lists if not).
             memFlags = makeList(memFlags)
             instFlags = makeList(instFlags)
@@ -65,12 +65,26 @@
             macroName = Name
             instFlagsCopy = list(instFlags)
             codeBlobsCopy = dict(codeBlobs)
-            if wbDecl is not None:
+
+            use_uops = 0
+            if wbDecl is not None or pcDecl is not None:
                 instFlagsCopy.append('IsMicroop')
                 Name = Name + 'Acc'
+                use_uops = 1
+
+            use_wb = 0
+            use_pc = 0
+            if wbDecl is not None:
+                use_wb = 1
+            if pcDecl is not None:
+                use_pc = 1
+
             codeBlobsCopy['acc_name'] = Name
             codeBlobsCopy['wb_decl'] = wbDecl
+            codeBlobsCopy['pc_decl'] = pcDecl
             codeBlobsCopy['use_uops'] = 0
+            codeBlobsCopy['use_wb'] = 0
+            codeBlobsCopy['use_pc'] = 0
 
             iop = InstObjParams(name, Name, base,
                                 codeBlobsCopy, instFlagsCopy)
@@ -81,11 +95,14 @@
                           self.initiateAccTemplate.subst(iop) + \
                           self.completeAccTemplate.subst(iop)
 
-            if wbDecl is not None:
+            if wbDecl is not None or pcDecl is not None:
                 iop = InstObjParams(name, macroName, base,
                                     { "wb_decl" : wbDecl,
+                                      "pc_decl" : pcDecl,
                                       "acc_name" : Name,
-                                      "use_uops" : 1 },
+                                      "use_uops" : use_uops,
+                                      "use_pc" : use_pc,
+                                      "use_wb" : use_wb },
                                     ['IsMacroop'])
                 header_output += self.declareTemplate.subst(iop)
                 decoder_output += self.constructTemplate.subst(iop)