Merge branches 'l2', 'pgt2' and 'misc' into for-linus
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 24626b0..be8f634 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -9,6 +9,7 @@
 	select SYS_SUPPORTS_APM_EMULATION
 	select GENERIC_ATOMIC64 if (CPU_V6 || !CPU_32v6K || !AEABI)
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
+	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
 	select HAVE_ARCH_KGDB
 	select HAVE_KPROBES if !XIP_KERNEL
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
@@ -21,6 +22,7 @@
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
+	select HAVE_KERNEL_XZ
 	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
@@ -32,6 +34,7 @@
 	select GENERIC_IRQ_SHOW
 	select CPU_PM if (SUSPEND || CPU_IDLE)
 	select GENERIC_PCI_IOMAP
+	select HAVE_BPF_JIT if NET
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
@@ -266,6 +269,7 @@
 	select PLAT_VERSATILE
 	select PLAT_VERSATILE_FPGA_IRQ
 	select NEED_MACH_MEMORY_H
+	select SPARSE_IRQ
 	help
 	  Support for ARM's Integrator platform.
 
@@ -312,6 +316,7 @@
 	select HAVE_CLK
 	select HAVE_PATA_PLATFORM
 	select ICST
+	select NO_IOPORT
 	select PLAT_VERSATILE
 	select PLAT_VERSATILE_CLCD
 	help
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index e0d236d..755d9fb 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -100,6 +100,22 @@
 		  Note that the system will appear to hang during boot if there
 		  is nothing connected to read from the DCC.
 
+	config DEBUG_SEMIHOSTING
+		bool "Kernel low-level debug output via semihosting I"
+		help
+		  Semihosting enables code running on an ARM target to use
+		  the I/O facilities on a host debugger/emulator through a
+		  simple SVC calls. The host debugger or emulator must have
+		  semihosting enabled for the special svc call to be trapped
+		  otherwise the kernel will crash.
+
+		  This is known to work with OpenOCD, as wellas
+		  ARM's Fast Models, or any other controlling environment
+		  that implements semihosting.
+
+		  For more details about semihosting, please see
+		  chapter 8 of DUI0203I_rvct_developer_guide.pdf from ARM Ltd.
+
 	config AT91_DEBUG_LL_DBGU0
 		bool "Kernel low-level debugging on rm9200, 9260/9g20, 9261/9g10 and 9rl"
 		depends on HAVE_AT91_DBGU0
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 40319d9..93d63be 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -253,6 +253,7 @@
 
 # If we have a machine-specific directory, then include it in the build.
 core-y				+= arch/arm/kernel/ arch/arm/mm/ arch/arm/common/
+core-y				+= arch/arm/net/
 core-y				+= $(machdirs) $(platdirs)
 
 drivers-$(CONFIG_OPROFILE)      += arch/arm/oprofile/
diff --git a/arch/arm/boot/compressed/.gitignore b/arch/arm/boot/compressed/.gitignore
index e0936a1..d0d441c 100644
--- a/arch/arm/boot/compressed/.gitignore
+++ b/arch/arm/boot/compressed/.gitignore
@@ -1,8 +1,10 @@
+ashldi3.S
 font.c
 lib1funcs.S
 piggy.gzip
 piggy.lzo
 piggy.lzma
+piggy.xzkern
 vmlinux
 vmlinux.lds
 
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index cf0a64c..bb26756 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -92,6 +92,7 @@
 suffix_$(CONFIG_KERNEL_GZIP) = gzip
 suffix_$(CONFIG_KERNEL_LZO)  = lzo
 suffix_$(CONFIG_KERNEL_LZMA) = lzma
+suffix_$(CONFIG_KERNEL_XZ)   = xzkern
 
 # Borrowed libfdt files for the ATAG compatibility mode
 
@@ -112,10 +113,12 @@
 
 targets       := vmlinux vmlinux.lds \
 		 piggy.$(suffix_y) piggy.$(suffix_y).o \
-		 lib1funcs.o lib1funcs.S font.o font.c head.o misc.o $(OBJS)
+		 lib1funcs.o lib1funcs.S ashldi3.o ashldi3.S \
+		 font.o font.c head.o misc.o $(OBJS)
 
 # Make sure files are removed during clean
-extra-y       += piggy.gzip piggy.lzo piggy.lzma lib1funcs.S $(libfdt) $(libfdt_hdrs)
+extra-y       += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern \
+		 lib1funcs.S ashldi3.S $(libfdt) $(libfdt_hdrs)
 
 ifeq ($(CONFIG_FUNCTION_TRACER),y)
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
@@ -151,6 +154,12 @@
 $(obj)/lib1funcs.S: $(srctree)/arch/$(SRCARCH)/lib/lib1funcs.S
 	$(call cmd,shipped)
 
+# For __aeabi_llsl
+ashldi3 = $(obj)/ashldi3.o
+
+$(obj)/ashldi3.S: $(srctree)/arch/$(SRCARCH)/lib/ashldi3.S
+	$(call cmd,shipped)
+
 # We need to prevent any GOTOFF relocs being used with references
 # to symbols in the .bss section since we cannot relocate them
 # independently from the rest at run time.  This can be achieved by
@@ -172,7 +181,7 @@
 fi
 
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/$(HEAD) $(obj)/piggy.$(suffix_y).o \
-	 	$(addprefix $(obj)/, $(OBJS)) $(lib1funcs) FORCE
+		$(addprefix $(obj)/, $(OBJS)) $(lib1funcs) $(ashldi3) FORCE
 	@$(check_for_multiple_zreladdr)
 	$(call if_changed,ld)
 	@$(check_for_bad_syms)
diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c
index 07be5a2..f41b38c 100644
--- a/arch/arm/boot/compressed/decompress.c
+++ b/arch/arm/boot/compressed/decompress.c
@@ -44,6 +44,12 @@
 #include "../../../../lib/decompress_unlzma.c"
 #endif
 
+#ifdef CONFIG_KERNEL_XZ
+#define memmove memmove
+#define memcpy memcpy
+#include "../../../../lib/decompress_unxz.c"
+#endif
+
 int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x))
 {
 	return decompress(input, len, NULL, NULL, output, NULL, error);
diff --git a/arch/arm/boot/compressed/piggy.xzkern.S b/arch/arm/boot/compressed/piggy.xzkern.S
new file mode 100644
index 0000000..5703f30
--- /dev/null
+++ b/arch/arm/boot/compressed/piggy.xzkern.S
@@ -0,0 +1,6 @@
+	.section .piggydata,#alloc
+	.globl	input_data
+input_data:
+	.incbin	"arch/arm/boot/compressed/piggy.xzkern"
+	.globl	input_data_end
+input_data_end:
diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index b2dc2dd..a3bc86f 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -697,13 +697,12 @@
 	 * For primary GICs, skip over SGIs.
 	 * For secondary GICs, skip over PPIs, too.
 	 */
-	domain->hwirq_base = 32;
-	if (gic_nr == 0) {
-		if ((irq_start & 31) > 0) {
-			domain->hwirq_base = 16;
-			if (irq_start != -1)
-				irq_start = (irq_start & ~31) + 16;
-		}
+	if (gic_nr == 0 && (irq_start & 31) > 0) {
+		domain->hwirq_base = 16;
+		if (irq_start != -1)
+			irq_start = (irq_start & ~31) + 16;
+	} else {
+		domain->hwirq_base = 32;
 	}
 
 	/*
diff --git a/arch/arm/configs/integrator_defconfig b/arch/arm/configs/integrator_defconfig
index 1103f62..a8314c3 100644
--- a/arch/arm/configs/integrator_defconfig
+++ b/arch/arm/configs/integrator_defconfig
@@ -57,18 +57,24 @@
 CONFIG_NET_ETHERNET=y
 CONFIG_NET_PCI=y
 CONFIG_E100=y
+CONFIG_SMC91X=y
 # CONFIG_KEYBOARD_ATKBD is not set
 # CONFIG_SERIO_SERPORT is not set
 CONFIG_SERIAL_AMBA_PL010=y
 CONFIG_SERIAL_AMBA_PL010_CONSOLE=y
 CONFIG_FB=y
 CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_ARMCLCD=y
 CONFIG_FB_MATROX=y
 CONFIG_FB_MATROX_MILLENIUM=y
 CONFIG_FB_MATROX_MYSTIQUE=y
+# CONFIG_VGA_CONSOLE is not set
+CONFIG_MMC=y
+CONFIG_MMC_ARMMMCI=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_PL030=y
 CONFIG_EXT2_FS=y
+CONFIG_VFAT_FS=y
 CONFIG_TMPFS=y
 CONFIG_JFFS2_FS=y
 CONFIG_CRAMFS=y
@@ -78,5 +84,7 @@
 CONFIG_NFSD=y
 CONFIG_NFSD_V3=y
 CONFIG_PARTITION_ADVANCED=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
diff --git a/arch/arm/include/asm/cp15.h b/arch/arm/include/asm/cp15.h
new file mode 100644
index 0000000..3dabd8dd
--- /dev/null
+++ b/arch/arm/include/asm/cp15.h
@@ -0,0 +1,87 @@
+#ifndef __ASM_ARM_CP15_H
+#define __ASM_ARM_CP15_H
+
+#include <asm/system.h>
+
+/*
+ * CR1 bits (CP#15 CR1)
+ */
+#define CR_M	(1 << 0)	/* MMU enable				*/
+#define CR_A	(1 << 1)	/* Alignment abort enable		*/
+#define CR_C	(1 << 2)	/* Dcache enable			*/
+#define CR_W	(1 << 3)	/* Write buffer enable			*/
+#define CR_P	(1 << 4)	/* 32-bit exception handler		*/
+#define CR_D	(1 << 5)	/* 32-bit data address range		*/
+#define CR_L	(1 << 6)	/* Implementation defined		*/
+#define CR_B	(1 << 7)	/* Big endian				*/
+#define CR_S	(1 << 8)	/* System MMU protection		*/
+#define CR_R	(1 << 9)	/* ROM MMU protection			*/
+#define CR_F	(1 << 10)	/* Implementation defined		*/
+#define CR_Z	(1 << 11)	/* Implementation defined		*/
+#define CR_I	(1 << 12)	/* Icache enable			*/
+#define CR_V	(1 << 13)	/* Vectors relocated to 0xffff0000	*/
+#define CR_RR	(1 << 14)	/* Round Robin cache replacement	*/
+#define CR_L4	(1 << 15)	/* LDR pc can set T bit			*/
+#define CR_DT	(1 << 16)
+#define CR_IT	(1 << 18)
+#define CR_ST	(1 << 19)
+#define CR_FI	(1 << 21)	/* Fast interrupt (lower latency mode)	*/
+#define CR_U	(1 << 22)	/* Unaligned access operation		*/
+#define CR_XP	(1 << 23)	/* Extended page tables			*/
+#define CR_VE	(1 << 24)	/* Vectored interrupts			*/
+#define CR_EE	(1 << 25)	/* Exception (Big) Endian		*/
+#define CR_TRE	(1 << 28)	/* TEX remap enable			*/
+#define CR_AFE	(1 << 29)	/* Access flag enable			*/
+#define CR_TE	(1 << 30)	/* Thumb exception enable		*/
+
+#ifndef __ASSEMBLY__
+
+#if __LINUX_ARM_ARCH__ >= 4
+#define vectors_high()	(cr_alignment & CR_V)
+#else
+#define vectors_high()	(0)
+#endif
+
+extern unsigned long cr_no_alignment;	/* defined in entry-armv.S */
+extern unsigned long cr_alignment;	/* defined in entry-armv.S */
+
+static inline unsigned int get_cr(void)
+{
+	unsigned int val;
+	asm("mrc p15, 0, %0, c1, c0, 0	@ get CR" : "=r" (val) : : "cc");
+	return val;
+}
+
+static inline void set_cr(unsigned int val)
+{
+	asm volatile("mcr p15, 0, %0, c1, c0, 0	@ set CR"
+	  : : "r" (val) : "cc");
+	isb();
+}
+
+#ifndef CONFIG_SMP
+extern void adjust_cr(unsigned long mask, unsigned long set);
+#endif
+
+#define CPACC_FULL(n)		(3 << (n * 2))
+#define CPACC_SVC(n)		(1 << (n * 2))
+#define CPACC_DISABLE(n)	(0 << (n * 2))
+
+static inline unsigned int get_copro_access(void)
+{
+	unsigned int val;
+	asm("mrc p15, 0, %0, c1, c0, 2 @ get copro access"
+	  : "=r" (val) : : "cc");
+	return val;
+}
+
+static inline void set_copro_access(unsigned int val)
+{
+	asm volatile("mcr p15, 0, %0, c1, c0, 2 @ set copro access"
+	  : : "r" (val) : "cc");
+	isb();
+}
+
+#endif
+
+#endif
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index 0e9ce8d..38050b1 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -130,8 +130,4 @@
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
 #define arch_randomize_brk arch_randomize_brk
 
-extern int vectors_user_mapping(void);
-#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping()
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
-
 #endif
diff --git a/arch/arm/include/asm/hardware/cache-l2x0.h b/arch/arm/include/asm/hardware/cache-l2x0.h
index 7df239b..c4c87bc 100644
--- a/arch/arm/include/asm/hardware/cache-l2x0.h
+++ b/arch/arm/include/asm/hardware/cache-l2x0.h
@@ -103,11 +103,11 @@
 #define L2X0_ADDR_FILTER_EN		1
 
 #ifndef __ASSEMBLY__
-extern void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask);
+extern void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask);
 #if defined(CONFIG_CACHE_L2X0) && defined(CONFIG_OF)
-extern int l2x0_of_init(__u32 aux_val, __u32 aux_mask);
+extern int l2x0_of_init(u32 aux_val, u32 aux_mask);
 #else
-static inline int l2x0_of_init(__u32 aux_val, __u32 aux_mask)
+static inline int l2x0_of_init(u32 aux_val, u32 aux_mask)
 {
 	return -ENODEV;
 }
diff --git a/arch/arm/include/asm/jump_label.h b/arch/arm/include/asm/jump_label.h
new file mode 100644
index 0000000..5c5ca2e
--- /dev/null
+++ b/arch/arm/include/asm/jump_label.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_ARM_JUMP_LABEL_H
+#define _ASM_ARM_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <asm/system.h>
+
+#define JUMP_LABEL_NOP_SIZE 4
+
+#ifdef CONFIG_THUMB2_KERNEL
+#define JUMP_LABEL_NOP	"nop.w"
+#else
+#define JUMP_LABEL_NOP	"nop"
+#endif
+
+static __always_inline bool arch_static_branch(struct jump_label_key *key)
+{
+	asm goto("1:\n\t"
+		 JUMP_LABEL_NOP "\n\t"
+		 ".pushsection __jump_table,  \"aw\"\n\t"
+		 ".word 1b, %l[l_yes], %c0\n\t"
+		 ".popsection\n\t"
+		 : :  "i" (key) :  : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+#endif /* __KERNEL__ */
+
+typedef u32 jump_label_t;
+
+struct jump_entry {
+	jump_label_t code;
+	jump_label_t target;
+	jump_label_t key;
+};
+
+#endif
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index a8997d7..fcb5757 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -116,6 +116,8 @@
 #define MODULES_END		(END_MEM)
 #define MODULES_VADDR		(PHYS_OFFSET)
 
+#define XIP_VIRT_ADDR(physaddr)  (physaddr)
+
 #endif /* !CONFIG_MMU */
 
 /*
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index 71605d9..a0b3cac 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -18,6 +18,7 @@
 #include <asm/cacheflush.h>
 #include <asm/cachetype.h>
 #include <asm/proc-fns.h>
+#include <asm-generic/mm_hooks.h>
 
 void __check_kvm_seq(struct mm_struct *mm);
 
@@ -133,32 +134,4 @@
 #define deactivate_mm(tsk,mm)	do { } while (0)
 #define activate_mm(prev,next)	switch_mm(prev, next, NULL)
 
-/*
- * We are inserting a "fake" vma for the user-accessible vector page so
- * gdb and friends can get to it through ptrace and /proc/<pid>/mem.
- * But we also want to remove it before the generic code gets to see it
- * during process exit or the unmapping of it would  cause total havoc.
- * (the macro is used as remove_vma() is static to mm/mmap.c)
- */
-#define arch_exit_mmap(mm) \
-do { \
-	struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \
-	if (high_vma) { \
-		BUG_ON(high_vma->vm_next);  /* it should be last */ \
-		if (high_vma->vm_prev) \
-			high_vma->vm_prev->vm_next = NULL; \
-		else \
-			mm->mmap = NULL; \
-		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-		mm->mmap_cache = NULL; \
-		mm->map_count--; \
-		remove_vma(high_vma); \
-	} \
-} while (0)
-
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
-{
-}
-
 #endif
diff --git a/arch/arm/include/asm/opcodes.h b/arch/arm/include/asm/opcodes.h
index c0efdd6..19c48de 100644
--- a/arch/arm/include/asm/opcodes.h
+++ b/arch/arm/include/asm/opcodes.h
@@ -17,4 +17,63 @@
 #define ARM_OPCODE_CONDTEST_PASS   1
 #define ARM_OPCODE_CONDTEST_UNCOND 2
 
+
+/*
+ * Opcode byteswap helpers
+ *
+ * These macros help with converting instructions between a canonical integer
+ * format and in-memory representation, in an endianness-agnostic manner.
+ *
+ * __mem_to_opcode_*() convert from in-memory representation to canonical form.
+ * __opcode_to_mem_*() convert from canonical form to in-memory representation.
+ *
+ *
+ * Canonical instruction representation:
+ *
+ *	ARM:		0xKKLLMMNN
+ *	Thumb 16-bit:	0x0000KKLL, where KK < 0xE8
+ *	Thumb 32-bit:	0xKKLLMMNN, where KK >= 0xE8
+ *
+ * There is no way to distinguish an ARM instruction in canonical representation
+ * from a Thumb instruction (just as these cannot be distinguished in memory).
+ * Where this distinction is important, it needs to be tracked separately.
+ *
+ * Note that values in the range 0x0000E800..0xE7FFFFFF intentionally do not
+ * represent any valid Thumb-2 instruction.  For this range,
+ * __opcode_is_thumb32() and __opcode_is_thumb16() will both be false.
+ */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/swab.h>
+
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define __opcode_to_mem_arm(x) swab32(x)
+#define __opcode_to_mem_thumb16(x) swab16(x)
+#define __opcode_to_mem_thumb32(x) swahb32(x)
+#else
+#define __opcode_to_mem_arm(x) ((u32)(x))
+#define __opcode_to_mem_thumb16(x) ((u16)(x))
+#define __opcode_to_mem_thumb32(x) swahw32(x)
+#endif
+
+#define __mem_to_opcode_arm(x) __opcode_to_mem_arm(x)
+#define __mem_to_opcode_thumb16(x) __opcode_to_mem_thumb16(x)
+#define __mem_to_opcode_thumb32(x) __opcode_to_mem_thumb32(x)
+
+/* Operations specific to Thumb opcodes */
+
+/* Instruction size checks: */
+#define __opcode_is_thumb32(x) ((u32)(x) >= 0xE8000000UL)
+#define __opcode_is_thumb16(x) ((u32)(x) < 0xE800UL)
+
+/* Operations to construct or split 32-bit Thumb instructions: */
+#define __opcode_thumb32_first(x) ((u16)((x) >> 16))
+#define __opcode_thumb32_second(x) ((u16)(x))
+#define __opcode_thumb32_compose(first, second) \
+	(((u32)(u16)(first) << 16) | (u32)(u16)(second))
+
+#endif /* __ASSEMBLY__ */
+
 #endif /* __ASM_ARM_OPCODES_H */
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 97b440c..5838361 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -151,6 +151,8 @@
 #define clear_page(page)	memset((void *)(page), 0, PAGE_SIZE)
 extern void copy_page(void *to, const void *from);
 
+#define __HAVE_ARCH_GATE_AREA 1
+
 #ifdef CONFIG_ARM_LPAE
 #include <asm/pgtable-3level-types.h>
 #else
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index 99cfe36..ee7c056 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -26,6 +26,7 @@
 	ARM_PERF_PMU_ID_CA9,
 	ARM_PERF_PMU_ID_CA5,
 	ARM_PERF_PMU_ID_CA15,
+	ARM_PERF_PMU_ID_CA7,
 	ARM_NUM_PMU_IDS,
 };
 
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index ce280b8..d7038fa 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -55,7 +55,6 @@
 #define start_thread(regs,pc,sp)					\
 ({									\
 	unsigned long *stack = (unsigned long *)sp;			\
-	set_fs(USER_DS);						\
 	memset(regs->uregs, 0, sizeof(regs->uregs));			\
 	if (current->personality & ADDR_LIMIT_32BIT)			\
 		regs->ARM_cpsr = USR_MODE;				\
diff --git a/arch/arm/include/asm/prom.h b/arch/arm/include/asm/prom.h
index ee03633..aeae9c6 100644
--- a/arch/arm/include/asm/prom.h
+++ b/arch/arm/include/asm/prom.h
@@ -13,8 +13,6 @@
 
 #ifdef CONFIG_OF
 
-#include <asm/irq.h>
-
 extern struct machine_desc *setup_machine_fdt(unsigned int dt_phys);
 extern void arm_dt_memblock_reserve(void);
 
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index e4c96cc..774c41e 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -15,37 +15,6 @@
 #define CPU_ARCH_ARMv7		9
 
 /*
- * CR1 bits (CP#15 CR1)
- */
-#define CR_M	(1 << 0)	/* MMU enable				*/
-#define CR_A	(1 << 1)	/* Alignment abort enable		*/
-#define CR_C	(1 << 2)	/* Dcache enable			*/
-#define CR_W	(1 << 3)	/* Write buffer enable			*/
-#define CR_P	(1 << 4)	/* 32-bit exception handler		*/
-#define CR_D	(1 << 5)	/* 32-bit data address range		*/
-#define CR_L	(1 << 6)	/* Implementation defined		*/
-#define CR_B	(1 << 7)	/* Big endian				*/
-#define CR_S	(1 << 8)	/* System MMU protection		*/
-#define CR_R	(1 << 9)	/* ROM MMU protection			*/
-#define CR_F	(1 << 10)	/* Implementation defined		*/
-#define CR_Z	(1 << 11)	/* Implementation defined		*/
-#define CR_I	(1 << 12)	/* Icache enable			*/
-#define CR_V	(1 << 13)	/* Vectors relocated to 0xffff0000	*/
-#define CR_RR	(1 << 14)	/* Round Robin cache replacement	*/
-#define CR_L4	(1 << 15)	/* LDR pc can set T bit			*/
-#define CR_DT	(1 << 16)
-#define CR_IT	(1 << 18)
-#define CR_ST	(1 << 19)
-#define CR_FI	(1 << 21)	/* Fast interrupt (lower latency mode)	*/
-#define CR_U	(1 << 22)	/* Unaligned access operation		*/
-#define CR_XP	(1 << 23)	/* Extended page tables			*/
-#define CR_VE	(1 << 24)	/* Vectored interrupts			*/
-#define CR_EE	(1 << 25)	/* Exception (Big) Endian		*/
-#define CR_TRE	(1 << 28)	/* TEX remap enable			*/
-#define CR_AFE	(1 << 29)	/* Access flag enable			*/
-#define CR_TE	(1 << 30)	/* Thumb exception enable		*/
-
-/*
  * This is used to ensure the compiler did actually allocate the register we
  * asked it for some inline assembly sequences.  Apparently we can't trust
  * the compiler from one version to another so a bit of paranoia won't hurt.
@@ -119,12 +88,6 @@
 
 extern unsigned int user_debug;
 
-#if __LINUX_ARM_ARCH__ >= 4
-#define vectors_high()	(cr_alignment & CR_V)
-#else
-#define vectors_high()	(0)
-#endif
-
 #if __LINUX_ARM_ARCH__ >= 7 ||		\
 	(__LINUX_ARM_ARCH__ == 6 && defined(CONFIG_CPU_32v6K))
 #define sev()	__asm__ __volatile__ ("sev" : : : "memory")
@@ -185,46 +148,6 @@
 #define set_mb(var, value)	do { var = value; smp_mb(); } while (0)
 #define nop() __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t");
 
-extern unsigned long cr_no_alignment;	/* defined in entry-armv.S */
-extern unsigned long cr_alignment;	/* defined in entry-armv.S */
-
-static inline unsigned int get_cr(void)
-{
-	unsigned int val;
-	asm("mrc p15, 0, %0, c1, c0, 0	@ get CR" : "=r" (val) : : "cc");
-	return val;
-}
-
-static inline void set_cr(unsigned int val)
-{
-	asm volatile("mcr p15, 0, %0, c1, c0, 0	@ set CR"
-	  : : "r" (val) : "cc");
-	isb();
-}
-
-#ifndef CONFIG_SMP
-extern void adjust_cr(unsigned long mask, unsigned long set);
-#endif
-
-#define CPACC_FULL(n)		(3 << (n * 2))
-#define CPACC_SVC(n)		(1 << (n * 2))
-#define CPACC_DISABLE(n)	(0 << (n * 2))
-
-static inline unsigned int get_copro_access(void)
-{
-	unsigned int val;
-	asm("mrc p15, 0, %0, c1, c0, 2 @ get copro access"
-	  : "=r" (val) : : "cc");
-	return val;
-}
-
-static inline void set_copro_access(unsigned int val)
-{
-	asm volatile("mcr p15, 0, %0, c1, c0, 2 @ set copro access"
-	  : : "r" (val) : "cc");
-	isb();
-}
-
 /*
  * switch_mm() may do a full cache flush over the context switch,
  * so enable interrupts over the context switch to avoid high
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index 02b2f82..85fe61e 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -318,6 +318,21 @@
 
 #define tlb_flag(f)	((always_tlb_flags & (f)) || (__tlb_flag & possible_tlb_flags & (f)))
 
+#define __tlb_op(f, insnarg, arg)					\
+	do {								\
+		if (always_tlb_flags & (f))				\
+			asm("mcr " insnarg				\
+			    : : "r" (arg) : "cc");			\
+		else if (possible_tlb_flags & (f))			\
+			asm("tst %1, %2\n\t"				\
+			    "mcrne " insnarg				\
+			    : : "r" (arg), "r" (__tlb_flag), "Ir" (f)	\
+			    : "cc");					\
+	} while (0)
+
+#define tlb_op(f, regs, arg)	__tlb_op(f, "p15, 0, %0, " regs, arg)
+#define tlb_l2_op(f, regs, arg)	__tlb_op(f, "p15, 1, %0, " regs, arg)
+
 static inline void local_flush_tlb_all(void)
 {
 	const int zero = 0;
@@ -326,16 +341,11 @@
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (tlb_flag(TLB_V3_FULL))
-		asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc");
-	if (tlb_flag(TLB_V4_U_FULL | TLB_V6_U_FULL))
-		asm("mcr p15, 0, %0, c8, c7, 0" : : "r" (zero) : "cc");
-	if (tlb_flag(TLB_V4_D_FULL | TLB_V6_D_FULL))
-		asm("mcr p15, 0, %0, c8, c6, 0" : : "r" (zero) : "cc");
-	if (tlb_flag(TLB_V4_I_FULL | TLB_V6_I_FULL))
-		asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
-	if (tlb_flag(TLB_V7_UIS_FULL))
-		asm("mcr p15, 0, %0, c8, c3, 0" : : "r" (zero) : "cc");
+	tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+	tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero);
+	tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero);
+	tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero);
+	tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero);
 
 	if (tlb_flag(TLB_BARRIER)) {
 		dsb();
@@ -352,29 +362,23 @@
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
-		if (tlb_flag(TLB_V3_FULL))
-			asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc");
-		if (tlb_flag(TLB_V4_U_FULL))
-			asm("mcr p15, 0, %0, c8, c7, 0" : : "r" (zero) : "cc");
-		if (tlb_flag(TLB_V4_D_FULL))
-			asm("mcr p15, 0, %0, c8, c6, 0" : : "r" (zero) : "cc");
-		if (tlb_flag(TLB_V4_I_FULL))
-			asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
+	if (possible_tlb_flags & (TLB_V3_FULL|TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
+		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
+			tlb_op(TLB_V3_FULL, "c6, c0, 0", zero);
+			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
+			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
+			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
+		}
+		put_cpu();
 	}
-	put_cpu();
 
-	if (tlb_flag(TLB_V6_U_ASID))
-		asm("mcr p15, 0, %0, c8, c7, 2" : : "r" (asid) : "cc");
-	if (tlb_flag(TLB_V6_D_ASID))
-		asm("mcr p15, 0, %0, c8, c6, 2" : : "r" (asid) : "cc");
-	if (tlb_flag(TLB_V6_I_ASID))
-		asm("mcr p15, 0, %0, c8, c5, 2" : : "r" (asid) : "cc");
-	if (tlb_flag(TLB_V7_UIS_ASID))
+	tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
+	tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);
+	tlb_op(TLB_V6_I_ASID, "c8, c5, 2", asid);
 #ifdef CONFIG_ARM_ERRATA_720789
-		asm("mcr p15, 0, %0, c8, c3, 0" : : "r" (zero) : "cc");
+	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", zero);
 #else
-		asm("mcr p15, 0, %0, c8, c3, 2" : : "r" (asid) : "cc");
+	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", asid);
 #endif
 
 	if (tlb_flag(TLB_BARRIER))
@@ -392,30 +396,23 @@
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
-		if (tlb_flag(TLB_V3_PAGE))
-			asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (uaddr) : "cc");
-		if (tlb_flag(TLB_V4_U_PAGE))
-			asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (uaddr) : "cc");
-		if (tlb_flag(TLB_V4_D_PAGE))
-			asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (uaddr) : "cc");
-		if (tlb_flag(TLB_V4_I_PAGE))
-			asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (uaddr) : "cc");
+	if (possible_tlb_flags & (TLB_V3_PAGE|TLB_V4_U_PAGE|TLB_V4_D_PAGE|TLB_V4_I_PAGE|TLB_V4_I_FULL) &&
+	    cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
+		tlb_op(TLB_V3_PAGE, "c6, c0, 0", uaddr);
+		tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", uaddr);
+		tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", uaddr);
+		tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", uaddr);
 		if (!tlb_flag(TLB_V4_I_PAGE) && tlb_flag(TLB_V4_I_FULL))
 			asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
 	}
 
-	if (tlb_flag(TLB_V6_U_PAGE))
-		asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (uaddr) : "cc");
-	if (tlb_flag(TLB_V6_D_PAGE))
-		asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (uaddr) : "cc");
-	if (tlb_flag(TLB_V6_I_PAGE))
-		asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (uaddr) : "cc");
-	if (tlb_flag(TLB_V7_UIS_PAGE))
+	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr);
+	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr);
+	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr);
 #ifdef CONFIG_ARM_ERRATA_720789
-		asm("mcr p15, 0, %0, c8, c3, 3" : : "r" (uaddr & PAGE_MASK) : "cc");
+	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
 #else
-		asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (uaddr) : "cc");
+	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", uaddr);
 #endif
 
 	if (tlb_flag(TLB_BARRIER))
@@ -432,25 +429,17 @@
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	if (tlb_flag(TLB_V3_PAGE))
-		asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V4_U_PAGE))
-		asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V4_D_PAGE))
-		asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V4_I_PAGE))
-		asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (kaddr) : "cc");
+	tlb_op(TLB_V3_PAGE, "c6, c0, 0", kaddr);
+	tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", kaddr);
+	tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", kaddr);
+	tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", kaddr);
 	if (!tlb_flag(TLB_V4_I_PAGE) && tlb_flag(TLB_V4_I_FULL))
 		asm("mcr p15, 0, %0, c8, c5, 0" : : "r" (zero) : "cc");
 
-	if (tlb_flag(TLB_V6_U_PAGE))
-		asm("mcr p15, 0, %0, c8, c7, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V6_D_PAGE))
-		asm("mcr p15, 0, %0, c8, c6, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V6_I_PAGE))
-		asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (kaddr) : "cc");
-	if (tlb_flag(TLB_V7_UIS_PAGE))
-		asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (kaddr) : "cc");
+	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
+	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
+	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
+	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", kaddr);
 
 	if (tlb_flag(TLB_BARRIER)) {
 		dsb();
@@ -475,13 +464,8 @@
 {
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
-	if (tlb_flag(TLB_DCLEAN))
-		asm("mcr	p15, 0, %0, c7, c10, 1	@ flush_pmd"
-			: : "r" (pmd) : "cc");
-
-	if (tlb_flag(TLB_L2CLEAN_FR))
-		asm("mcr	p15, 1, %0, c15, c9, 1  @ L2 flush_pmd"
-			: : "r" (pmd) : "cc");
+	tlb_op(TLB_DCLEAN, "c7, c10, 1	@ flush_pmd", pmd);
+	tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
 
 	if (tlb_flag(TLB_WB))
 		dsb();
@@ -491,15 +475,11 @@
 {
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
-	if (tlb_flag(TLB_DCLEAN))
-		asm("mcr	p15, 0, %0, c7, c10, 1	@ flush_pmd"
-			: : "r" (pmd) : "cc");
-
-	if (tlb_flag(TLB_L2CLEAN_FR))
-		asm("mcr	p15, 1, %0, c15, c9, 1  @ L2 flush_pmd"
-			: : "r" (pmd) : "cc");
+	tlb_op(TLB_DCLEAN, "c7, c10, 1	@ flush_pmd", pmd);
+	tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
 }
 
+#undef tlb_op
 #undef tlb_flag
 #undef always_tlb_flags
 #undef possible_tlb_flags
diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h
index 5b29a66..f555bb3 100644
--- a/arch/arm/include/asm/traps.h
+++ b/arch/arm/include/asm/traps.h
@@ -46,7 +46,7 @@
 	return in ? : __in_irqentry_text(ptr);
 }
 
-extern void __init early_trap_init(void);
+extern void __init early_trap_init(void *);
 extern void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame);
 extern void ptrace_break(struct task_struct *tsk, struct pt_regs *regs);
 
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 43b740d..1b7d9a3 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -7,6 +7,8 @@
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_insn.o = -pg
+CFLAGS_REMOVE_patch.o = -pg
 endif
 
 CFLAGS_REMOVE_return_address.o = -pg
@@ -34,10 +36,11 @@
 obj-$(CONFIG_SMP)		+= smp.o smp_tlb.o
 obj-$(CONFIG_HAVE_ARM_SCU)	+= smp_scu.o
 obj-$(CONFIG_HAVE_ARM_TWD)	+= smp_twd.o
-obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
+obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o insn.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o insn.o
+obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o insn.o patch.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
-obj-$(CONFIG_KPROBES)		+= kprobes.o kprobes-common.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o kprobes-common.o patch.o
 ifdef CONFIG_THUMB2_KERNEL
 obj-$(CONFIG_KPROBES)		+= kprobes-thumb.o
 else
diff --git a/arch/arm/kernel/debug.S b/arch/arm/kernel/debug.S
index 204e216..e5a765c 100644
--- a/arch/arm/kernel/debug.S
+++ b/arch/arm/kernel/debug.S
@@ -100,7 +100,7 @@
 
 #endif	/* CONFIG_CPU_V6 */
 
-#else
+#elif !defined(CONFIG_DEBUG_SEMIHOSTING)
 #include <mach/debug-macro.S>
 #endif	/* CONFIG_DEBUG_ICEDCC */
 
@@ -155,6 +155,8 @@
 
 		.ltorg
 
+#ifndef CONFIG_DEBUG_SEMIHOSTING
+
 ENTRY(printascii)
 		addruart_current r3, r1, r2
 		b	2f
@@ -177,3 +179,24 @@
 		mov	r0, #0
 		b	1b
 ENDPROC(printch)
+
+#else
+
+ENTRY(printascii)
+		mov	r1, r0
+		mov	r0, #0x04		@ SYS_WRITE0
+	ARM(	svc	#0x123456	)
+	THUMB(	svc	#0xab		)
+		mov	pc, lr
+ENDPROC(printascii)
+
+ENTRY(printch)
+		adr	r1, hexbuf
+		strb	r0, [r1]
+		mov	r0, #0x03		@ SYS_WRITEC
+	ARM(	svc	#0x123456	)
+	THUMB(	svc	#0xab		)
+		mov	pc, lr
+ENDPROC(printch)
+
+#endif
diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c
index 4c164ec..c32f845 100644
--- a/arch/arm/kernel/fiq.c
+++ b/arch/arm/kernel/fiq.c
@@ -42,9 +42,9 @@
 #include <linux/seq_file.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cp15.h>
 #include <asm/fiq.h>
 #include <asm/irq.h>
-#include <asm/system.h>
 #include <asm/traps.h>
 
 static unsigned long no_fiq_insn;
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index c0062ad..df0bf0c 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -16,10 +16,13 @@
 #include <linux/uaccess.h>
 
 #include <asm/cacheflush.h>
+#include <asm/opcodes.h>
 #include <asm/ftrace.h>
 
+#include "insn.h"
+
 #ifdef CONFIG_THUMB2_KERNEL
-#define	NOP		0xeb04f85d	/* pop.w {lr} */
+#define	NOP		0xf85deb04	/* pop.w {lr} */
 #else
 #define	NOP		0xe8bd4000	/* pop {lr} */
 #endif
@@ -60,76 +63,31 @@
 }
 #endif
 
-#ifdef CONFIG_THUMB2_KERNEL
-static unsigned long ftrace_gen_branch(unsigned long pc, unsigned long addr,
-				       bool link)
-{
-	unsigned long s, j1, j2, i1, i2, imm10, imm11;
-	unsigned long first, second;
-	long offset;
-
-	offset = (long)addr - (long)(pc + 4);
-	if (offset < -16777216 || offset > 16777214) {
-		WARN_ON_ONCE(1);
-		return 0;
-	}
-
-	s	= (offset >> 24) & 0x1;
-	i1	= (offset >> 23) & 0x1;
-	i2	= (offset >> 22) & 0x1;
-	imm10	= (offset >> 12) & 0x3ff;
-	imm11	= (offset >>  1) & 0x7ff;
-
-	j1 = (!i1) ^ s;
-	j2 = (!i2) ^ s;
-
-	first = 0xf000 | (s << 10) | imm10;
-	second = 0x9000 | (j1 << 13) | (j2 << 11) | imm11;
-	if (link)
-		second |= 1 << 14;
-
-	return (second << 16) | first;
-}
-#else
-static unsigned long ftrace_gen_branch(unsigned long pc, unsigned long addr,
-				       bool link)
-{
-	unsigned long opcode = 0xea000000;
-	long offset;
-
-	if (link)
-		opcode |= 1 << 24;
-
-	offset = (long)addr - (long)(pc + 8);
-	if (unlikely(offset < -33554432 || offset > 33554428)) {
-		/* Can't generate branches that far (from ARM ARM). Ftrace
-		 * doesn't generate branches outside of kernel text.
-		 */
-		WARN_ON_ONCE(1);
-		return 0;
-	}
-
-	offset = (offset >> 2) & 0x00ffffff;
-
-	return opcode | offset;
-}
-#endif
-
 static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr)
 {
-	return ftrace_gen_branch(pc, addr, true);
+	return arm_gen_branch_link(pc, addr);
 }
 
 static int ftrace_modify_code(unsigned long pc, unsigned long old,
-			      unsigned long new)
+			      unsigned long new, bool validate)
 {
 	unsigned long replaced;
 
-	if (probe_kernel_read(&replaced, (void *)pc, MCOUNT_INSN_SIZE))
-		return -EFAULT;
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		old = __opcode_to_mem_thumb32(old);
+		new = __opcode_to_mem_thumb32(new);
+	} else {
+		old = __opcode_to_mem_arm(old);
+		new = __opcode_to_mem_arm(new);
+	}
 
-	if (replaced != old)
-		return -EINVAL;
+	if (validate) {
+		if (probe_kernel_read(&replaced, (void *)pc, MCOUNT_INSN_SIZE))
+			return -EFAULT;
+
+		if (replaced != old)
+			return -EINVAL;
+	}
 
 	if (probe_kernel_write((void *)pc, &new, MCOUNT_INSN_SIZE))
 		return -EPERM;
@@ -141,23 +99,21 @@
 
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
-	unsigned long pc, old;
+	unsigned long pc;
 	unsigned long new;
 	int ret;
 
 	pc = (unsigned long)&ftrace_call;
-	memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(pc, (unsigned long)func);
 
-	ret = ftrace_modify_code(pc, old, new);
+	ret = ftrace_modify_code(pc, 0, new, false);
 
 #ifdef CONFIG_OLD_MCOUNT
 	if (!ret) {
 		pc = (unsigned long)&ftrace_call_old;
-		memcpy(&old, &ftrace_call_old, MCOUNT_INSN_SIZE);
 		new = ftrace_call_replace(pc, (unsigned long)func);
 
-		ret = ftrace_modify_code(pc, old, new);
+		ret = ftrace_modify_code(pc, 0, new, false);
 	}
 #endif
 
@@ -172,7 +128,7 @@
 	old = ftrace_nop_replace(rec);
 	new = ftrace_call_replace(ip, adjust_address(rec, addr));
 
-	return ftrace_modify_code(rec->ip, old, new);
+	return ftrace_modify_code(rec->ip, old, new, true);
 }
 
 int ftrace_make_nop(struct module *mod,
@@ -185,7 +141,7 @@
 
 	old = ftrace_call_replace(ip, adjust_address(rec, addr));
 	new = ftrace_nop_replace(rec);
-	ret = ftrace_modify_code(ip, old, new);
+	ret = ftrace_modify_code(ip, old, new, true);
 
 #ifdef CONFIG_OLD_MCOUNT
 	if (ret == -EINVAL && addr == MCOUNT_ADDR) {
@@ -193,7 +149,7 @@
 
 		old = ftrace_call_replace(ip, adjust_address(rec, addr));
 		new = ftrace_nop_replace(rec);
-		ret = ftrace_modify_code(ip, old, new);
+		ret = ftrace_modify_code(ip, old, new, true);
 	}
 #endif
 
@@ -249,12 +205,12 @@
 {
 	unsigned long caller_fn = (unsigned long) func;
 	unsigned long pc = (unsigned long) callsite;
-	unsigned long branch = ftrace_gen_branch(pc, caller_fn, false);
+	unsigned long branch = arm_gen_branch(pc, caller_fn);
 	unsigned long nop = 0xe1a00000;	/* mov r0, r0 */
 	unsigned long old = enable ? nop : branch;
 	unsigned long new = enable ? branch : nop;
 
-	return ftrace_modify_code(pc, old, new);
+	return ftrace_modify_code(pc, old, new, true);
 }
 
 static int ftrace_modify_graph_caller(bool enable)
diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S
index d46f259..278cfc1 100644
--- a/arch/arm/kernel/head-nommu.S
+++ b/arch/arm/kernel/head-nommu.S
@@ -17,8 +17,8 @@
 #include <asm/assembler.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
+#include <asm/cp15.h>
 #include <asm/thread_info.h>
-#include <asm/system.h>
 
 /*
  * Kernel startup entry point.
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 6d57911..3bf0c7f 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -15,12 +15,12 @@
 #include <linux/init.h>
 
 #include <asm/assembler.h>
+#include <asm/cp15.h>
 #include <asm/domain.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
 #include <asm/memory.h>
 #include <asm/thread_info.h>
-#include <asm/system.h>
 #include <asm/pgtable.h>
 
 #ifdef CONFIG_DEBUG_LL
@@ -265,7 +265,7 @@
 	str	r6, [r3]
 
 #ifdef CONFIG_DEBUG_LL
-#ifndef CONFIG_DEBUG_ICEDCC
+#if !defined(CONFIG_DEBUG_ICEDCC) && !defined(CONFIG_DEBUG_SEMIHOSTING)
 	/*
 	 * Map in IO space for serial debugging.
 	 * This allows debug messages to be output
@@ -297,10 +297,10 @@
 	cmp	r0, r6
 	blo	1b
 
-#else /* CONFIG_DEBUG_ICEDCC */
-	/* we don't need any serial debugging mappings for ICEDCC */
+#else /* CONFIG_DEBUG_ICEDCC || CONFIG_DEBUG_SEMIHOSTING */
+	/* we don't need any serial debugging mappings */
 	ldr	r7, [r10, #PROCINFO_IO_MMUFLAGS] @ io_mmuflags
-#endif /* !CONFIG_DEBUG_ICEDCC */
+#endif
 
 #if defined(CONFIG_ARCH_NETWINDER) || defined(CONFIG_ARCH_CATS)
 	/*
diff --git a/arch/arm/kernel/insn.c b/arch/arm/kernel/insn.c
new file mode 100644
index 0000000..ab312e5
--- /dev/null
+++ b/arch/arm/kernel/insn.c
@@ -0,0 +1,61 @@
+#include <linux/kernel.h>
+#include <asm/opcodes.h>
+
+static unsigned long
+__arm_gen_branch_thumb2(unsigned long pc, unsigned long addr, bool link)
+{
+	unsigned long s, j1, j2, i1, i2, imm10, imm11;
+	unsigned long first, second;
+	long offset;
+
+	offset = (long)addr - (long)(pc + 4);
+	if (offset < -16777216 || offset > 16777214) {
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	s	= (offset >> 24) & 0x1;
+	i1	= (offset >> 23) & 0x1;
+	i2	= (offset >> 22) & 0x1;
+	imm10	= (offset >> 12) & 0x3ff;
+	imm11	= (offset >>  1) & 0x7ff;
+
+	j1 = (!i1) ^ s;
+	j2 = (!i2) ^ s;
+
+	first = 0xf000 | (s << 10) | imm10;
+	second = 0x9000 | (j1 << 13) | (j2 << 11) | imm11;
+	if (link)
+		second |= 1 << 14;
+
+	return __opcode_thumb32_compose(first, second);
+}
+
+static unsigned long
+__arm_gen_branch_arm(unsigned long pc, unsigned long addr, bool link)
+{
+	unsigned long opcode = 0xea000000;
+	long offset;
+
+	if (link)
+		opcode |= 1 << 24;
+
+	offset = (long)addr - (long)(pc + 8);
+	if (unlikely(offset < -33554432 || offset > 33554428)) {
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	offset = (offset >> 2) & 0x00ffffff;
+
+	return opcode | offset;
+}
+
+unsigned long
+__arm_gen_branch(unsigned long pc, unsigned long addr, bool link)
+{
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL))
+		return __arm_gen_branch_thumb2(pc, addr, link);
+	else
+		return __arm_gen_branch_arm(pc, addr, link);
+}
diff --git a/arch/arm/kernel/insn.h b/arch/arm/kernel/insn.h
new file mode 100644
index 0000000..e96065d
--- /dev/null
+++ b/arch/arm/kernel/insn.h
@@ -0,0 +1,29 @@
+#ifndef __ASM_ARM_INSN_H
+#define __ASM_ARM_INSN_H
+
+static inline unsigned long
+arm_gen_nop(void)
+{
+#ifdef CONFIG_THUMB2_KERNEL
+	return 0xf3af8000; /* nop.w */
+#else
+	return 0xe1a00000; /* mov r0, r0 */
+#endif
+}
+
+unsigned long
+__arm_gen_branch(unsigned long pc, unsigned long addr, bool link);
+
+static inline unsigned long
+arm_gen_branch(unsigned long pc, unsigned long addr)
+{
+	return __arm_gen_branch(pc, addr, false);
+}
+
+static inline unsigned long
+arm_gen_branch_link(unsigned long pc, unsigned long addr)
+{
+	return __arm_gen_branch(pc, addr, true);
+}
+
+#endif
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 3efd82c..3f86ee1 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -181,10 +181,7 @@
 	local_irq_save(flags);
 
 	for_each_irq_desc(i, desc) {
-		bool affinity_broken = false;
-
-		if (!desc)
-			continue;
+		bool affinity_broken;
 
 		raw_spin_lock(&desc->lock);
 		affinity_broken = migrate_one_irq(desc);
diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
new file mode 100644
index 0000000..4ce4f78
--- /dev/null
+++ b/arch/arm/kernel/jump_label.c
@@ -0,0 +1,39 @@
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+
+#include "insn.h"
+#include "patch.h"
+
+#ifdef HAVE_JUMP_LABEL
+
+static void __arch_jump_label_transform(struct jump_entry *entry,
+					enum jump_label_type type,
+					bool is_static)
+{
+	void *addr = (void *)entry->code;
+	unsigned int insn;
+
+	if (type == JUMP_LABEL_ENABLE)
+		insn = arm_gen_branch(entry->code, entry->target);
+	else
+		insn = arm_gen_nop();
+
+	if (is_static)
+		__patch_text(addr, insn);
+	else
+		patch_text(addr, insn);
+}
+
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
+{
+	__arch_jump_label_transform(entry, type, false);
+}
+
+void arch_jump_label_transform_static(struct jump_entry *entry,
+				      enum jump_label_type type)
+{
+	__arch_jump_label_transform(entry, type, true);
+}
+
+#endif
diff --git a/arch/arm/kernel/kprobes.c b/arch/arm/kernel/kprobes.c
index 129c116..ab1869d 100644
--- a/arch/arm/kernel/kprobes.c
+++ b/arch/arm/kernel/kprobes.c
@@ -29,6 +29,7 @@
 #include <asm/cacheflush.h>
 
 #include "kprobes.h"
+#include "patch.h"
 
 #define MIN_STACK_SIZE(addr) 				\
 	min((unsigned long)MAX_STACK_SIZE,		\
@@ -103,58 +104,34 @@
 	return 0;
 }
 
-#ifdef CONFIG_THUMB2_KERNEL
-
-/*
- * For a 32-bit Thumb breakpoint spanning two memory words we need to take
- * special precautions to insert the breakpoint atomically, especially on SMP
- * systems. This is achieved by calling this arming function using stop_machine.
- */
-static int __kprobes set_t32_breakpoint(void *addr)
-{
-	((u16 *)addr)[0] = KPROBE_THUMB32_BREAKPOINT_INSTRUCTION >> 16;
-	((u16 *)addr)[1] = KPROBE_THUMB32_BREAKPOINT_INSTRUCTION & 0xffff;
-	flush_insns(addr, 2*sizeof(u16));
-	return 0;
-}
-
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
-	uintptr_t addr = (uintptr_t)p->addr & ~1; /* Remove any Thumb flag */
+	unsigned int brkp;
+	void *addr;
 
-	if (!is_wide_instruction(p->opcode)) {
-		*(u16 *)addr = KPROBE_THUMB16_BREAKPOINT_INSTRUCTION;
-		flush_insns(addr, sizeof(u16));
-	} else if (addr & 2) {
-		/* A 32-bit instruction spanning two words needs special care */
-		stop_machine(set_t32_breakpoint, (void *)addr, &cpu_online_map);
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) {
+		/* Remove any Thumb flag */
+		addr = (void *)((uintptr_t)p->addr & ~1);
+
+		if (is_wide_instruction(p->opcode))
+			brkp = KPROBE_THUMB32_BREAKPOINT_INSTRUCTION;
+		else
+			brkp = KPROBE_THUMB16_BREAKPOINT_INSTRUCTION;
 	} else {
-		/* Word aligned 32-bit instruction can be written atomically */
-		u32 bkp = KPROBE_THUMB32_BREAKPOINT_INSTRUCTION;
-#ifndef __ARMEB__ /* Swap halfwords for little-endian */
-		bkp = (bkp >> 16) | (bkp << 16);
-#endif
-		*(u32 *)addr = bkp;
-		flush_insns(addr, sizeof(u32));
+		kprobe_opcode_t insn = p->opcode;
+
+		addr = p->addr;
+		brkp = KPROBE_ARM_BREAKPOINT_INSTRUCTION;
+
+		if (insn >= 0xe0000000)
+			brkp |= 0xe0000000;  /* Unconditional instruction */
+		else
+			brkp |= insn & 0xf0000000;  /* Copy condition from insn */
 	}
+
+	patch_text(addr, brkp);
 }
 
-#else /* !CONFIG_THUMB2_KERNEL */
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-	kprobe_opcode_t insn = p->opcode;
-	kprobe_opcode_t brkp = KPROBE_ARM_BREAKPOINT_INSTRUCTION;
-	if (insn >= 0xe0000000)
-		brkp |= 0xe0000000;  /* Unconditional instruction */
-	else
-		brkp |= insn & 0xf0000000;  /* Copy condition from insn */
-	*p->addr = brkp;
-	flush_insns(p->addr, sizeof(p->addr[0]));
-}
-
-#endif /* !CONFIG_THUMB2_KERNEL */
-
 /*
  * The actual disarming is done here on each CPU and synchronized using
  * stop_machine. This synchronization is necessary on SMP to avoid removing
@@ -166,25 +143,10 @@
 int __kprobes __arch_disarm_kprobe(void *p)
 {
 	struct kprobe *kp = p;
-#ifdef CONFIG_THUMB2_KERNEL
-	u16 *addr = (u16 *)((uintptr_t)kp->addr & ~1);
-	kprobe_opcode_t insn = kp->opcode;
-	unsigned int len;
+	void *addr = (void *)((uintptr_t)kp->addr & ~1);
 
-	if (is_wide_instruction(insn)) {
-		((u16 *)addr)[0] = insn>>16;
-		((u16 *)addr)[1] = insn;
-		len = 2*sizeof(u16);
-	} else {
-		((u16 *)addr)[0] = insn;
-		len = sizeof(u16);
-	}
-	flush_insns(addr, len);
+	__patch_text(addr, kp->opcode);
 
-#else /* !CONFIG_THUMB2_KERNEL */
-	*kp->addr = kp->opcode;
-	flush_insns(kp->addr, sizeof(kp->addr[0]));
-#endif
 	return 0;
 }
 
diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index 764bd45..a300044 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -7,6 +7,7 @@
 #include <linux/delay.h>
 #include <linux/reboot.h>
 #include <linux/io.h>
+#include <linux/irq.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
@@ -53,6 +54,29 @@
 		cpu_relax();
 }
 
+static void machine_kexec_mask_interrupts(void)
+{
+	unsigned int i;
+	struct irq_desc *desc;
+
+	for_each_irq_desc(i, desc) {
+		struct irq_chip *chip;
+
+		chip = irq_desc_get_chip(desc);
+		if (!chip)
+			continue;
+
+		if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
+			chip->irq_eoi(&desc->irq_data);
+
+		if (chip->irq_mask)
+			chip->irq_mask(&desc->irq_data);
+
+		if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
+			chip->irq_disable(&desc->irq_data);
+	}
+}
+
 void machine_crash_shutdown(struct pt_regs *regs)
 {
 	unsigned long msecs;
@@ -70,6 +94,7 @@
 		printk(KERN_WARNING "Non-crashing CPUs did not react to IPI\n");
 
 	crash_save_cpu(regs, smp_processor_id());
+	machine_kexec_mask_interrupts();
 
 	printk(KERN_INFO "Loading crashdump kernel...\n");
 }
diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
new file mode 100644
index 0000000..07314af
--- /dev/null
+++ b/arch/arm/kernel/patch.c
@@ -0,0 +1,75 @@
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/stop_machine.h>
+
+#include <asm/cacheflush.h>
+#include <asm/smp_plat.h>
+#include <asm/opcodes.h>
+
+#include "patch.h"
+
+struct patch {
+	void *addr;
+	unsigned int insn;
+};
+
+void __kprobes __patch_text(void *addr, unsigned int insn)
+{
+	bool thumb2 = IS_ENABLED(CONFIG_THUMB2_KERNEL);
+	int size;
+
+	if (thumb2 && __opcode_is_thumb16(insn)) {
+		*(u16 *)addr = __opcode_to_mem_thumb16(insn);
+		size = sizeof(u16);
+	} else if (thumb2 && ((uintptr_t)addr & 2)) {
+		u16 first = __opcode_thumb32_first(insn);
+		u16 second = __opcode_thumb32_second(insn);
+		u16 *addrh = addr;
+
+		addrh[0] = __opcode_to_mem_thumb16(first);
+		addrh[1] = __opcode_to_mem_thumb16(second);
+
+		size = sizeof(u32);
+	} else {
+		if (thumb2)
+			insn = __opcode_to_mem_thumb32(insn);
+		else
+			insn = __opcode_to_mem_arm(insn);
+
+		*(u32 *)addr = insn;
+		size = sizeof(u32);
+	}
+
+	flush_icache_range((uintptr_t)(addr),
+			   (uintptr_t)(addr) + size);
+}
+
+static int __kprobes patch_text_stop_machine(void *data)
+{
+	struct patch *patch = data;
+
+	__patch_text(patch->addr, patch->insn);
+
+	return 0;
+}
+
+void __kprobes patch_text(void *addr, unsigned int insn)
+{
+	struct patch patch = {
+		.addr = addr,
+		.insn = insn,
+	};
+
+	if (cache_ops_need_broadcast()) {
+		stop_machine(patch_text_stop_machine, &patch, cpu_online_mask);
+	} else {
+		bool straddles_word = IS_ENABLED(CONFIG_THUMB2_KERNEL)
+				      && __opcode_is_thumb32(insn)
+				      && ((uintptr_t)addr & 2);
+
+		if (straddles_word)
+			stop_machine(patch_text_stop_machine, &patch, NULL);
+		else
+			__patch_text(addr, insn);
+	}
+}
diff --git a/arch/arm/kernel/patch.h b/arch/arm/kernel/patch.h
new file mode 100644
index 0000000..b4731f2
--- /dev/null
+++ b/arch/arm/kernel/patch.h
@@ -0,0 +1,7 @@
+#ifndef _ARM_KERNEL_PATCH_H
+#define _ARM_KERNEL_PATCH_H
+
+void patch_text(void *addr, unsigned int insn);
+void __patch_text(void *addr, unsigned int insn);
+
+#endif
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 5bb91bf..ab59c3b 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -712,6 +712,9 @@
 		case 0xC0F0:	/* Cortex-A15 */
 			cpu_pmu = armv7_a15_pmu_init();
 			break;
+		case 0xC070:	/* Cortex-A7 */
+			cpu_pmu = armv7_a7_pmu_init();
+			break;
 		}
 	/* Intel CPUs [xscale]. */
 	} else if (0x69 == implementor) {
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 460bbbb..b2b764e 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -582,6 +582,130 @@
 };
 
 /*
+ * Cortex-A7 HW events mapping
+ */
+static const unsigned armv7_a7_perf_map[PERF_COUNT_HW_MAX] = {
+	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_PERFCTR_PC_WRITE,
+	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV7_PERFCTR_BUS_CYCLES,
+	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= HW_OP_UNSUPPORTED,
+	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= HW_OP_UNSUPPORTED,
+};
+
+static const unsigned armv7_a7_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
+					[PERF_COUNT_HW_CACHE_OP_MAX]
+					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(L1D)] = {
+		/*
+		 * The performance counters don't differentiate between read
+		 * and write accesses/misses so this isn't strictly correct,
+		 * but it's the best we can do. Writes and reads get
+		 * combined.
+		 */
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_CACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_CACHE_ACCESS,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(DTLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(NODE)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
+		},
+	},
+};
+
+/*
  * Perf Events' indices
  */
 #define	ARMV7_IDX_CYCLE_COUNTER	0
@@ -1067,6 +1191,12 @@
 				&armv7_a15_perf_cache_map, 0xFF);
 }
 
+static int armv7_a7_map_event(struct perf_event *event)
+{
+	return map_cpu_event(event, &armv7_a7_perf_map,
+				&armv7_a7_perf_cache_map, 0xFF);
+}
+
 static struct arm_pmu armv7pmu = {
 	.handle_irq		= armv7pmu_handle_irq,
 	.enable			= armv7pmu_enable_event,
@@ -1127,6 +1257,16 @@
 	armv7pmu.set_event_filter = armv7pmu_set_event_filter;
 	return &armv7pmu;
 }
+
+static struct arm_pmu *__init armv7_a7_pmu_init(void)
+{
+	armv7pmu.id		= ARM_PERF_PMU_ID_CA7;
+	armv7pmu.name		= "ARMv7 Cortex-A7";
+	armv7pmu.map_event	= armv7_a7_map_event;
+	armv7pmu.num_events	= armv7_read_num_pmnc_events();
+	armv7pmu.set_event_filter = armv7pmu_set_event_filter;
+	return &armv7pmu;
+}
 #else
 static struct arm_pmu *__init armv7_a8_pmu_init(void)
 {
@@ -1147,4 +1287,9 @@
 {
 	return NULL;
 }
+
+static struct arm_pmu *__init armv7_a7_pmu_init(void)
+{
+	return NULL;
+}
 #endif	/* CONFIG_CPU_V7 */
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 971d65c..e11b523 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -526,22 +526,40 @@
 #ifdef CONFIG_MMU
 /*
  * The vectors page is always readable from user space for the
- * atomic helpers and the signal restart code.  Let's declare a mapping
- * for it so it is visible through ptrace and /proc/<pid>/mem.
+ * atomic helpers and the signal restart code. Insert it into the
+ * gate_vma so that it is visible through ptrace and /proc/<pid>/mem.
  */
+static struct vm_area_struct gate_vma;
 
-int vectors_user_mapping(void)
+static int __init gate_vma_init(void)
 {
-	struct mm_struct *mm = current->mm;
-	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
-				       VM_READ | VM_EXEC |
-				       VM_MAYREAD | VM_MAYEXEC |
-				       VM_ALWAYSDUMP | VM_RESERVED,
-				       NULL);
+	gate_vma.vm_start	= 0xffff0000;
+	gate_vma.vm_end		= 0xffff0000 + PAGE_SIZE;
+	gate_vma.vm_page_prot	= PAGE_READONLY_EXEC;
+	gate_vma.vm_flags	= VM_READ | VM_EXEC |
+				  VM_MAYREAD | VM_MAYEXEC |
+				  VM_ALWAYSDUMP;
+	return 0;
+}
+arch_initcall(gate_vma_init);
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+	return &gate_vma;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+	return (addr >= gate_vma.vm_start) && (addr < gate_vma.vm_end);
+}
+
+int in_gate_area_no_mm(unsigned long addr)
+{
+	return in_gate_area(NULL, addr);
 }
 
 const char *arch_vma_name(struct vm_area_struct *vma)
 {
-	return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL;
+	return (vma == &gate_vma) ? "[vectors]" : NULL;
 }
 #endif
diff --git a/arch/arm/kernel/sched_clock.c b/arch/arm/kernel/sched_clock.c
index 5416c7c..27d186a 100644
--- a/arch/arm/kernel/sched_clock.c
+++ b/arch/arm/kernel/sched_clock.c
@@ -10,6 +10,7 @@
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/syscore_ops.h>
 #include <linux/timer.h>
 
 #include <asm/sched_clock.h>
@@ -164,3 +165,20 @@
 
 	sched_clock_poll(sched_clock_timer.data);
 }
+
+static int sched_clock_suspend(void)
+{
+	sched_clock_poll(sched_clock_timer.data);
+	return 0;
+}
+
+static struct syscore_ops sched_clock_ops = {
+	.suspend = sched_clock_suspend,
+};
+
+static int __init sched_clock_syscore_init(void)
+{
+	register_syscore_ops(&sched_clock_ops);
+	return 0;
+}
+device_initcall(sched_clock_syscore_init);
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 129fbd5..e5c37fc 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -34,6 +34,7 @@
 #include <linux/sort.h>
 
 #include <asm/unified.h>
+#include <asm/cp15.h>
 #include <asm/cpu.h>
 #include <asm/cputype.h>
 #include <asm/elf.h>
@@ -45,7 +46,6 @@
 #include <asm/cacheflush.h>
 #include <asm/cachetype.h>
 #include <asm/tlbflush.h>
-#include <asm/system.h>
 
 #include <asm/prom.h>
 #include <asm/mach/arch.h>
@@ -961,7 +961,6 @@
 	conswitchp = &dummy_con;
 #endif
 #endif
-	early_trap_init();
 
 	if (mdesc->init_early)
 		mdesc->init_early();
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 0340224..d13e61a 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -66,12 +66,13 @@
  */
 asmlinkage int sys_sigsuspend(int restart, unsigned long oldmask, old_sigset_t mask)
 {
-	mask &= _BLOCKABLE;
-	spin_lock_irq(&current->sighand->siglock);
+	sigset_t blocked;
+
 	current->saved_sigmask = current->blocked;
-	siginitset(&current->blocked, mask);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+
+	mask &= _BLOCKABLE;
+	siginitset(&blocked, mask);
+	set_current_blocked(&blocked);
 
 	current->state = TASK_INTERRUPTIBLE;
 	schedule();
@@ -281,10 +282,7 @@
 	err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
 	if (err == 0) {
 		sigdelsetmask(&set, ~_BLOCKABLE);
-		spin_lock_irq(&current->sighand->siglock);
-		current->blocked = set;
-		recalc_sigpending();
-		spin_unlock_irq(&current->sighand->siglock);
+		set_current_blocked(&set);
 	}
 
 	__get_user_error(regs->ARM_r0, &sf->uc.uc_mcontext.arm_r0, err);
@@ -637,13 +635,7 @@
 	/*
 	 * Block the signal if we were successful.
 	 */
-	spin_lock_irq(&tsk->sighand->siglock);
-	sigorsets(&tsk->blocked, &tsk->blocked,
-		  &ka->sa.sa_mask);
-	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&tsk->blocked, sig);
-	recalc_sigpending();
-	spin_unlock_irq(&tsk->sighand->siglock);
+	block_sigmask(ka, sig);
 
 	return 0;
 }
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 57db122..2b26dca 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -58,6 +58,8 @@
 	IPI_CPU_STOP,
 };
 
+static DECLARE_COMPLETION(cpu_running);
+
 int __cpuinit __cpu_up(unsigned int cpu)
 {
 	struct cpuinfo_arm *ci = &per_cpu(cpu_data, cpu);
@@ -98,20 +100,12 @@
 	 */
 	ret = boot_secondary(cpu, idle);
 	if (ret == 0) {
-		unsigned long timeout;
-
 		/*
 		 * CPU was successfully started, wait for it
 		 * to come online or time out.
 		 */
-		timeout = jiffies + HZ;
-		while (time_before(jiffies, timeout)) {
-			if (cpu_online(cpu))
-				break;
-
-			udelay(10);
-			barrier();
-		}
+		wait_for_completion_timeout(&cpu_running,
+						 msecs_to_jiffies(1000));
 
 		if (!cpu_online(cpu)) {
 			pr_crit("CPU%u: failed to come online\n", cpu);
@@ -300,9 +294,10 @@
 	/*
 	 * OK, now it's safe to let the boot CPU continue.  Wait for
 	 * the CPU migration code to notice that the CPU is online
-	 * before we continue.
+	 * before we continue - which happens after __cpu_up returns.
 	 */
 	set_cpu_online(cpu, true);
+	complete(&cpu_running);
 
 	/*
 	 * Setup the percpu timer for this CPU.
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 99a5727..23377a3 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -227,6 +227,11 @@
 #else
 #define S_SMP ""
 #endif
+#ifdef CONFIG_THUMB2_KERNEL
+#define S_ISA " THUMB2"
+#else
+#define S_ISA " ARM"
+#endif
 
 static int __die(const char *str, int err, struct thread_info *thread, struct pt_regs *regs)
 {
@@ -234,8 +239,8 @@
 	static int die_counter;
 	int ret;
 
-	printk(KERN_EMERG "Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n",
-	       str, err, ++die_counter);
+	printk(KERN_EMERG "Internal error: %s: %x [#%d]" S_PREEMPT S_SMP
+	       S_ISA "\n", str, err, ++die_counter);
 
 	/* trap and error numbers are mostly meaningless on ARM */
 	ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, SIGSEGV);
@@ -781,18 +786,16 @@
 		memcpy((void *)vectors + 0xfe0, (void *)vectors + 0xfe8, 4);
 }
 
-void __init early_trap_init(void)
+void __init early_trap_init(void *vectors_base)
 {
-#if defined(CONFIG_CPU_USE_DOMAINS)
-	unsigned long vectors = CONFIG_VECTORS_BASE;
-#else
-	unsigned long vectors = (unsigned long)vectors_page;
-#endif
+	unsigned long vectors = (unsigned long)vectors_base;
 	extern char __stubs_start[], __stubs_end[];
 	extern char __vectors_start[], __vectors_end[];
 	extern char __kuser_helper_start[], __kuser_helper_end[];
 	int kuser_sz = __kuser_helper_end - __kuser_helper_start;
 
+	vectors_page = vectors_base;
+
 	/*
 	 * Copy the vectors, stubs and kuser helpers (in entry-armv.S)
 	 * into the vector page, mapped at 0xffff0000, and ensure these
diff --git a/arch/arm/mach-exynos/hotplug.c b/arch/arm/mach-exynos/hotplug.c
index da70e7e..c9146fa7 100644
--- a/arch/arm/mach-exynos/hotplug.c
+++ b/arch/arm/mach-exynos/hotplug.c
@@ -16,6 +16,7 @@
 #include <linux/io.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cp15.h>
 
 #include <mach/regs-pmu.h>
 
diff --git a/arch/arm/mach-integrator/Kconfig b/arch/arm/mach-integrator/Kconfig
index 350e266..5a6148a 100644
--- a/arch/arm/mach-integrator/Kconfig
+++ b/arch/arm/mach-integrator/Kconfig
@@ -5,6 +5,7 @@
 config ARCH_INTEGRATOR_AP
 	bool "Support Integrator/AP and Integrator/PP2 platforms"
 	select CLKSRC_MMIO
+	select HAVE_SCHED_CLOCK
 	select MIGHT_HAVE_PCI
 	select SERIAL_AMBA_PL010
 	select SERIAL_AMBA_PL010_CONSOLE
diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c
index 019f0ab..cba1907 100644
--- a/arch/arm/mach-integrator/core.c
+++ b/arch/arm/mach-integrator/core.c
@@ -25,8 +25,9 @@
 
 #include <mach/hardware.h>
 #include <mach/platform.h>
-#include <asm/irq.h>
 #include <mach/cm.h>
+#include <mach/irqs.h>
+
 #include <asm/system.h>
 #include <asm/leds.h>
 #include <asm/mach-types.h>
diff --git a/arch/arm/mach-integrator/include/mach/irqs.h b/arch/arm/mach-integrator/include/mach/irqs.h
index 1fbe6d1..a19a1a2 100644
--- a/arch/arm/mach-integrator/include/mach/irqs.h
+++ b/arch/arm/mach-integrator/include/mach/irqs.h
@@ -78,5 +78,6 @@
 #define IRQ_SIC_CP_LMINT7		46
 #define IRQ_SIC_END			46
 
-#define NR_IRQS                         47
+#define NR_IRQS_INTEGRATOR_AP		34
+#define NR_IRQS_INTEGRATOR_CP		47
 
diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c
index 21a1d6c..871f148 100644
--- a/arch/arm/mach-integrator/integrator_ap.c
+++ b/arch/arm/mach-integrator/integrator_ap.c
@@ -38,12 +38,13 @@
 #include <mach/hardware.h>
 #include <mach/platform.h>
 #include <asm/hardware/arm_timer.h>
-#include <asm/irq.h>
 #include <asm/setup.h>
 #include <asm/param.h>		/* HZ */
 #include <asm/mach-types.h>
+#include <asm/sched_clock.h>
 
 #include <mach/lm.h>
+#include <mach/irqs.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/irq.h>
@@ -325,6 +326,11 @@
 
 static unsigned long timer_reload;
 
+static u32 notrace integrator_read_sched_clock(void)
+{
+	return -readl((void __iomem *) TIMER2_VA_BASE + TIMER_VALUE);
+}
+
 static void integrator_clocksource_init(unsigned long inrate)
 {
 	void __iomem *base = (void __iomem *)TIMER2_VA_BASE;
@@ -341,6 +347,7 @@
 
 	clocksource_mmio_init(base + TIMER_VALUE, "timer2",
 			rate, 200, 16, clocksource_mmio_readl_down);
+	setup_sched_clock(integrator_read_sched_clock, 16, rate);
 }
 
 static void __iomem * const clkevt_base = (void __iomem *)TIMER1_VA_BASE;
@@ -468,6 +475,7 @@
 	.atag_offset	= 0x100,
 	.reserve	= integrator_reserve,
 	.map_io		= ap_map_io,
+	.nr_irqs	= NR_IRQS_INTEGRATOR_AP,
 	.init_early	= integrator_init_early,
 	.init_irq	= ap_init_irq,
 	.timer		= &ap_timer,
diff --git a/arch/arm/mach-integrator/integrator_cp.c b/arch/arm/mach-integrator/integrator_cp.c
index a8b6aa6..38d997d 100644
--- a/arch/arm/mach-integrator/integrator_cp.c
+++ b/arch/arm/mach-integrator/integrator_cp.c
@@ -26,7 +26,6 @@
 
 #include <mach/hardware.h>
 #include <mach/platform.h>
-#include <asm/irq.h>
 #include <asm/setup.h>
 #include <asm/mach-types.h>
 #include <asm/hardware/arm_timer.h>
@@ -34,6 +33,7 @@
 
 #include <mach/cm.h>
 #include <mach/lm.h>
+#include <mach/irqs.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/irq.h>
@@ -495,6 +495,7 @@
 	.atag_offset	= 0x100,
 	.reserve	= integrator_reserve,
 	.map_io		= intcp_map_io,
+	.nr_irqs	= NR_IRQS_INTEGRATOR_CP,
 	.init_early	= intcp_init_early,
 	.init_irq	= intcp_init_irq,
 	.timer		= &cp_timer,
diff --git a/arch/arm/mach-integrator/pci.c b/arch/arm/mach-integrator/pci.c
index 520b6bf..e15aa43 100644
--- a/arch/arm/mach-integrator/pci.c
+++ b/arch/arm/mach-integrator/pci.c
@@ -26,11 +26,12 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 
-#include <asm/irq.h>
 #include <asm/system.h>
 #include <asm/mach/pci.h>
 #include <asm/mach-types.h>
 
+#include <mach/irqs.h>
+
 /* 
  * A small note about bridges and interrupts.  The DECchip 21050 (and
  * later) adheres to the PCI-PCI bridge specification.  This says that
diff --git a/arch/arm/mach-integrator/pci_v3.c b/arch/arm/mach-integrator/pci_v3.c
index 3c82566..65e5896 100644
--- a/arch/arm/mach-integrator/pci_v3.c
+++ b/arch/arm/mach-integrator/pci_v3.c
@@ -30,7 +30,8 @@
 
 #include <mach/hardware.h>
 #include <mach/platform.h>
-#include <asm/irq.h>
+#include <mach/irqs.h>
+
 #include <asm/signal.h>
 #include <asm/system.h>
 #include <asm/mach/pci.h>
diff --git a/arch/arm/mach-realview/hotplug.c b/arch/arm/mach-realview/hotplug.c
index ac1aed2..5e64fbf 100644
--- a/arch/arm/mach-realview/hotplug.c
+++ b/arch/arm/mach-realview/hotplug.c
@@ -13,6 +13,7 @@
 #include <linux/smp.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cp15.h>
 
 extern volatile int pen_release;
 
diff --git a/arch/arm/mach-tegra/hotplug.c b/arch/arm/mach-tegra/hotplug.c
index f329404..d8dc9dd 100644
--- a/arch/arm/mach-tegra/hotplug.c
+++ b/arch/arm/mach-tegra/hotplug.c
@@ -13,6 +13,7 @@
 #include <linux/smp.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cp15.h>
 
 static inline void cpu_enter_lowpower(void)
 {
diff --git a/arch/arm/mach-vexpress/hotplug.c b/arch/arm/mach-vexpress/hotplug.c
index 813ee08..7a05548 100644
--- a/arch/arm/mach-vexpress/hotplug.c
+++ b/arch/arm/mach-vexpress/hotplug.c
@@ -13,7 +13,7 @@
 #include <linux/smp.h>
 
 #include <asm/cacheflush.h>
-#include <asm/system.h>
+#include <asm/cp15.h>
 
 extern volatile int pen_release;
 
diff --git a/arch/arm/mach-vexpress/include/mach/io.h b/arch/arm/mach-vexpress/include/mach/io.h
index 13522d8..0088cd3 100644
--- a/arch/arm/mach-vexpress/include/mach/io.h
+++ b/arch/arm/mach-vexpress/include/mach/io.h
@@ -20,7 +20,6 @@
 #ifndef __ASM_ARM_ARCH_IO_H
 #define __ASM_ARM_ARCH_IO_H
 
-#define __io(a)		__typesafe_io(a)
 #define __mem_pci(a)	(a)
 
 #endif
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index caf14dc..78459b8 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -22,7 +22,7 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 
-#include <asm/system.h>
+#include <asm/cp15.h>
 #include <asm/unaligned.h>
 
 #include "fault.h"
diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
index e0b0e7a..dd3d591 100644
--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <asm/cacheflush.h>
+#include <asm/cp15.h>
 #include <plat/cache-feroceon-l2.h>
 
 /*
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index b1e192b..a53fd2a 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -30,13 +30,13 @@
 
 static void __iomem *l2x0_base;
 static DEFINE_RAW_SPINLOCK(l2x0_lock);
-static uint32_t l2x0_way_mask;	/* Bitmask of active ways */
-static uint32_t l2x0_size;
+static u32 l2x0_way_mask;	/* Bitmask of active ways */
+static u32 l2x0_size;
 
 struct l2x0_regs l2x0_saved_regs;
 
 struct l2x0_of_data {
-	void (*setup)(const struct device_node *, __u32 *, __u32 *);
+	void (*setup)(const struct device_node *, u32 *, u32 *);
 	void (*save)(void);
 	void (*resume)(void);
 };
@@ -288,7 +288,7 @@
 	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
 }
 
-static void l2x0_unlock(__u32 cache_id)
+static void l2x0_unlock(u32 cache_id)
 {
 	int lockregs;
 	int i;
@@ -307,11 +307,11 @@
 	}
 }
 
-void __init l2x0_init(void __iomem *base, __u32 aux_val, __u32 aux_mask)
+void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
 {
-	__u32 aux;
-	__u32 cache_id;
-	__u32 way_size = 0;
+	u32 aux;
+	u32 cache_id;
+	u32 way_size = 0;
 	int ways;
 	const char *type;
 
@@ -388,7 +388,7 @@
 
 #ifdef CONFIG_OF
 static void __init l2x0_of_setup(const struct device_node *np,
-				 __u32 *aux_val, __u32 *aux_mask)
+				 u32 *aux_val, u32 *aux_mask)
 {
 	u32 data[2] = { 0, 0 };
 	u32 tag = 0;
@@ -422,7 +422,7 @@
 }
 
 static void __init pl310_of_setup(const struct device_node *np,
-				  __u32 *aux_val, __u32 *aux_mask)
+				  u32 *aux_val, u32 *aux_mask)
 {
 	u32 data[3] = { 0, 0, 0 };
 	u32 tag[3] = { 0, 0, 0 };
@@ -548,7 +548,7 @@
 	{}
 };
 
-int __init l2x0_of_init(__u32 aux_val, __u32 aux_mask)
+int __init l2x0_of_init(u32 aux_val, u32 aux_mask)
 {
 	struct device_node *np;
 	struct l2x0_of_data *data;
diff --git a/arch/arm/mm/cache-tauros2.c b/arch/arm/mm/cache-tauros2.c
index 5086865..1fbca05 100644
--- a/arch/arm/mm/cache-tauros2.c
+++ b/arch/arm/mm/cache-tauros2.c
@@ -16,6 +16,7 @@
 
 #include <linux/init.h>
 #include <asm/cacheflush.h>
+#include <asm/cp15.h>
 #include <asm/hardware/cache-tauros2.h>
 
 
diff --git a/arch/arm/mm/cache-xsc3l2.c b/arch/arm/mm/cache-xsc3l2.c
index 5a32020..6c3edeb 100644
--- a/arch/arm/mm/cache-xsc3l2.c
+++ b/arch/arm/mm/cache-xsc3l2.c
@@ -18,7 +18,7 @@
  */
 #include <linux/init.h>
 #include <linux/highmem.h>
-#include <asm/system.h>
+#include <asm/cp15.h>
 #include <asm/cputype.h>
 #include <asm/cacheflush.h>
 
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 1aa664a..db23ae4 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -214,7 +214,8 @@
 core_initcall(consistent_init);
 
 static void *
-__dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot)
+__dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot,
+	const void *caller)
 {
 	struct arm_vmregion *c;
 	size_t align;
@@ -241,7 +242,7 @@
 	 * Allocate a virtual address in the consistent mapping region.
 	 */
 	c = arm_vmregion_alloc(&consistent_head, align, size,
-			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
+			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM), caller);
 	if (c) {
 		pte_t *pte;
 		int idx = CONSISTENT_PTE_INDEX(c->vm_start);
@@ -320,14 +321,14 @@
 
 #else	/* !CONFIG_MMU */
 
-#define __dma_alloc_remap(page, size, gfp, prot)	page_address(page)
+#define __dma_alloc_remap(page, size, gfp, prot, c)	page_address(page)
 #define __dma_free_remap(addr, size)			do { } while (0)
 
 #endif	/* CONFIG_MMU */
 
 static void *
 __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
-	    pgprot_t prot)
+	    pgprot_t prot, const void *caller)
 {
 	struct page *page;
 	void *addr;
@@ -349,7 +350,7 @@
 		return NULL;
 
 	if (!arch_is_coherent())
-		addr = __dma_alloc_remap(page, size, gfp, prot);
+		addr = __dma_alloc_remap(page, size, gfp, prot, caller);
 	else
 		addr = page_address(page);
 
@@ -374,7 +375,8 @@
 		return memory;
 
 	return __dma_alloc(dev, size, handle, gfp,
-			   pgprot_dmacoherent(pgprot_kernel));
+			   pgprot_dmacoherent(pgprot_kernel),
+			   __builtin_return_address(0));
 }
 EXPORT_SYMBOL(dma_alloc_coherent);
 
@@ -386,7 +388,8 @@
 dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp)
 {
 	return __dma_alloc(dev, size, handle, gfp,
-			   pgprot_writecombine(pgprot_kernel));
+			   pgprot_writecombine(pgprot_kernel),
+			   __builtin_return_address(0));
 }
 EXPORT_SYMBOL(dma_alloc_writecombine);
 
@@ -723,6 +726,9 @@
 
 static int __init dma_debug_do_init(void)
 {
+#ifdef CONFIG_MMU
+	arm_vmregion_create_proc("dma-mappings", &consistent_head);
+#endif
 	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 	return 0;
 }
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index bb7eac3..40c43a9 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -164,7 +164,8 @@
 	struct siginfo si;
 
 #ifdef CONFIG_DEBUG_USER
-	if (user_debug & UDBG_SEGV) {
+	if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) ||
+	    ((user_debug & UDBG_BUS)  && (sig == SIGBUS))) {
 		printk(KERN_DEBUG "%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n",
 		       tsk->comm, sig, addr, fsr);
 		show_pte(tsk->mm, addr);
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 6ec1226..42d906f 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -659,7 +659,9 @@
 #ifdef CONFIG_HIGHMEM
 			"    pkmap   : 0x%08lx - 0x%08lx   (%4ld MB)\n"
 #endif
+#ifdef CONFIG_MODULES
 			"    modules : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+#endif
 			"      .text : 0x%p" " - 0x%p" "   (%4d kB)\n"
 			"      .init : 0x%p" " - 0x%p" "   (%4d kB)\n"
 			"      .data : 0x%p" " - 0x%p" "   (%4d kB)\n"
@@ -678,7 +680,9 @@
 			MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
 				(PAGE_SIZE)),
 #endif
+#ifdef CONFIG_MODULES
 			MLM(MODULES_VADDR, MODULES_END),
+#endif
 
 			MLK_ROUNDUP(_text, _etext),
 			MLK_ROUNDUP(__init_begin, __init_end),
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 80632e8..66daf17 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -26,6 +26,7 @@
 #include <linux/vmalloc.h>
 #include <linux/io.h>
 
+#include <asm/cp15.h>
 #include <asm/cputype.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 94c5a0c..f77f1db 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
 
+#include <asm/cp15.h>
 #include <asm/cputype.h>
 #include <asm/sections.h>
 #include <asm/cachetype.h>
@@ -997,11 +998,14 @@
 {
 	struct map_desc map;
 	unsigned long addr;
+	void *vectors;
 
 	/*
 	 * Allocate the vector page early.
 	 */
-	vectors_page = early_alloc(PAGE_SIZE);
+	vectors = early_alloc(PAGE_SIZE);
+
+	early_trap_init(vectors);
 
 	for (addr = VMALLOC_START; addr; addr += PMD_SIZE)
 		pmd_clear(pmd_off_k(addr));
@@ -1041,7 +1045,7 @@
 	 * location (0xffff0000).  If we aren't using high-vectors, also
 	 * create a mapping at the low-vectors virtual address.
 	 */
-	map.pfn = __phys_to_pfn(virt_to_phys(vectors_page));
+	map.pfn = __phys_to_pfn(virt_to_phys(vectors));
 	map.virtual = 0xffff0000;
 	map.length = PAGE_SIZE;
 	map.type = MT_HIGH_VECTORS;
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index a3e78cc..0acb089 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -12,6 +12,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 
+#include <asm/cp15.h>
 #include <asm/pgalloc.h>
 #include <asm/page.h>
 #include <asm/tlbflush.h>
diff --git a/arch/arm/mm/vmregion.c b/arch/arm/mm/vmregion.c
index 036fdbf..a631016 100644
--- a/arch/arm/mm/vmregion.c
+++ b/arch/arm/mm/vmregion.c
@@ -1,5 +1,8 @@
+#include <linux/fs.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 
 #include "vmregion.h"
@@ -36,7 +39,7 @@
 
 struct arm_vmregion *
 arm_vmregion_alloc(struct arm_vmregion_head *head, size_t align,
-		   size_t size, gfp_t gfp)
+		   size_t size, gfp_t gfp, const void *caller)
 {
 	unsigned long start = head->vm_start, addr = head->vm_end;
 	unsigned long flags;
@@ -52,6 +55,8 @@
 	if (!new)
 		goto out;
 
+	new->caller = caller;
+
 	spin_lock_irqsave(&head->vm_lock, flags);
 
 	addr = rounddown(addr - size, align);
@@ -129,3 +134,72 @@
 
 	kfree(c);
 }
+
+#ifdef CONFIG_PROC_FS
+static int arm_vmregion_show(struct seq_file *m, void *p)
+{
+	struct arm_vmregion *c = list_entry(p, struct arm_vmregion, vm_list);
+
+	seq_printf(m, "0x%08lx-0x%08lx %7lu", c->vm_start, c->vm_end,
+		c->vm_end - c->vm_start);
+	if (c->caller)
+		seq_printf(m, " %pS", (void *)c->caller);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static void *arm_vmregion_start(struct seq_file *m, loff_t *pos)
+{
+	struct arm_vmregion_head *h = m->private;
+	spin_lock_irq(&h->vm_lock);
+	return seq_list_start(&h->vm_list, *pos);
+}
+
+static void *arm_vmregion_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct arm_vmregion_head *h = m->private;
+	return seq_list_next(p, &h->vm_list, pos);
+}
+
+static void arm_vmregion_stop(struct seq_file *m, void *p)
+{
+	struct arm_vmregion_head *h = m->private;
+	spin_unlock_irq(&h->vm_lock);
+}
+
+static const struct seq_operations arm_vmregion_ops = {
+	.start	= arm_vmregion_start,
+	.stop	= arm_vmregion_stop,
+	.next	= arm_vmregion_next,
+	.show	= arm_vmregion_show,
+};
+
+static int arm_vmregion_open(struct inode *inode, struct file *file)
+{
+	struct arm_vmregion_head *h = PDE(inode)->data;
+	int ret = seq_open(file, &arm_vmregion_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = h;
+	}
+	return ret;
+}
+
+static const struct file_operations arm_vmregion_fops = {
+	.open	= arm_vmregion_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release = seq_release,
+};
+
+int arm_vmregion_create_proc(const char *path, struct arm_vmregion_head *h)
+{
+	proc_create_data(path, S_IRUSR, NULL, &arm_vmregion_fops, h);
+	return 0;
+}
+#else
+int arm_vmregion_create_proc(const char *path, struct arm_vmregion_head *h)
+{
+	return 0;
+}
+#endif
diff --git a/arch/arm/mm/vmregion.h b/arch/arm/mm/vmregion.h
index 15e9f04..162be66 100644
--- a/arch/arm/mm/vmregion.h
+++ b/arch/arm/mm/vmregion.h
@@ -19,11 +19,14 @@
 	unsigned long		vm_end;
 	struct page		*vm_pages;
 	int			vm_active;
+	const void		*caller;
 };
 
-struct arm_vmregion *arm_vmregion_alloc(struct arm_vmregion_head *, size_t, size_t, gfp_t);
+struct arm_vmregion *arm_vmregion_alloc(struct arm_vmregion_head *, size_t, size_t, gfp_t, const void *);
 struct arm_vmregion *arm_vmregion_find(struct arm_vmregion_head *, unsigned long);
 struct arm_vmregion *arm_vmregion_find_remove(struct arm_vmregion_head *, unsigned long);
 void arm_vmregion_free(struct arm_vmregion_head *, struct arm_vmregion *);
 
+int arm_vmregion_create_proc(const char *, struct arm_vmregion_head *);
+
 #endif
diff --git a/arch/arm/net/Makefile b/arch/arm/net/Makefile
new file mode 100644
index 0000000..c2c1084
--- /dev/null
+++ b/arch/arm/net/Makefile
@@ -0,0 +1,3 @@
+# ARM-specific networking code
+
+obj-$(CONFIG_BPF_JIT) += bpf_jit_32.o
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
new file mode 100644
index 0000000..62135849
--- /dev/null
+++ b/arch/arm/net/bpf_jit_32.c
@@ -0,0 +1,915 @@
+/*
+ * Just-In-Time compiler for BPF filters on 32bit ARM
+ *
+ * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/filter.h>
+#include <linux/moduleloader.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/hwcap.h>
+
+#include "bpf_jit_32.h"
+
+/*
+ * ABI:
+ *
+ * r0	scratch register
+ * r4	BPF register A
+ * r5	BPF register X
+ * r6	pointer to the skb
+ * r7	skb->data
+ * r8	skb_headlen(skb)
+ */
+
+#define r_scratch	ARM_R0
+/* r1-r3 are (also) used for the unaligned loads on the non-ARMv7 slowpath */
+#define r_off		ARM_R1
+#define r_A		ARM_R4
+#define r_X		ARM_R5
+#define r_skb		ARM_R6
+#define r_skb_data	ARM_R7
+#define r_skb_hl	ARM_R8
+
+#define SCRATCH_SP_OFFSET	0
+#define SCRATCH_OFF(k)		(SCRATCH_SP_OFFSET + (k))
+
+#define SEEN_MEM		((1 << BPF_MEMWORDS) - 1)
+#define SEEN_MEM_WORD(k)	(1 << (k))
+#define SEEN_X			(1 << BPF_MEMWORDS)
+#define SEEN_CALL		(1 << (BPF_MEMWORDS + 1))
+#define SEEN_SKB		(1 << (BPF_MEMWORDS + 2))
+#define SEEN_DATA		(1 << (BPF_MEMWORDS + 3))
+
+#define FLAG_NEED_X_RESET	(1 << 0)
+
+struct jit_ctx {
+	const struct sk_filter *skf;
+	unsigned idx;
+	unsigned prologue_bytes;
+	int ret0_fp_idx;
+	u32 seen;
+	u32 flags;
+	u32 *offsets;
+	u32 *target;
+#if __LINUX_ARM_ARCH__ < 7
+	u16 epilogue_bytes;
+	u16 imm_count;
+	u32 *imms;
+#endif
+};
+
+int bpf_jit_enable __read_mostly;
+
+static u64 jit_get_skb_b(struct sk_buff *skb, unsigned offset)
+{
+	u8 ret;
+	int err;
+
+	err = skb_copy_bits(skb, offset, &ret, 1);
+
+	return (u64)err << 32 | ret;
+}
+
+static u64 jit_get_skb_h(struct sk_buff *skb, unsigned offset)
+{
+	u16 ret;
+	int err;
+
+	err = skb_copy_bits(skb, offset, &ret, 2);
+
+	return (u64)err << 32 | ntohs(ret);
+}
+
+static u64 jit_get_skb_w(struct sk_buff *skb, unsigned offset)
+{
+	u32 ret;
+	int err;
+
+	err = skb_copy_bits(skb, offset, &ret, 4);
+
+	return (u64)err << 32 | ntohl(ret);
+}
+
+/*
+ * Wrapper that handles both OABI and EABI and assures Thumb2 interworking
+ * (where the assembly routines like __aeabi_uidiv could cause problems).
+ */
+static u32 jit_udiv(u32 dividend, u32 divisor)
+{
+	return dividend / divisor;
+}
+
+static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
+{
+	if (ctx->target != NULL)
+		ctx->target[ctx->idx] = inst | (cond << 28);
+
+	ctx->idx++;
+}
+
+/*
+ * Emit an instruction that will be executed unconditionally.
+ */
+static inline void emit(u32 inst, struct jit_ctx *ctx)
+{
+	_emit(ARM_COND_AL, inst, ctx);
+}
+
+static u16 saved_regs(struct jit_ctx *ctx)
+{
+	u16 ret = 0;
+
+	if ((ctx->skf->len > 1) ||
+	    (ctx->skf->insns[0].code == BPF_S_RET_A))
+		ret |= 1 << r_A;
+
+#ifdef CONFIG_FRAME_POINTER
+	ret |= (1 << ARM_FP) | (1 << ARM_IP) | (1 << ARM_LR) | (1 << ARM_PC);
+#else
+	if (ctx->seen & SEEN_CALL)
+		ret |= 1 << ARM_LR;
+#endif
+	if (ctx->seen & (SEEN_DATA | SEEN_SKB))
+		ret |= 1 << r_skb;
+	if (ctx->seen & SEEN_DATA)
+		ret |= (1 << r_skb_data) | (1 << r_skb_hl);
+	if (ctx->seen & SEEN_X)
+		ret |= 1 << r_X;
+
+	return ret;
+}
+
+static inline int mem_words_used(struct jit_ctx *ctx)
+{
+	/* yes, we do waste some stack space IF there are "holes" in the set" */
+	return fls(ctx->seen & SEEN_MEM);
+}
+
+static inline bool is_load_to_a(u16 inst)
+{
+	switch (inst) {
+	case BPF_S_LD_W_LEN:
+	case BPF_S_LD_W_ABS:
+	case BPF_S_LD_H_ABS:
+	case BPF_S_LD_B_ABS:
+	case BPF_S_ANC_CPU:
+	case BPF_S_ANC_IFINDEX:
+	case BPF_S_ANC_MARK:
+	case BPF_S_ANC_PROTOCOL:
+	case BPF_S_ANC_RXHASH:
+	case BPF_S_ANC_QUEUE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static void build_prologue(struct jit_ctx *ctx)
+{
+	u16 reg_set = saved_regs(ctx);
+	u16 first_inst = ctx->skf->insns[0].code;
+	u16 off;
+
+#ifdef CONFIG_FRAME_POINTER
+	emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx);
+	emit(ARM_PUSH(reg_set), ctx);
+	emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx);
+#else
+	if (reg_set)
+		emit(ARM_PUSH(reg_set), ctx);
+#endif
+
+	if (ctx->seen & (SEEN_DATA | SEEN_SKB))
+		emit(ARM_MOV_R(r_skb, ARM_R0), ctx);
+
+	if (ctx->seen & SEEN_DATA) {
+		off = offsetof(struct sk_buff, data);
+		emit(ARM_LDR_I(r_skb_data, r_skb, off), ctx);
+		/* headlen = len - data_len */
+		off = offsetof(struct sk_buff, len);
+		emit(ARM_LDR_I(r_skb_hl, r_skb, off), ctx);
+		off = offsetof(struct sk_buff, data_len);
+		emit(ARM_LDR_I(r_scratch, r_skb, off), ctx);
+		emit(ARM_SUB_R(r_skb_hl, r_skb_hl, r_scratch), ctx);
+	}
+
+	if (ctx->flags & FLAG_NEED_X_RESET)
+		emit(ARM_MOV_I(r_X, 0), ctx);
+
+	/* do not leak kernel data to userspace */
+	if ((first_inst != BPF_S_RET_K) && !(is_load_to_a(first_inst)))
+		emit(ARM_MOV_I(r_A, 0), ctx);
+
+	/* stack space for the BPF_MEM words */
+	if (ctx->seen & SEEN_MEM)
+		emit(ARM_SUB_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx);
+}
+
+static void build_epilogue(struct jit_ctx *ctx)
+{
+	u16 reg_set = saved_regs(ctx);
+
+	if (ctx->seen & SEEN_MEM)
+		emit(ARM_ADD_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx);
+
+	reg_set &= ~(1 << ARM_LR);
+
+#ifdef CONFIG_FRAME_POINTER
+	/* the first instruction of the prologue was: mov ip, sp */
+	reg_set &= ~(1 << ARM_IP);
+	reg_set |= (1 << ARM_SP);
+	emit(ARM_LDM(ARM_SP, reg_set), ctx);
+#else
+	if (reg_set) {
+		if (ctx->seen & SEEN_CALL)
+			reg_set |= 1 << ARM_PC;
+		emit(ARM_POP(reg_set), ctx);
+	}
+
+	if (!(ctx->seen & SEEN_CALL))
+		emit(ARM_BX(ARM_LR), ctx);
+#endif
+}
+
+static int16_t imm8m(u32 x)
+{
+	u32 rot;
+
+	for (rot = 0; rot < 16; rot++)
+		if ((x & ~ror32(0xff, 2 * rot)) == 0)
+			return rol32(x, 2 * rot) | (rot << 8);
+
+	return -1;
+}
+
+#if __LINUX_ARM_ARCH__ < 7
+
+static u16 imm_offset(u32 k, struct jit_ctx *ctx)
+{
+	unsigned i = 0, offset;
+	u16 imm;
+
+	/* on the "fake" run we just count them (duplicates included) */
+	if (ctx->target == NULL) {
+		ctx->imm_count++;
+		return 0;
+	}
+
+	while ((i < ctx->imm_count) && ctx->imms[i]) {
+		if (ctx->imms[i] == k)
+			break;
+		i++;
+	}
+
+	if (ctx->imms[i] == 0)
+		ctx->imms[i] = k;
+
+	/* constants go just after the epilogue */
+	offset =  ctx->offsets[ctx->skf->len];
+	offset += ctx->prologue_bytes;
+	offset += ctx->epilogue_bytes;
+	offset += i * 4;
+
+	ctx->target[offset / 4] = k;
+
+	/* PC in ARM mode == address of the instruction + 8 */
+	imm = offset - (8 + ctx->idx * 4);
+
+	return imm;
+}
+
+#endif /* __LINUX_ARM_ARCH__ */
+
+/*
+ * Move an immediate that's not an imm8m to a core register.
+ */
+static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx)
+{
+#if __LINUX_ARM_ARCH__ < 7
+	emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx);
+#else
+	emit(ARM_MOVW(rd, val & 0xffff), ctx);
+	if (val > 0xffff)
+		emit(ARM_MOVT(rd, val >> 16), ctx);
+#endif
+}
+
+static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx)
+{
+	int imm12 = imm8m(val);
+
+	if (imm12 >= 0)
+		emit(ARM_MOV_I(rd, imm12), ctx);
+	else
+		emit_mov_i_no8m(rd, val, ctx);
+}
+
+#if __LINUX_ARM_ARCH__ < 6
+
+static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDRB_I(ARM_R3, r_addr, 1), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 3), ctx);
+	_emit(cond, ARM_LSL_I(ARM_R3, ARM_R3, 16), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R0, r_addr, 2), ctx);
+	_emit(cond, ARM_ORR_S(ARM_R3, ARM_R3, ARM_R1, SRTYPE_LSL, 24), ctx);
+	_emit(cond, ARM_ORR_R(ARM_R3, ARM_R3, ARM_R2), ctx);
+	_emit(cond, ARM_ORR_S(r_res, ARM_R3, ARM_R0, SRTYPE_LSL, 8), ctx);
+}
+
+static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx);
+	_emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 1), ctx);
+	_emit(cond, ARM_ORR_S(r_res, ARM_R2, ARM_R1, SRTYPE_LSL, 8), ctx);
+}
+
+static inline void emit_swap16(u8 r_dst, u8 r_src, struct jit_ctx *ctx)
+{
+	emit(ARM_LSL_R(ARM_R1, r_src, 8), ctx);
+	emit(ARM_ORR_S(r_dst, ARM_R1, r_src, SRTYPE_LSL, 8), ctx);
+	emit(ARM_LSL_I(r_dst, r_dst, 8), ctx);
+	emit(ARM_LSL_R(r_dst, r_dst, 8), ctx);
+}
+
+#else  /* ARMv6+ */
+
+static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDR_I(r_res, r_addr, 0), ctx);
+#ifdef __LITTLE_ENDIAN
+	_emit(cond, ARM_REV(r_res, r_res), ctx);
+#endif
+}
+
+static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx)
+{
+	_emit(cond, ARM_LDRH_I(r_res, r_addr, 0), ctx);
+#ifdef __LITTLE_ENDIAN
+	_emit(cond, ARM_REV16(r_res, r_res), ctx);
+#endif
+}
+
+static inline void emit_swap16(u8 r_dst __maybe_unused,
+			       u8 r_src __maybe_unused,
+			       struct jit_ctx *ctx __maybe_unused)
+{
+#ifdef __LITTLE_ENDIAN
+	emit(ARM_REV16(r_dst, r_src), ctx);
+#endif
+}
+
+#endif /* __LINUX_ARM_ARCH__ < 6 */
+
+
+/* Compute the immediate value for a PC-relative branch. */
+static inline u32 b_imm(unsigned tgt, struct jit_ctx *ctx)
+{
+	u32 imm;
+
+	if (ctx->target == NULL)
+		return 0;
+	/*
+	 * BPF allows only forward jumps and the offset of the target is
+	 * still the one computed during the first pass.
+	 */
+	imm  = ctx->offsets[tgt] + ctx->prologue_bytes - (ctx->idx * 4 + 8);
+
+	return imm >> 2;
+}
+
+#define OP_IMM3(op, r1, r2, imm_val, ctx)				\
+	do {								\
+		imm12 = imm8m(imm_val);					\
+		if (imm12 < 0) {					\
+			emit_mov_i_no8m(r_scratch, imm_val, ctx);	\
+			emit(op ## _R((r1), (r2), r_scratch), ctx);	\
+		} else {						\
+			emit(op ## _I((r1), (r2), imm12), ctx);		\
+		}							\
+	} while (0)
+
+static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx)
+{
+	if (ctx->ret0_fp_idx >= 0) {
+		_emit(cond, ARM_B(b_imm(ctx->ret0_fp_idx, ctx)), ctx);
+		/* NOP to keep the size constant between passes */
+		emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx);
+	} else {
+		_emit(cond, ARM_MOV_I(ARM_R0, 0), ctx);
+		_emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx);
+	}
+}
+
+static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx)
+{
+#if __LINUX_ARM_ARCH__ < 5
+	emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx);
+
+	if (elf_hwcap & HWCAP_THUMB)
+		emit(ARM_BX(tgt_reg), ctx);
+	else
+		emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx);
+#else
+	emit(ARM_BLX_R(tgt_reg), ctx);
+#endif
+}
+
+static inline void emit_udiv(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx)
+{
+#if __LINUX_ARM_ARCH__ == 7
+	if (elf_hwcap & HWCAP_IDIVA) {
+		emit(ARM_UDIV(rd, rm, rn), ctx);
+		return;
+	}
+#endif
+	if (rm != ARM_R0)
+		emit(ARM_MOV_R(ARM_R0, rm), ctx);
+	if (rn != ARM_R1)
+		emit(ARM_MOV_R(ARM_R1, rn), ctx);
+
+	ctx->seen |= SEEN_CALL;
+	emit_mov_i(ARM_R3, (u32)jit_udiv, ctx);
+	emit_blx_r(ARM_R3, ctx);
+
+	if (rd != ARM_R0)
+		emit(ARM_MOV_R(rd, ARM_R0), ctx);
+}
+
+static inline void update_on_xread(struct jit_ctx *ctx)
+{
+	if (!(ctx->seen & SEEN_X))
+		ctx->flags |= FLAG_NEED_X_RESET;
+
+	ctx->seen |= SEEN_X;
+}
+
+static int build_body(struct jit_ctx *ctx)
+{
+	void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w};
+	const struct sk_filter *prog = ctx->skf;
+	const struct sock_filter *inst;
+	unsigned i, load_order, off, condt;
+	int imm12;
+	u32 k;
+
+	for (i = 0; i < prog->len; i++) {
+		inst = &(prog->insns[i]);
+		/* K as an immediate value operand */
+		k = inst->k;
+
+		/* compute offsets only in the fake pass */
+		if (ctx->target == NULL)
+			ctx->offsets[i] = ctx->idx * 4;
+
+		switch (inst->code) {
+		case BPF_S_LD_IMM:
+			emit_mov_i(r_A, k, ctx);
+			break;
+		case BPF_S_LD_W_LEN:
+			ctx->seen |= SEEN_SKB;
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+			emit(ARM_LDR_I(r_A, r_skb,
+				       offsetof(struct sk_buff, len)), ctx);
+			break;
+		case BPF_S_LD_MEM:
+			/* A = scratch[k] */
+			ctx->seen |= SEEN_MEM_WORD(k);
+			emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_LD_W_ABS:
+			load_order = 2;
+			goto load;
+		case BPF_S_LD_H_ABS:
+			load_order = 1;
+			goto load;
+		case BPF_S_LD_B_ABS:
+			load_order = 0;
+load:
+			/* the interpreter will deal with the negative K */
+			if ((int)k < 0)
+				return -ENOTSUPP;
+			emit_mov_i(r_off, k, ctx);
+load_common:
+			ctx->seen |= SEEN_DATA | SEEN_CALL;
+
+			if (load_order > 0) {
+				emit(ARM_SUB_I(r_scratch, r_skb_hl,
+					       1 << load_order), ctx);
+				emit(ARM_CMP_R(r_scratch, r_off), ctx);
+				condt = ARM_COND_HS;
+			} else {
+				emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+				condt = ARM_COND_HI;
+			}
+
+			_emit(condt, ARM_ADD_R(r_scratch, r_off, r_skb_data),
+			      ctx);
+
+			if (load_order == 0)
+				_emit(condt, ARM_LDRB_I(r_A, r_scratch, 0),
+				      ctx);
+			else if (load_order == 1)
+				emit_load_be16(condt, r_A, r_scratch, ctx);
+			else if (load_order == 2)
+				emit_load_be32(condt, r_A, r_scratch, ctx);
+
+			_emit(condt, ARM_B(b_imm(i + 1, ctx)), ctx);
+
+			/* the slowpath */
+			emit_mov_i(ARM_R3, (u32)load_func[load_order], ctx);
+			emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+			/* the offset is already in R1 */
+			emit_blx_r(ARM_R3, ctx);
+			/* check the result of skb_copy_bits */
+			emit(ARM_CMP_I(ARM_R1, 0), ctx);
+			emit_err_ret(ARM_COND_NE, ctx);
+			emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+			break;
+		case BPF_S_LD_W_IND:
+			load_order = 2;
+			goto load_ind;
+		case BPF_S_LD_H_IND:
+			load_order = 1;
+			goto load_ind;
+		case BPF_S_LD_B_IND:
+			load_order = 0;
+load_ind:
+			OP_IMM3(ARM_ADD, r_off, r_X, k, ctx);
+			goto load_common;
+		case BPF_S_LDX_IMM:
+			ctx->seen |= SEEN_X;
+			emit_mov_i(r_X, k, ctx);
+			break;
+		case BPF_S_LDX_W_LEN:
+			ctx->seen |= SEEN_X | SEEN_SKB;
+			emit(ARM_LDR_I(r_X, r_skb,
+				       offsetof(struct sk_buff, len)), ctx);
+			break;
+		case BPF_S_LDX_MEM:
+			ctx->seen |= SEEN_X | SEEN_MEM_WORD(k);
+			emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_LDX_B_MSH:
+			/* x = ((*(frame + k)) & 0xf) << 2; */
+			ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL;
+			/* the interpreter should deal with the negative K */
+			if (k < 0)
+				return -1;
+			/* offset in r1: we might have to take the slow path */
+			emit_mov_i(r_off, k, ctx);
+			emit(ARM_CMP_R(r_skb_hl, r_off), ctx);
+
+			/* load in r0: common with the slowpath */
+			_emit(ARM_COND_HI, ARM_LDRB_R(ARM_R0, r_skb_data,
+						      ARM_R1), ctx);
+			/*
+			 * emit_mov_i() might generate one or two instructions,
+			 * the same holds for emit_blx_r()
+			 */
+			_emit(ARM_COND_HI, ARM_B(b_imm(i + 1, ctx) - 2), ctx);
+
+			emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+			/* r_off is r1 */
+			emit_mov_i(ARM_R3, (u32)jit_get_skb_b, ctx);
+			emit_blx_r(ARM_R3, ctx);
+			/* check the return value of skb_copy_bits */
+			emit(ARM_CMP_I(ARM_R1, 0), ctx);
+			emit_err_ret(ARM_COND_NE, ctx);
+
+			emit(ARM_AND_I(r_X, ARM_R0, 0x00f), ctx);
+			emit(ARM_LSL_I(r_X, r_X, 2), ctx);
+			break;
+		case BPF_S_ST:
+			ctx->seen |= SEEN_MEM_WORD(k);
+			emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_STX:
+			update_on_xread(ctx);
+			ctx->seen |= SEEN_MEM_WORD(k);
+			emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx);
+			break;
+		case BPF_S_ALU_ADD_K:
+			/* A += K */
+			OP_IMM3(ARM_ADD, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_ADD_X:
+			update_on_xread(ctx);
+			emit(ARM_ADD_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_SUB_K:
+			/* A -= K */
+			OP_IMM3(ARM_SUB, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_SUB_X:
+			update_on_xread(ctx);
+			emit(ARM_SUB_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_MUL_K:
+			/* A *= K */
+			emit_mov_i(r_scratch, k, ctx);
+			emit(ARM_MUL(r_A, r_A, r_scratch), ctx);
+			break;
+		case BPF_S_ALU_MUL_X:
+			update_on_xread(ctx);
+			emit(ARM_MUL(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_DIV_K:
+			/* current k == reciprocal_value(userspace k) */
+			emit_mov_i(r_scratch, k, ctx);
+			/* A = top 32 bits of the product */
+			emit(ARM_UMULL(r_scratch, r_A, r_A, r_scratch), ctx);
+			break;
+		case BPF_S_ALU_DIV_X:
+			update_on_xread(ctx);
+			emit(ARM_CMP_I(r_X, 0), ctx);
+			emit_err_ret(ARM_COND_EQ, ctx);
+			emit_udiv(r_A, r_A, r_X, ctx);
+			break;
+		case BPF_S_ALU_OR_K:
+			/* A |= K */
+			OP_IMM3(ARM_ORR, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_OR_X:
+			update_on_xread(ctx);
+			emit(ARM_ORR_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_AND_K:
+			/* A &= K */
+			OP_IMM3(ARM_AND, r_A, r_A, k, ctx);
+			break;
+		case BPF_S_ALU_AND_X:
+			update_on_xread(ctx);
+			emit(ARM_AND_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_LSH_K:
+			if (unlikely(k > 31))
+				return -1;
+			emit(ARM_LSL_I(r_A, r_A, k), ctx);
+			break;
+		case BPF_S_ALU_LSH_X:
+			update_on_xread(ctx);
+			emit(ARM_LSL_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_RSH_K:
+			if (unlikely(k > 31))
+				return -1;
+			emit(ARM_LSR_I(r_A, r_A, k), ctx);
+			break;
+		case BPF_S_ALU_RSH_X:
+			update_on_xread(ctx);
+			emit(ARM_LSR_R(r_A, r_A, r_X), ctx);
+			break;
+		case BPF_S_ALU_NEG:
+			/* A = -A */
+			emit(ARM_RSB_I(r_A, r_A, 0), ctx);
+			break;
+		case BPF_S_JMP_JA:
+			/* pc += K */
+			emit(ARM_B(b_imm(i + k + 1, ctx)), ctx);
+			break;
+		case BPF_S_JMP_JEQ_K:
+			/* pc += (A == K) ? pc->jt : pc->jf */
+			condt  = ARM_COND_EQ;
+			goto cmp_imm;
+		case BPF_S_JMP_JGT_K:
+			/* pc += (A > K) ? pc->jt : pc->jf */
+			condt  = ARM_COND_HI;
+			goto cmp_imm;
+		case BPF_S_JMP_JGE_K:
+			/* pc += (A >= K) ? pc->jt : pc->jf */
+			condt  = ARM_COND_HS;
+cmp_imm:
+			imm12 = imm8m(k);
+			if (imm12 < 0) {
+				emit_mov_i_no8m(r_scratch, k, ctx);
+				emit(ARM_CMP_R(r_A, r_scratch), ctx);
+			} else {
+				emit(ARM_CMP_I(r_A, imm12), ctx);
+			}
+cond_jump:
+			if (inst->jt)
+				_emit(condt, ARM_B(b_imm(i + inst->jt + 1,
+						   ctx)), ctx);
+			if (inst->jf)
+				_emit(condt ^ 1, ARM_B(b_imm(i + inst->jf + 1,
+							     ctx)), ctx);
+			break;
+		case BPF_S_JMP_JEQ_X:
+			/* pc += (A == X) ? pc->jt : pc->jf */
+			condt   = ARM_COND_EQ;
+			goto cmp_x;
+		case BPF_S_JMP_JGT_X:
+			/* pc += (A > X) ? pc->jt : pc->jf */
+			condt   = ARM_COND_HI;
+			goto cmp_x;
+		case BPF_S_JMP_JGE_X:
+			/* pc += (A >= X) ? pc->jt : pc->jf */
+			condt   = ARM_COND_CS;
+cmp_x:
+			update_on_xread(ctx);
+			emit(ARM_CMP_R(r_A, r_X), ctx);
+			goto cond_jump;
+		case BPF_S_JMP_JSET_K:
+			/* pc += (A & K) ? pc->jt : pc->jf */
+			condt  = ARM_COND_NE;
+			/* not set iff all zeroes iff Z==1 iff EQ */
+
+			imm12 = imm8m(k);
+			if (imm12 < 0) {
+				emit_mov_i_no8m(r_scratch, k, ctx);
+				emit(ARM_TST_R(r_A, r_scratch), ctx);
+			} else {
+				emit(ARM_TST_I(r_A, imm12), ctx);
+			}
+			goto cond_jump;
+		case BPF_S_JMP_JSET_X:
+			/* pc += (A & X) ? pc->jt : pc->jf */
+			update_on_xread(ctx);
+			condt  = ARM_COND_NE;
+			emit(ARM_TST_R(r_A, r_X), ctx);
+			goto cond_jump;
+		case BPF_S_RET_A:
+			emit(ARM_MOV_R(ARM_R0, r_A), ctx);
+			goto b_epilogue;
+		case BPF_S_RET_K:
+			if ((k == 0) && (ctx->ret0_fp_idx < 0))
+				ctx->ret0_fp_idx = i;
+			emit_mov_i(ARM_R0, k, ctx);
+b_epilogue:
+			if (i != ctx->skf->len - 1)
+				emit(ARM_B(b_imm(prog->len, ctx)), ctx);
+			break;
+		case BPF_S_MISC_TAX:
+			/* X = A */
+			ctx->seen |= SEEN_X;
+			emit(ARM_MOV_R(r_X, r_A), ctx);
+			break;
+		case BPF_S_MISC_TXA:
+			/* A = X */
+			update_on_xread(ctx);
+			emit(ARM_MOV_R(r_A, r_X), ctx);
+			break;
+		case BPF_S_ANC_PROTOCOL:
+			/* A = ntohs(skb->protocol) */
+			ctx->seen |= SEEN_SKB;
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
+						  protocol) != 2);
+			off = offsetof(struct sk_buff, protocol);
+			emit(ARM_LDRH_I(r_scratch, r_skb, off), ctx);
+			emit_swap16(r_A, r_scratch, ctx);
+			break;
+		case BPF_S_ANC_CPU:
+			/* r_scratch = current_thread_info() */
+			OP_IMM3(ARM_BIC, r_scratch, ARM_SP, THREAD_SIZE - 1, ctx);
+			/* A = current_thread_info()->cpu */
+			BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4);
+			off = offsetof(struct thread_info, cpu);
+			emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+			break;
+		case BPF_S_ANC_IFINDEX:
+			/* A = skb->dev->ifindex */
+			ctx->seen |= SEEN_SKB;
+			off = offsetof(struct sk_buff, dev);
+			emit(ARM_LDR_I(r_scratch, r_skb, off), ctx);
+
+			emit(ARM_CMP_I(r_scratch, 0), ctx);
+			emit_err_ret(ARM_COND_EQ, ctx);
+
+			BUILD_BUG_ON(FIELD_SIZEOF(struct net_device,
+						  ifindex) != 4);
+			off = offsetof(struct net_device, ifindex);
+			emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+			break;
+		case BPF_S_ANC_MARK:
+			ctx->seen |= SEEN_SKB;
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+			off = offsetof(struct sk_buff, mark);
+			emit(ARM_LDR_I(r_A, r_skb, off), ctx);
+			break;
+		case BPF_S_ANC_RXHASH:
+			ctx->seen |= SEEN_SKB;
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4);
+			off = offsetof(struct sk_buff, rxhash);
+			emit(ARM_LDR_I(r_A, r_skb, off), ctx);
+			break;
+		case BPF_S_ANC_QUEUE:
+			ctx->seen |= SEEN_SKB;
+			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
+						  queue_mapping) != 2);
+			BUILD_BUG_ON(offsetof(struct sk_buff,
+					      queue_mapping) > 0xff);
+			off = offsetof(struct sk_buff, queue_mapping);
+			emit(ARM_LDRH_I(r_A, r_skb, off), ctx);
+			break;
+		default:
+			return -1;
+		}
+	}
+
+	/* compute offsets only during the first pass */
+	if (ctx->target == NULL)
+		ctx->offsets[i] = ctx->idx * 4;
+
+	return 0;
+}
+
+
+void bpf_jit_compile(struct sk_filter *fp)
+{
+	struct jit_ctx ctx;
+	unsigned tmp_idx;
+	unsigned alloc_size;
+
+	if (!bpf_jit_enable)
+		return;
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.skf		= fp;
+	ctx.ret0_fp_idx = -1;
+
+	ctx.offsets = kzalloc(GFP_KERNEL, 4 * (ctx.skf->len + 1));
+	if (ctx.offsets == NULL)
+		return;
+
+	/* fake pass to fill in the ctx->seen */
+	if (unlikely(build_body(&ctx)))
+		goto out;
+
+	tmp_idx = ctx.idx;
+	build_prologue(&ctx);
+	ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4;
+
+#if __LINUX_ARM_ARCH__ < 7
+	tmp_idx = ctx.idx;
+	build_epilogue(&ctx);
+	ctx.epilogue_bytes = (ctx.idx - tmp_idx) * 4;
+
+	ctx.idx += ctx.imm_count;
+	if (ctx.imm_count) {
+		ctx.imms = kzalloc(GFP_KERNEL, 4 * ctx.imm_count);
+		if (ctx.imms == NULL)
+			goto out;
+	}
+#else
+	/* there's nothing after the epilogue on ARMv7 */
+	build_epilogue(&ctx);
+#endif
+
+	alloc_size = 4 * ctx.idx;
+	ctx.target = module_alloc(max(sizeof(struct work_struct),
+				      alloc_size));
+	if (unlikely(ctx.target == NULL))
+		goto out;
+
+	ctx.idx = 0;
+	build_prologue(&ctx);
+	build_body(&ctx);
+	build_epilogue(&ctx);
+
+	flush_icache_range((u32)ctx.target, (u32)(ctx.target + ctx.idx));
+
+#if __LINUX_ARM_ARCH__ < 7
+	if (ctx.imm_count)
+		kfree(ctx.imms);
+#endif
+
+	if (bpf_jit_enable > 1)
+		print_hex_dump(KERN_INFO, "BPF JIT code: ",
+			       DUMP_PREFIX_ADDRESS, 16, 4, ctx.target,
+			       alloc_size, false);
+
+	fp->bpf_func = (void *)ctx.target;
+out:
+	kfree(ctx.offsets);
+	return;
+}
+
+static void bpf_jit_free_worker(struct work_struct *work)
+{
+	module_free(NULL, work);
+}
+
+void bpf_jit_free(struct sk_filter *fp)
+{
+	struct work_struct *work;
+
+	if (fp->bpf_func != sk_run_filter) {
+		work = (struct work_struct *)fp->bpf_func;
+
+		INIT_WORK(work, bpf_jit_free_worker);
+		schedule_work(work);
+	}
+}
diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h
new file mode 100644
index 0000000..99ae5e3f
--- /dev/null
+++ b/arch/arm/net/bpf_jit_32.h
@@ -0,0 +1,190 @@
+/*
+ * Just-In-Time compiler for BPF filters on 32bit ARM
+ *
+ * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#ifndef PFILTER_OPCODES_ARM_H
+#define PFILTER_OPCODES_ARM_H
+
+#define ARM_R0	0
+#define ARM_R1	1
+#define ARM_R2	2
+#define ARM_R3	3
+#define ARM_R4	4
+#define ARM_R5	5
+#define ARM_R6	6
+#define ARM_R7	7
+#define ARM_R8	8
+#define ARM_R9	9
+#define ARM_R10	10
+#define ARM_FP	11
+#define ARM_IP	12
+#define ARM_SP	13
+#define ARM_LR	14
+#define ARM_PC	15
+
+#define ARM_COND_EQ		0x0
+#define ARM_COND_NE		0x1
+#define ARM_COND_CS		0x2
+#define ARM_COND_HS		ARM_COND_CS
+#define ARM_COND_CC		0x3
+#define ARM_COND_LO		ARM_COND_CC
+#define ARM_COND_MI		0x4
+#define ARM_COND_PL		0x5
+#define ARM_COND_VS		0x6
+#define ARM_COND_VC		0x7
+#define ARM_COND_HI		0x8
+#define ARM_COND_LS		0x9
+#define ARM_COND_GE		0xa
+#define ARM_COND_LT		0xb
+#define ARM_COND_GT		0xc
+#define ARM_COND_LE		0xd
+#define ARM_COND_AL		0xe
+
+/* register shift types */
+#define SRTYPE_LSL		0
+#define SRTYPE_LSR		1
+#define SRTYPE_ASR		2
+#define SRTYPE_ROR		3
+
+#define ARM_INST_ADD_R		0x00800000
+#define ARM_INST_ADD_I		0x02800000
+
+#define ARM_INST_AND_R		0x00000000
+#define ARM_INST_AND_I		0x02000000
+
+#define ARM_INST_BIC_R		0x01c00000
+#define ARM_INST_BIC_I		0x03c00000
+
+#define ARM_INST_B		0x0a000000
+#define ARM_INST_BX		0x012FFF10
+#define ARM_INST_BLX_R		0x012fff30
+
+#define ARM_INST_CMP_R		0x01500000
+#define ARM_INST_CMP_I		0x03500000
+
+#define ARM_INST_LDRB_I		0x05d00000
+#define ARM_INST_LDRB_R		0x07d00000
+#define ARM_INST_LDRH_I		0x01d000b0
+#define ARM_INST_LDR_I		0x05900000
+
+#define ARM_INST_LDM		0x08900000
+
+#define ARM_INST_LSL_I		0x01a00000
+#define ARM_INST_LSL_R		0x01a00010
+
+#define ARM_INST_LSR_I		0x01a00020
+#define ARM_INST_LSR_R		0x01a00030
+
+#define ARM_INST_MOV_R		0x01a00000
+#define ARM_INST_MOV_I		0x03a00000
+#define ARM_INST_MOVW		0x03000000
+#define ARM_INST_MOVT		0x03400000
+
+#define ARM_INST_MUL		0x00000090
+
+#define ARM_INST_POP		0x08bd0000
+#define ARM_INST_PUSH		0x092d0000
+
+#define ARM_INST_ORR_R		0x01800000
+#define ARM_INST_ORR_I		0x03800000
+
+#define ARM_INST_REV		0x06bf0f30
+#define ARM_INST_REV16		0x06bf0fb0
+
+#define ARM_INST_RSB_I		0x02600000
+
+#define ARM_INST_SUB_R		0x00400000
+#define ARM_INST_SUB_I		0x02400000
+
+#define ARM_INST_STR_I		0x05800000
+
+#define ARM_INST_TST_R		0x01100000
+#define ARM_INST_TST_I		0x03100000
+
+#define ARM_INST_UDIV		0x0730f010
+
+#define ARM_INST_UMULL		0x00800090
+
+/* register */
+#define _AL3_R(op, rd, rn, rm)	((op ## _R) | (rd) << 12 | (rn) << 16 | (rm))
+/* immediate */
+#define _AL3_I(op, rd, rn, imm)	((op ## _I) | (rd) << 12 | (rn) << 16 | (imm))
+
+#define ARM_ADD_R(rd, rn, rm)	_AL3_R(ARM_INST_ADD, rd, rn, rm)
+#define ARM_ADD_I(rd, rn, imm)	_AL3_I(ARM_INST_ADD, rd, rn, imm)
+
+#define ARM_AND_R(rd, rn, rm)	_AL3_R(ARM_INST_AND, rd, rn, rm)
+#define ARM_AND_I(rd, rn, imm)	_AL3_I(ARM_INST_AND, rd, rn, imm)
+
+#define ARM_BIC_R(rd, rn, rm)	_AL3_R(ARM_INST_BIC, rd, rn, rm)
+#define ARM_BIC_I(rd, rn, imm)	_AL3_I(ARM_INST_BIC, rd, rn, imm)
+
+#define ARM_B(imm24)		(ARM_INST_B | ((imm24) & 0xffffff))
+#define ARM_BX(rm)		(ARM_INST_BX | (rm))
+#define ARM_BLX_R(rm)		(ARM_INST_BLX_R | (rm))
+
+#define ARM_CMP_R(rn, rm)	_AL3_R(ARM_INST_CMP, 0, rn, rm)
+#define ARM_CMP_I(rn, imm)	_AL3_I(ARM_INST_CMP, 0, rn, imm)
+
+#define ARM_LDR_I(rt, rn, off)	(ARM_INST_LDR_I | (rt) << 12 | (rn) << 16 \
+				 | (off))
+#define ARM_LDRB_I(rt, rn, off)	(ARM_INST_LDRB_I | (rt) << 12 | (rn) << 16 \
+				 | (off))
+#define ARM_LDRB_R(rt, rn, rm)	(ARM_INST_LDRB_R | (rt) << 12 | (rn) << 16 \
+				 | (rm))
+#define ARM_LDRH_I(rt, rn, off)	(ARM_INST_LDRH_I | (rt) << 12 | (rn) << 16 \
+				 | (((off) & 0xf0) << 4) | ((off) & 0xf))
+
+#define ARM_LDM(rn, regs)	(ARM_INST_LDM | (rn) << 16 | (regs))
+
+#define ARM_LSL_R(rd, rn, rm)	(_AL3_R(ARM_INST_LSL, rd, 0, rn) | (rm) << 8)
+#define ARM_LSL_I(rd, rn, imm)	(_AL3_I(ARM_INST_LSL, rd, 0, rn) | (imm) << 7)
+
+#define ARM_LSR_R(rd, rn, rm)	(_AL3_R(ARM_INST_LSR, rd, 0, rn) | (rm) << 8)
+#define ARM_LSR_I(rd, rn, imm)	(_AL3_I(ARM_INST_LSR, rd, 0, rn) | (imm) << 7)
+
+#define ARM_MOV_R(rd, rm)	_AL3_R(ARM_INST_MOV, rd, 0, rm)
+#define ARM_MOV_I(rd, imm)	_AL3_I(ARM_INST_MOV, rd, 0, imm)
+
+#define ARM_MOVW(rd, imm)	\
+	(ARM_INST_MOVW | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff))
+
+#define ARM_MOVT(rd, imm)	\
+	(ARM_INST_MOVT | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff))
+
+#define ARM_MUL(rd, rm, rn)	(ARM_INST_MUL | (rd) << 16 | (rm) << 8 | (rn))
+
+#define ARM_POP(regs)		(ARM_INST_POP | (regs))
+#define ARM_PUSH(regs)		(ARM_INST_PUSH | (regs))
+
+#define ARM_ORR_R(rd, rn, rm)	_AL3_R(ARM_INST_ORR, rd, rn, rm)
+#define ARM_ORR_I(rd, rn, imm)	_AL3_I(ARM_INST_ORR, rd, rn, imm)
+#define ARM_ORR_S(rd, rn, rm, type, rs)	\
+	(ARM_ORR_R(rd, rn, rm) | (type) << 5 | (rs) << 7)
+
+#define ARM_REV(rd, rm)		(ARM_INST_REV | (rd) << 12 | (rm))
+#define ARM_REV16(rd, rm)	(ARM_INST_REV16 | (rd) << 12 | (rm))
+
+#define ARM_RSB_I(rd, rn, imm)	_AL3_I(ARM_INST_RSB, rd, rn, imm)
+
+#define ARM_SUB_R(rd, rn, rm)	_AL3_R(ARM_INST_SUB, rd, rn, rm)
+#define ARM_SUB_I(rd, rn, imm)	_AL3_I(ARM_INST_SUB, rd, rn, imm)
+
+#define ARM_STR_I(rt, rn, off)	(ARM_INST_STR_I | (rt) << 12 | (rn) << 16 \
+				 | (off))
+
+#define ARM_TST_R(rn, rm)	_AL3_R(ARM_INST_TST, 0, rn, rm)
+#define ARM_TST_I(rn, imm)	_AL3_I(ARM_INST_TST, 0, rn, imm)
+
+#define ARM_UDIV(rd, rn, rm)	(ARM_INST_UDIV | (rd) << 16 | (rn) | (rm) << 8)
+
+#define ARM_UMULL(rd_lo, rd_hi, rn, rm)	(ARM_INST_UMULL | (rd_hi) << 16 \
+					 | (rd_lo) << 12 | (rm) << 8 | rn)
+
+#endif /* PFILTER_OPCODES_ARM_H */
diff --git a/arch/arm/plat-versatile/Kconfig b/arch/arm/plat-versatile/Kconfig
index 52353be..aa63f38 100644
--- a/arch/arm/plat-versatile/Kconfig
+++ b/arch/arm/plat-versatile/Kconfig
@@ -11,7 +11,7 @@
 	depends on ARCH_REALVIEW || ARCH_VERSATILE
 
 config PLAT_VERSATILE_SCHED_CLOCK
-	def_bool y if !ARCH_INTEGRATOR_AP
+	def_bool y
 	select HAVE_SCHED_CLOCK
 
 endif
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 8f3ccdd..d89068f 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -18,6 +18,7 @@
 #include <linux/smp.h>
 #include <linux/init.h>
 
+#include <asm/cp15.h>
 #include <asm/cputype.h>
 #include <asm/thread_notify.h>
 #include <asm/vfp.h>
diff --git a/scripts/gcc-goto.sh b/scripts/gcc-goto.sh
index 98cffcb9..a2af2e8 100644
--- a/scripts/gcc-goto.sh
+++ b/scripts/gcc-goto.sh
@@ -2,4 +2,20 @@
 # Test for gcc 'asm goto' support
 # Copyright (C) 2010, Jason Baron <jbaron@redhat.com>
 
-echo "int main(void) { entry: asm goto (\"\"::::entry); return 0; }" | $@ -x c - -c -o /dev/null >/dev/null 2>&1 && echo "y"
+cat << "END" | $@ -x c - -c -o /dev/null >/dev/null 2>&1 && echo "y"
+int main(void)
+{
+#ifdef __arm__
+	/*
+	 * Not related to asm goto, but used by jump label
+	 * and broken on some ARM GCC versions (see GCC Bug 48637).
+	 */
+	static struct { int dummy; int state; } tp;
+	asm (".long %c0" :: "i" (&tp.state));
+#endif
+
+entry:
+	asm goto ("" :::: entry);
+	return 0;
+}
+END