Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull KVM fixes from Paolo Bonzini: "On the x86 side, there are some optimizations and documentation updates. The big ARM/KVM change for 3.11, support for AArch64, will come through Catalin Marinas's tree. s390 and PPC have misc cleanups and bugfixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (87 commits) KVM: PPC: Ignore PIR writes KVM: PPC: Book3S PR: Invalidate SLB entries properly KVM: PPC: Book3S PR: Allow guest to use 1TB segments KVM: PPC: Book3S PR: Don't keep scanning HPTEG after we find a match KVM: PPC: Book3S PR: Fix invalidation of SLB entry 0 on guest entry KVM: PPC: Book3S PR: Fix proto-VSID calculations KVM: PPC: Guard doorbell exception with CONFIG_PPC_DOORBELL KVM: Fix RTC interrupt coalescing tracking kvm: Add a tracepoint write_tsc_offset KVM: MMU: Inform users of mmio generation wraparound KVM: MMU: document fast invalidate all mmio sptes KVM: MMU: document fast invalidate all pages KVM: MMU: document fast page fault KVM: MMU: document mmio page fault KVM: MMU: document write_flooding_count KVM: MMU: document clear_spte_count KVM: MMU: drop kvm_mmu_zap_mmio_sptes KVM: MMU: init kvm generation close to mmio wrap-around value KVM: MMU: add tracepoint for check_mmio_spte KVM: MMU: fast invalidate all mmio sptes ...

commit: fe489bf4505ae26d3c6d6a1f1d3064c2a9c5cd85 [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Wed Jul 03 13:21:40 2013 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> Wed Jul 03 13:21:40 2013 -0700
tree: 46596fd7edf7c4da1dafdb2c62011841e71cf32d
parent: 3e34131a65127e73fbae68c82748f32c8af7e4a4 [diff]
parent: a3ff5fbc94a829680d4aa005cd17add1c1a1fb5b [diff]
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 9bfadeb..66dd2aa 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt

@@ -2278,7 +2278,7 @@
 indicate that the attribute can be read or written in the device's
 current state.  "addr" is ignored.
 
-4.77 KVM_ARM_VCPU_INIT
+4.82 KVM_ARM_VCPU_INIT
 
 Capability: basic
 Architectures: arm, arm64
@@ -2304,7 +2304,7 @@
 	  Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only).
 
 
-4.78 KVM_GET_REG_LIST
+4.83 KVM_GET_REG_LIST
 
 Capability: basic
 Architectures: arm, arm64
@@ -2324,7 +2324,7 @@
 KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
 
 
-4.80 KVM_ARM_SET_DEVICE_ADDR
+4.84 KVM_ARM_SET_DEVICE_ADDR
 
 Capability: KVM_CAP_ARM_SET_DEVICE_ADDR
 Architectures: arm, arm64
@@ -2362,7 +2362,7 @@
 KVM_RUN on any of the VCPUs.  Calling this ioctl twice for any of the
 base addresses will return -EEXIST.
 
-4.82 KVM_PPC_RTAS_DEFINE_TOKEN
+4.85 KVM_PPC_RTAS_DEFINE_TOKEN
 
 Capability: KVM_CAP_PPC_RTAS
 Architectures: ppc

diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index 43fcb76..2908941 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt

@@ -191,12 +191,12 @@
     A counter keeping track of how many hardware registers (guest cr3 or
     pdptrs) are now pointing at the page.  While this counter is nonzero, the
     page cannot be destroyed.  See role.invalid.
-  multimapped:
-    Whether there exist multiple sptes pointing at this page.
-  parent_pte/parent_ptes:
-    If multimapped is zero, parent_pte points at the single spte that points at
-    this page's spt.  Otherwise, parent_ptes points at a data structure
-    with a list of parent_ptes.
+  parent_ptes:
+    The reverse mapping for the pte/ptes pointing at this page's spt. If
+    parent_ptes bit 0 is zero, only one spte points at this pages and
+    parent_ptes points at this single spte, otherwise, there exists multiple
+    sptes pointing at this page and (parent_ptes & ~0x1) points at a data
+    structure with a list of parent_ptes.
   unsync:
     If true, then the translations in this page may not match the guest's
     translation.  This is equivalent to the state of the tlb when a pte is
@@ -210,6 +210,24 @@
     A bitmap indicating which sptes in spt point (directly or indirectly) at
     pages that may be unsynchronized.  Used to quickly locate all unsychronized
     pages reachable from a given page.
+  mmu_valid_gen:
+    Generation number of the page.  It is compared with kvm->arch.mmu_valid_gen
+    during hash table lookup, and used to skip invalidated shadow pages (see
+    "Zapping all pages" below.)
+  clear_spte_count:
+    Only present on 32-bit hosts, where a 64-bit spte cannot be written
+    atomically.  The reader uses this while running out of the MMU lock
+    to detect in-progress updates and retry them until the writer has
+    finished the write.
+  write_flooding_count:
+    A guest may write to a page table many times, causing a lot of
+    emulations if the page needs to be write-protected (see "Synchronized
+    and unsynchronized pages" below).  Leaf pages can be unsynchronized
+    so that they do not trigger frequent emulation, but this is not
+    possible for non-leafs.  This field counts the number of emulations
+    since the last time the page table was actually used; if emulation
+    is triggered too frequently on this page, KVM will unmap the page
+    to avoid emulation in the future.
 
 Reverse map
 ===========
@@ -258,14 +276,26 @@
 
 Handling a page fault is performed as follows:
 
+ - if the RSV bit of the error code is set, the page fault is caused by guest
+   accessing MMIO and cached MMIO information is available.
+   - walk shadow page table
+   - check for valid generation number in the spte (see "Fast invalidation of
+     MMIO sptes" below)
+   - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and
+     vcpu->arch.mmio_gfn, and call the emulator
+ - If both P bit and R/W bit of error code are set, this could possibly
+   be handled as a "fast page fault" (fixed without taking the MMU lock).  See
+   the description in Documentation/virtual/kvm/locking.txt.
  - if needed, walk the guest page tables to determine the guest translation
    (gva->gpa or ngpa->gpa)
    - if permissions are insufficient, reflect the fault back to the guest
  - determine the host page
-   - if this is an mmio request, there is no host page; call the emulator
-     to emulate the instruction instead
+   - if this is an mmio request, there is no host page; cache the info to
+     vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn
  - walk the shadow page table to find the spte for the translation,
    instantiating missing intermediate page tables as necessary
+   - If this is an mmio request, cache the mmio info to the spte and set some
+     reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask)
  - try to unsynchronize the page
    - if successful, we can let the guest continue and modify the gpte
  - emulate the instruction
@@ -351,6 +381,51 @@
 a large spte.  The frames at the end of an unaligned memory slot have
 artificially inflated ->write_counts so they can never be instantiated.
 
+Zapping all pages (page generation count)
+=========================================
+
+For the large memory guests, walking and zapping all pages is really slow
+(because there are a lot of pages), and also blocks memory accesses of
+all VCPUs because it needs to hold the MMU lock.
+
+To make it be more scalable, kvm maintains a global generation number
+which is stored in kvm->arch.mmu_valid_gen.  Every shadow page stores
+the current global generation-number into sp->mmu_valid_gen when it
+is created.  Pages with a mismatching generation number are "obsolete".
+
+When KVM need zap all shadow pages sptes, it just simply increases the global
+generation-number then reload root shadow pages on all vcpus.  As the VCPUs
+create new shadow page tables, the old pages are not used because of the
+mismatching generation number.
+
+KVM then walks through all pages and zaps obsolete pages.  While the zap
+operation needs to take the MMU lock, the lock can be released periodically
+so that the VCPUs can make progress.
+
+Fast invalidation of MMIO sptes
+===============================
+
+As mentioned in "Reaction to events" above, kvm will cache MMIO
+information in leaf sptes.  When a new memslot is added or an existing
+memslot is changed, this information may become stale and needs to be
+invalidated.  This also needs to hold the MMU lock while walking all
+shadow pages, and is made more scalable with a similar technique.
+
+MMIO sptes have a few spare bits, which are used to store a
+generation number.  The global generation number is stored in
+kvm_memslots(kvm)->generation, and increased whenever guest memory info
+changes.  This generation number is distinct from the one described in
+the previous section.
+
+When KVM finds an MMIO spte, it checks the generation number of the spte.
+If the generation number of the spte does not equal the global generation
+number, it will ignore the cached MMIO information and handle the page
+fault through the slow path.
+
+Since only 19 bits are used to store generation-number on mmio spte, all
+pages are zapped when there is an overflow.
+
+
 Further reading
 ===============
 

diff --git a/MAINTAINERS b/MAINTAINERS
index c85bf69..60c68fb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS

@@ -4733,10 +4733,10 @@
 F:	drivers/s390/kvm/
 
 KERNEL VIRTUAL MACHINE (KVM) FOR ARM
-M:	Christoffer Dall <cdall@cs.columbia.edu>
+M:	Christoffer Dall <christoffer.dall@linaro.org>
 L:	kvmarm@lists.cs.columbia.edu
 W:	http://systems.cs.columbia.edu/projects/kvm-arm
-S:	Maintained
+S:	Supported
 F:	arch/arm/include/uapi/asm/kvm*
 F:	arch/arm/include/asm/kvm*
 F:	arch/arm/kvm/

diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 124623e..64e9696 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h

@@ -135,7 +135,6 @@
 #define KVM_PHYS_MASK	(KVM_PHYS_SIZE - 1ULL)
 #define PTRS_PER_S2_PGD	(1ULL << (KVM_PHYS_SHIFT - 30))
 #define S2_PGD_ORDER	get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
-#define S2_PGD_SIZE	(1 << S2_PGD_ORDER)
 
 /* Virtualization Translation Control Register (VTCR) bits */
 #define VTCR_SH0	(3 << 12)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 18d5032..a2f43dd 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h

@@ -37,16 +37,18 @@
 #define c5_AIFSR	15	/* Auxilary Instrunction Fault Status R */
 #define c6_DFAR		16	/* Data Fault Address Register */
 #define c6_IFAR		17	/* Instruction Fault Address Register */
-#define c9_L2CTLR	18	/* Cortex A15 L2 Control Register */
-#define c10_PRRR	19	/* Primary Region Remap Register */
-#define c10_NMRR	20	/* Normal Memory Remap Register */
-#define c12_VBAR	21	/* Vector Base Address Register */
-#define c13_CID		22	/* Context ID Register */
-#define c13_TID_URW	23	/* Thread ID, User R/W */
-#define c13_TID_URO	24	/* Thread ID, User R/O */
-#define c13_TID_PRIV	25	/* Thread ID, Privileged */
-#define c14_CNTKCTL	26	/* Timer Control Register (PL1) */
-#define NR_CP15_REGS	27	/* Number of regs (incl. invalid) */
+#define c7_PAR		18	/* Physical Address Register */
+#define c7_PAR_high	19	/* PAR top 32 bits */
+#define c9_L2CTLR	20	/* Cortex A15 L2 Control Register */
+#define c10_PRRR	21	/* Primary Region Remap Register */
+#define c10_NMRR	22	/* Normal Memory Remap Register */
+#define c12_VBAR	23	/* Vector Base Address Register */
+#define c13_CID		24	/* Context ID Register */
+#define c13_TID_URW	25	/* Thread ID, User R/W */
+#define c13_TID_URO	26	/* Thread ID, User R/O */
+#define c13_TID_PRIV	27	/* Thread ID, Privileged */
+#define c14_CNTKCTL	28	/* Timer Control Register (PL1) */
+#define NR_CP15_REGS	29	/* Number of regs (incl. invalid) */
 
 #define ARM_EXCEPTION_RESET	  0
 #define ARM_EXCEPTION_UNDEFINED   1
@@ -72,8 +74,6 @@
 extern char __kvm_hyp_code_start[];
 extern char __kvm_hyp_code_end[];
 
-extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
-
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 

diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 82b4bab..a464e8d 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h

@@ -65,11 +65,6 @@
 	return cpsr_mode > USR_MODE;;
 }
 
-static inline bool kvm_vcpu_reg_is_pc(struct kvm_vcpu *vcpu, int reg)
-{
-	return reg == 15;
-}
-
 static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.fault.hsr;

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 57cb786..7d22517 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h

@@ -23,9 +23,14 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmio.h>
 #include <asm/fpstate.h>
-#include <asm/kvm_arch_timer.h>
+#include <kvm/arm_arch_timer.h>
 
+#if defined(CONFIG_KVM_ARM_MAX_VCPUS)
 #define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS
+#else
+#define KVM_MAX_VCPUS 0
+#endif
+
 #define KVM_USER_MEM_SLOTS 32
 #define KVM_PRIVATE_MEM_SLOTS 4
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -38,7 +43,7 @@
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_PAGES_PER_HPAGE(x)	(1UL<<31)
 
-#include <asm/kvm_vgic.h>
+#include <kvm/arm_vgic.h>
 
 struct kvm_vcpu;
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
@@ -190,8 +195,8 @@
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		int exception_index);
 
-static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr,
-				       unsigned long long pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
+				       phys_addr_t pgd_ptr,
 				       unsigned long hyp_stack_ptr,
 				       unsigned long vector_ptr)
 {

diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 370e1a8..ebf5015 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig

@@ -41,9 +41,9 @@
 	  Provides host support for ARM processors.
 
 config KVM_ARM_MAX_VCPUS
-	int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST
-	default 4 if KVM_ARM_HOST
-	default 0
+	int "Number maximum supported virtual CPUs per VM"
+	depends on KVM_ARM_HOST
+	default 4
 	help
 	  Static number of max supported virtual CPUs per VM.
 
@@ -67,6 +67,4 @@
 	---help---
 	  Adds support for the Architected Timers in virtual machines
 
-source drivers/virtio/Kconfig
-
 endif # VIRTUALIZATION

diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 53c5ed8..d99bee4 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile

@@ -14,10 +14,11 @@
 AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
 AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
 
-kvm-arm-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+KVM := ../../../virt/kvm
+kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
 
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o
-obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o
-obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o
+obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
+obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index ef1703b..741f66a 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c

@@ -800,8 +800,8 @@
 
 static void cpu_init_hyp_mode(void *dummy)
 {
-	unsigned long long boot_pgd_ptr;
-	unsigned long long pgd_ptr;
+	phys_addr_t boot_pgd_ptr;
+	phys_addr_t pgd_ptr;
 	unsigned long hyp_stack_ptr;
 	unsigned long stack_page;
 	unsigned long vector_ptr;
@@ -809,8 +809,8 @@
 	/* Switch from the HYP stub to our own HYP init vector */
 	__hyp_set_vectors(kvm_get_idmap_vector());
 
-	boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr();
-	pgd_ptr = (unsigned long long)kvm_mmu_get_httbr();
+	boot_pgd_ptr = kvm_mmu_get_boot_httbr();
+	pgd_ptr = kvm_mmu_get_httbr();
 	stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
 	hyp_stack_ptr = stack_page + PAGE_SIZE;
 	vector_ptr = (unsigned long)__kvm_hyp_vector;

diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 8eea97b..4a51990 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c

@@ -180,6 +180,10 @@
 			NULL, reset_unknown, c6_DFAR },
 	{ CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32,
 			NULL, reset_unknown, c6_IFAR },
+
+	/* PAR swapped by interrupt.S */
+	{ CRn( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR },
+
 	/*
 	 * DC{C,I,CI}SW operations:
 	 */

diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 3d74a0b..df4c82d 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c

@@ -52,9 +52,6 @@
 
 static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-	if (kvm_psci_call(vcpu))
-		return 1;
-
 	kvm_inject_undefined(vcpu);
 	return 1;
 }

diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index f7793df..16cd4ba 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S

@@ -49,6 +49,7 @@
 ENTRY(__kvm_tlb_flush_vmid_ipa)
 	push	{r2, r3}
 
+	dsb	ishst
 	add	r0, r0, #KVM_VTTBR
 	ldrd	r2, r3, [r0]
 	mcrr	p15, 6, r2, r3, c2	@ Write VTTBR
@@ -291,6 +292,7 @@
 	ldr	r2, =BSYM(panic)
 	msr	ELR_hyp, r2
 	ldr	r0, =\panic_str
+	clrex				@ Clear exclusive monitor
 	eret
 .endm
 
@@ -414,6 +416,10 @@
 	mrcne	p15, 4, r2, c6, c0, 4	@ HPFAR
 	bne	3f
 
+	/* Preserve PAR */
+	mrrc	p15, 0, r0, r1, c7	@ PAR
+	push	{r0, r1}
+
 	/* Resolve IPA using the xFAR */
 	mcr	p15, 0, r2, c7, c8, 0	@ ATS1CPR
 	isb
@@ -424,13 +430,20 @@
 	lsl	r2, r2, #4
 	orr	r2, r2, r1, lsl #24
 
+	/* Restore PAR */
+	pop	{r0, r1}
+	mcrr	p15, 0, r0, r1, c7	@ PAR
+
 3:	load_vcpu			@ Load VCPU pointer to r0
 	str	r2, [r0, #VCPU_HPFAR]
 
 1:	mov	r1, #ARM_EXCEPTION_HVC
 	b	__kvm_vcpu_return
 
-4:	pop	{r0, r1, r2}		@ Failed translation, return to guest
+4:	pop	{r0, r1}		@ Failed translation, return to guest
+	mcrr	p15, 0, r0, r1, c7	@ PAR
+	clrex
+	pop	{r0, r1, r2}
 	eret
 
 /*
@@ -456,6 +469,7 @@
 
 	pop	{r3-r7}
 	pop	{r0-r2}
+	clrex
 	eret
 #endif
 

diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
index d43cfb5..6f18695 100644
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S

@@ -302,11 +302,14 @@
 	.endif
 
 	mrc	p15, 0, r2, c14, c1, 0	@ CNTKCTL
+	mrrc	p15, 0, r4, r5, c7	@ PAR
 
 	.if \store_to_vcpu == 0
-	push	{r2}
+	push	{r2,r4-r5}
 	.else
 	str	r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
+	add	r12, vcpu, #CP15_OFFSET(c7_PAR)
+	strd	r4, r5, [r12]
 	.endif
 .endm
 
@@ -319,12 +322,15 @@
  */
 .macro write_cp15_state read_from_vcpu
 	.if \read_from_vcpu == 0
-	pop	{r2}
+	pop	{r2,r4-r5}
 	.else
 	ldr	r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
+	add	r12, vcpu, #CP15_OFFSET(c7_PAR)
+	ldrd	r4, r5, [r12]
 	.endif
 
 	mcr	p15, 0, r2, c14, c1, 0	@ CNTKCTL
+	mcrr	p15, 0, r4, r5, c7	@ PAR
 
 	.if \read_from_vcpu == 0
 	pop	{r2-r12}

diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
index 72a12f2..b8e06b7 100644
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c

@@ -86,12 +86,6 @@
 	sign_extend = kvm_vcpu_dabt_issext(vcpu);
 	rt = kvm_vcpu_dabt_get_rd(vcpu);
 
-	if (kvm_vcpu_reg_is_pc(vcpu, rt)) {
-		/* IO memory trying to read/write pc */
-		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-		return 1;
-	}
-
 	mmio->is_write = is_write;
 	mmio->phys_addr = fault_ipa;
 	mmio->len = len;

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 84ba67b..ca6bea4 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c

@@ -382,9 +382,6 @@
 	if (!pgd)
 		return -ENOMEM;
 
-	/* stage-2 pgd must be aligned to its size */
-	VM_BUG_ON((unsigned long)pgd & (S2_PGD_SIZE - 1));
-
 	memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t));
 	kvm_clean_pgd(pgd);
 	kvm->arch.pgd = pgd;

diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 7ee5bb7..86a693a 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c

@@ -75,7 +75,7 @@
  * kvm_psci_call - handle PSCI call if r0 value is in range
  * @vcpu: Pointer to the VCPU struct
  *
- * Handle PSCI calls from guests through traps from HVC or SMC instructions.
+ * Handle PSCI calls from guests through traps from HVC instructions.
  * The calling convention is similar to SMC calls to the secure world where
  * the function number is placed in r0 and this function returns true if the
  * function number specified in r0 is withing the PSCI range, and false

diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index b80256b..b7840e7 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c

@@ -27,6 +27,8 @@
 #include <asm/kvm_arm.h>
 #include <asm/kvm_coproc.h>
 
+#include <kvm/arm_arch_timer.h>
+
 /******************************************************************************
  * Cortex-A15 Reset Values
  */
@@ -37,6 +39,11 @@
 	.usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
 };
 
+static const struct kvm_irq_level a15_vtimer_irq = {
+	.irq = 27,
+	.level = 1,
+};
+
 
 /*******************************************************************************
  * Exported reset function
@@ -52,6 +59,7 @@
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_regs *cpu_reset;
+	const struct kvm_irq_level *cpu_vtimer_irq;
 
 	switch (vcpu->arch.target) {
 	case KVM_ARM_TARGET_CORTEX_A15:
@@ -59,6 +67,7 @@
 			return -EINVAL;
 		cpu_reset = &a15_regs_reset;
 		vcpu->arch.midr = read_cpuid_id();
+		cpu_vtimer_irq = &a15_vtimer_irq;
 		break;
 	default:
 		return -ENODEV;
@@ -70,5 +79,8 @@
 	/* Reset CP15 registers */
 	kvm_reset_coprocs(vcpu);
 
+	/* Reset arch_timer context */
+	kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
+
 	return 0;
 }

diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 1a40537..18e45ec 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile

@@ -47,12 +47,13 @@
 
 ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
 asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
+KVM := ../../../virt/kvm
 
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-		coalesced_mmio.o irq_comm.o)
+common-objs = $(KVM)/kvm_main.o $(KVM)/ioapic.o \
+		$(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o
 
 ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y)
-common-objs += $(addprefix ../../../virt/kvm/, assigned-dev.o iommu.o)
+common-objs += $(KVM)/assigned-dev.o $(KVM)/iommu.o
 endif
 
 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 349ed85..08891d0 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h

@@ -107,8 +107,9 @@
 #define CONTEXT_GUEST		1
 #define CONTEXT_GUEST_END	2
 
-#define VSID_REAL	0x1fffffffffc00000ULL
-#define VSID_BAT	0x1fffffffffb00000ULL
+#define VSID_REAL	0x0fffffffffc00000ULL
+#define VSID_BAT	0x0fffffffffb00000ULL
+#define VSID_1T		0x1000000000000000ULL
 #define VSID_REAL_DR	0x2000000000000000ULL
 #define VSID_REAL_IR	0x4000000000000000ULL
 #define VSID_PR		0x8000000000000000ULL
@@ -123,6 +124,7 @@
 extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
+extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, ulong seg_size);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
 extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
 			struct kvm_vcpu *vcpu, unsigned long addr,

diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 422de3f..008cd85 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile

@@ -5,9 +5,10 @@
 subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
+KVM := ../../../virt/kvm
 
-common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \
-						eventfd.o)
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
+		$(KVM)/eventfd.o
 
 CFLAGS_44x_tlb.o  := -I.
 CFLAGS_e500_mmu.o := -I.
@@ -53,7 +54,7 @@
 kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
 
 kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
-	../../../virt/kvm/coalesced_mmio.o \
+	$(KVM)/coalesced_mmio.o \
 	fpu.o \
 	book3s_paired_singles.o \
 	book3s_pr.o \
@@ -86,8 +87,8 @@
 	book3s_xics.o
 
 kvm-book3s_64-module-objs := \
-	../../../virt/kvm/kvm_main.o \
-	../../../virt/kvm/eventfd.o \
+	$(KVM)/kvm_main.o \
+	$(KVM)/eventfd.o \
 	powerpc.o \
 	emulate.o \
 	book3s.o \
@@ -111,7 +112,7 @@
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
 
 kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
-kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o)
+kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
 
 kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
 

diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index b871721..739bfba 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c

@@ -26,6 +26,7 @@
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
 
 /* #define DEBUG_MMU */
 
@@ -76,6 +77,24 @@
 	return NULL;
 }
 
+static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe)
+{
+	return slbe->tb ? SID_SHIFT_1T : SID_SHIFT;
+}
+
+static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe)
+{
+	return (1ul << kvmppc_slb_sid_shift(slbe)) - 1;
+}
+
+static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr)
+{
+	eaddr &= kvmppc_slb_offset_mask(slb);
+
+	return (eaddr >> VPN_SHIFT) |
+		((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT));
+}
+
 static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
 					 bool data)
 {
@@ -85,11 +104,7 @@
 	if (!slb)
 		return 0;
 
-	if (slb->tb)
-		return (((u64)eaddr >> 12) & 0xfffffff) |
-		       (((u64)slb->vsid) << 28);
-
-	return (((u64)eaddr >> 12) & 0xffff) | (((u64)slb->vsid) << 16);
+	return kvmppc_slb_calc_vpn(slb, eaddr);
 }
 
 static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)
@@ -100,7 +115,8 @@
 static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr)
 {
 	int p = kvmppc_mmu_book3s_64_get_pagesize(slbe);
-	return ((eaddr & 0xfffffff) >> p);
+
+	return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p);
 }
 
 static hva_t kvmppc_mmu_book3s_64_get_pteg(
@@ -109,13 +125,15 @@
 				bool second)
 {
 	u64 hash, pteg, htabsize;
-	u32 page;
+	u32 ssize;
 	hva_t r;
+	u64 vpn;
 
-	page = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
 	htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1);
 
-	hash = slbe->vsid ^ page;
+	vpn = kvmppc_slb_calc_vpn(slbe, eaddr);
+	ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M;
+	hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize);
 	if (second)
 		hash = ~hash;
 	hash &= ((1ULL << 39ULL) - 1ULL);
@@ -146,7 +164,7 @@
 	u64 avpn;
 
 	avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
-	avpn |= slbe->vsid << (28 - p);
+	avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p);
 
 	if (p < 24)
 		avpn >>= ((80 - p) - 56) - 8;
@@ -167,7 +185,6 @@
 	int i;
 	u8 key = 0;
 	bool found = false;
-	bool perm_err = false;
 	int second = 0;
 	ulong mp_ea = vcpu->arch.magic_page_ea;
 
@@ -190,13 +207,15 @@
 	if (!slbe)
 		goto no_seg_found;
 
+	avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
+	if (slbe->tb)
+		avpn |= SLB_VSID_B_1T;
+
 do_second:
 	ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
 	if (kvm_is_error_hva(ptegp))
 		goto no_page_found;
 
-	avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
-
 	if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
 		printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp);
 		goto no_page_found;
@@ -219,7 +238,7 @@
 			continue;
 
 		/* AVPN compare */
-		if (HPTE_V_AVPN_VAL(avpn) == HPTE_V_AVPN_VAL(v)) {
+		if (HPTE_V_COMPARE(avpn, v)) {
 			u8 pp = (r & HPTE_R_PP) | key;
 			int eaddr_mask = 0xFFF;
 
@@ -248,11 +267,6 @@
 				break;
 			}
 
-			if (!gpte->may_read) {
-				perm_err = true;
-				continue;
-			}
-
 			dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
 				"-> 0x%lx\n",
 				eaddr, avpn, gpte->vpage, gpte->raddr);
@@ -281,6 +295,8 @@
 		if (pteg[i+1] != oldr)
 			copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
 
+		if (!gpte->may_read)
+			return -EPERM;
 		return 0;
 	} else {
 		dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx "
@@ -296,13 +312,7 @@
 		}
 	}
 
-
 no_page_found:
-
-
-	if (perm_err)
-		return -EPERM;
-
 	return -ENOENT;
 
 no_seg_found:
@@ -334,7 +344,7 @@
 	slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
 	slbe->tb    = (rs & SLB_VSID_B_1T) ? 1 : 0;
 	slbe->esid  = slbe->tb ? esid_1t : esid;
-	slbe->vsid  = rs >> 12;
+	slbe->vsid  = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16);
 	slbe->valid = (rb & SLB_ESID_V) ? 1 : 0;
 	slbe->Ks    = (rs & SLB_VSID_KS) ? 1 : 0;
 	slbe->Kp    = (rs & SLB_VSID_KP) ? 1 : 0;
@@ -375,6 +385,7 @@
 static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
 {
 	struct kvmppc_slb *slbe;
+	u64 seg_size;
 
 	dprintk("KVM MMU: slbie(0x%llx)\n", ea);
 
@@ -386,8 +397,11 @@
 	dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid);
 
 	slbe->valid = false;
+	slbe->orige = 0;
+	slbe->origv = 0;
 
-	kvmppc_mmu_map_segment(vcpu, ea);
+	seg_size = 1ull << kvmppc_slb_sid_shift(slbe);
+	kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size);
 }
 
 static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
@@ -396,8 +410,11 @@
 
 	dprintk("KVM MMU: slbia()\n");
 
-	for (i = 1; i < vcpu->arch.slb_nr; i++)
+	for (i = 1; i < vcpu->arch.slb_nr; i++) {
 		vcpu->arch.slb[i].valid = false;
+		vcpu->arch.slb[i].orige = 0;
+		vcpu->arch.slb[i].origv = 0;
+	}
 
 	if (vcpu->arch.shared->msr & MSR_IR) {
 		kvmppc_mmu_flush_segments(vcpu);
@@ -467,8 +484,14 @@
 
 	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 		slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
-		if (slb)
+		if (slb) {
 			gvsid = slb->vsid;
+			if (slb->tb) {
+				gvsid <<= SID_SHIFT_1T - SID_SHIFT;
+				gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1);
+				gvsid |= VSID_1T;
+			}
+		}
 	}
 
 	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 3a9a1ac..b350d94 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c

@@ -301,6 +301,23 @@
 	return r;
 }
 
+void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	ulong seg_mask = -seg_size;
+	int i;
+
+	for (i = 1; i < svcpu->slb_max; i++) {
+		if ((svcpu->slb[i].esid & SLB_ESID_V) &&
+		    (svcpu->slb[i].esid & seg_mask) == ea) {
+			/* Invalidate this entry */
+			svcpu->slb[i].esid = 0;
+		}
+	}
+
+	svcpu_put(svcpu);
+}
+
 void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
@@ -325,9 +342,9 @@
 		return -1;
 	vcpu3s->context_id[0] = err;
 
-	vcpu3s->proto_vsid_max = ((vcpu3s->context_id[0] + 1)
+	vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1)
 				  << ESID_BITS) - 1;
-	vcpu3s->proto_vsid_first = vcpu3s->context_id[0] << ESID_BITS;
+	vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS;
 	vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first;
 
 	kvmppc_mmu_hpte_init(vcpu);

diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 56b983e..4f0caec 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S

@@ -66,10 +66,6 @@
 
 	ld	r12, PACA_SLBSHADOWPTR(r13)
 
-	/* Save off the first entry so we can slbie it later */
-	ld	r10, SHADOW_SLB_ESID(0)(r12)
-	ld	r11, SHADOW_SLB_VSID(0)(r12)
-
 	/* Remove bolted entries */
 	UNBOLT_SLB_ENTRY(0)
 	UNBOLT_SLB_ENTRY(1)
@@ -81,15 +77,10 @@
 
 	/* Flush SLB */
 
+	li	r10, 0
+	slbmte	r10, r10
 	slbia
 
-	/* r0 = esid & ESID_MASK */
-	rldicr  r10, r10, 0, 35
-	/* r0 |= CLASS_BIT(VSID) */
-	rldic   r12, r11, 56 - 36, 36
-	or      r10, r10, r12
-	slbie	r10
-
 	/* Fill SLB with our shadow */
 
 	lbz	r12, SVCPU_SLB_MAX(r3)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index bdc40b8..19498a5 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c

@@ -1239,8 +1239,7 @@
 #ifdef CONFIG_PPC64
 int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
 {
-	/* No flags */
-	info->flags = 0;
+	info->flags = KVM_PPC_1T_SEGMENTS;
 
 	/* SLB is always 64 entries */
 	info->slb_size = 64;

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 1a1b511..dcc94f0 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c

@@ -796,7 +796,7 @@
 		kvmppc_fill_pt_regs(&regs);
 		timer_interrupt(&regs);
 		break;
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64)
+#if defined(CONFIG_PPC_DOORBELL)
 	case BOOKE_INTERRUPT_DOORBELL:
 		kvmppc_fill_pt_regs(&regs);
 		doorbell_exception(&regs);

diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 631a265..2c52ada 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c

@@ -169,6 +169,9 @@
 		vcpu->arch.shared->sprg3 = spr_val;
 		break;
 
+	/* PIR can legally be written, but we ignore it */
+	case SPRN_PIR: break;
+
 	default:
 		emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
 						     spr_val);

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 16bd5d1..3238d40 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h

@@ -62,13 +62,20 @@
 #define CPUSTAT_MCDS       0x00000100
 #define CPUSTAT_SM         0x00000080
 #define CPUSTAT_G          0x00000008
+#define CPUSTAT_GED        0x00000004
 #define CPUSTAT_J          0x00000002
 #define CPUSTAT_P          0x00000001
 
 struct kvm_s390_sie_block {
 	atomic_t cpuflags;		/* 0x0000 */
 	__u32	prefix;			/* 0x0004 */
-	__u8	reserved8[32];		/* 0x0008 */
+	__u8	reserved08[4];		/* 0x0008 */
+#define PROG_IN_SIE (1<<0)
+	__u32	prog0c;			/* 0x000c */
+	__u8	reserved10[16];		/* 0x0010 */
+#define PROG_BLOCK_SIE 0x00000001
+	atomic_t prog20;		/* 0x0020 */
+	__u8	reserved24[4];		/* 0x0024 */
 	__u64	cputm;			/* 0x0028 */
 	__u64	ckc;			/* 0x0030 */
 	__u64	epoch;			/* 0x0038 */
@@ -90,7 +97,8 @@
 	__u32	scaoh;			/* 0x005c */
 	__u8	reserved60;		/* 0x0060 */
 	__u8	ecb;			/* 0x0061 */
-	__u8	reserved62[2];		/* 0x0062 */
+	__u8    ecb2;                   /* 0x0062 */
+	__u8    reserved63[1];          /* 0x0063 */
 	__u32	scaol;			/* 0x0064 */
 	__u8	reserved68[4];		/* 0x0068 */
 	__u32	todpr;			/* 0x006c */
@@ -130,6 +138,7 @@
 	u32 deliver_program_int;
 	u32 deliver_io_int;
 	u32 exit_wait_state;
+	u32 instruction_pfmf;
 	u32 instruction_stidp;
 	u32 instruction_spx;
 	u32 instruction_stpx;
@@ -166,7 +175,7 @@
 };
 
 #define PGM_OPERATION            0x01
-#define PGM_PRIVILEGED_OPERATION 0x02
+#define PGM_PRIVILEGED_OP	 0x02
 #define PGM_EXECUTE              0x03
 #define PGM_PROTECTION           0x04
 #define PGM_ADDRESSING           0x05
@@ -219,7 +228,7 @@
 	atomic_t active;
 	struct kvm_s390_float_interrupt *float_int;
 	int timer_due; /* event indicator for waitqueue below */
-	wait_queue_head_t wq;
+	wait_queue_head_t *wq;
 	atomic_t *cpuflags;
 	unsigned int action_bits;
 };
@@ -266,4 +275,5 @@
 };
 
 extern int sie64a(struct kvm_s390_sie_block *, u64 *);
+extern char sie_exit;
 #endif

diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 5f0173a..1141fb3 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h

@@ -14,3 +14,13 @@
 /* Per-CPU flags for PMU states */
 #define PMU_F_RESERVED			0x1000
 #define PMU_F_ENABLED			0x2000
+
+#ifdef CONFIG_64BIT
+
+/* Perf callbacks */
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs) perf_misc_flags(regs)
+
+#endif /* CONFIG_64BIT */

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 9aefa3c..0ea4e59 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h

@@ -296,18 +296,16 @@
 #define _SEGMENT_ENTRY_EMPTY	(_SEGMENT_ENTRY_INV)
 
 /* Page status table bits for virtualization */
-#define RCP_ACC_BITS	0xf0000000UL
-#define RCP_FP_BIT	0x08000000UL
-#define RCP_PCL_BIT	0x00800000UL
-#define RCP_HR_BIT	0x00400000UL
-#define RCP_HC_BIT	0x00200000UL
-#define RCP_GR_BIT	0x00040000UL
-#define RCP_GC_BIT	0x00020000UL
-#define RCP_IN_BIT	0x00002000UL	/* IPTE notify bit */
-
-/* User dirty / referenced bit for KVM's migration feature */
-#define KVM_UR_BIT	0x00008000UL
-#define KVM_UC_BIT	0x00004000UL
+#define PGSTE_ACC_BITS	0xf0000000UL
+#define PGSTE_FP_BIT	0x08000000UL
+#define PGSTE_PCL_BIT	0x00800000UL
+#define PGSTE_HR_BIT	0x00400000UL
+#define PGSTE_HC_BIT	0x00200000UL
+#define PGSTE_GR_BIT	0x00040000UL
+#define PGSTE_GC_BIT	0x00020000UL
+#define PGSTE_UR_BIT	0x00008000UL
+#define PGSTE_UC_BIT	0x00004000UL	/* user dirty (migration) */
+#define PGSTE_IN_BIT	0x00002000UL	/* IPTE notify bit */
 
 #else /* CONFIG_64BIT */
 
@@ -364,18 +362,16 @@
 				 | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO)
 
 /* Page status table bits for virtualization */
-#define RCP_ACC_BITS	0xf000000000000000UL
-#define RCP_FP_BIT	0x0800000000000000UL
-#define RCP_PCL_BIT	0x0080000000000000UL
-#define RCP_HR_BIT	0x0040000000000000UL
-#define RCP_HC_BIT	0x0020000000000000UL
-#define RCP_GR_BIT	0x0004000000000000UL
-#define RCP_GC_BIT	0x0002000000000000UL
-#define RCP_IN_BIT	0x0000200000000000UL	/* IPTE notify bit */
-
-/* User dirty / referenced bit for KVM's migration feature */
-#define KVM_UR_BIT	0x0000800000000000UL
-#define KVM_UC_BIT	0x0000400000000000UL
+#define PGSTE_ACC_BITS	0xf000000000000000UL
+#define PGSTE_FP_BIT	0x0800000000000000UL
+#define PGSTE_PCL_BIT	0x0080000000000000UL
+#define PGSTE_HR_BIT	0x0040000000000000UL
+#define PGSTE_HC_BIT	0x0020000000000000UL
+#define PGSTE_GR_BIT	0x0004000000000000UL
+#define PGSTE_GC_BIT	0x0002000000000000UL
+#define PGSTE_UR_BIT	0x0000800000000000UL
+#define PGSTE_UC_BIT	0x0000400000000000UL	/* user dirty (migration) */
+#define PGSTE_IN_BIT	0x0000200000000000UL	/* IPTE notify bit */
 
 #endif /* CONFIG_64BIT */
 
@@ -615,8 +611,8 @@
 	asm(
 		"	lg	%0,%2\n"
 		"0:	lgr	%1,%0\n"
-		"	nihh	%0,0xff7f\n"	/* clear RCP_PCL_BIT in old */
-		"	oihh	%1,0x0080\n"	/* set RCP_PCL_BIT in new */
+		"	nihh	%0,0xff7f\n"	/* clear PCL bit in old */
+		"	oihh	%1,0x0080\n"	/* set PCL bit in new */
 		"	csg	%0,%1,%2\n"
 		"	jl	0b\n"
 		: "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
@@ -629,7 +625,7 @@
 {
 #ifdef CONFIG_PGSTE
 	asm(
-		"	nihh	%1,0xff7f\n"	/* clear RCP_PCL_BIT */
+		"	nihh	%1,0xff7f\n"	/* clear PCL bit */
 		"	stg	%1,%0\n"
 		: "=Q" (ptep[PTRS_PER_PTE])
 		: "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
@@ -662,14 +658,14 @@
 	else if (bits)
 		page_reset_referenced(address);
 	/* Transfer page changed & referenced bit to guest bits in pgste */
-	pgste_val(pgste) |= bits << 48;		/* RCP_GR_BIT & RCP_GC_BIT */
+	pgste_val(pgste) |= bits << 48;		/* GR bit & GC bit */
 	/* Get host changed & referenced bits from pgste */
-	bits |= (pgste_val(pgste) & (RCP_HR_BIT | RCP_HC_BIT)) >> 52;
+	bits |= (pgste_val(pgste) & (PGSTE_HR_BIT | PGSTE_HC_BIT)) >> 52;
 	/* Transfer page changed & referenced bit to kvm user bits */
-	pgste_val(pgste) |= bits << 45;		/* KVM_UR_BIT & KVM_UC_BIT */
+	pgste_val(pgste) |= bits << 45;		/* PGSTE_UR_BIT & PGSTE_UC_BIT */
 	/* Clear relevant host bits in pgste. */
-	pgste_val(pgste) &= ~(RCP_HR_BIT | RCP_HC_BIT);
-	pgste_val(pgste) &= ~(RCP_ACC_BITS | RCP_FP_BIT);
+	pgste_val(pgste) &= ~(PGSTE_HR_BIT | PGSTE_HC_BIT);
+	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
 	/* Copy page access key and fetch protection bit to pgste */
 	pgste_val(pgste) |=
 		(unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
@@ -690,15 +686,15 @@
 	/* Get referenced bit from storage key */
 	young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
 	if (young)
-		pgste_val(pgste) |= RCP_GR_BIT;
+		pgste_val(pgste) |= PGSTE_GR_BIT;
 	/* Get host referenced bit from pgste */
-	if (pgste_val(pgste) & RCP_HR_BIT) {
-		pgste_val(pgste) &= ~RCP_HR_BIT;
+	if (pgste_val(pgste) & PGSTE_HR_BIT) {
+		pgste_val(pgste) &= ~PGSTE_HR_BIT;
 		young = 1;
 	}
 	/* Transfer referenced bit to kvm user bits and pte */
 	if (young) {
-		pgste_val(pgste) |= KVM_UR_BIT;
+		pgste_val(pgste) |= PGSTE_UR_BIT;
 		pte_val(*ptep) |= _PAGE_SWR;
 	}
 #endif
@@ -720,7 +716,7 @@
 	 * The guest C/R information is still in the PGSTE, set real
 	 * key C/R to 0.
 	 */
-	nkey = (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56;
+	nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
 	page_set_storage_key(address, nkey, 0);
 #endif
 }
@@ -750,6 +746,7 @@
 	struct mm_struct *mm;
 	unsigned long *table;
 	unsigned long asce;
+	void *private;
 	struct list_head crst_list;
 };
 
@@ -808,8 +805,8 @@
 					pte_t *ptep, pgste_t pgste)
 {
 #ifdef CONFIG_PGSTE
-	if (pgste_val(pgste) & RCP_IN_BIT) {
-		pgste_val(pgste) &= ~RCP_IN_BIT;
+	if (pgste_val(pgste) & PGSTE_IN_BIT) {
+		pgste_val(pgste) &= ~PGSTE_IN_BIT;
 		gmap_do_ipte_notify(mm, addr, ptep);
 	}
 #endif
@@ -977,8 +974,8 @@
 	if (mm_has_pgste(mm)) {
 		pgste = pgste_get_lock(ptep);
 		pgste = pgste_update_all(ptep, pgste);
-		dirty = !!(pgste_val(pgste) & KVM_UC_BIT);
-		pgste_val(pgste) &= ~KVM_UC_BIT;
+		dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
+		pgste_val(pgste) &= ~PGSTE_UC_BIT;
 		pgste_set_unlock(ptep, pgste);
 		return dirty;
 	}
@@ -997,8 +994,8 @@
 	if (mm_has_pgste(mm)) {
 		pgste = pgste_get_lock(ptep);
 		pgste = pgste_update_young(ptep, pgste);
-		young = !!(pgste_val(pgste) & KVM_UR_BIT);
-		pgste_val(pgste) &= ~KVM_UR_BIT;
+		young = !!(pgste_val(pgste) & PGSTE_UR_BIT);
+		pgste_val(pgste) &= ~PGSTE_UR_BIT;
 		pgste_set_unlock(ptep, pgste);
 	}
 	return young;

diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index d6de844..2416138 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c

@@ -7,6 +7,7 @@
 #define ASM_OFFSETS_C
 
 #include <linux/kbuild.h>
+#include <linux/kvm_host.h>
 #include <linux/sched.h>
 #include <asm/cputime.h>
 #include <asm/vdso.h>
@@ -162,6 +163,8 @@
 	DEFINE(__LC_PGM_TDB, offsetof(struct _lowcore, pgm_tdb));
 	DEFINE(__THREAD_trap_tdb, offsetof(struct task_struct, thread.trap_tdb));
 	DEFINE(__GMAP_ASCE, offsetof(struct gmap, asce));
+	DEFINE(__SIE_PROG0C, offsetof(struct kvm_s390_sie_block, prog0c));
+	DEFINE(__SIE_PROG20, offsetof(struct kvm_s390_sie_block, prog20));
 #endif /* CONFIG_32BIT */
 	return 0;
 }

diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index bc5864c..1c039d0 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S

@@ -47,7 +47,6 @@
 		 _TIF_MCCK_PENDING)
 _TIF_TRACE    = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
 		 _TIF_SYSCALL_TRACEPOINT)
-_TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING)
 
 #define BASED(name) name-system_call(%r13)
 
@@ -81,23 +80,27 @@
 #endif
 	.endm
 
-	.macro	HANDLE_SIE_INTERCEPT scratch,pgmcheck
+	.macro	HANDLE_SIE_INTERCEPT scratch,reason
 #if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
 	tmhh	%r8,0x0001		# interrupting from user ?
-	jnz	.+42
+	jnz	.+62
 	lgr	\scratch,%r9
-	slg	\scratch,BASED(.Lsie_loop)
-	clg	\scratch,BASED(.Lsie_length)
-	.if	\pgmcheck
+	slg	\scratch,BASED(.Lsie_critical)
+	clg	\scratch,BASED(.Lsie_critical_length)
+	.if	\reason==1
 	# Some program interrupts are suppressing (e.g. protection).
 	# We must also check the instruction after SIE in that case.
 	# do_protection_exception will rewind to rewind_pad
-	jh	.+22
+	jh	.+42
 	.else
-	jhe	.+22
+	jhe	.+42
 	.endif
-	lg	%r9,BASED(.Lsie_loop)
-	LPP	BASED(.Lhost_id)	# set host id
+	lg	%r14,__SF_EMPTY(%r15)		# get control block pointer
+	LPP	__SF_EMPTY+16(%r15)		# set host id
+	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
+	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
+	larl	%r9,sie_exit			# skip forward to sie_exit
+	mvi	__SF_EMPTY+31(%r15),\reason	# set exit reason
 #endif
 	.endm
 
@@ -450,7 +453,7 @@
 	lg	%r12,__LC_THREAD_INFO
 	larl	%r13,system_call
 	lmg	%r8,%r9,__LC_IO_OLD_PSW
-	HANDLE_SIE_INTERCEPT %r14,0
+	HANDLE_SIE_INTERCEPT %r14,2
 	SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
 	tmhh	%r8,0x0001		# interrupting from user?
 	jz	io_skip
@@ -603,7 +606,7 @@
 	lg	%r12,__LC_THREAD_INFO
 	larl	%r13,system_call
 	lmg	%r8,%r9,__LC_EXT_OLD_PSW
-	HANDLE_SIE_INTERCEPT %r14,0
+	HANDLE_SIE_INTERCEPT %r14,3
 	SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
 	tmhh	%r8,0x0001		# interrupting from user ?
 	jz	ext_skip
@@ -651,7 +654,7 @@
 	lg	%r12,__LC_THREAD_INFO
 	larl	%r13,system_call
 	lmg	%r8,%r9,__LC_MCK_OLD_PSW
-	HANDLE_SIE_INTERCEPT %r14,0
+	HANDLE_SIE_INTERCEPT %r14,4
 	tm	__LC_MCCK_CODE,0x80	# system damage?
 	jo	mcck_panic		# yes -> rest of mcck code invalid
 	lghi	%r14,__LC_CPU_TIMER_SAVE_AREA
@@ -945,56 +948,50 @@
 	stmg	%r6,%r14,__SF_GPRS(%r15)	# save kernel registers
 	stg	%r2,__SF_EMPTY(%r15)		# save control block pointer
 	stg	%r3,__SF_EMPTY+8(%r15)		# save guest register save area
-	xc	__SF_EMPTY+16(8,%r15),__SF_EMPTY+16(%r15) # host id == 0
+	xc	__SF_EMPTY+16(16,%r15),__SF_EMPTY+16(%r15) # host id & reason
 	lmg	%r0,%r13,0(%r3)			# load guest gprs 0-13
-# some program checks are suppressing. C code (e.g. do_protection_exception)
-# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other
-# instructions in the sie_loop should not cause program interrupts. So
-# lets use a nop (47 00 00 00) as a landing pad.
-# See also HANDLE_SIE_INTERCEPT
-rewind_pad:
-	nop	0
-sie_loop:
-	lg	%r14,__LC_THREAD_INFO		# pointer thread_info struct
-	tm	__TI_flags+7(%r14),_TIF_EXIT_SIE
-	jnz	sie_exit
 	lg	%r14,__LC_GMAP			# get gmap pointer
 	ltgr	%r14,%r14
 	jz	sie_gmap
 	lctlg	%c1,%c1,__GMAP_ASCE(%r14)	# load primary asce
 sie_gmap:
 	lg	%r14,__SF_EMPTY(%r15)		# get control block pointer
+	oi	__SIE_PROG0C+3(%r14),1		# we are going into SIE now
+	tm	__SIE_PROG20+3(%r14),1		# last exit...
+	jnz	sie_done
 	LPP	__SF_EMPTY(%r15)		# set guest id
 	sie	0(%r14)
 sie_done:
 	LPP	__SF_EMPTY+16(%r15)		# set host id
-	lg	%r14,__LC_THREAD_INFO		# pointer thread_info struct
-sie_exit:
+	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
+# some program checks are suppressing. C code (e.g. do_protection_exception)
+# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other
+# instructions beween sie64a and sie_done should not cause program
+# interrupts. So lets use a nop (47 00 00 00) as a landing pad.
+# See also HANDLE_SIE_INTERCEPT
+rewind_pad:
+	nop	0
+	.globl sie_exit
+sie_exit:
 	lg	%r14,__SF_EMPTY+8(%r15)		# load guest register save area
 	stmg	%r0,%r13,0(%r14)		# save guest gprs 0-13
 	lmg	%r6,%r14,__SF_GPRS(%r15)	# restore kernel registers
-	lghi	%r2,0
+	lg	%r2,__SF_EMPTY+24(%r15)		# return exit reason code
 	br	%r14
 sie_fault:
-	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-	lg	%r14,__LC_THREAD_INFO		# pointer thread_info struct
-	lg	%r14,__SF_EMPTY+8(%r15)		# load guest register save area
-	stmg	%r0,%r13,0(%r14)		# save guest gprs 0-13
-	lmg	%r6,%r14,__SF_GPRS(%r15)	# restore kernel registers
-	lghi	%r2,-EFAULT
-	br	%r14
+	lghi	%r14,-EFAULT
+	stg	%r14,__SF_EMPTY+24(%r15)	# set exit reason code
+	j	sie_exit
 
 	.align	8
-.Lsie_loop:
-	.quad	sie_loop
-.Lsie_length:
-	.quad	sie_done - sie_loop
-.Lhost_id:
-	.quad	0
+.Lsie_critical:
+	.quad	sie_gmap
+.Lsie_critical_length:
+	.quad	sie_done - sie_gmap
 
 	EX_TABLE(rewind_pad,sie_fault)
-	EX_TABLE(sie_loop,sie_fault)
+	EX_TABLE(sie_exit,sie_fault)
 #endif
 
 		.section .rodata, "a"

diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index f58f37f..a6fc037 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c

@@ -13,6 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/perf_event.h>
+#include <linux/kvm_host.h>
 #include <linux/percpu.h>
 #include <linux/export.h>
 #include <asm/irq.h>
@@ -39,6 +40,57 @@
 }
 EXPORT_SYMBOL(perf_num_counters);
 
+static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
+{
+	struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];
+
+	if (!stack)
+		return NULL;
+
+	return (struct kvm_s390_sie_block *) stack->empty1[0];
+}
+
+static bool is_in_guest(struct pt_regs *regs)
+{
+	unsigned long ip = instruction_pointer(regs);
+
+	if (user_mode(regs))
+		return false;
+
+	return ip == (unsigned long) &sie_exit;
+}
+
+static unsigned long guest_is_user_mode(struct pt_regs *regs)
+{
+	return sie_block(regs)->gpsw.mask & PSW_MASK_PSTATE;
+}
+
+static unsigned long instruction_pointer_guest(struct pt_regs *regs)
+{
+	return sie_block(regs)->gpsw.addr & PSW_ADDR_INSN;
+}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	return is_in_guest(regs) ? instruction_pointer_guest(regs)
+				 : instruction_pointer(regs);
+}
+
+static unsigned long perf_misc_guest_flags(struct pt_regs *regs)
+{
+	return guest_is_user_mode(regs) ? PERF_RECORD_MISC_GUEST_USER
+					: PERF_RECORD_MISC_GUEST_KERNEL;
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	if (is_in_guest(regs))
+		return perf_misc_guest_flags(regs);
+
+	return user_mode(regs) ? PERF_RECORD_MISC_USER
+			       : PERF_RECORD_MISC_KERNEL;
+}
+
 void perf_event_print_debug(void)
 {
 	struct cpumf_ctr_info cf_info;

diff --git a/arch/s390/kernel/s390_ksyms.c b/arch/s390/kernel/s390_ksyms.c
index 9bdbcef..3bac589 100644
--- a/arch/s390/kernel/s390_ksyms.c
+++ b/arch/s390/kernel/s390_ksyms.c

@@ -7,6 +7,7 @@
 #endif
 #if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
 EXPORT_SYMBOL(sie64a);
+EXPORT_SYMBOL(sie_exit);
 #endif
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);

diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 8fe9d65..40b4c64 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile

@@ -6,7 +6,8 @@
 # it under the terms of the GNU General Public License (version 2 only)
 # as published by the Free Software Foundation.
 
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o eventfd.o)
+KVM := ../../../virt/kvm
+common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o
 
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 

diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 1c01a99..3074475 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c

@@ -132,6 +132,9 @@
 {
 	int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
 
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	trace_kvm_s390_handle_diag(vcpu, code);
 	switch (code) {
 	case 0x10:

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index b7d1b2e..5ee56e5 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c

@@ -22,87 +22,6 @@
 #include "trace.h"
 #include "trace-s390.h"
 
-static int handle_lctlg(struct kvm_vcpu *vcpu)
-{
-	int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
-	int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
-	u64 useraddr;
-	int reg, rc;
-
-	vcpu->stat.instruction_lctlg++;
-
-	useraddr = kvm_s390_get_base_disp_rsy(vcpu);
-
-	if (useraddr & 7)
-		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-
-	reg = reg1;
-
-	VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
-		   useraddr);
-	trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
-
-	do {
-		rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
-			       (u64 __user *) useraddr);
-		if (rc)
-			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		useraddr += 8;
-		if (reg == reg3)
-			break;
-		reg = (reg + 1) % 16;
-	} while (1);
-	return 0;
-}
-
-static int handle_lctl(struct kvm_vcpu *vcpu)
-{
-	int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
-	int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
-	u64 useraddr;
-	u32 val = 0;
-	int reg, rc;
-
-	vcpu->stat.instruction_lctl++;
-
-	useraddr = kvm_s390_get_base_disp_rs(vcpu);
-
-	if (useraddr & 3)
-		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-
-	VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
-		   useraddr);
-	trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
-
-	reg = reg1;
-	do {
-		rc = get_guest(vcpu, val, (u32 __user *) useraddr);
-		if (rc)
-			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
-		vcpu->arch.sie_block->gcr[reg] |= val;
-		useraddr += 4;
-		if (reg == reg3)
-			break;
-		reg = (reg + 1) % 16;
-	} while (1);
-	return 0;
-}
-
-static const intercept_handler_t eb_handlers[256] = {
-	[0x2f] = handle_lctlg,
-	[0x8a] = kvm_s390_handle_priv_eb,
-};
-
-static int handle_eb(struct kvm_vcpu *vcpu)
-{
-	intercept_handler_t handler;
-
-	handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
-	if (handler)
-		return handler(vcpu);
-	return -EOPNOTSUPP;
-}
 
 static const intercept_handler_t instruction_handlers[256] = {
 	[0x01] = kvm_s390_handle_01,
@@ -110,10 +29,10 @@
 	[0x83] = kvm_s390_handle_diag,
 	[0xae] = kvm_s390_handle_sigp,
 	[0xb2] = kvm_s390_handle_b2,
-	[0xb7] = handle_lctl,
+	[0xb7] = kvm_s390_handle_lctl,
 	[0xb9] = kvm_s390_handle_b9,
 	[0xe5] = kvm_s390_handle_e5,
-	[0xeb] = handle_eb,
+	[0xeb] = kvm_s390_handle_eb,
 };
 
 static int handle_noop(struct kvm_vcpu *vcpu)
@@ -174,47 +93,12 @@
 
 static int handle_validity(struct kvm_vcpu *vcpu)
 {
-	unsigned long vmaddr;
 	int viwhy = vcpu->arch.sie_block->ipb >> 16;
-	int rc;
 
 	vcpu->stat.exit_validity++;
 	trace_kvm_s390_intercept_validity(vcpu, viwhy);
-	if (viwhy == 0x37) {
-		vmaddr = gmap_fault(vcpu->arch.sie_block->prefix,
-				    vcpu->arch.gmap);
-		if (IS_ERR_VALUE(vmaddr)) {
-			rc = -EOPNOTSUPP;
-			goto out;
-		}
-		rc = fault_in_pages_writeable((char __user *) vmaddr,
-			 PAGE_SIZE);
-		if (rc) {
-			/* user will receive sigsegv, exit to user */
-			rc = -EOPNOTSUPP;
-			goto out;
-		}
-		vmaddr = gmap_fault(vcpu->arch.sie_block->prefix + PAGE_SIZE,
-				    vcpu->arch.gmap);
-		if (IS_ERR_VALUE(vmaddr)) {
-			rc = -EOPNOTSUPP;
-			goto out;
-		}
-		rc = fault_in_pages_writeable((char __user *) vmaddr,
-			 PAGE_SIZE);
-		if (rc) {
-			/* user will receive sigsegv, exit to user */
-			rc = -EOPNOTSUPP;
-			goto out;
-		}
-	} else
-		rc = -EOPNOTSUPP;
-
-out:
-	if (rc)
-		VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
-			   viwhy);
-	return rc;
+	WARN_ONCE(true, "kvm: unhandled validity intercept 0x%x\n", viwhy);
+	return -EOPNOTSUPP;
 }
 
 static int handle_instruction(struct kvm_vcpu *vcpu)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5c94817..7f35cb3 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c

@@ -438,7 +438,7 @@
 no_timer:
 	spin_lock(&vcpu->arch.local_int.float_int->lock);
 	spin_lock_bh(&vcpu->arch.local_int.lock);
-	add_wait_queue(&vcpu->arch.local_int.wq, &wait);
+	add_wait_queue(&vcpu->wq, &wait);
 	while (list_empty(&vcpu->arch.local_int.list) &&
 		list_empty(&vcpu->arch.local_int.float_int->list) &&
 		(!vcpu->arch.local_int.timer_due) &&
@@ -452,7 +452,7 @@
 	}
 	__unset_cpu_idle(vcpu);
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&vcpu->arch.local_int.wq, &wait);
+	remove_wait_queue(&vcpu->wq, &wait);
 	spin_unlock_bh(&vcpu->arch.local_int.lock);
 	spin_unlock(&vcpu->arch.local_int.float_int->lock);
 	hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
@@ -465,8 +465,8 @@
 
 	spin_lock(&vcpu->arch.local_int.lock);
 	vcpu->arch.local_int.timer_due = 1;
-	if (waitqueue_active(&vcpu->arch.local_int.wq))
-		wake_up_interruptible(&vcpu->arch.local_int.wq);
+	if (waitqueue_active(&vcpu->wq))
+		wake_up_interruptible(&vcpu->wq);
 	spin_unlock(&vcpu->arch.local_int.lock);
 }
 
@@ -613,7 +613,7 @@
 	spin_lock_bh(&li->lock);
 	list_add(&inti->list, &li->list);
 	atomic_set(&li->active, 1);
-	BUG_ON(waitqueue_active(&li->wq));
+	BUG_ON(waitqueue_active(li->wq));
 	spin_unlock_bh(&li->lock);
 	return 0;
 }
@@ -746,8 +746,8 @@
 	li = fi->local_int[sigcpu];
 	spin_lock_bh(&li->lock);
 	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-	if (waitqueue_active(&li->wq))
-		wake_up_interruptible(&li->wq);
+	if (waitqueue_active(li->wq))
+		wake_up_interruptible(li->wq);
 	spin_unlock_bh(&li->lock);
 	spin_unlock(&fi->lock);
 	mutex_unlock(&kvm->lock);
@@ -832,8 +832,8 @@
 	if (inti->type == KVM_S390_SIGP_STOP)
 		li->action_bits |= ACTION_STOP_ON_STOP;
 	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-	if (waitqueue_active(&li->wq))
-		wake_up_interruptible(&vcpu->arch.local_int.wq);
+	if (waitqueue_active(&vcpu->wq))
+		wake_up_interruptible(&vcpu->wq);
 	spin_unlock_bh(&li->lock);
 	mutex_unlock(&vcpu->kvm->lock);
 	return 0;

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c1c7c68..ba694d2 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c

@@ -59,6 +59,7 @@
 	{ "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
 	{ "deliver_program_interruption", VCPU_STAT(deliver_program_int) },
 	{ "exit_wait_state", VCPU_STAT(exit_wait_state) },
+	{ "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
 	{ "instruction_stidp", VCPU_STAT(instruction_stidp) },
 	{ "instruction_spx", VCPU_STAT(instruction_spx) },
 	{ "instruction_stpx", VCPU_STAT(instruction_stpx) },
@@ -84,6 +85,7 @@
 };
 
 static unsigned long long *facilities;
+static struct gmap_notifier gmap_notifier;
 
 /* Section: not file related */
 int kvm_arch_hardware_enable(void *garbage)
@@ -96,13 +98,18 @@
 {
 }
 
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+
 int kvm_arch_hardware_setup(void)
 {
+	gmap_notifier.notifier_call = kvm_gmap_notifier;
+	gmap_register_ipte_notifier(&gmap_notifier);
 	return 0;
 }
 
 void kvm_arch_hardware_unsetup(void)
 {
+	gmap_unregister_ipte_notifier(&gmap_notifier);
 }
 
 void kvm_arch_check_processor_compat(void *rtn)
@@ -239,6 +246,7 @@
 		kvm->arch.gmap = gmap_alloc(current->mm);
 		if (!kvm->arch.gmap)
 			goto out_nogmap;
+		kvm->arch.gmap->private = kvm;
 	}
 
 	kvm->arch.css_support = 0;
@@ -270,7 +278,7 @@
 
 	free_page((unsigned long)(vcpu->arch.sie_block));
 	kvm_vcpu_uninit(vcpu);
-	kfree(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
 static void kvm_free_vcpus(struct kvm *kvm)
@@ -309,6 +317,7 @@
 		vcpu->arch.gmap = gmap_alloc(current->mm);
 		if (!vcpu->arch.gmap)
 			return -ENOMEM;
+		vcpu->arch.gmap->private = vcpu->kvm;
 		return 0;
 	}
 
@@ -373,8 +382,10 @@
 {
 	atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
 						    CPUSTAT_SM |
-						    CPUSTAT_STOPPED);
+						    CPUSTAT_STOPPED |
+						    CPUSTAT_GED);
 	vcpu->arch.sie_block->ecb   = 6;
+	vcpu->arch.sie_block->ecb2  = 8;
 	vcpu->arch.sie_block->eca   = 0xC1002001U;
 	vcpu->arch.sie_block->fac   = (int) (long) facilities;
 	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
@@ -397,7 +408,7 @@
 
 	rc = -ENOMEM;
 
-	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
 	if (!vcpu)
 		goto out;
 
@@ -427,7 +438,7 @@
 	vcpu->arch.local_int.float_int = &kvm->arch.float_int;
 	spin_lock(&kvm->arch.float_int.lock);
 	kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
-	init_waitqueue_head(&vcpu->arch.local_int.wq);
+	vcpu->arch.local_int.wq = &vcpu->wq;
 	vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
 	spin_unlock(&kvm->arch.float_int.lock);
 
@@ -442,7 +453,7 @@
 out_free_sie_block:
 	free_page((unsigned long)(vcpu->arch.sie_block));
 out_free_cpu:
-	kfree(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
 out:
 	return ERR_PTR(rc);
 }
@@ -454,6 +465,50 @@
 	return 0;
 }
 
+void s390_vcpu_block(struct kvm_vcpu *vcpu)
+{
+	atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
+}
+
+void s390_vcpu_unblock(struct kvm_vcpu *vcpu)
+{
+	atomic_clear_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
+}
+
+/*
+ * Kick a guest cpu out of SIE and wait until SIE is not running.
+ * If the CPU is not running (e.g. waiting as idle) the function will
+ * return immediately. */
+void exit_sie(struct kvm_vcpu *vcpu)
+{
+	atomic_set_mask(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags);
+	while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE)
+		cpu_relax();
+}
+
+/* Kick a guest cpu out of SIE and prevent SIE-reentry */
+void exit_sie_sync(struct kvm_vcpu *vcpu)
+{
+	s390_vcpu_block(vcpu);
+	exit_sie(vcpu);
+}
+
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+{
+	int i;
+	struct kvm *kvm = gmap->private;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		/* match against both prefix pages */
+		if (vcpu->arch.sie_block->prefix == (address & ~0x1000UL)) {
+			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+			kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+			exit_sie_sync(vcpu);
+		}
+	}
+}
+
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
 	/* kvm common code refers to this, but never calls it */
@@ -606,6 +661,27 @@
 	return -EINVAL; /* not implemented yet */
 }
 
+static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
+	 * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+	 * This ensures that the ipte instruction for this request has
+	 * already finished. We might race against a second unmapper that
+	 * wants to set the blocking bit. Lets just retry the request loop.
+	 */
+	while (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
+		int rc;
+		rc = gmap_ipte_notify(vcpu->arch.gmap,
+				      vcpu->arch.sie_block->prefix,
+				      PAGE_SIZE * 2);
+		if (rc)
+			return rc;
+		s390_vcpu_unblock(vcpu);
+	}
+	return 0;
+}
+
 static int __vcpu_run(struct kvm_vcpu *vcpu)
 {
 	int rc;
@@ -621,6 +697,10 @@
 	if (!kvm_is_ucontrol(vcpu->kvm))
 		kvm_s390_deliver_pending_interrupts(vcpu);
 
+	rc = kvm_s390_handle_requests(vcpu);
+	if (rc)
+		return rc;
+
 	vcpu->arch.sie_block->icptcode = 0;
 	preempt_disable();
 	kvm_guest_enter();
@@ -630,7 +710,9 @@
 	trace_kvm_s390_sie_enter(vcpu,
 				 atomic_read(&vcpu->arch.sie_block->cpuflags));
 	rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
-	if (rc) {
+	if (rc > 0)
+		rc = 0;
+	if (rc < 0) {
 		if (kvm_is_ucontrol(vcpu->kvm)) {
 			rc = SIE_INTERCEPT_UCONTROL;
 		} else {
@@ -1046,7 +1128,7 @@
 		return -ENOMEM;
 	}
 	memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
-	facilities[0] &= 0xff00fff3f47c0000ULL;
+	facilities[0] &= 0xff82fff3f47c0000ULL;
 	facilities[1] &= 0x001c000000000000ULL;
 	return 0;
 }
@@ -1059,3 +1141,12 @@
 
 module_init(kvm_s390_init);
 module_exit(kvm_s390_exit);
+
+/*
+ * Enable autoloading of the kvm module.
+ * Note that we add the module alias here instead of virt/kvm/kvm_main.c
+ * since x86 takes a different approach.
+ */
+#include <linux/miscdevice.h>
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");

diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index efc14f6..028ca9f 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h

@@ -63,6 +63,7 @@
 {
 	vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u;
 	vcpu->arch.sie_block->ihcpu  = 0xffff;
+	kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
 }
 
 static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu)
@@ -85,6 +86,12 @@
 	*address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
 }
 
+static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2)
+{
+	*r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20;
+	*r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
+}
+
 static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
 {
 	u32 base2 = vcpu->arch.sie_block->ipb >> 28;
@@ -125,7 +132,8 @@
 int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_b9(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu);
-int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
 
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
@@ -133,6 +141,10 @@
 /* implemented in kvm-s390.c */
 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
 				 unsigned long addr);
+void s390_vcpu_block(struct kvm_vcpu *vcpu);
+void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
+void exit_sie(struct kvm_vcpu *vcpu);
+void exit_sie_sync(struct kvm_vcpu *vcpu);
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
 

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 6bbd7b5..0da3e6e 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c

@@ -1,7 +1,7 @@
 /*
  * handling privileged instructions
  *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008, 2013
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -20,6 +20,9 @@
 #include <asm/debug.h>
 #include <asm/ebcdic.h>
 #include <asm/sysinfo.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/io.h>
 #include <asm/ptrace.h>
 #include <asm/compat.h>
 #include "gaccess.h"
@@ -34,6 +37,9 @@
 
 	vcpu->stat.instruction_spx++;
 
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	operand2 = kvm_s390_get_base_disp_s(vcpu);
 
 	/* must be word boundary */
@@ -65,6 +71,9 @@
 
 	vcpu->stat.instruction_stpx++;
 
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	operand2 = kvm_s390_get_base_disp_s(vcpu);
 
 	/* must be word boundary */
@@ -89,6 +98,9 @@
 
 	vcpu->stat.instruction_stap++;
 
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	useraddr = kvm_s390_get_base_disp_s(vcpu);
 
 	if (useraddr & 1)
@@ -105,7 +117,12 @@
 static int handle_skey(struct kvm_vcpu *vcpu)
 {
 	vcpu->stat.instruction_storage_key++;
-	vcpu->arch.sie_block->gpsw.addr -= 4;
+
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+	vcpu->arch.sie_block->gpsw.addr =
+		__rewind_psw(vcpu->arch.sie_block->gpsw, 4);
 	VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
 	return 0;
 }
@@ -129,9 +146,10 @@
 		 * Store the two-word I/O interruption code into the
 		 * provided area.
 		 */
-		put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) addr);
-		put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) (addr + 2));
-		put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) (addr + 4));
+		if (put_guest(vcpu, inti->io.subchannel_id, (u16 __user *)addr)
+		    || put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *)(addr + 2))
+		    || put_guest(vcpu, inti->io.io_int_parm, (u32 __user *)(addr + 4)))
+			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 	} else {
 		/*
 		 * Store the three-word I/O interruption code into
@@ -182,6 +200,9 @@
 {
 	VCPU_EVENT(vcpu, 4, "%s", "I/O instruction");
 
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	if (vcpu->kvm->arch.css_support) {
 		/*
 		 * Most I/O instructions will be handled by userspace.
@@ -210,8 +231,12 @@
 	int rc;
 
 	vcpu->stat.instruction_stfl++;
+
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	/* only pass the facility bits, which we can handle */
-	facility_list = S390_lowcore.stfl_fac_list & 0xff00fff3;
+	facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
 
 	rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
 			   &facility_list, sizeof(facility_list));
@@ -255,8 +280,8 @@
 	u64 addr;
 
 	if (gpsw->mask & PSW_MASK_PSTATE)
-		return kvm_s390_inject_program_int(vcpu,
-						   PGM_PRIVILEGED_OPERATION);
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	addr = kvm_s390_get_base_disp_s(vcpu);
 	if (addr & 7)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -278,6 +303,9 @@
 	psw_t new_psw;
 	u64 addr;
 
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	addr = kvm_s390_get_base_disp_s(vcpu);
 	if (addr & 7)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -296,6 +324,9 @@
 
 	vcpu->stat.instruction_stidp++;
 
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	operand2 = kvm_s390_get_base_disp_s(vcpu);
 
 	if (operand2 & 7)
@@ -351,16 +382,30 @@
 	vcpu->stat.instruction_stsi++;
 	VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
 
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+	if (fc > 3) {
+		vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;	  /* cc 3 */
+		return 0;
+	}
+
+	if (vcpu->run->s.regs.gprs[0] & 0x0fffff00
+	    || vcpu->run->s.regs.gprs[1] & 0xffff0000)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	if (fc == 0) {
+		vcpu->run->s.regs.gprs[0] = 3 << 28;
+		vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);  /* cc 0 */
+		return 0;
+	}
+
 	operand2 = kvm_s390_get_base_disp_s(vcpu);
 
-	if (operand2 & 0xfff && fc > 0)
+	if (operand2 & 0xfff)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	switch (fc) {
-	case 0:
-		vcpu->run->s.regs.gprs[0] = 3 << 28;
-		vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
-		return 0;
 	case 1: /* same handling for 1 and 2 */
 	case 2:
 		mem = get_zeroed_page(GFP_KERNEL);
@@ -377,8 +422,6 @@
 			goto out_no_data;
 		handle_stsi_3_2_2(vcpu, (void *) mem);
 		break;
-	default:
-		goto out_no_data;
 	}
 
 	if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) {
@@ -432,20 +475,14 @@
 	intercept_handler_t handler;
 
 	/*
-	 * a lot of B2 instructions are priviledged. We first check for
-	 * the privileged ones, that we can handle in the kernel. If the
-	 * kernel can handle this instruction, we check for the problem
-	 * state bit and (a) handle the instruction or (b) send a code 2
-	 * program check.
-	 * Anything else goes to userspace.*/
+	 * A lot of B2 instructions are priviledged. Here we check for
+	 * the privileged ones, that we can handle in the kernel.
+	 * Anything else goes to userspace.
+	 */
 	handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
-	if (handler) {
-		if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
-			return kvm_s390_inject_program_int(vcpu,
-						   PGM_PRIVILEGED_OPERATION);
-		else
-			return handler(vcpu);
-	}
+	if (handler)
+		return handler(vcpu);
+
 	return -EOPNOTSUPP;
 }
 
@@ -453,8 +490,7 @@
 {
 	int reg1, reg2;
 
-	reg1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 24;
-	reg2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
+	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
 
 	/* This basically extracts the mask half of the psw. */
 	vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
@@ -467,9 +503,88 @@
 	return 0;
 }
 
+#define PFMF_RESERVED   0xfffc0101UL
+#define PFMF_SK         0x00020000UL
+#define PFMF_CF         0x00010000UL
+#define PFMF_UI         0x00008000UL
+#define PFMF_FSC        0x00007000UL
+#define PFMF_NQ         0x00000800UL
+#define PFMF_MR         0x00000400UL
+#define PFMF_MC         0x00000200UL
+#define PFMF_KEY        0x000000feUL
+
+static int handle_pfmf(struct kvm_vcpu *vcpu)
+{
+	int reg1, reg2;
+	unsigned long start, end;
+
+	vcpu->stat.instruction_pfmf++;
+
+	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+	if (!MACHINE_HAS_PFMF)
+		return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+	if (vcpu->run->s.regs.gprs[reg1] & PFMF_RESERVED)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	/* Only provide non-quiescing support if the host supports it */
+	if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ &&
+	    S390_lowcore.stfl_fac_list & 0x00020000)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	/* No support for conditional-SSKE */
+	if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+	switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
+	case 0x00000000:
+		end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
+		break;
+	case 0x00001000:
+		end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
+		break;
+	/* We dont support EDAT2
+	case 0x00002000:
+		end = (start + (1UL << 31)) & ~((1UL << 31) - 1);
+		break;*/
+	default:
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+	}
+	while (start < end) {
+		unsigned long useraddr;
+
+		useraddr = gmap_translate(start, vcpu->arch.gmap);
+		if (IS_ERR((void *)useraddr))
+			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+		if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+			if (clear_user((void __user *)useraddr, PAGE_SIZE))
+				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		}
+
+		if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) {
+			if (set_guest_storage_key(current->mm, useraddr,
+					vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
+					vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
+				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		}
+
+		start += PAGE_SIZE;
+	}
+	if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
+		vcpu->run->s.regs.gprs[reg2] = end;
+	return 0;
+}
+
 static const intercept_handler_t b9_handlers[256] = {
 	[0x8d] = handle_epsw,
 	[0x9c] = handle_io_inst,
+	[0xaf] = handle_pfmf,
 };
 
 int kvm_s390_handle_b9(struct kvm_vcpu *vcpu)
@@ -478,29 +593,96 @@
 
 	/* This is handled just as for the B2 instructions. */
 	handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
-	if (handler) {
-		if ((handler != handle_epsw) &&
-		    (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE))
-			return kvm_s390_inject_program_int(vcpu,
-						   PGM_PRIVILEGED_OPERATION);
-		else
-			return handler(vcpu);
-	}
+	if (handler)
+		return handler(vcpu);
+
 	return -EOPNOTSUPP;
 }
 
+int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
+{
+	int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
+	int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
+	u64 useraddr;
+	u32 val = 0;
+	int reg, rc;
+
+	vcpu->stat.instruction_lctl++;
+
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+	useraddr = kvm_s390_get_base_disp_rs(vcpu);
+
+	if (useraddr & 3)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
+		   useraddr);
+	trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
+
+	reg = reg1;
+	do {
+		rc = get_guest(vcpu, val, (u32 __user *) useraddr);
+		if (rc)
+			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
+		vcpu->arch.sie_block->gcr[reg] |= val;
+		useraddr += 4;
+		if (reg == reg3)
+			break;
+		reg = (reg + 1) % 16;
+	} while (1);
+
+	return 0;
+}
+
+static int handle_lctlg(struct kvm_vcpu *vcpu)
+{
+	int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
+	int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
+	u64 useraddr;
+	int reg, rc;
+
+	vcpu->stat.instruction_lctlg++;
+
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+	useraddr = kvm_s390_get_base_disp_rsy(vcpu);
+
+	if (useraddr & 7)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	reg = reg1;
+
+	VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
+		   useraddr);
+	trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
+
+	do {
+		rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
+			       (u64 __user *) useraddr);
+		if (rc)
+			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		useraddr += 8;
+		if (reg == reg3)
+			break;
+		reg = (reg + 1) % 16;
+	} while (1);
+
+	return 0;
+}
+
 static const intercept_handler_t eb_handlers[256] = {
+	[0x2f] = handle_lctlg,
 	[0x8a] = handle_io_inst,
 };
 
-int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu)
+int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
 {
 	intercept_handler_t handler;
 
-	/* All eb instructions that end up here are privileged. */
-	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
-		return kvm_s390_inject_program_int(vcpu,
-						   PGM_PRIVILEGED_OPERATION);
 	handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
 	if (handler)
 		return handler(vcpu);
@@ -515,6 +697,9 @@
 
 	vcpu->stat.instruction_tprot++;
 
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
 	kvm_s390_get_base_disp_sse(vcpu, &address1, &address2);
 
 	/* we only handle the Linux memory detection case:
@@ -560,8 +745,7 @@
 	u32 value;
 
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
-		return kvm_s390_inject_program_int(vcpu,
-						   PGM_PRIVILEGED_OPERATION);
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
 	if (vcpu->run->s.regs.gprs[0] & 0x00000000ffff0000)
 		return kvm_s390_inject_program_int(vcpu,

diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 1c48ab2..bec398c 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c

@@ -79,8 +79,8 @@
 	list_add_tail(&inti->list, &li->list);
 	atomic_set(&li->active, 1);
 	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-	if (waitqueue_active(&li->wq))
-		wake_up_interruptible(&li->wq);
+	if (waitqueue_active(li->wq))
+		wake_up_interruptible(li->wq);
 	spin_unlock_bh(&li->lock);
 	rc = SIGP_CC_ORDER_CODE_ACCEPTED;
 	VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
@@ -117,8 +117,8 @@
 	list_add_tail(&inti->list, &li->list);
 	atomic_set(&li->active, 1);
 	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-	if (waitqueue_active(&li->wq))
-		wake_up_interruptible(&li->wq);
+	if (waitqueue_active(li->wq))
+		wake_up_interruptible(li->wq);
 	spin_unlock_bh(&li->lock);
 	rc = SIGP_CC_ORDER_CODE_ACCEPTED;
 	VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
@@ -145,8 +145,8 @@
 	atomic_set(&li->active, 1);
 	atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
 	li->action_bits |= action;
-	if (waitqueue_active(&li->wq))
-		wake_up_interruptible(&li->wq);
+	if (waitqueue_active(li->wq))
+		wake_up_interruptible(li->wq);
 out:
 	spin_unlock_bh(&li->lock);
 
@@ -250,8 +250,8 @@
 
 	list_add_tail(&inti->list, &li->list);
 	atomic_set(&li->active, 1);
-	if (waitqueue_active(&li->wq))
-		wake_up_interruptible(&li->wq);
+	if (waitqueue_active(li->wq))
+		wake_up_interruptible(li->wq);
 	rc = SIGP_CC_ORDER_CODE_ACCEPTED;
 
 	VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address);
@@ -333,8 +333,7 @@
 
 	/* sigp in userspace can exit */
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
-		return kvm_s390_inject_program_int(vcpu,
-						   PGM_PRIVILEGED_OPERATION);
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
 	order_code = kvm_s390_get_base_disp_rs(vcpu);
 

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 74c29d9..17bf4d3 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c

@@ -689,7 +689,7 @@
 		entry = *ptep;
 		if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) {
 			pgste = pgste_get_lock(ptep);
-			pgste_val(pgste) |= RCP_IN_BIT;
+			pgste_val(pgste) |= PGSTE_IN_BIT;
 			pgste_set_unlock(ptep, pgste);
 			start += PAGE_SIZE;
 			len -= PAGE_SIZE;

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af9c552..f87f7fc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h

@@ -222,14 +222,22 @@
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
 	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
+
+	/* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen.  */
+	unsigned long mmu_valid_gen;
+
 	DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
+	/*
+	 * Used out of the mmu-lock to avoid reading spte values while an
+	 * update is in progress; see the comments in __get_spte_lockless().
+	 */
 	int clear_spte_count;
 #endif
 
+	/* Number of writes since the last time traversal visited this page.  */
 	int write_flooding_count;
-	bool mmio_cached;
 };
 
 struct kvm_pio_request {
@@ -529,11 +537,14 @@
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
 	unsigned int indirect_shadow_pages;
+	unsigned long mmu_valid_gen;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
 	 * Hash table of struct kvm_mmu_page.
 	 */
 	struct list_head active_mmu_pages;
+	struct list_head zapped_obsolete_pages;
+
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
 	int iommu_flags;
@@ -769,7 +780,7 @@
 				     struct kvm_memory_slot *slot,
 				     gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_zap_mmio_sptes(struct kvm *kvm);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d609e1d..bf4fb04 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile

@@ -5,12 +5,13 @@
 CFLAGS_svm.o := -I.
 CFLAGS_vmx.o := -I.
 
-kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-				coalesced_mmio.o irq_comm.o eventfd.o \
-				irqchip.o)
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= $(addprefix ../../../virt/kvm/, \
-				assigned-dev.o iommu.o)
-kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
+KVM := ../../../virt/kvm
+
+kvm-y			+= $(KVM)/kvm_main.o $(KVM)/ioapic.o \
+				$(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
+				$(KVM)/eventfd.o $(KVM)/irqchip.o
+kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= $(KVM)/assigned-dev.o $(KVM)/iommu.o
+kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o cpuid.o pmu.o

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5953dce..2bc1e81 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c

@@ -61,6 +61,8 @@
 #define OpMem8            26ull  /* 8-bit zero extended memory operand */
 #define OpImm64           27ull  /* Sign extended 16/32/64-bit immediate */
 #define OpXLat            28ull  /* memory at BX/EBX/RBX + zero-extended AL */
+#define OpAccLo           29ull  /* Low part of extended acc (AX/AX/EAX/RAX) */
+#define OpAccHi           30ull  /* High part of extended acc (-/DX/EDX/RDX) */
 
 #define OpBits             5  /* Width of operand field */
 #define OpMask             ((1ull << OpBits) - 1)
@@ -86,6 +88,7 @@
 #define DstMem64    (OpMem64 << DstShift)
 #define DstImmUByte (OpImmUByte << DstShift)
 #define DstDX       (OpDX << DstShift)
+#define DstAccLo    (OpAccLo << DstShift)
 #define DstMask     (OpMask << DstShift)
 /* Source operand type. */
 #define SrcShift    6
@@ -108,6 +111,7 @@
 #define SrcImm64    (OpImm64 << SrcShift)
 #define SrcDX       (OpDX << SrcShift)
 #define SrcMem8     (OpMem8 << SrcShift)
+#define SrcAccHi    (OpAccHi << SrcShift)
 #define SrcMask     (OpMask << SrcShift)
 #define BitOp       (1<<11)
 #define MemAbs      (1<<12)      /* Memory operand is absolute displacement */
@@ -138,6 +142,7 @@
 /* Source 2 operand type */
 #define Src2Shift   (31)
 #define Src2None    (OpNone << Src2Shift)
+#define Src2Mem     (OpMem << Src2Shift)
 #define Src2CL      (OpCL << Src2Shift)
 #define Src2ImmByte (OpImmByte << Src2Shift)
 #define Src2One     (OpOne << Src2Shift)
@@ -155,6 +160,9 @@
 #define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */
 #define Fastop      ((u64)1 << 44)  /* Use opcode::u.fastop */
 #define NoWrite     ((u64)1 << 45)  /* No writeback */
+#define SrcWrite    ((u64)1 << 46)  /* Write back src operand */
+
+#define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
 #define X2(x...) x, x
 #define X3(x...) X2(x), x
@@ -171,10 +179,11 @@
 /*
  * fastop functions have a special calling convention:
  *
- * dst:    [rdx]:rax  (in/out)
- * src:    rbx        (in/out)
+ * dst:    rax        (in/out)
+ * src:    rdx        (in/out)
  * src2:   rcx        (in)
  * flags:  rflags     (in/out)
+ * ex:     rsi        (in:fastop pointer, out:zero if exception)
  *
  * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
  * different operand sizes can be reached by calculation, rather than a jump
@@ -276,174 +285,17 @@
 }
 
 /*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k"		/* force 32-bit operand */
-#define _STK  "%%rsp"		/* stack pointer */
-#elif defined(__i386__)
-#define _LO32 ""		/* force 32-bit operand */
-#define _STK  "%%esp"		/* stack pointer */
-#endif
-
-/*
  * These EFLAGS bits are restored from saved value during emulation, and
  * any changes are written back to the saved value after emulation.
  */
 #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
 
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp)					\
-	/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
-	"movl %"_sav",%"_LO32 _tmp"; "                                  \
-	"push %"_tmp"; "                                                \
-	"push %"_tmp"; "                                                \
-	"movl %"_msk",%"_LO32 _tmp"; "                                  \
-	"andl %"_LO32 _tmp",("_STK"); "                                 \
-	"pushf; "                                                       \
-	"notl %"_LO32 _tmp"; "                                          \
-	"andl %"_LO32 _tmp",("_STK"); "                                 \
-	"andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); "	\
-	"pop  %"_tmp"; "                                                \
-	"orl  %"_LO32 _tmp",("_STK"); "                                 \
-	"popf; "                                                        \
-	"pop  %"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
-	/* _sav |= EFLAGS & _msk; */		\
-	"pushf; "				\
-	"pop  %"_tmp"; "			\
-	"andl %"_msk",%"_LO32 _tmp"; "		\
-	"orl  %"_LO32 _tmp",%"_sav"; "
-
 #ifdef CONFIG_X86_64
 #define ON64(x) x
 #else
 #define ON64(x)
 #endif
 
-#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype)	\
-	do {								\
-		__asm__ __volatile__ (					\
-			_PRE_EFLAGS("0", "4", "2")			\
-			_op _suffix " %"_x"3,%1; "			\
-			_POST_EFLAGS("0", "4", "2")			\
-			: "=m" ((ctxt)->eflags),			\
-			  "+q" (*(_dsttype*)&(ctxt)->dst.val),		\
-			  "=&r" (_tmp)					\
-			: _y ((ctxt)->src.val), "i" (EFLAGS_MASK));	\
-	} while (0)
-
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy)		\
-	do {								\
-		unsigned long _tmp;					\
-									\
-		switch ((ctxt)->dst.bytes) {				\
-		case 2:							\
-			____emulate_2op(ctxt,_op,_wx,_wy,"w",u16);	\
-			break;						\
-		case 4:							\
-			____emulate_2op(ctxt,_op,_lx,_ly,"l",u32);	\
-			break;						\
-		case 8:							\
-			ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
-			break;						\
-		}							\
-	} while (0)
-
-#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy)		     \
-	do {								     \
-		unsigned long _tmp;					     \
-		switch ((ctxt)->dst.bytes) {				     \
-		case 1:							     \
-			____emulate_2op(ctxt,_op,_bx,_by,"b",u8);	     \
-			break;						     \
-		default:						     \
-			__emulate_2op_nobyte(ctxt, _op,			     \
-					     _wx, _wy, _lx, _ly, _qx, _qy);  \
-			break;						     \
-		}							     \
-	} while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(ctxt, _op)					\
-	__emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(ctxt, _op)					\
-	__emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(ctxt, _op)				\
-	__emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(ctxt, _op, _suffix, _type)		\
-	do {								\
-		unsigned long _tmp;					\
-		_type _clv  = (ctxt)->src2.val;				\
-		_type _srcv = (ctxt)->src.val;				\
-		_type _dstv = (ctxt)->dst.val;				\
-									\
-		__asm__ __volatile__ (					\
-			_PRE_EFLAGS("0", "5", "2")			\
-			_op _suffix " %4,%1 \n"				\
-			_POST_EFLAGS("0", "5", "2")			\
-			: "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
-			: "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK)	\
-			);						\
-									\
-		(ctxt)->src2.val  = (unsigned long) _clv;		\
-		(ctxt)->src2.val = (unsigned long) _srcv;		\
-		(ctxt)->dst.val = (unsigned long) _dstv;		\
-	} while (0)
-
-#define emulate_2op_cl(ctxt, _op)					\
-	do {								\
-		switch ((ctxt)->dst.bytes) {				\
-		case 2:							\
-			__emulate_2op_cl(ctxt, _op, "w", u16);		\
-			break;						\
-		case 4:							\
-			__emulate_2op_cl(ctxt, _op, "l", u32);		\
-			break;						\
-		case 8:							\
-			ON64(__emulate_2op_cl(ctxt, _op, "q", ulong));	\
-			break;						\
-		}							\
-	} while (0)
-
-#define __emulate_1op(ctxt, _op, _suffix)				\
-	do {								\
-		unsigned long _tmp;					\
-									\
-		__asm__ __volatile__ (					\
-			_PRE_EFLAGS("0", "3", "2")			\
-			_op _suffix " %1; "				\
-			_POST_EFLAGS("0", "3", "2")			\
-			: "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
-			  "=&r" (_tmp)					\
-			: "i" (EFLAGS_MASK));				\
-	} while (0)
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(ctxt, _op)						\
-	do {								\
-		switch ((ctxt)->dst.bytes) {				\
-		case 1:	__emulate_1op(ctxt, _op, "b"); break;		\
-		case 2:	__emulate_1op(ctxt, _op, "w"); break;		\
-		case 4:	__emulate_1op(ctxt, _op, "l"); break;		\
-		case 8:	ON64(__emulate_1op(ctxt, _op, "q")); break;	\
-		}							\
-	} while (0)
-
 static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 
 #define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
@@ -462,7 +314,10 @@
 #define FOPNOP() FOP_ALIGN FOP_RET
 
 #define FOP1E(op,  dst) \
-	FOP_ALIGN #op " %" #dst " \n\t" FOP_RET
+	FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET
+
+#define FOP1EEX(op,  dst) \
+	FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
 
 #define FASTOP1(op) \
 	FOP_START(op) \
@@ -472,24 +327,42 @@
 	ON64(FOP1E(op##q, rax))	\
 	FOP_END
 
+/* 1-operand, using src2 (for MUL/DIV r/m) */
+#define FASTOP1SRC2(op, name) \
+	FOP_START(name) \
+	FOP1E(op, cl) \
+	FOP1E(op, cx) \
+	FOP1E(op, ecx) \
+	ON64(FOP1E(op, rcx)) \
+	FOP_END
+
+/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */
+#define FASTOP1SRC2EX(op, name) \
+	FOP_START(name) \
+	FOP1EEX(op, cl) \
+	FOP1EEX(op, cx) \
+	FOP1EEX(op, ecx) \
+	ON64(FOP1EEX(op, rcx)) \
+	FOP_END
+
 #define FOP2E(op,  dst, src)	   \
 	FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
 
 #define FASTOP2(op) \
 	FOP_START(op) \
-	FOP2E(op##b, al, bl) \
-	FOP2E(op##w, ax, bx) \
-	FOP2E(op##l, eax, ebx) \
-	ON64(FOP2E(op##q, rax, rbx)) \
+	FOP2E(op##b, al, dl) \
+	FOP2E(op##w, ax, dx) \
+	FOP2E(op##l, eax, edx) \
+	ON64(FOP2E(op##q, rax, rdx)) \
 	FOP_END
 
 /* 2 operand, word only */
 #define FASTOP2W(op) \
 	FOP_START(op) \
 	FOPNOP() \
-	FOP2E(op##w, ax, bx) \
-	FOP2E(op##l, eax, ebx) \
-	ON64(FOP2E(op##q, rax, rbx)) \
+	FOP2E(op##w, ax, dx) \
+	FOP2E(op##l, eax, edx) \
+	ON64(FOP2E(op##q, rax, rdx)) \
 	FOP_END
 
 /* 2 operand, src is CL */
@@ -508,14 +381,17 @@
 #define FASTOP3WCL(op) \
 	FOP_START(op) \
 	FOPNOP() \
-	FOP3E(op##w, ax, bx, cl) \
-	FOP3E(op##l, eax, ebx, cl) \
-	ON64(FOP3E(op##q, rax, rbx, cl)) \
+	FOP3E(op##w, ax, dx, cl) \
+	FOP3E(op##l, eax, edx, cl) \
+	ON64(FOP3E(op##q, rax, rdx, cl)) \
 	FOP_END
 
 /* Special case for SETcc - 1 instruction per cc */
 #define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
 
+asm(".global kvm_fastop_exception \n"
+    "kvm_fastop_exception: xor %esi, %esi; ret");
+
 FOP_START(setcc)
 FOP_SETCC(seto)
 FOP_SETCC(setno)
@@ -538,47 +414,6 @@
 FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
 FOP_END;
 
-#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex)			\
-	do {								\
-		unsigned long _tmp;					\
-		ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX);		\
-		ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX);		\
-									\
-		__asm__ __volatile__ (					\
-			_PRE_EFLAGS("0", "5", "1")			\
-			"1: \n\t"					\
-			_op _suffix " %6; "				\
-			"2: \n\t"					\
-			_POST_EFLAGS("0", "5", "1")			\
-			".pushsection .fixup,\"ax\" \n\t"		\
-			"3: movb $1, %4 \n\t"				\
-			"jmp 2b \n\t"					\
-			".popsection \n\t"				\
-			_ASM_EXTABLE(1b, 3b)				\
-			: "=m" ((ctxt)->eflags), "=&r" (_tmp),		\
-			  "+a" (*rax), "+d" (*rdx), "+qm"(_ex)		\
-			: "i" (EFLAGS_MASK), "m" ((ctxt)->src.val));	\
-	} while (0)
-
-/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-#define emulate_1op_rax_rdx(ctxt, _op, _ex)	\
-	do {								\
-		switch((ctxt)->src.bytes) {				\
-		case 1:							\
-			__emulate_1op_rax_rdx(ctxt, _op, "b", _ex);	\
-			break;						\
-		case 2:							\
-			__emulate_1op_rax_rdx(ctxt, _op, "w", _ex);	\
-			break;						\
-		case 4:							\
-			__emulate_1op_rax_rdx(ctxt, _op, "l", _ex);	\
-			break;						\
-		case 8: ON64(						\
-			__emulate_1op_rax_rdx(ctxt, _op, "q", _ex));	\
-			break;						\
-		}							\
-	} while (0)
-
 static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
 				    enum x86_intercept intercept,
 				    enum x86_intercept_stage stage)
@@ -988,6 +823,11 @@
 FASTOP2(cmp);
 FASTOP2(test);
 
+FASTOP1SRC2(mul, mul_ex);
+FASTOP1SRC2(imul, imul_ex);
+FASTOP1SRC2EX(div, div_ex);
+FASTOP1SRC2EX(idiv, idiv_ex);
+
 FASTOP3WCL(shld);
 FASTOP3WCL(shrd);
 
@@ -1013,6 +853,8 @@
 FASTOP2W(btr);
 FASTOP2W(btc);
 
+FASTOP2(xadd);
+
 static u8 test_cc(unsigned int condition, unsigned long flags)
 {
 	u8 rc;
@@ -1726,45 +1568,42 @@
 	}
 }
 
-static int writeback(struct x86_emulate_ctxt *ctxt)
+static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
 {
 	int rc;
 
-	if (ctxt->d & NoWrite)
-		return X86EMUL_CONTINUE;
-
-	switch (ctxt->dst.type) {
+	switch (op->type) {
 	case OP_REG:
-		write_register_operand(&ctxt->dst);
+		write_register_operand(op);
 		break;
 	case OP_MEM:
 		if (ctxt->lock_prefix)
 			rc = segmented_cmpxchg(ctxt,
-					       ctxt->dst.addr.mem,
-					       &ctxt->dst.orig_val,
-					       &ctxt->dst.val,
-					       ctxt->dst.bytes);
+					       op->addr.mem,
+					       &op->orig_val,
+					       &op->val,
+					       op->bytes);
 		else
 			rc = segmented_write(ctxt,
-					     ctxt->dst.addr.mem,
-					     &ctxt->dst.val,
-					     ctxt->dst.bytes);
+					     op->addr.mem,
+					     &op->val,
+					     op->bytes);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
 	case OP_MEM_STR:
 		rc = segmented_write(ctxt,
-				ctxt->dst.addr.mem,
-				ctxt->dst.data,
-				ctxt->dst.bytes * ctxt->dst.count);
+				op->addr.mem,
+				op->data,
+				op->bytes * op->count);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
 	case OP_XMM:
-		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
+		write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
 		break;
 	case OP_MM:
-		write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm);
+		write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
 		break;
 	case OP_NONE:
 		/* no writeback */
@@ -2117,42 +1956,6 @@
 	return X86EMUL_CONTINUE;
 }
 
-static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
-{
-	u8 ex = 0;
-
-	emulate_1op_rax_rdx(ctxt, "mul", ex);
-	return X86EMUL_CONTINUE;
-}
-
-static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
-{
-	u8 ex = 0;
-
-	emulate_1op_rax_rdx(ctxt, "imul", ex);
-	return X86EMUL_CONTINUE;
-}
-
-static int em_div_ex(struct x86_emulate_ctxt *ctxt)
-{
-	u8 de = 0;
-
-	emulate_1op_rax_rdx(ctxt, "div", de);
-	if (de)
-		return emulate_de(ctxt);
-	return X86EMUL_CONTINUE;
-}
-
-static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
-{
-	u8 de = 0;
-
-	emulate_1op_rax_rdx(ctxt, "idiv", de);
-	if (de)
-		return emulate_de(ctxt);
-	return X86EMUL_CONTINUE;
-}
-
 static int em_grp45(struct x86_emulate_ctxt *ctxt)
 {
 	int rc = X86EMUL_CONTINUE;
@@ -3734,10 +3537,10 @@
 	F(DstMem | SrcImm | NoWrite, em_test),
 	F(DstMem | SrcNone | Lock, em_not),
 	F(DstMem | SrcNone | Lock, em_neg),
-	I(SrcMem, em_mul_ex),
-	I(SrcMem, em_imul_ex),
-	I(SrcMem, em_div_ex),
-	I(SrcMem, em_idiv_ex),
+	F(DstXacc | Src2Mem, em_mul_ex),
+	F(DstXacc | Src2Mem, em_imul_ex),
+	F(DstXacc | Src2Mem, em_div_ex),
+	F(DstXacc | Src2Mem, em_idiv_ex),
 };
 
 static const struct opcode group4[] = {
@@ -4064,7 +3867,7 @@
 	F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
 	D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xC7 */
-	D2bv(DstMem | SrcReg | ModRM | Lock),
+	F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
 	N, D(DstMem | SrcReg | ModRM | Mov),
 	N, N, N, GD(0, &group9),
 	/* 0xC8 - 0xCF */
@@ -4172,6 +3975,24 @@
 		fetch_register_operand(op);
 		op->orig_val = op->val;
 		break;
+	case OpAccLo:
+		op->type = OP_REG;
+		op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
+		op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+		fetch_register_operand(op);
+		op->orig_val = op->val;
+		break;
+	case OpAccHi:
+		if (ctxt->d & ByteOp) {
+			op->type = OP_NONE;
+			break;
+		}
+		op->type = OP_REG;
+		op->bytes = ctxt->op_bytes;
+		op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+		fetch_register_operand(op);
+		op->orig_val = op->val;
+		break;
 	case OpDI:
 		op->type = OP_MEM;
 		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
@@ -4553,11 +4374,15 @@
 static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
 {
 	ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
-	fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+	if (!(ctxt->d & ByteOp))
+		fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
 	asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
-	    : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
-	: "c"(ctxt->src2.val), [fastop]"S"(fop));
+	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
+	      [fastop]"+S"(fop)
+	    : "c"(ctxt->src2.val));
 	ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
+	if (!fop) /* exception is returned in fop variable */
+		return emulate_de(ctxt);
 	return X86EMUL_CONTINUE;
 }
 
@@ -4773,9 +4598,17 @@
 		goto done;
 
 writeback:
-	rc = writeback(ctxt);
-	if (rc != X86EMUL_CONTINUE)
-		goto done;
+	if (!(ctxt->d & NoWrite)) {
+		rc = writeback(ctxt, &ctxt->dst);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+	}
+	if (ctxt->d & SrcWrite) {
+		BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
+		rc = writeback(ctxt, &ctxt->src);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+	}
 
 	/*
 	 * restore dst type in case the decoding will be reused
@@ -4872,12 +4705,6 @@
 		ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
 							(s16) ctxt->src.val;
 		break;
-	case 0xc0 ... 0xc1:	/* xadd */
-		fastop(ctxt, em_add);
-		/* Write back the register source. */
-		ctxt->src.val = ctxt->dst.orig_val;
-		write_register_operand(&ctxt->src);
-		break;
 	case 0xc3:		/* movnti */
 		ctxt->dst.bytes = ctxt->op_bytes;
 		ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0eee2c8..afc1124 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c

@@ -1608,8 +1608,8 @@
 		return;
 
 	if (atomic_read(&apic->lapic_timer.pending) > 0) {
-		if (kvm_apic_local_deliver(apic, APIC_LVTT))
-			atomic_dec(&apic->lapic_timer.pending);
+		kvm_apic_local_deliver(apic, APIC_LVTT);
+		atomic_set(&apic->lapic_timer.pending, 0);
 	}
 }
 

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87..0d094da 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c

@@ -197,15 +197,63 @@
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
-static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+/*
+ * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
+ * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
+ * number.
+ */
+#define MMIO_SPTE_GEN_LOW_SHIFT		3
+#define MMIO_SPTE_GEN_HIGH_SHIFT	52
+
+#define MMIO_GEN_SHIFT			19
+#define MMIO_GEN_LOW_SHIFT		9
+#define MMIO_GEN_LOW_MASK		((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_MASK			((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_MAX_GEN			((1 << MMIO_GEN_SHIFT) - 1)
+
+static u64 generation_mmio_spte_mask(unsigned int gen)
 {
-	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+	u64 mask;
+
+	WARN_ON(gen > MMIO_MAX_GEN);
+
+	mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
+	mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+	return mask;
+}
+
+static unsigned int get_mmio_spte_generation(u64 spte)
+{
+	unsigned int gen;
+
+	spte &= ~shadow_mmio_mask;
+
+	gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
+	gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+	return gen;
+}
+
+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+	/*
+	 * Init kvm generation close to MMIO_MAX_GEN to easily test the
+	 * code of handling generation number wrap-around.
+	 */
+	return (kvm_memslots(kvm)->generation +
+		      MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+}
+
+static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+			   unsigned access)
+{
+	unsigned int gen = kvm_current_mmio_generation(kvm);
+	u64 mask = generation_mmio_spte_mask(gen);
 
 	access &= ACC_WRITE_MASK | ACC_USER_MASK;
+	mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
 
-	sp->mmio_cached = true;
-	trace_mark_mmio_spte(sptep, gfn, access);
-	mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+	trace_mark_mmio_spte(sptep, gfn, access, gen);
+	mmu_spte_set(sptep, mask);
 }
 
 static bool is_mmio_spte(u64 spte)
@@ -215,24 +263,38 @@
 
 static gfn_t get_mmio_spte_gfn(u64 spte)
 {
-	return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+	u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+	return (spte & ~mask) >> PAGE_SHIFT;
 }
 
 static unsigned get_mmio_spte_access(u64 spte)
 {
-	return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+	u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+	return (spte & ~mask) & ~PAGE_MASK;
 }
 
-static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+			  pfn_t pfn, unsigned access)
 {
 	if (unlikely(is_noslot_pfn(pfn))) {
-		mark_mmio_spte(sptep, gfn, access);
+		mark_mmio_spte(kvm, sptep, gfn, access);
 		return true;
 	}
 
 	return false;
 }
 
+static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+{
+	unsigned int kvm_gen, spte_gen;
+
+	kvm_gen = kvm_current_mmio_generation(kvm);
+	spte_gen = get_mmio_spte_generation(spte);
+
+	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+	return likely(kvm_gen == spte_gen);
+}
+
 static inline u64 rsvd_bits(int s, int e)
 {
 	return ((1ULL << (e - s + 1)) - 1) << s;
@@ -404,9 +466,20 @@
 /*
  * The idea using the light way get the spte on x86_32 guest is from
  * gup_get_pte(arch/x86/mm/gup.c).
- * The difference is we can not catch the spte tlb flush if we leave
- * guest mode, so we emulate it by increase clear_spte_count when spte
- * is cleared.
+ *
+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * coalesces them and we are running out of the MMU lock.  Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte.  The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race.  This is done using clear_spte_count.
  */
 static u64 __get_spte_lockless(u64 *sptep)
 {
@@ -1511,6 +1584,12 @@
 	if (!direct)
 		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+	/*
+	 * The active_mmu_pages list is the FIFO list, do not move the
+	 * page until it is zapped. kvm_zap_obsolete_pages depends on
+	 * this feature. See the comments in kvm_zap_obsolete_pages().
+	 */
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	sp->parent_ptes = 0;
 	mmu_page_add_parent_pte(vcpu, sp, parent_pte);
@@ -1648,6 +1727,16 @@
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list);
 
+/*
+ * NOTE: we should pay more attention on the zapped-obsolete page
+ * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
+ * since it has been deleted from active_mmu_pages but still can be found
+ * at hast list.
+ *
+ * for_each_gfn_indirect_valid_sp has skipped that kind of page and
+ * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
+ * all the obsolete pages.
+ */
 #define for_each_gfn_sp(_kvm, _sp, _gfn)				\
 	hlist_for_each_entry(_sp,					\
 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -1838,6 +1927,11 @@
 	__clear_sp_write_flooding_count(sp);
 }
 
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -1864,6 +1958,9 @@
 		role.quadrant = quadrant;
 	}
 	for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+		if (is_obsolete_sp(vcpu->kvm, sp))
+			continue;
+
 		if (!need_sync && sp->unsync)
 			need_sync = true;
 
@@ -1900,6 +1997,7 @@
 
 		account_shadowed(vcpu->kvm, gfn);
 	}
+	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	init_shadow_page_table(sp);
 	trace_kvm_mmu_get_page(sp, true);
 	return sp;
@@ -2070,8 +2168,10 @@
 	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
+
 	if (!sp->role.invalid && !sp->role.direct)
 		unaccount_shadowed(kvm, sp->gfn);
+
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
 	if (!sp->root_count) {
@@ -2081,7 +2181,13 @@
 		kvm_mod_used_mmu_pages(kvm, -1);
 	} else {
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
-		kvm_reload_remote_mmus(kvm);
+
+		/*
+		 * The obsolete pages can not be used on any vcpus.
+		 * See the comments in kvm_mmu_invalidate_zap_all_pages().
+		 */
+		if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+			kvm_reload_remote_mmus(kvm);
 	}
 
 	sp->role.invalid = 1;
@@ -2331,7 +2437,7 @@
 	u64 spte;
 	int ret = 0;
 
-	if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+	if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
 		return 0;
 
 	spte = PT_PRESENT_MASK;
@@ -2869,22 +2975,25 @@
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
-	spin_lock(&vcpu->kvm->mmu_lock);
+
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
 	    (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
 	     vcpu->arch.mmu.direct_map)) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
+		spin_lock(&vcpu->kvm->mmu_lock);
 		sp = page_header(root);
 		--sp->root_count;
 		if (!sp->root_count && sp->role.invalid) {
 			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
 			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		}
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 		spin_unlock(&vcpu->kvm->mmu_lock);
+		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 		return;
 	}
+
+	spin_lock(&vcpu->kvm->mmu_lock);
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -3148,17 +3257,12 @@
 	return spte;
 }
 
-/*
- * If it is a real mmio page fault, return 1 and emulat the instruction
- * directly, return 0 to let CPU fault again on the address, -1 is
- * returned if bug is detected.
- */
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
 	u64 spte;
 
 	if (quickly_check_mmio_pf(vcpu, addr, direct))
-		return 1;
+		return RET_MMIO_PF_EMULATE;
 
 	spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
 
@@ -3166,12 +3270,15 @@
 		gfn_t gfn = get_mmio_spte_gfn(spte);
 		unsigned access = get_mmio_spte_access(spte);
 
+		if (!check_mmio_spte(vcpu->kvm, spte))
+			return RET_MMIO_PF_INVALID;
+
 		if (direct)
 			addr = 0;
 
 		trace_handle_mmio_page_fault(addr, gfn, access);
 		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
-		return 1;
+		return RET_MMIO_PF_EMULATE;
 	}
 
 	/*
@@ -3179,13 +3286,13 @@
 	 * it's a BUG if the gfn is not a mmio page.
 	 */
 	if (direct && !check_direct_spte_mmio_pf(spte))
-		return -1;
+		return RET_MMIO_PF_BUG;
 
 	/*
 	 * If the page table is zapped by other cpus, let CPU fault again on
 	 * the address.
 	 */
-	return 0;
+	return RET_MMIO_PF_RETRY;
 }
 EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
 
@@ -3195,7 +3302,7 @@
 	int ret;
 
 	ret = handle_mmio_page_fault_common(vcpu, addr, direct);
-	WARN_ON(ret < 0);
+	WARN_ON(ret == RET_MMIO_PF_BUG);
 	return ret;
 }
 
@@ -3207,8 +3314,12 @@
 
 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
 
-	if (unlikely(error_code & PFERR_RSVD_MASK))
-		return handle_mmio_page_fault(vcpu, gva, error_code, true);
+	if (unlikely(error_code & PFERR_RSVD_MASK)) {
+		r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+
+		if (likely(r != RET_MMIO_PF_INVALID))
+			return r;
+	}
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -3284,8 +3395,12 @@
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-	if (unlikely(error_code & PFERR_RSVD_MASK))
-		return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+	if (unlikely(error_code & PFERR_RSVD_MASK)) {
+		r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
+		if (likely(r != RET_MMIO_PF_INVALID))
+			return r;
+	}
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -3391,8 +3506,8 @@
 	*access &= mask;
 }
 
-static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
-			   int *nr_present)
+static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+			   unsigned access, int *nr_present)
 {
 	if (unlikely(is_mmio_spte(*sptep))) {
 		if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3401,7 +3516,7 @@
 		}
 
 		(*nr_present)++;
-		mark_mmio_spte(sptep, gfn, access);
+		mark_mmio_spte(kvm, sptep, gfn, access);
 		return true;
 	}
 
@@ -3764,9 +3879,7 @@
 	if (r)
 		goto out;
 	r = mmu_alloc_roots(vcpu);
-	spin_lock(&vcpu->kvm->mmu_lock);
-	mmu_sync_roots(vcpu);
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_sync_roots(vcpu);
 	if (r)
 		goto out;
 	/* set_cr3() should ensure TLB has been flushed */
@@ -4179,39 +4292,107 @@
 	spin_unlock(&kvm->mmu_lock);
 }
 
-void kvm_mmu_zap_all(struct kvm *kvm)
+#define BATCH_ZAP_PAGES	10
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
-	LIST_HEAD(invalid_list);
+	int batch = 0;
 
-	spin_lock(&kvm->mmu_lock);
 restart:
-	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-			goto restart;
+	list_for_each_entry_safe_reverse(sp, node,
+	      &kvm->arch.active_mmu_pages, link) {
+		int ret;
 
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-	spin_unlock(&kvm->mmu_lock);
-}
+		/*
+		 * No obsolete page exists before new created page since
+		 * active_mmu_pages is the FIFO list.
+		 */
+		if (!is_obsolete_sp(kvm, sp))
+			break;
 
-void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
-{
-	struct kvm_mmu_page *sp, *node;
-	LIST_HEAD(invalid_list);
-
-	spin_lock(&kvm->mmu_lock);
-restart:
-	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-		if (!sp->mmio_cached)
+		/*
+		 * Since we are reversely walking the list and the invalid
+		 * list will be moved to the head, skip the invalid page
+		 * can help us to avoid the infinity list walking.
+		 */
+		if (sp->role.invalid)
 			continue;
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+
+		/*
+		 * Need not flush tlb since we only zap the sp with invalid
+		 * generation number.
+		 */
+		if (batch >= BATCH_ZAP_PAGES &&
+		      cond_resched_lock(&kvm->mmu_lock)) {
+			batch = 0;
+			goto restart;
+		}
+
+		ret = kvm_mmu_prepare_zap_page(kvm, sp,
+				&kvm->arch.zapped_obsolete_pages);
+		batch += ret;
+
+		if (ret)
 			goto restart;
 	}
 
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	/*
+	 * Should flush tlb before free page tables since lockless-walking
+	 * may use the pages.
+	 */
+	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+}
+
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
+{
+	spin_lock(&kvm->mmu_lock);
+	trace_kvm_mmu_invalidate_zap_all_pages(kvm);
+	kvm->arch.mmu_valid_gen++;
+
+	/*
+	 * Notify all vcpus to reload its shadow page table
+	 * and flush TLB. Then all vcpus will switch to new
+	 * shadow page table with the new mmu_valid_gen.
+	 *
+	 * Note: we should do this under the protection of
+	 * mmu-lock, otherwise, vcpu would purge shadow page
+	 * but miss tlb flush.
+	 */
+	kvm_reload_remote_mmus(kvm);
+
+	kvm_zap_obsolete_pages(kvm);
 	spin_unlock(&kvm->mmu_lock);
 }
 
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
+
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
+{
+	/*
+	 * The very rare case: if the generation-number is round,
+	 * zap all shadow pages.
+	 *
+	 * The max value is MMIO_MAX_GEN - 1 since it is not called
+	 * when mark memslot invalid.
+	 */
+	if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+		printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
+		kvm_mmu_invalidate_zap_all_pages(kvm);
+	}
+}
+
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct kvm *kvm;
@@ -4240,15 +4421,23 @@
 		 * want to shrink a VM that only started to populate its MMU
 		 * anyway.
 		 */
-		if (!kvm->arch.n_used_mmu_pages)
+		if (!kvm->arch.n_used_mmu_pages &&
+		      !kvm_has_zapped_obsolete_pages(kvm))
 			continue;
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 
+		if (kvm_has_zapped_obsolete_pages(kvm)) {
+			kvm_mmu_commit_zap_page(kvm,
+			      &kvm->arch.zapped_obsolete_pages);
+			goto unlock;
+		}
+
 		prepare_zap_oldest_mmu_page(kvm, &invalid_list);
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
+unlock:
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2adcbc2..5b59c57 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h

@@ -52,6 +52,23 @@
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+
+/*
+ * Return values of handle_mmio_page_fault_common:
+ * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
+ *			directly.
+ * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
+ *			fault path update the mmio spte.
+ * RET_MMIO_PF_RETRY: let CPU fault again on the address.
+ * RET_MMIO_PF_BUG: bug is detected.
+ */
+enum {
+	RET_MMIO_PF_EMULATE = 1,
+	RET_MMIO_PF_INVALID = 2,
+	RET_MMIO_PF_RETRY = 0,
+	RET_MMIO_PF_BUG = -1
+};
+
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
@@ -97,4 +114,5 @@
 	return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
 }
 
+void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
 #endif

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b8f6172..9d2e0ff 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h

@@ -7,16 +7,18 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
 
-#define KVM_MMU_PAGE_FIELDS \
-	__field(__u64, gfn) \
-	__field(__u32, role) \
-	__field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS			\
+	__field(unsigned long, mmu_valid_gen)	\
+	__field(__u64, gfn)			\
+	__field(__u32, role)			\
+	__field(__u32, root_count)		\
 	__field(bool, unsync)
 
-#define KVM_MMU_PAGE_ASSIGN(sp)			     \
-	__entry->gfn = sp->gfn;			     \
-	__entry->role = sp->role.word;		     \
-	__entry->root_count = sp->root_count;        \
+#define KVM_MMU_PAGE_ASSIGN(sp)				\
+	__entry->mmu_valid_gen = sp->mmu_valid_gen;	\
+	__entry->gfn = sp->gfn;				\
+	__entry->role = sp->role.word;			\
+	__entry->root_count = sp->root_count;		\
 	__entry->unsync = sp->unsync;
 
 #define KVM_MMU_PAGE_PRINTK() ({				        \
@@ -28,8 +30,8 @@
 								        \
 	role.word = __entry->role;					\
 									\
-	trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s"		\
-			 " %snxe root %u %s%c",				\
+	trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s"	\
+			 " %snxe root %u %s%c",	__entry->mmu_valid_gen,	\
 			 __entry->gfn, role.level,			\
 			 role.cr4_pae ? " pae" : "",			\
 			 role.quadrant,					\
@@ -197,23 +199,25 @@
 
 TRACE_EVENT(
 	mark_mmio_spte,
-	TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
-	TP_ARGS(sptep, gfn, access),
+	TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
+	TP_ARGS(sptep, gfn, access, gen),
 
 	TP_STRUCT__entry(
 		__field(void *, sptep)
 		__field(gfn_t, gfn)
 		__field(unsigned, access)
+		__field(unsigned int, gen)
 	),
 
 	TP_fast_assign(
 		__entry->sptep = sptep;
 		__entry->gfn = gfn;
 		__entry->access = access;
+		__entry->gen = gen;
 	),
 
-	TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
-		  __entry->access)
+	TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
+		  __entry->gfn, __entry->access, __entry->gen)
 );
 
 TRACE_EVENT(
@@ -274,6 +278,50 @@
 		  __spte_satisfied(old_spte), __spte_satisfied(new_spte)
 	)
 );
+
+TRACE_EVENT(
+	kvm_mmu_invalidate_zap_all_pages,
+	TP_PROTO(struct kvm *kvm),
+	TP_ARGS(kvm),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, mmu_valid_gen)
+		__field(unsigned int, mmu_used_pages)
+	),
+
+	TP_fast_assign(
+		__entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+		__entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+	),
+
+	TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
+		  __entry->mmu_valid_gen, __entry->mmu_used_pages
+	)
+);
+
+
+TRACE_EVENT(
+	check_mmio_spte,
+	TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
+	TP_ARGS(spte, kvm_gen, spte_gen),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, kvm_gen)
+		__field(unsigned int, spte_gen)
+		__field(u64, spte)
+	),
+
+	TP_fast_assign(
+		__entry->kvm_gen = kvm_gen;
+		__entry->spte_gen = spte_gen;
+		__entry->spte = spte;
+	),
+
+	TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
+		  __entry->kvm_gen, __entry->spte_gen,
+		  __entry->kvm_gen == __entry->spte_gen
+	)
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index da20860..7769699 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h

@@ -552,9 +552,12 @@
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
-	if (unlikely(error_code & PFERR_RSVD_MASK))
-		return handle_mmio_page_fault(vcpu, addr, error_code,
+	if (unlikely(error_code & PFERR_RSVD_MASK)) {
+		r = handle_mmio_page_fault(vcpu, addr, error_code,
 					      mmu_is_nested(vcpu));
+		if (likely(r != RET_MMIO_PF_INVALID))
+			return r;
+	};
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -792,7 +795,8 @@
 		pte_access &= gpte_access(vcpu, gpte);
 		protect_clean_gpte(&pte_access, gpte);
 
-		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+		if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
+		      &nr_present))
 			continue;
 
 		if (gfn != sp->gfns[i]) {

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a14a6ea..c0bc803 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c

@@ -1026,7 +1026,10 @@
 		g_tsc_offset = svm->vmcb->control.tsc_offset -
 			       svm->nested.hsave->control.tsc_offset;
 		svm->nested.hsave->control.tsc_offset = offset;
-	}
+	} else
+		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+					   svm->vmcb->control.tsc_offset,
+					   offset);
 
 	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
 
@@ -1044,6 +1047,11 @@
 	svm->vmcb->control.tsc_offset += adjustment;
 	if (is_guest_mode(vcpu))
 		svm->nested.hsave->control.tsc_offset += adjustment;
+	else
+		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+				     svm->vmcb->control.tsc_offset - adjustment,
+				     svm->vmcb->control.tsc_offset);
+
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index fe5e00e..545245d 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h

@@ -756,6 +756,27 @@
 		  __entry->gpa_match ? "GPA" : "GVA")
 );
 
+TRACE_EVENT(kvm_write_tsc_offset,
+	TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset,
+		 __u64 next_tsc_offset),
+	TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset),
+
+	TP_STRUCT__entry(
+		__field( unsigned int,	vcpu_id				)
+		__field(	__u64,	previous_tsc_offset		)
+		__field(	__u64,	next_tsc_offset			)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id		= vcpu_id;
+		__entry->previous_tsc_offset	= previous_tsc_offset;
+		__entry->next_tsc_offset	= next_tsc_offset;
+	),
+
+	TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id,
+		  __entry->previous_tsc_offset, __entry->next_tsc_offset)
+);
+
 #ifdef CONFIG_X86_64
 
 #define host_clocks					\

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b30f5a5..a7e1855 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c

@@ -2096,6 +2096,8 @@
 			(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
 			 vmcs12->tsc_offset : 0));
 	} else {
+		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+					   vmcs_read64(TSC_OFFSET), offset);
 		vmcs_write64(TSC_OFFSET, offset);
 	}
 }
@@ -2103,11 +2105,14 @@
 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
 {
 	u64 offset = vmcs_read64(TSC_OFFSET);
+
 	vmcs_write64(TSC_OFFSET, offset + adjustment);
 	if (is_guest_mode(vcpu)) {
 		/* Even when running L2, the adjustment needs to apply to L1 */
 		to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
-	}
+	} else
+		trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
+					   offset + adjustment);
 }
 
 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -4176,10 +4181,10 @@
 	/*
 	 * EPT Misconfigurations can be generated if the value of bits 2:0
 	 * of an EPT paging-structure entry is 110b (write/execute).
-	 * Also, magic bits (0xffull << 49) is set to quickly identify mmio
+	 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
 	 * spte.
 	 */
-	kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
+	kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
 }
 
 /*
@@ -5366,10 +5371,14 @@
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 
 	ret = handle_mmio_page_fault_common(vcpu, gpa, true);
-	if (likely(ret == 1))
+	if (likely(ret == RET_MMIO_PF_EMULATE))
 		return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
 					      EMULATE_DONE;
-	if (unlikely(!ret))
+
+	if (unlikely(ret == RET_MMIO_PF_INVALID))
+		return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
+
+	if (unlikely(ret == RET_MMIO_PF_RETRY))
 		return 1;
 
 	/* It is the real ept misconfig */

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 292e6ca..d21bce5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c

@@ -1193,20 +1193,37 @@
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
+		int faulted = 0;
+
 		/* n.b - signed multiplication and division required */
 		usdiff = data - kvm->arch.last_tsc_write;
 #ifdef CONFIG_X86_64
 		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
 #else
 		/* do_div() only does unsigned */
-		asm("idivl %2; xor %%edx, %%edx"
-		: "=A"(usdiff)
-		: "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+		asm("1: idivl %[divisor]\n"
+		    "2: xor %%edx, %%edx\n"
+		    "   movl $0, %[faulted]\n"
+		    "3:\n"
+		    ".section .fixup,\"ax\"\n"
+		    "4: movl $1, %[faulted]\n"
+		    "   jmp  3b\n"
+		    ".previous\n"
+
+		_ASM_EXTABLE(1b, 4b)
+
+		: "=A"(usdiff), [faulted] "=r" (faulted)
+		: "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
+
 #endif
 		do_div(elapsed, 1000);
 		usdiff -= elapsed;
 		if (usdiff < 0)
 			usdiff = -usdiff;
+
+		/* idivl overflow => difference is larger than USEC_PER_SEC */
+		if (faulted)
+			usdiff = USEC_PER_SEC;
 	} else
 		usdiff = USEC_PER_SEC; /* disable TSC match window below */
 
@@ -1587,6 +1604,30 @@
 	return 0;
 }
 
+/*
+ * kvmclock updates which are isolated to a given vcpu, such as
+ * vcpu->cpu migration, should not allow system_timestamp from
+ * the rest of the vcpus to remain static. Otherwise ntp frequency
+ * correction applies to one vcpu's system_timestamp but not
+ * the others.
+ *
+ * So in those cases, request a kvmclock update for all vcpus.
+ * The worst case for a remote vcpu to update its kvmclock
+ * is then bounded by maximum nohz sleep latency.
+ */
+
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+	int i;
+	struct kvm *kvm = v->kvm;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+		kvm_vcpu_kick(vcpu);
+	}
+}
+
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -1984,7 +2025,7 @@
 		kvmclock_reset(vcpu);
 
 		vcpu->arch.time = data;
-		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+		kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 
 		/* we verify if the enable bit is set... */
 		if (!(data & 1))
@@ -2701,7 +2742,7 @@
 		 * kvmclock on vcpu->cpu migration
 		 */
 		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
-			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 		if (vcpu->cpu != cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
@@ -5238,7 +5279,13 @@
 	 * Set the reserved bits and the present bit of an paging-structure
 	 * entry to generate page fault with PFER.RSV = 1.
 	 */
-	mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+	 /* Mask the reserved physical address bits. */
+	mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
+
+	/* Bit 62 is always reserved for 32bit host. */
+	mask |= 0x3ull << 62;
+
+	/* Set the present bit. */
 	mask |= 1ull;
 
 #ifdef CONFIG_X86_64
@@ -5498,13 +5545,6 @@
 	char instruction[3];
 	unsigned long rip = kvm_rip_read(vcpu);
 
-	/*
-	 * Blow out the MMU to ensure that no other VCPU has an active mapping
-	 * to ensure that the updated hypercall appears atomically across all
-	 * VCPUs.
-	 */
-	kvm_mmu_zap_all(vcpu->kvm);
-
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
 	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
@@ -5702,6 +5742,8 @@
 			__kvm_migrate_timers(vcpu);
 		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
 			kvm_gen_update_masterclock(vcpu->kvm);
+		if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
+			kvm_gen_kvmclock_update(vcpu);
 		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
 			r = kvm_guest_time_update(vcpu);
 			if (unlikely(r))
@@ -6812,6 +6854,7 @@
 		return -EINVAL;
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -7040,22 +7083,18 @@
 	 * If memory slot is created, or moved, we need to clear all
 	 * mmio sptes.
 	 */
-	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
-		kvm_mmu_zap_mmio_sptes(kvm);
-		kvm_reload_remote_mmus(kvm);
-	}
+	kvm_mmu_invalidate_mmio_sptes(kvm);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
-	kvm_mmu_zap_all(kvm);
-	kvm_reload_remote_mmus(kvm);
+	kvm_mmu_invalidate_zap_all_pages(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot)
 {
-	kvm_arch_flush_shadow_all(kvm);
+	kvm_mmu_invalidate_zap_all_pages(kvm);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -7263,3 +7302,4 @@
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);

diff --git a/arch/arm/include/asm/kvm_arch_timer.h b/include/kvm/arm_arch_timer.h
similarity index 91%
rename from arch/arm/include/asm/kvm_arch_timer.h
rename to include/kvm/arm_arch_timer.h
index 68cb9e1..6d9aedd 100644
--- a/arch/arm/include/asm/kvm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h

@@ -61,6 +61,8 @@
 #ifdef CONFIG_KVM_ARM_TIMER
 int kvm_timer_hyp_init(void);
 int kvm_timer_init(struct kvm *kvm);
+void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+			  const struct kvm_irq_level *irq);
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
@@ -76,6 +78,8 @@
 	return 0;
 }
 
+static inline void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+					const struct kvm_irq_level *irq) {}
 static inline void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) {}
 static inline void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) {}
 static inline void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) {}

diff --git a/arch/arm/include/asm/kvm_vgic.h b/include/kvm/arm_vgic.h
similarity index 100%
rename from arch/arm/include/asm/kvm_vgic.h
rename to include/kvm/arm_vgic.h


diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8db53cf..a63d83e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h

@@ -125,6 +125,7 @@
 #define KVM_REQ_MCLOCK_INPROGRESS 19
 #define KVM_REQ_EPR_EXIT          20
 #define KVM_REQ_SCAN_IOAPIC       21
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE 22
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -145,7 +146,8 @@
 #define NR_IOBUS_DEVS 1000
 
 struct kvm_io_bus {
-	int                   dev_count;
+	int dev_count;
+	int ioeventfd_count;
 	struct kvm_io_range range[];
 };
 

diff --git a/arch/arm/kvm/arch_timer.c b/virt/kvm/arm/arch_timer.c
similarity index 90%
rename from arch/arm/kvm/arch_timer.c
rename to virt/kvm/arm/arch_timer.c
index 49a7516..c2e1ef4 100644
--- a/arch/arm/kvm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c

@@ -25,14 +25,12 @@
 #include <clocksource/arm_arch_timer.h>
 #include <asm/arch_timer.h>
 
-#include <asm/kvm_vgic.h>
-#include <asm/kvm_arch_timer.h>
+#include <kvm/arm_vgic.h>
+#include <kvm/arm_arch_timer.h>
 
 static struct timecounter *timecounter;
 static struct workqueue_struct *wqueue;
-static struct kvm_irq_level timer_irq = {
-	.level	= 1,
-};
+static unsigned int host_vtimer_irq;
 
 static cycle_t kvm_phys_timer_read(void)
 {
@@ -67,8 +65,8 @@
 
 	timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
 	kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-			    vcpu->arch.timer_cpu.irq->irq,
-			    vcpu->arch.timer_cpu.irq->level);
+			    timer->irq->irq,
+			    timer->irq->level);
 }
 
 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
@@ -156,6 +154,20 @@
 	timer_arm(timer, ns);
 }
 
+void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+			  const struct kvm_irq_level *irq)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+	/*
+	 * The vcpu timer irq number cannot be determined in
+	 * kvm_timer_vcpu_init() because it is called much before
+	 * kvm_vcpu_set_target(). To handle this, we determine
+	 * vcpu timer irq number when the vcpu is reset.
+	 */
+	timer->irq = irq;
+}
+
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -163,12 +175,11 @@
 	INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
 	hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	timer->timer.function = kvm_timer_expire;
-	timer->irq = &timer_irq;
 }
 
 static void kvm_timer_init_interrupt(void *info)
 {
-	enable_percpu_irq(timer_irq.irq, 0);
+	enable_percpu_irq(host_vtimer_irq, 0);
 }
 
 
@@ -182,7 +193,7 @@
 		break;
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
-		disable_percpu_irq(timer_irq.irq);
+		disable_percpu_irq(host_vtimer_irq);
 		break;
 	}
 
@@ -230,7 +241,7 @@
 		goto out;
 	}
 
-	timer_irq.irq = ppi;
+	host_vtimer_irq = ppi;
 
 	err = register_cpu_notifier(&kvm_timer_cpu_nb);
 	if (err) {

diff --git a/arch/arm/kvm/vgic.c b/virt/kvm/arm/vgic.c
similarity index 100%
rename from arch/arm/kvm/vgic.c
rename to virt/kvm/arm/vgic.c


diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 64ee720..1550637 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c

@@ -753,6 +753,7 @@
 	if (ret < 0)
 		goto unlock_fail;
 
+	kvm->buses[bus_idx]->ioeventfd_count++;
 	list_add_tail(&p->list, &kvm->ioeventfds);
 
 	mutex_unlock(&kvm->slots_lock);
@@ -798,6 +799,7 @@
 			continue;
 
 		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
+		kvm->buses[bus_idx]->ioeventfd_count--;
 		ioeventfd_release(p);
 		ret = 0;
 		break;

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 302681c..1580dd4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c

@@ -2926,7 +2926,8 @@
 	struct kvm_io_bus *new_bus, *bus;
 
 	bus = kvm->buses[bus_idx];
-	if (bus->dev_count > NR_IOBUS_DEVS - 1)
+	/* exclude ioeventfd which is limited by maximum fd */
+	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;
 
 	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
@@ -3181,6 +3182,7 @@
 
 out_undebugfs:
 	unregister_syscore_ops(&kvm_syscore_ops);
+	misc_deregister(&kvm_dev);
 out_unreg:
 	kvm_async_pf_deinit();
 out_free:
commit	fe489bf4505ae26d3c6d6a1f1d3064c2a9c5cd85	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Wed Jul 03 13:21:40 2013 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	Wed Jul 03 13:21:40 2013 -0700
tree	46596fd7edf7c4da1dafdb2c62011841e71cf32d
parent	3e34131a65127e73fbae68c82748f32c8af7e4a4 [diff]
parent	a3ff5fbc94a829680d4aa005cd17add1c1a1fb5b [diff]