GIT 5b644f6246905396f05ebe0f570c50bd81fce850 git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm.git#master

commit 5b644f6246905396f05ebe0f570c50bd81fce850
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jan 16 12:49:30 2008 +0200

    KVM: Move apic timer migration away from critical section
    
    Migrating the apic timer in the critical section is not very nice, and is
    absolutely horrible with the real-time port.  Move migration to the regular
    vcpu execution path, triggered by a new bitflag.
    
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cf953c92a6a89b2256d92cb969df34e00f5d78df
Author: Glauber de Oliveira Costa <gcosta@redhat.com>
Date:   Tue Jan 15 13:10:15 2008 -0200

    KVM: Put kvm_para.h include outside __KERNEL__
    
    kvm_para.h potentially contains definitions that are to be used by userspace,
    so it should not be included inside the __KERNEL__ block. To protect its own
    data structures, kvm_para.h already includes its own __KERNEL__ block.
    
    Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
    Acked-by: Amit Shah <amit.shah@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e523e00f4e2fe71b123bb102a8847f86b5b4bd8b
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jan 15 18:27:32 2008 +0200

    KVM: Fix unbounded preemption latency
    
    When preparing to enter the guest, if an interrupt comes in while
    preemption is disabled but interrupts are still enabled, we miss a
    preemption point.  Fix by explicitly checking whether we need to
    reschedule.
    
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9e007c2364364e4243870a61c7e8ed9f8527fa5b
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jan 13 13:23:56 2008 +0200

    KVM: Initialize the mmu caches only after verifying cpu support
    
    Otherwise we re-initialize the mmu caches, which will fail since the
    caches are already registered, which will cause us to deinitialize said caches.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b790d90bb55c3733ec9de6bf60c84ec15e10362e
Author: Izik Eidus <izike@qumranet.com>
Date:   Sat Jan 12 23:49:09 2008 +0200

    KVM: MMU: Fix dirty page setting for pages removed from rmap
    
    Right now rmap_remove won't set the page as dirty if the shadow pte
    pointed to this page had write access and then it became readonly.
    This patches fixes that, by setting the page as dirty for spte changes from
    write to readonly access.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ae544b92020577f556c5f7dd18e6a652f6f03a53
Author: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date:   Tue Jan 8 08:04:50 2008 +0100

    KVM: Portability: Move kvm_fpu to asm-x86/kvm.h
    
    This patch moves kvm_fpu asm-x86/kvm.h to allow every architecture to
    define an own representation used for KVM_GET_FPU/KVM_SET_FPU.
    
    Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Acked-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4ce97c46387e0ea72dfe0b52fd63345f2805a785
Author: Dong, Eddie <eddie.dong@intel.com>
Date:   Mon Jan 7 13:20:25 2008 +0200

    KVM: MMU: Simplify hash table indexing
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 94ed639ccce45f3233d30c84a2b136736e0e216b
Author: Dong, Eddie <eddie.dong@intel.com>
Date:   Mon Jan 7 11:14:20 2008 +0200

    KVM: MMU: Update shadow ptes on partial guest pte writes
    
    A guest partial guest pte write will leave shadow_trap_nonpresent_pte
    in spte, which generates a vmexit at the next guest access through that pte.
    
    This patch improves this by reading the full guest pte in advance and thus
    being able to update the spte and eliminate the vmexit.
    
    This helps pae guests which use two 32-bit writes to set a single 64-bit pte.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5ed49953ef3749de1198bc07cdf11339d8f74432
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Wed Jan 2 14:49:22 2008 +0800

    KVM: x86 emulator: Only allow VMCALL/VMMCALL trapped by #UD
    
    When executing a test program called "crashme", we found the KVM guest cannot
    survive more than ten seconds, then encounterd kernel panic. The basic concept
    of "crashme" is generating random assembly code and trying to execute it.
    
    After some fixes on emulator insn validity judgment, we found it's hard to
    get the current emulator handle the invalid instructions correctly, for the
    #UD trap for hypercall patching caused troubles. The problem is, if the opcode
    itself was OK, but combination of opcode and modrm_reg was invalid, and one
    operand of the opcode was memory (SrcMem or DstMem), the emulator will fetch
    the memory operand first rather than checking the validity, and may encounter
    an error there. For example, ".byte 0xfe, 0x34, 0xcd" has this problem.
    
    In the patch, we simply check that if the invalid opcode wasn't vmcall/vmmcall,
    then return from emulate_instruction() and inject a #UD to guest. With the
    patch, the guest had been running for more than 12 hours.
    
    Signed-off-by: Feng (Eric) Liu <eric.e.liu@intel.com>
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 749e8e0460335f6699984cd7ba041320a9ba3c56
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jan 3 18:30:30 2008 +0200

    KVM: Mark vapic page as dirty for save/restore/migrate
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e7706133a12f0172b1a5ce4467277abd598155b6
Author: Dong, Eddie <eddie.dong@intel.com>
Date:   Wed Jan 2 14:29:08 2008 +0800

    KVM: MMU: Merge shadow level check in FNAME(fetch)
    
    Remove the redundant level check when fetching
    shadow pte for present & non-present spte.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 91051a4955255317cab0d09d8849ec12db85f924
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jan 1 14:19:48 2008 +0200

    KVM: Fix unbalanced mmap_sem operations in cmpxchg8b emulation
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit dcfbe05b0a370341c2ef72ee314eacfb924b6de6
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jan 1 14:19:00 2008 +0200

    KVM: MMU: Fix recursive locking of mmap_sem()
    
    Can cause nasty deadlocks.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a089279c866ddf3a69c4fa13fb9935b143853f97
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Dec 31 17:10:22 2007 +0200

    KVM: MMU: Broaden scope of mmap_sem to include actual mapping
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 91f0ad972111addf2d694b1f2237e5eae72ca454
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Dec 31 15:27:49 2007 +0200

    KVM: MMU: Move kvm_free_some_pages() into critical section
    
    If some other cpu steals mmu pages between our check and an attempt to
    allocate, we can run out of mmu pages.  Fix by moving the check into the
    same critical section as the allocation.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 03f6646f123b7afa1dc8595a6dc96f01d71c765b
Author: Marcelo Tosatti <mtosatti@redhat.com>
Date:   Thu Dec 20 19:18:26 2007 -0500

    KVM: MMU: Switch to mmu spinlock
    
    Convert the synchronization of the shadow handling to a separate mmu_lock
    spinlock.
    
    Also guard fetch() by mmap_sem in read-mode to protect against alias
    and memslot changes.
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6e44a9063005c435bf62da3bf3c131ed4e54bced
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 30 12:29:05 2007 +0200

    KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte()
    
    Since gfn_to_page() is a sleeping function, and we want to make the core mmu
    spinlocked, we need to pass the page from the walker context (which can sleep)
    to the shadow context (which cannot).
    
    [marcelo: avoid recursive locking of mmap_sem]
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a23041c096c4c024351bc1f758807d66471b283f
Author: Marcelo Tosatti <mtosatti@redhat.com>
Date:   Thu Dec 20 19:18:23 2007 -0500

    KVM: Add kvm_read_guest_atomic()
    
    In preparation for a mmu spinlock, add kvm_read_guest_atomic()
    and use it in fetch() and prefetch_page().
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9bad222c14f721daa7e0bc2863cf62caaecf14e6
Author: Marcelo Tosatti <mtosatti@redhat.com>
Date:   Thu Dec 20 19:18:22 2007 -0500

    KVM: MMU: Concurrent guest walkers
    
    Do not hold kvm->lock mutex across the entire pagefault code,
    only acquire it in places where it is necessary, such as mmu
    hash list, active list, rmap and parent pte handling.
    
    Allow concurrent guest walkers by switching walk_addr() to use
    mmap_sem in read-mode.
    
    And get rid of the lockless __gfn_to_page.
    
    [avi: move kvm_mmu_pte_write() locking inside the function]
    [avi: add locking for real mode]
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1477b89f565e78edbcd5d195fb72ab4682f63b2a
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Dec 26 13:57:04 2007 +0200

    KVM: Disable vapic support on Intel machines with FlexPriority
    
    FlexPriority accelerates the tpr without any patching.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1d536bc26aec043a102f0fd02899985fd5f77653
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 25 16:52:32 2007 +0200

    KVM: Accelerated apic support
    
    This adds a mechanism for exposing the virtual apic tpr to the guest, and a
    protocol for letting the guest update the tpr without causing a vmexit if
    conditions allow (e.g. there is no interrupt pending with a higher priority
    than the new tpr).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b7a5f24a96a5501f59caa165428a928ee2652af6
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Oct 22 16:50:39 2007 +0200

    KVM: local APIC TPR access reporting facility
    
    Add a facility to report on accesses to the local apic tpr even if the
    local apic is emulated in the kernel.  This is basically a hack that
    allows userspace to patch Windows which tends to bang on the tpr a lot.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7885a4fbebcbae1f8594445a46aefb9d97353594
Author: Izik Eidus <izike@qumranet.com>
Date:   Thu Dec 20 10:41:39 2007 +0200

    KVM: Ensure pages are copied on write
    
    Fix userspace memory handling bug related to cow the previous way we called
    get_user_pages, was without the force flag and therefore it didnt break COW
    shared pages.  This caused host memory corruption when host userspace fork()ed.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 567f9bfa0a7461945dcd6a50d0b3efb44714400d
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Dec 19 12:02:40 2007 +0200

    KVM: Print data for unimplemented wrmsr
    
    This can help diagnosing what the guest is trying to do.  In many cases
    we can get away with partial emulation of msrs.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ed918da99a1377481cf8d40b75b2cbb35b802f3e
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Dec 18 19:47:18 2007 +0200

    KVM: MMU: Add cache miss statistic
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0350cb07b74834bce356d0a819be2b239b94d080
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Tue Dec 18 06:08:27 2007 +0800

    KVM: MMU: Coalesce remote tlb flushes
    
    Host side TLB flush can be merged together if multiple
    spte need to be write-protected.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4d95c0f3eea3b89221f3b35a7e01119973ae7190
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Dec 17 20:27:27 2007 +0800

    KVM: Expose ioapic to ia64 save/restore APIs
    
    IA64 also needs to see ioapic structure in irqchip.
    
    Signed-off-by: xiantao.zhang@intel.com <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2042dde72d5f8663237efb5e8a2797ba9e1c42e4
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Dec 17 14:21:40 2007 +0800

    KVM: Move kvm_vcpu_kick() to x86.c
    
    Moving kvm_vcpu_kick() to x86.c. Since it should be
    common for all archs, put its declarations in <linux/kvm_host.h>
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 50d50e54371d313a8bb1827a05ccca41ab6761aa
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Dec 17 14:16:14 2007 +0800

    KVM: Move ioapic code to common directory.
    
    Move ioapic code to common, since IA64 also needs it.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d924af23f843578480c35d6e2dd7cf61f63187b2
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Dec 17 13:59:56 2007 +0800

    KVM: Move irqchip declarations into new ioapic.h and lapic.h
    
    This allows reuse of ioapic in ia64.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c366747d21ea71d89f32b530037285f7bbfabaa3
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 16 11:13:16 2007 +0200

    KVM: Move drivers/kvm/* to virt/kvm/
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 85c6290a3c5345016358e4e604fba470a62fc392
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 16 11:02:48 2007 +0200

    KVM: Move arch dependent files to new directory arch/x86/kvm/
    
    This paves the way for multiple architecture support.  Note that while
    ioapic.c could potentially be shared with ia64, it is also moved.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a3848478c4b265d6b9789a891cefc8dce4ed9c69
Author: Ryan Harper <ryanh@us.ibm.com>
Date:   Thu Dec 13 10:21:10 2007 -0600

    KVM: VMX: Add printk_ratelimit in vmx_intr_assist
    
    Add printk_ratelimit check in front of printk.  This prevents spamming
    of the message during 32-bit ubuntu 6.06server install.  Previously, it
    would hang during the partition formatting stage.
    
    Signed-off-by: Ryan Harper <ryanh@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 58bb8349b808e2ac08bf9aa0580330bcb461e696
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 10:23:23 2007 +0800

    KVM: Portability: Move kvm_vm_stat to x86.h
    
    This patch moves kvm_vm_stat to x86.h, and every arch
    can define its own kvm_vm_stat in $arch.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f51835e0d03b8cd97eaad80dece87e32f7896ed6
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 10:20:16 2007 +0800

    KVM: Portability: Move round_robin_prev_vcpu and tss_addr to kvm_arch
    
    This patches moves two fields round_robin_prev_vcpu and tss to kvm_arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 024c0d7cd4348dcb412e4b49a4dc3bcdbb46bb5d
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 10:17:34 2007 +0800

    KVM: Portability: move vpic and vioapic to kvm_arch
    
    This patches moves two fields vpid and vioapic to kvm_arch
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6a021262861ac4957bc6f558a7221e5127cd4cf7
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 10:01:48 2007 +0800

    KVM: Portability: Move mmu-related fields to kvm_arch
    
    This patches moves mmu-related fields to kvm_arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 06087dc10debe5a74eaf7a6a37464c504973dd45
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:54:20 2007 +0800

    KVM: Portability: Move memslot aliases to new struct kvm_arch
    
    This patches create kvm_arch to hold arch-specific kvm fileds
    and moves fields naliases and aliases to kvm_arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c63eee73881d2db2ee7d2255dc2a0350d528e594
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:49:26 2007 +0800

    KVM: Portability: Move kvm_vcpu_stat to x86.h
    
    This patches moves kvm_vcpu_stat to x86.h, so every
    arch can define its own kvm_vcpu_stat structure.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6ed5f1928c66462ba81da72defb3d705feb73a86
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:45:31 2007 +0800

    KVM: Portability: Expand the KVM_VCPU_COMM in kvm_vcpu structure.
    
    This patches removes KVM_COMM macro, original it is hold
    kvm_vcpu common fields.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f6f92b4c782102cb0162613902b1513f9e7adf3f
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:41:22 2007 +0800

    KVM: Portability: Move kvm_vcpu definition back to kvm.h
    
    This patches moves kvm_vcpu definition to kvm.h, and finally
    kvm.h includes x86.h.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b560069d516402f8cfc6c7f003454956953bcfb5
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:35:10 2007 +0800

    KVM: Portability: Split mmu-related static inline functions to mmu.h
    
    Since these functions need to know the details of kvm or kvm_vcpu structure,
    it can't be put in x86.h.  Create mmu.h to hold them.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f7888ffeef8ee06ef16c665733cd5f324056f014
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Thu Dec 13 23:50:52 2007 +0800

    KVM: Portability: Introduce kvm_vcpu_arch
    
    Move all the architecture-specific fields in kvm_vcpu into a new struct
    kvm_vcpu_arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a0ef3bd9b3bfb21400b8109db5423c6610145dfe
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Tue Dec 11 20:36:00 2007 +0800

    KVM: Portability: Move kvm{pic,ioapic} accesors to x86 specific code
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0d2480f47ae5c357049eb7a49b48fe9ee5b05953
Author: Marcelo Tosatti <marcelo@kvack.org>
Date:   Wed Dec 12 10:46:12 2007 -0500

    KVM: MMU: emulated cmpxchg8b should be atomic on i386
    
    Emulate cmpxchg8b atomically on i386. This is required to avoid a guest
    pte walker from seeing a splitted write.
    
    [avi: make it compile]
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cd8f3145348de3087607195cd221780f5da4f030
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Tue Dec 11 15:36:57 2007 +0100

    KVM: SVM: support writing 0 to K8 performance counter control registers
    
    This lets SVM ignore writes of the value 0 to the performance counter control
    registers.  Thus enabling them will still fail in the guest, but a write of 0
    which keeps them disabled is accepted.  This is required to boot Windows
    Vista 64bit.
    
    [avi: avoid fall-thru in switch statement]
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a30c90b493eec3e118d95e8057776b1c34bd9da8
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Wed Dec 12 12:37:24 2007 +0100

    KVM: LAPIC: minor debugging compile fix
    
    This patch fixes a compile error of the LAPIC code with APIC debugging enabled.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 58a96eecc801a6570264d5c66d26531d82470c33
Author: Marcelo Tosatti <marcelo@kvack.org>
Date:   Tue Dec 11 19:12:27 2007 -0500

    KVM: MMU: Fix SMP shadow instantiation race
    
    There is a race where VCPU0 is shadowing a pagetable entry while VCPU1
    is updating it, which results in a stale shadow copy.
    
    Fix that by comparing the contents of the cached guest pte with the
    current guest pte after write-protecting the guest pagetable.
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 957ff6b0d9822fd24b294b0e2a66b6a48135b269
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Thu Dec 6 21:02:25 2007 +0100

    KVM: SVM: Exit to userspace if write to cr8 and not using in-kernel apic
    
    With this patch KVM on SVM will exit to userspace if the guest writes to CR8
    and the in-kernel APIC is disabled.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0210661321f78f69f98c593625f18a9ea686fdc1
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 18:43:00 2007 +0200

    KVM: MMU: Use mmu_set_spte() for real-mode shadows
    
    In addition to removing some duplicated code, this also handles the unlikely
    case of real-mode code updating a guest page table.  This can happen when
    one vcpu (in real mode) touches a second vcpu's (in protected mode) page
    tables, or if a vcpu switches to real mode, touches page tables, and switches
    back.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit be3aea0724666dcfd0b9b4c7a571a7f1fb2e0357
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 18:39:41 2007 +0200

    KVM: MMU: Adjust mmu_set_spte() debug code for gpte removal
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8c88b9de814d698c0b4ae37fea84bd32b70f786b
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:40:31 2007 +0200

    KVM: MMU: Move set_pte() into guest paging mode independent code
    
    As set_pte() no longer references either a gpte or the guest walker, we can
    move it out of paging mode dependent code (which compiles twice and is
    generally nasty).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit dc9f6110cf35805ed4e8c0a95c1be267c907e884
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:33:46 2007 +0200

    KVM: MMU: Remove walker argument to set_pte()
    
    Unused.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 33e0e3a2d4920eb2dce19c1e1a796ff6e0762457
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:32:30 2007 +0200

    KVM: MMU: Pass pte dirty flag to set_pte() instead of calculating it on-site
    
    This allows us to remove its dependency on pt_element_t.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ea8dea713f57429db7ff2fc647f1dd3c479a1498
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:27:52 2007 +0200

    KVM: MMU: No need to pick up nx bit from guest pte
    
    We already set it according to cumulative access permissions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aab175e49c49324310d7223be9a78ab2c199e74f
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:00:02 2007 +0200

    KVM: MMU: Fix inherited permissions for emulated guest pte updates
    
    When we emulate a guest pte write, we fail to apply the correct inherited
    permissions from the parent ptes.  Now that we store inherited permissions
    in the shadow page, we can use that to update the pte permissions correctly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 36f96df050f80f2632cfb6b4732d839a7977113a
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 16:52:56 2007 +0200

    KVM: MMU: Move pte access calculation into a helper function
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2472c041e66c73c9ad754f6b8d19c7fba374cb27
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 16:37:36 2007 +0200

    KVM: MMU: Set nx bit correctly on shadow ptes
    
    While the page table walker correctly generates a guest page fault
    if a guest tries to execute a non-executable page, the shadow code does
    not mark it non-executable.  This means that if a guest accesses an nx
    page first with a read access, then subsequent code fetch accesses will
    succeed.
    
    Fix by setting the nx bit on shadow ptes.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c05635b462e90c381420667cffd025bcc87a081f
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 16:15:46 2007 +0200

    KVM: MMU: Simplify calculation of pte access
    
    The nx bit is awkwardly placed in the 63rd bit position; furthermore it
    has a reversed meaning compared to the other bits, which means we can't use
    a bitwise and to calculate compounded access masks.
    
    So, we simplify things by creating a new 3-bit exec/write/user access word,
    and doing all calculations in that.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit eab1de2b85dcbc23f9fe7332623dffbbf6ddb419
Author: Marcelo Tosatti <marcelo@kvack.org>
Date:   Fri Dec 7 07:56:58 2007 -0500

    KVM: MMU: Use cmpxchg for pte updates on walk_addr()
    
    In preparation for multi-threaded guest pte walking, use cmpxchg()
    when updating guest pte's. This guarantees that the assignment of the
    dirty bit can't be lost if two CPU's are faulting the same address
    simultaneously.
    
    [avi: fix kunmap_atomic() parameters]
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e3366a8a9cf43bc6a6f1d5b473cb6e58e80b7d5e
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 21:11:31 2007 +0200

    KVM: VMX: Fix cr8 exit optimization
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 620208ae8864882e4e5dd7f1b5996820caa575a6
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 19:50:00 2007 +0200

    KVM: SVM: Trap access to the cr8 register
    
    Later we may be able to use the virtual tpr feature, but for now,
    just trap it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c1185a68804451d912719e9b03716a909aeef6bc
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 18:14:14 2007 +0200

    KVM: x86 emulator: Fix stack instructions on 64-bit mode
    
    Stack instructions are always 64-bit on 64-bit mode; many of the
    emulated stack instructions did not take that into account.  Fix by
    adding a 'Stack' bitflag and setting the operand size appropriately
    during the decode stage (except for 'push r/m', which is in a group
    with a few other instructions, so it gets its own treatment).
    
    This fixes random crashes on Vista x64.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 495fdd4724fca0ccb0915816aefe693dd02fd034
Author: Joerg Roedel <jroedel@lemmy.amd.com>
Date:   Thu Dec 6 15:46:52 2007 +0100

    KVM: SVM: Emulate read/write access to cr8
    
    This patch adds code to emulate the access to the cr8 register to the x86
    instruction emulator in kvm.  This is needed on svm, where there is no
    hardware decode for control register access.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1a306332c4e88c8eaa6bb26a23965864e6e78bbb
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 16:32:45 2007 +0200

    KVM: VMX: Avoid exit when setting cr8 if the local apic is in the kernel
    
    With apic in userspace, we must exit to userspace after a cr8 write in order
    to update the tpr.  But if the apic is in the kernel, the exit is unnecessary.
    
    Noticed by Joerg Roedel.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1b15a0f3cc6c9ca6bfb6fe468156ed3603092878
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 16:15:02 2007 +0200

    KVM: x86 emulator: fix eflags preparation for emulation
    
    We prepare eflags for the emulated instruction, then clobber it with an 'andl'.
    Fix by popping eflags as the last thing in the sequence.
    
    Patch taken from Xen (16143:959b4b92b6bf)
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a95acbf60ba74e82ffa450a3c1de314ed10ea52d
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 15:22:50 2007 +0200

    KVM: Use generalized exception queue for injecting #UD
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7826d5f9510b29eac2fda1f16a410d85cda28c57
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 14:12:03 2007 +0200

    KVM: Replace #GP injection by the generalized exception queue
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 508945960872689acee683c2baec87b00d6af788
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 14:04:58 2007 +0200

    KVM: Replace page fault injection by the generalized exception queue
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9baf60dafabd46151b1be0ca9b787300a14870cd
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 13:41:11 2007 +0200

    KVM: Generalize exception injection mechanism
    
    Instead of each subarch doing its own thing, add an API for queuing an
    injection, and manage failed exception injection centerally (i.e., if
    an inject failed due to a shadow page fault, we need to requeue it).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0132b078ce22386d7a42714e2dde2c712a5c4207
Author: Marcelo Tosatti <marcelo@kvack.org>
Date:   Tue Dec 4 13:42:16 2007 -0500

    KVM: MMU: Remove unused prev_shadow_ent variable from fetch()
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 312aaca14f07c92fc3d04825968b2cd7122d0508
Author: npiggin@suse.de <npiggin@suse.de>
Date:   Wed Dec 5 18:15:52 2007 +1100

    KVM: Convert KVM from ->nopage() to ->fault()
    
    Signed-off-by: Nick Piggin <npiggin@suse.de>
    Cc: kvm-devel@lists.sourceforge.net
    Cc: avi@qumranet.com
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c4b2fe88ab89aa9a3d6d6248751806073c61ee22
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Dec 3 16:15:26 2007 -0600

    KVM: Portability: Create kvm_arch_vcpu_runnable() function
    
    This abstracts the detail of x86 hlt and INIT modes into a function.
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d32f10db9060b2f5ba643008d70ea3e2604557f0
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Dec 3 15:30:25 2007 -0600

    KVM: Portability: Stop including x86-specific headers in kvm_main.c
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5059097a8f86bc0a4e63451e9a4bd390bd5840b3
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Dec 3 15:30:24 2007 -0600

    KVM: Portability: Move IO device definitions to its own header file
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c2aa443f394d90ab3b7b1e4236a1bf0dd070d41e
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Dec 3 15:30:23 2007 -0600

    KVM: Portability: Move address types to their own header file
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6d1d70bb6121e10a84c394fd29af320e8625f0d6
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Dec 2 22:53:07 2007 +0800

    KVM: Extend ioapic code to support iosapic
    
    iosapic supports an additional mmio EOI register compared to ioapic.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 11f4471ed4f5438439e041cbf66c7ed07d62cfa2
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Dec 2 22:49:09 2007 +0800

    KVM: Replace dest_Lowest_Prio and dest_Fixed with self-defined macros
    
    Change
      dest_Loest_Prio -> IOAPIC_LOWEST_PRIORITY
      dest_Fixed -> IOAPIC_FIXED
    
    the original names are x86 specific, while the ioapic code will be reused
    for ia64.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b627248e01ab212ecfdab822248b24db6e93e5ae
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Dec 2 22:35:57 2007 +0800

    KVM: Replace kvm_lapic with kvm_vcpu in ioapic/lapic interface
    
    This patch replaces lapic structure with kvm_vcpu in ioapic.c, making ioapic
    independent of the local apic, as required by ia64.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a5d72858e076d865223900ddce084ab04a2907f9
Author: Carlo Marcelo Arenas Belon <carenas@sajinet.com.pe>
Date:   Sat Dec 1 06:17:11 2007 -0600

    KVM: SVM: Remove KVM specific defines for MSR_EFER
    
    This patch removes the KVM specific defines for MSR_EFER that were being used
    in the svm support file and migrates all references to use instead the ones
    from the kernel headers that are used everywhere else and that have the same
    values.
    
    Signed-off-by: Carlo Marcelo Arenas Belon <carenas@sajinet.com.pe>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 30f68f12b371e4105897fe2f2acbaaab667e8293
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 2 10:50:06 2007 +0200

    KVM: Export include/linux/kvm.h only if $ARCH actually supports KVM
    
    Currently, make headers_check barfs due to <asm/kvm.h>, which <linux/kvm.h>
    includes, not existing.  Rather than add a zillion <asm/kvm.h>s, export kvm.h
    only if the arch actually supports it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 248b1cea3a9ed4e997409dbc0d0397cc601eeb28
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Thu Nov 29 15:35:39 2007 +0800

    KVM: Correct kvm_init() error paths not freeing bad_pge.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 872b5d79d10518f90814ea8de95e5b4a2ef678cd
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 21 04:36:41 2007 +0800

    KVM: Portability: Move KVM_INTERRUPT vcpu ioctl to x86.c
    
    Other archs doesn't need it.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8f13a26579abdae8fc09bfaaeb778d5c66383648
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 27 19:30:56 2007 +0200

    KVM: x86 emulator: unify four switch statements into two
    
    Unify the special instruction switch with the regular instruction switch,
    and the two byte special instruction switch with the regular two byte
    instruction switch.  That makes it much easier to find an instruction or
    the place an instruction needs to be added in.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 52fb6a8bbfd208976296ec1c42ecf3c707df6693
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 27 19:14:21 2007 +0200

    KVM: x86 emulator: unify two switches
    
    The rep prefix cleanup left two switch () statements next to each other.
    Unify them.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9c4da7ae8602d4a3c6558efcb1a5f5d0c1aedae3
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 27 19:05:37 2007 +0200

    KVM: x86 emulator: Move rep processing before instruction execution
    
    Currently rep processing is handled somewhere in the middle of instruction
    processing.  Move it to a sensible place.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 334df02ff6c75a9a740838fbacebc9c590587617
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Nov 26 18:30:07 2007 +0200

    KVM: Fix cpuid2 killing 32-bit guests on non-NX machines
    
    KVM_SET_CPUID fails to remove NX when the host doesn't support it, as previous
    versions do.  On the other hand, KVM_SET_CPUID2 removes the feature even
    though, since we tell userspace about it, it shouldn't be necessary.
    
    Swap the two cases.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 04546994156d8cee5417bd6a9b180476a405c20d
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 26 08:33:53 2007 -0600

    KVM: Add ifdef in irqchip struct for x86 only structures
    
    This patch fixes a small issue where sturctures:
    	kvm_pic_state
    	kvm_ioapic_state
    
    are defined inside x86 specific code and may or may not
    be defined in anyway for other architectures. The problem
    caused is one cannot compile userspace apps (ex. libkvm)
    for other archs since a size cannot be determined for these
    structures.
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ce63e51b866391404de2b7361146355b731ee5fb
Author: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date:   Mon Nov 26 13:49:09 2007 +0100

    KVM: x86 emulator: cmps instruction
    
    Add emulation for the cmps instruction.  This lets OpenBSD boot on kvm.
    
    Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b961c7a777e68aa16e26b415b93fdb3ad74743a7
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Fri Nov 16 16:29:15 2007 +0800

    KVM: x86 emulator: Rename 'cr2' to 'memop'
    
    Previous patches have removed the dependency on cr2; we can now stop passing
    it to the emulator and rename uses to 'memop'.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a90bcadc507d1507a6ad29ef87a22f311e542fb2
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Nov 26 16:10:43 2007 +0200

    KVM: x86 emulator: rename REP_REPE_PREFIX
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fb90cbc3977a94bbd2b4e496de7afd3ce6d977d7
Author: Izik Eidus <izike@qumranet.com>
Date:   Mon Nov 26 14:08:14 2007 +0200

    KVM: MMU: mark pages that were inserted to the shadow pages table as accessed
    
    Mark guest pages as accessed when removed from the shadow page tables for
    better lru processing.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d0cf28925a13945072ec05574d7b94f9fb95984e
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 17:45:31 2007 +0200

    KVM: Remove misleading check for mmio during event injection
    
    mmio was already handled in kvm_arch_vcpu_ioctl_run(), so no need to check
    again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f77171a55097f13656aea7ea7dfbc84c2afff918
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 22 14:16:12 2007 +0200

    KVM: x86 emulator: address size and operand size overrides are sticky
    
    Current implementation is to toggle, which is incorrect.  Patch ported from
    corresponding Xen code.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c8ba048f1a82c79276ef9146140bd5ed0dd6cd85
Author: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date:   Thu Nov 22 11:32:09 2007 +0100

    KVM: x86 emulator: Make a distinction between repeat prefixes F3 and F2
    
    cmps and scas instructions accept repeat prefixes F3 and F2. So in
    order to emulate those prefixed instructions we need to be able to know
    if prefixes are REP/REPE/REPZ or REPNE/REPNZ. Currently kvm doesn't make
    this distinction. This patch introduces this distinction.
    
    Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 97cf6a86e25f3cdf1f61a55e659d2e3df1c5ff8e
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Thu Nov 22 11:20:33 2007 +0800

    KVM: Portability: Move unalias_gfn to arch dependent file
    
    Non-x86 archs don't need this mechanism. Move it to arch, and
    keep its interface in common.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f442770e25f19e17a39f715dbd4328d77b68b2f7
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Wed Nov 21 14:33:25 2007 +0800

    KVM: VMX: Remove the secondary execute control dependency on irqchip
    
    The state of SECONDARY_VM_EXEC_CONTROL shouldn't depend on in-kernel IRQ chip,
    this patch fix this.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a140b2cc4877b204c2e1548e6b66a9903813ac14
Author: Dan Kenigsberg <danken@qumranet.com>
Date:   Wed Nov 21 17:10:04 2007 +0200

    KVM: Enhance guest cpuid management
    
    The current cpuid management suffers from several problems, which inhibit
    passing through the host feature set to the guest:
    
     - No way to tell which features the host supports
    
      While some features can be supported with no changes to kvm, others
      need explicit support.  That means kvm needs to vet the feature set
      before it is passed to the guest.
    
     - No support for indexed or stateful cpuid entries
    
      Some cpuid entries depend on ecx as well as on eax, or on internal
      state in the processor (running cpuid multiple times with the same
      input returns different output).  The current cpuid machinery only
      supports keying on eax.
    
     - No support for save/restore/migrate
    
      The internal state above needs to be exposed to userspace so it can
      be saved or migrated.
    
    This patch adds extended cpuid support by means of three new ioctls:
    
     - KVM_GET_SUPPORTED_CPUID: get all cpuid entries the host (and kvm)
       supports
    
     - KVM_SET_CPUID2: sets the vcpu's cpuid table
    
     - KVM_GET_CPUID2: gets the vcpu's cpuid table, including hidden state
    
    Signed-off-by: Dan Kenigsberg <danken@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1a4c6de93b1c39030475a61d02e4df7f74f51e37
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 16:41:05 2007 +0200

    KVM: Disallow fork() and similar games when using a VM
    
    We don't want the meaning of guest userspace changing under our feet.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 718e1c897684f1362ce7c8a9b209e7e2c78e4c4c
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 15:32:41 2007 +0200

    KVM: MMU: Rename 'release_page'
    
    Rename the awkwardly named variable.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fb3dc55a5bdadd47a293c46620bef3032ff48426
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 15:28:32 2007 +0200

    KVM: MMU: Rename variables of type 'struct kvm_mmu_page *'
    
    These are traditionally named 'page', but even more traditionally, that name
    is reserved for variables that point to a 'struct page'.  Rename them to 'sp'
    (for "shadow page").
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f02512044420fe8e91f3cb0e2e56f189e74066a9
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 15:01:44 2007 +0200

    KVM: Remove gpa_to_hpa()
    
    Converting last uses along the way.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8aac4447a875e1f11d3543c039f09f825a4fa359
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:57:44 2007 +0200

    KVM: MMU: Remove gva_to_hpa()
    
    No longer used.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f1ce9a1d9c1ea0f68c6b894c992d633207b664ea
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:54:16 2007 +0200

    KVM: MMU: Simplify nonpaging_map()
    
    Instead of passing an hpa, pass a regular struct page.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b57925a773d4fc2a88184f4b31eb7aae4d9b7e2c
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:44:45 2007 +0200

    KVM: MMU: Introduce gfn_to_gpa()
    
    Converting a frame number to an address is tricky since the data type changes
    size.  Introduce a function to do it.  This fixes an actual bug when
    accessing guest ptes.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5c3cbf2d407a80644241e6af9fcca51b3ed0aa5c
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:20:22 2007 +0200

    KVM: MMU: Adjust page_header_update_slot() to accept a gfn instead of a gpa
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ba8720265892f853b802198c47682bce89719f8c
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:16:30 2007 +0200

    KVM: MMU: Merge set_pte() and set_pte_common()
    
    Since set_pte() is now the only caller of set_pte_common(), merge the two
    functions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 12879fd11c07ae060c5ec918eaf8cc5c5e0bf5c8
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:11:49 2007 +0200

    KVM: MMU: Remove set_pde()
    
    It is now identical to set_pte().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 98d462d22b18d9433ee40bb0e7b9bfc2a4839d55
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:08:40 2007 +0200

    KVM: MMU: Remove extra gaddr parameter from set_pte_common()
    
    Similar information is available in the gfn parameter, so use that.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 194d7f3ad1e26815bfc1b934a73a4b74f9d5d018
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 13:54:47 2007 +0200

    KVM: MMU: Move pse36 handling to the guest walker
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 904b2b78058edc3abee7f0ea543ed107a580d6e7
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 12:35:07 2007 +0200

    KVM: MMU: Introduce and use gpte_to_gfn()
    
    Instead of repretitively open-coding this.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b6b934f10416dc5e2ee59442b564264492885b25
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Nov 20 12:02:12 2007 +0200

    KVM: MMU: Code cleanup
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 481af46d479b09a0a56606af7efc6a0a9368a43d
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 02:57:59 2007 +0200

    KVM: Don't bother the mmu if cr3 load doesn't change cr3
    
    If the guest requests just a tlb flush, don't take the vm lock and
    drop the mmu context pointlessly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9dd1731684fdfe4b07374b28c383e97c21ccd97a
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 02:06:21 2007 +0200

    KVM: MMU: Avoid unnecessary remote tlb flushes when guest updates a pte
    
    If all we're doing is increasing permissions on a pte (typical for demand
    paging), then there's not need to flush remote tlbs.  Worst case they'll
    get a spurious page fault.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1c6d0c2de6c8f5b57971d554215a5991bb948884
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 23:01:14 2007 +0200

    KVM: Add statistic for remote tlb flushes
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 772086964f8aa002f57f61044a336aac667386d3
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 21:39:54 2007 +0200

    KVM: MMU: Implement guest page fault bypass for nonpae
    
    I spent an hour worrying why I see so many guest page faults on FC6 i386.
    Turns out bypass wasn't implemented for nonpae.  Implement it so it doesn't
    happen again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 91eb2376ace13756a5eb9068093bdaed1320e949
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 15:30:24 2007 +0200

    KVM: Split vcpu creation to avoid vcpu_load() before preemption setup
    
    Split kvm_arch_vcpu_create() into kvm_arch_vcpu_create() and
    kvm_arch_vcpu_setup(), enabling preemption notification between the two.
    This mean that we can now do vcpu_load() within kvm_arch_vcpu_setup().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 821afccb303e4a5761b9b07a1e599dd99feecfe1
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Tue Nov 20 16:25:04 2007 +0800

    KVM: Portability:  Split kvm_set_memory_region() to have an arch callout
    
    Moving !user_alloc case to kvm_arch to avoid unnecessary
    code logic in non-x86 platform.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e2d8cbc6b4589ae4c06cad9dcc29a5da58a0d761
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Tue Nov 20 13:11:38 2007 +0800

    KVM: Recalculate mmu pages needed for every memory region change
    
    Instead of incrementally changing the mmu cache size for every memory slot
    operation, recalculate it from scratch.  This is simpler and safer.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8c62725b40aceb3d8004954d221215f8e1a4656f
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 13:15:52 2007 +0200

    KVM: x86 emulator: prefetch up to 15 bytes of the instruction executed
    
    Instead of fetching one byte at a time, prefetch 15 bytes (or until the next
    page boundary) to avoid guest page table walks.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 31c5dcecace6d0eed31d13e9d6e525a1d1b0d741
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 12:49:31 2007 +0200

    KVM: x86 emulator: retire ->write_std()
    
    Theoretically used to acccess memory known to be ordinary RAM, it was
    never implemented.  It is questionable whether it is possible to implement
    it correctly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c4cfa3cad0a5b24d6c09daf2b9837d60191af177
Author: Izik Eidus <izike@localhost.localdomain>
Date:   Tue Nov 20 11:49:33 2007 +0200

    KVM: MMU: Selectively set PageDirty when releasing guest memory
    
    Improve dirty bit setting for pages that kvm release, until now every page
    that we released we marked dirty, from now only pages that have potential
    to get dirty we mark dirty.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c68426d13b5ee3607b40c9eb4db55bd46448c315
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Nov 20 11:30:04 2007 +0200

    KVM: MMU: Fix potential memory leak with smp real-mode
    
    When we map a page, we check whether some other vcpu mapped it for us and if
    so, bail out.  But we should decrease the refcount on the page as we do so.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9500583c5938b83134114f925f933813e8d0ee1a
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 11:45:14 2007 +0200

    KVM: Export include/asm-x86/kvm.h
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 29f516db0b9baf019b1c93a154244fbedd81d2ac
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:37 2007 -0600

    KVM: Portability: Move cpuid structures to <asm/kvm.h>
    
    This patch moves structures:
    	kvm_cpuid_entry
    	kvm_cpuid
    
    from include/linux/kvm.h to include/asm-x86/kvm.h
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8d8c2c60388c3e98e4386e086357f4e1b51bca1e
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:36 2007 -0600

    KVM: Portability: Move kvm_sregs and msr structures to <asm/kvm.h>
    
    Move structures:
    	kvm_sregs
    	kvm_msr_entry
    	kvm_msrs
    	kvm_msr_list
    
    from include/linux/kvm.h to include/asm-x86/kvm.h
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e7841ac816a436cc71107ed3d91496b45c86a00f
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:35 2007 -0600

    KVM: Portability: Move kvm_segment & kvm_dtable structure to  <asm/kvm.h>
    
    This patch moves structures:
    	kvm_segment
    	kvm_dtable
    from include/linux/kvm.h to include/asm-x86/kvm.h
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6b9fef5a4eb55e4da3cd39f720de76294bcd029c
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:34 2007 -0600

    KVM: Portability: Move structure lapic_state to <asm/kvm.h>
    
    This patch moves structure lapic_state from include/linux/kvm.h
    to include/asm-x86/kvm.h
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 93f93de2ff85e2fd20b2223415827499e5d5a05a
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:33 2007 -0600

    KVM: Portability: Move kvm_regs to <asm/kvm.h>
    
    This patch moves structure kvm_regs to include/asm-x86/kvm.h.
    Each architecture will need to create there own version of this
    structure.
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9470242e09e9bc4cab3b516dc2d46632b62446ae
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:32 2007 -0600

    KVM: Portability: Move x86 pic strutctures
    
    This patch moves structures:
    	kvm_pic_state
    	kvm_ioapic_state
    
    to inclue/asm-x86/kvm.h.
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 12e8f8d9a762f055ef4a8a58ebf4a2d68b1c9db6
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:31 2007 -0600

    KVM: Portability: Move kvm_memory_alias to asm/kvm.h
    
    This patch moves sturct kvm_memory_alias from include/linux/kvm.h
    to include/asm-x86/kvm.h. Also have include/linux/kvm.h include
    include/asm/kvm.h.
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d06cde663eaef859bafeb4cb63cf78bc0cd2fe87
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Nov 19 14:04:44 2007 -0600

    KVM: Move misplaced comment
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c477869e9394cde6806adf277f1bcfbf99a853b9
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Nov 19 14:04:43 2007 -0600

    KVM: Correct consistent typo: "destory" -> "destroy"
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ff24fe8b7bb10d101bc0f919ac7363ff1151cf22
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Nov 19 14:04:45 2007 -0600

    KVM: Remove unused "rmap_overflow" variable
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1dd5587d4d986704098e1b3e5fcb3b6c4fa214b9
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Nov 19 18:44:15 2007 +0200

    KVM: MMU: Remove unused variable
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c246677e2d16f91d82b2eaab1a0da04853c68d9e
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Nov 19 18:28:09 2007 +0200

    KVM: Add missing #include <asm/pgtable.h>
    
    Needed for empty_zero_page.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3075271017054557698bd40c72759cf2fdfbdc5d
Author: Izik Eidus <izike@qumranet.com>
Date:   Mon Nov 19 11:16:57 2007 +0200

    KVM: Simplify kvm_clear_guest_page()
    
    Use kvm_write_guest_page() with empty_zero_page, instead of doing
    kmap and memset.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit adc877cd04966fe7fa71b6e8204594c231db4877
Author: Izik Eidus <izike@qumranet.com>
Date:   Mon Nov 19 11:28:19 2007 +0200

    KVM: MMU: Change guest pte access to kvm_{read,write}_guest()
    
    Things are simpler and more regular this way.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 28d8a8f15da3f54aec537ccb08330d9afac915f4
Author: Jan Kiszka <jan.kiszka@siemens.com>
Date:   Mon Nov 19 10:21:45 2007 +0100

    KVM: VMX: Force seg.base == (seg.sel << 4) in real  mode
    
    Ensure that segment.base == segment.selector << 4 when entering the real
    mode on Intel so that the CPU will not bark at us.  This fixes some old
    protected mode demo from http://www.x86.org/articles/pmbasics/tspec_a1_doc.htm.
    
    Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit afe6fba05620eacbebd533ef88903d27db5d4731
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 15:24:28 2007 +0800

    KVM: Portability: Move some function declarations to x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1765bb911a30b3ca2b186797b467ef1d921eb7a7
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 15:08:31 2007 +0800

    KVM: Move some static inline functions out from kvm.h into x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 611a09879f95ba17968dc7072db40ae29b58831a
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 14:56:05 2007 +0800

    KVM: Portability: Move vcpu regs enumeration definition to x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3a1fa229b664bc5c157e19bc7ae0dab9fc2ddb98
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 14:40:47 2007 +0800

    KVM: Portability: Move struct kvm_x86_ops definition to x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 36c726e56c7bc40667ac5dcd21e517222da2b90d
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 14:33:37 2007 +0800

    KVM: Portability: Move some macro definitions from kvm.h to x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 843c367908fe8e9d6b9d1c1625e710537ce028f7
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Nov 18 20:43:21 2007 +0800

    KVM: Portability: MMU initialization and teardown split
    
    Move out kvm_mmu init and exit functionality from kvm_main.c
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b2dafaa9527ae6845c8bb2d85eee79dd759d5c42
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Nov 18 20:29:43 2007 +0800

    KVM: Portability: Move kvm_vcpu_ioctl_get_dirty_log to arch-specific  file
    
    Meanwhile keep the interface in common, and leave as more logic in common
    as possible.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2171fec7f9df6cbd0f7793cf89d9ee3848ad6cd9
Author: Amit Shah <amit.shah@qumranet.com>
Date:   Thu Nov 15 18:38:46 2007 +0200

    KVM: Make unloading of FPU state when putting vcpu arch-independent
    
    Instead of having each architecture do it individually, we
    do this in the arch-independent code (just x86 as of now).
    
    [avi: add svm to the mix, which was added to mainline during the
     2.6.24-rc process]
    
    Signed-off-by: Amit Shah <amit.shah@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9eb6db84384c50eba85eea360e921b314669b211
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 16:37:07 2007 +0200

    KVM: MMU: Add some mmu statistics
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 94551089d411b2ba50293210ef785ddbd8b14026
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 16:24:12 2007 +0200

    KVM: Extend stats support for VM stats
    
    This is in addition to the current virtual cpu statistics.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4f749da9e68b5ade494d68597ecf045c73cb4ec0
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 15:17:51 2007 +0200

    KVM: Add instruction emulation statistics

commit 26c36eca453443016f70ea85af46c6109eeea400
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 13:54:33 2007 +0200

    KVM: Add fpu_reload counter
    
    Measure the number of times we switch the fpu state.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8355eb8e0bd02de51f4d36fc4f3e4df1572584e3
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 13:50:24 2007 +0200

    KVM: Replace 'light_exits' stat with 'host_state_reload'
    
    This is a little more accurate (since it counts actual reloads, not potential
    reloads), and reverses the sense of the statistic to measure a bad event like
    most of the other stats (e.g. we want to minimize all counters).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4289f27f40412271daf71ae62be478b535a6836d
Author: Zhang Xiantao <xiantao@vtsmp-build32.los-vmm.org>
Date:   Sun Nov 18 18:43:45 2007 +0800

    KVM: Portability: Add two hooks to handle kvm_create and destroy vm
    
    Add two arch hooks to handle kvm_create_vm and kvm destroy_vm. Now, just
    put io_bus init and destory in common.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c4e513d12d14cf64612211f8d06af996cc5019a7
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Nov 16 14:38:21 2007 +0800

    KVM: Remove __init attributes for kvm_init_debug and kvm_init_msr_list
    
    Since their callers are not declared with __init.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cee77328482e5770d6df48212ecaece4d1750060
Author: Joe Perches <joe@perches.com>
Date:   Mon Nov 12 20:06:51 2007 -0800

    KVM: Remove ptr comparisons to 0
    
    Fix sparse warnings "Using plain integer as NULL pointer"
    
    Signed-off-by: Joe Perches <joe@perches.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 822e8a654b3ffd19b7bdcee3cb86498ac49e73f7
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Nov 16 13:05:55 2007 +0800

    KVM: Portability: Make kvm_vcpu_ioctl_translate arch dependent
    
    Move kvm_vcpu_ioctl_translate to arch, since mmu would be put under arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit da5b7f9653452b29c1389ed7c291422cd37c3312
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 15 18:06:18 2007 +0200

    KVM: VMX: Consolidate register usage in vmx_vcpu_run()
    
    We pass vcpu, vmx->fail, and vmx->launched to assembly code, but all three
    are fields within vmx.  Consolidate by only passing in vmx and offsets for
    the rest.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e1392d7931e0092dd1a03a108ca77c63b6083ffb
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Thu Nov 15 23:07:47 2007 +0800

    KVM: Portability: move KVM_CHECK_EXTENSION
    
    Make KVM_CHECK_EXTENSION code into a function, all archs can define its
    capability independently.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7830eebf97a949620a0eb85a6d2d75eacc7e5df8
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Thu Nov 15 14:52:28 2007 +0800

    KVM: x86 emulator: modify 'lods', and 'stos' not to depend on CR2
    
    The current 'lods' and 'stos' is depending on incoming CR2 rather than decode
    memory address from registers.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fee712e28103841adb5d4e3884f39f8859ec3005
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:40:21 2007 +0800

    KVM: Portability: Move x86 specific code from kvm_init() to kvm_arch()
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 97d05f12b4c2949210d671bab3168ed0055131ff
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:39:31 2007 +0800

    KVM: Portability: Combine kvm_init and kvm_init_x86
    
    Will be called once arch module registers itself.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3f9df31d3108ba0ffc587f163e66abb9ad8eb61a
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:38:21 2007 +0800

    KVM: Portability: Add vcpu and hardware management arch hooks
    
    Add the following hooks:
    
      void decache_vcpus_on_cpu(int cpu);
      int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
      void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
      void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
      void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
      void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
      struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
      void kvm_arch_vcpu_destory(struct kvm_vcpu *vcpu);
      int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
      void kvm_arch_hardware_enable(void *garbage);
      void kvm_arch_hardware_disable(void *garbage);
      int kvm_arch_hardware_setup(void);
      void kvm_arch_hardware_unsetup(void);
      void kvm_arch_check_processor_compat(void *rtn);
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 42ebda70312ee76ad23b90bbf02ddb480e511f63
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:09:30 2007 +0800

    KVM: Portability: Move kvm_x86_ops to x86.c
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1698b42863745205c29fc17dcc685a30dcd46fa0
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:08:51 2007 +0800

    KVM: Portability: Move some includes to x86.c
    
    Move some includes to x86.c from kvm_main.c, since the related functions
    have been moved to x86.c
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e76e5514079fb399197599c1b00dc7d5fa9fd9bf
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 22:10:22 2007 +0200

    KVM: Change kvm_{read,write}_guest() to use copy_{from,to}_user()
    
    This changes kvm_write_guest_page/kvm_read_guest_page to use
    copy_to_user/read_from_user, as a result we get better speed
    and better dirty bit tracking.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2de9dba3097ebff65cc25496c32c0653b18dcf83
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 22:05:04 2007 +0200

    KVM: introduce gfn_to_hva()
    
    Convert a guest frame number to the corresponding host virtual address.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0893321f0d8ab0e87b71b13c07e50cba1f423464
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 22:02:22 2007 +0200

    KVM: add kvm_is_error_hva()
    
    Check for the "error hva", an address outside the user address space that
    signals a bad gfn.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b420a3a903f258a993bc5b33bc54a49126a8dbbf
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 11 18:37:32 2007 +0200

    KVM: Simplify CPU_TASKS_FROZEN cpu notifier handling
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e53135d7f031d90c9d470837937984b0cd8b7989
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 14:48:17 2007 +0200

    KVM: x86 emulator: remove 8 bytes operands emulator for call near instruction
    
    it is removed beacuse it isnt supported on a real host
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c3ea2d8855148888f26bbfd71b530eb7396f2645
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sun Nov 11 12:28:35 2007 +0200

    KVM: VMX: wbinvd exiting
    
    Add wbinvd VM Exit support to prepare for pass-through
    device cache emulation and also enhance real time
    responsiveness.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5dd65ebd34d0dcfe5db11ef0f43c262a9fc2a625
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sun Nov 11 12:27:20 2007 +0200

    KVM: VMX: Comment VMX primary/secondary exec ctl definitions
    
    Add comments for secondary/primary Processor-Based VM-execution controls.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 65987f623c17e58096470092b800db660691c27e
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 22 11:42:59 2007 +0200

    KVM: Fix faults during injection of real-mode interrupts
    
    If vmx fails to inject a real-mode interrupt while fetching the interrupt
    redirection table, it fails to record this in the vectoring information
    field.  So we detect this condition and do it ourselves.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 313589267465b2521fcb9c72491c10b161f9b6c0
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 22 11:30:47 2007 +0200

    KVM: VMX: Read & store IDT_VECTORING_INFO_FIELD
    
    We'll want to write to it in order to fix real-mode irq injection problems,
    but it is a read-only field.  Storing it in a variable solves that issue.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 69d7c705a0c9bef0b21e2899ba86160804a900f2
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 8 18:19:20 2007 +0200

    KVM: VMX: Use vmx to inject real-mode interrupts
    
    Instead of injecting real-mode interrupts by writing the interrupt frame into
    guest memory, abuse vmx by injecting a software interrupt.  We need to
    pretend the software interrupt instruction had a length > 0, so we have to
    adjust rip backward.
    
    This lets us not to mess with writing guest memory, which is complex and also
    sleeps.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a7acc3f15961003585476e56c97aac12931db7ba
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Wed Nov 7 16:20:06 2007 +0200

    KVM: Add make_page_dirty() to kvm_clear_guest_page()
    
    Every write access to guest pages should be tracked.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e485187cb758cbe6c4172ac1c6cf1a02033640c2
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Thu Nov 1 14:16:10 2007 -0500

    KVM: Portability: Move x86 vcpu ioctl handlers to x86.c
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit bd80a35a904c82d5e3eb85483c59a4fcebcc0345
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Wed Oct 31 17:24:25 2007 -0500

    KVM: Portability: Move x86 FPU handling to x86.c
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 96e9b0f7b0edf08c5b8b0de47d48826032677535
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Wed Oct 31 17:24:24 2007 -0500

    KVM: Portability: Move x86 instruction emulation code to x86.c
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 291efa8f29b50660763daf712408289100a7f963
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Wed Oct 31 17:24:23 2007 -0500

    KVM: Portability: Make exported debugfs data architecture-specific
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b44df9ec4796d569bc8edf47619bdcfccf675ebf
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 1 06:31:28 2007 +0200

    KVM: x86 emulator: Hoist modrm and abs decoding into separate functions
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 99bcd32e7a046d42dab4a3bed03fc46f167cf0b6
Author: Uri Lublin <uril@qumranet.com>
Date:   Tue Oct 30 10:42:09 2007 +0200

    KVM: Make mark_page_dirty() work for aliased pages too.
    
    Recommended by Izik Eidus.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 498a5bf184d75d7aad2af8e2f18ef57640980b15
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 31 11:21:06 2007 +0200

    KVM: Simplify decode_register_operand() calling convention
    
    Now that rex_prefix is part of the decode cache, there is no need to pass
    it along.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d24c2abd708a5e354197fc40f6562328ec0171e3
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 31 11:15:56 2007 +0200

    KVM: x86 emulator: centralize decoding of one-byte register access insns
    
    Instructions like 'inc reg' that have the register operand encoded
    in the opcode are currently specially decoded.  Extend
    decode_register_operand() to handle that case, indicated by having
    DstReg or SrcReg without ModRM.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 90128492364c458040934c1e0a4e69d986b8ce17
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 31 10:27:04 2007 +0200

    KVM: x86 emulator: Extract the common code of SrcReg and DstReg
    
    Share the common parts of SrcReg and DstReg decoding.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f065eb1c86c8f66252fb83e938c909301f107b02
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Tue Oct 30 18:44:25 2007 +0100

    KVM: Portability: Move pio emulation functions to x86.c
    
    This patch moves implementation of the following functions from
    kvm_main.c to x86.c:
    free_pio_guest_pages, vcpu_find_pio_dev, pio_copy_data, complete_pio,
    kernel_pio, pio_string_write, kvm_emulate_pio, kvm_emulate_pio_string
    
    The function inject_gp, which was duplicated by yesterday's patch
    series, is removed from kvm_main.c now because it is not needed anymore.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f2b682cefecc497e101372386a98705567362acc
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Tue Oct 30 18:44:21 2007 +0100

    KVM: Portability: Move x86 emulation and mmio device hook to x86.c
    
    This patch moves the following functions to from kvm_main.c to x86.c:
    emulator_read/write_std, vcpu_find_pervcpu_dev, vcpu_find_mmio_dev,
    emulator_read/write_emulated, emulator_write_phys,
    emulator_write_emulated_onepage, emulator_cmpxchg_emulated,
    get_setment_base, emulate_invlpg, emulate_clts, emulator_get/set_dr,
    kvm_report_emulation_failure, emulate_instruction
    
    The following data type is moved to x86.c:
    struct x86_emulate_ops emulate_ops
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 49153e58bf8f07d6a9c8766f2da8c91805f83f82
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Tue Oct 30 18:44:17 2007 +0100

    KVM: Portability: Move kvm_get/set_msr[_common] to x86.c
    
    This patch moves the implementation of the functions of kvm_get/set_msr,
    kvm_get/set_msr_common, and set_efer from kvm_main.c to x86.c. The
    definition of EFER_RESERVED_BITS is moved too.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a5ade20c8056c92db199fc6515be7edb4fa7e63e
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Oct 29 15:15:20 2007 -0500

    KVM: Fix gfn_to_page() acquiring mmap_sem twice
    
    KVM's nopage handler calls gfn_to_page() which acquires the mmap_sem when
    calling out to get_user_pages().  nopage handlers are already invoked with the
    mmap_sem held though.  Introduce a __gfn_to_page() for use by the nopage
    handler which requires the lock to already be held.
    
    This was noticed by tglx.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 451287231375e512574477693ad310c981e7d922
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Mon Oct 29 09:40:42 2007 +0800

    KVM: VMX: Enable memory mapped TPR shadow (FlexPriority)
    
    This patch based on CR8/TPR patch, and enable the TPR shadow (FlexPriority)
    for 32bit Windows.  Since TPR is accessed very frequently by 32bit
    Windows, especially SMP guest, with FlexPriority enabled, we saw significant
    performance gain.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8348f5ecbfb3c8f5688a8c5cf04f120fe5ef4f68
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Mon Oct 29 16:09:35 2007 +0100

    KVM: Portability: Move control register helper functions to x86.c
    
    This patch moves the definitions of CR0_RESERVED_BITS,
    CR4_RESERVED_BITS, and CR8_RESERVED_BITS along with the following
    functions from kvm_main.c to x86.c:
    set_cr0(), set_cr3(), set_cr4(), set_cr8(), get_cr8(), lmsw(),
    load_pdptrs()
    The static function wrapper inject_gp is duplicated in kvm_main.c and
    x86.c for now, the version in kvm_main.c should disappear once the last
    user of it is gone too.
    The function load_pdptrs is no longer static, and now defined in x86.h
    for the time being, until the last user of it is gone from kvm_main.c.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 210301268dc7ce8aae9898f00a6cb498c2535ceb
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Mon Oct 29 16:09:10 2007 +0100

    KVM: Portability: move get/set_apic_base to x86.c
    
    This patch moves the implementation of get_apic_base and set_apic_base
    from kvm_main.c to x86.c
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit eec9635aa34d88c986f37613408bb25b0da39d67
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Mon Oct 29 16:08:51 2007 +0100

    KVM: Portability: Move memory segmentation to x86.c
    
    This patch moves the definition of segment_descriptor_64 for AMD64 and
    EM64T from kvm_main.c to segment_descriptor.h. It also adds a proper
    #ifndef...#define...#endif around that header file.
    The implementation of segment_base is moved from kvm_main.c to x86.c.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 68486294e54c8c3ea3cc20818a2ffc258ae79064
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Mon Oct 29 16:08:35 2007 +0100

    KVM: Portability: Split kvm_vm_ioctl v3
    
    This patch splits kvm_vm_ioctl into archtecture independent parts, and
    x86 specific parts which go to kvm_arch_vcpu_ioctl in x86.c.
    The patch is unchanged since last submission.
    
    Common ioctls for all architectures are:
    KVM_CREATE_VCPU, KVM_GET_DIRTY_LOG, KVM_SET_USER_MEMORY_REGION
    
    x86 specific ioctls are:
    KVM_SET_MEMORY_REGION,
    KVM_GET/SET_NR_MMU_PAGES, KVM_SET_MEMORY_ALIAS, KVM_CREATE_IRQCHIP,
    KVM_CREATE_IRQ_LINE, KVM_GET/SET_IRQCHIP
    KVM_SET_TSS_ADDR
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7e34e4d66cbbc04cc7a549da826e936860f9dcb2
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 28 18:52:05 2007 +0200

    KVM: MMU: Topup the mmu memory preallocation caches before emulating an insn
    
    Emulation may cause a shadow pte to be instantiated, which requires
    memory resources.  Make sure the caches are filled to avoid an oops.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 415ce32aab70971a6ec794605012a0bd451d03af
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 28 18:48:59 2007 +0200

    KVM: Move page fault processing to common code
    
    The code that dispatches the page fault and emulates if we failed to map
    is duplicated across vmx and svm.  Merge it to simplify further bugfixing.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e7facf3d1ac177110e431c7021111d7c10acf5e2
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 28 16:34:25 2007 +0200

    KVM: x86 emulator: don't depend on cr2 for mov abs emulation
    
    The 'mov abs' instruction family (opcodes 0xa0 - 0xa3) still depends on cr2
    provided by the page fault handler.  This is wrong for several reasons:
    
    - if an instruction accessed misaligned data that crosses a page boundary,
      and if the fault happened on the second page, cr2 will point at the
      second page, not the data itself.
    
    - if we're emulating in real mode, or due to a FlexPriority exit, there
      is no cr2 generated.
    
    So, this change adds decoding for this instruction form and drops reliance
    on cr2.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 39c06a59bf7ce412179c8d2ee681f7e6eae34c4c
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Oct 25 14:18:54 2007 +0200

    KVM: SVM: Let gcc to choose which registers to save (i386)
    
    This patch lets GCC to determine which registers to save when we
    switch to/from a VCPU in the case of AMD i386
    
    * Original code saves following registers:
    
        ebx, ecx, edx, esi, edi, ebp
    
    * Patched code:
    
      - informs GCC that we modify following registers
        using the clobber description:
    
        ebx, ecx, edx, esi, edi
    
      - rbp is saved (pop/push) because GCC seems to ignore its use in the clobber
        description.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 75e7957bcd1ff7d86878f6104662d7aaeb1895f4
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Oct 25 14:18:53 2007 +0200

    KVM: SVM: Let gcc to choose which registers to save (x86_64)
    
    This patch lets GCC to determine which registers to save when we
    switch to/from a VCPU in the case of AMD x86_64.
    
    * Original code saves following registers:
    
        rbx, rcx, rdx, rsi, rdi, rbp,
        r8, r9, r10, r11, r12, r13, r14, r15
    
    * Patched code:
    
      - informs GCC that we modify following registers
        using the clobber description:
    
        rbx, rcx, rdx, rsi, rdi
        r8, r9, r10, r11, r12, r13, r14, r15
    
      - rbp is saved (pop/push) because GCC seems to ignore its use in the clobber
        description.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 795533755652ddd1d60cd1a451337116a956539b
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Oct 25 14:18:55 2007 +0200

    KVM: VMX: Let gcc to choose which registers to save (i386)
    
    This patch lets GCC to determine which registers to save when we
    switch to/from a VCPU in the case of intel i386.
    
    * Original code saves following registers:
    
        eax, ebx, ecx, edx, edi, esi, ebp (using popa)
    
    * Patched code:
    
      - informs GCC that we modify following registers
        using the clobber description:
    
        ebx, edi, rsi
    
      - doesn't save eax because it is an output operand (vmx->fail)
    
      - cannot put ecx in clobber description because it is an input operand,
        but as we modify it and we want to keep its value (vcpu), we must
        save it (pop/push)
    
      - ebp is saved (pop/push) because GCC seems to ignore its use the clobber
        description.
    
      - edx is saved (pop/push) because it is reserved by GCC (REGPARM) and
        cannot be put in the clobber description.
    
      - line "mov (%%esp), %3 \n\t" has been removed because %3
        is ecx and ecx is restored just after.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 41141bde788afe7eb407e4ac46dee13f0bc30cac
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Oct 25 14:18:52 2007 +0200

    KVM: VMX: Let gcc to choose which registers to save (x86_64)
    
    This patch lets GCC to determine which registers to save when we
    switch to/from a VCPU in the case of intel x86_64.
    
    * Original code saves following registers:
    
        rax, rbx, rcx, rdx, rsi, rdi, rbp,
        r8, r9, r10, r11, r12, r13, r14, r15
    
    * Patched code:
    
      - informs GCC that we modify following registers
        using the clobber description:
    
        rbx, rdi, rsi,
        r8, r9, r10, r11, r12, r13, r14, r15
    
      - doesn't save rax because it is an output operand (vmx->fail)
    
      - cannot put rcx in clobber description because it is an input operand,
        but as we modify it and we want to keep its value (vcpu), we must
        save it (pop/push)
    
      - rbp is saved (pop/push) because GCC seems to ignore its use in the clobber
        description.
    
      - rdx is saved (pop/push) because it is reserved by GCC (REGPARM) and
        cannot be put in the clobber description.
    
      - line "mov (%%rsp), %3 \n\t" has been removed because %3
        is rcx and rcx is restored just after.
    
      - line ASM_VMX_VMWRITE_RSP_RDX() is moved out of the ifdef/else/endif
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 430dbe8a56da6b45d7ceb03ca71cc474a0620c4f
Author: Izik Eidus <izike@qumranet.com>
Date:   Thu Oct 25 00:29:55 2007 +0200

    KVM: Add ioctl to tss address from userspace,
    
    Currently kvm has a wart in that it requires three extra pages for use
    as a tss when emulating real mode on Intel.  This patch moves the allocation
    internally, only requiring userspace to tell us where in the physical address
    space we can place the tss.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6fb1e0b2a1e1a2b5b9729b187b13c870d58bf4b6
Author: Izik Eidus <izike@qumranet.com>
Date:   Wed Oct 24 23:57:46 2007 +0200

    KVM: Add kernel-internal memory slots
    
    Reserve a few memory slots for kernel internal use.  This is good for case
    you have to register memory region and you want to be sure it was not
    registered from userspace, and for case you want to register a memory region
    that won't be seen from userspace.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 733f6415f3975478ed8fa383d89c7ae86b938cde
Author: Izik Eidus <izike@qumranet.com>
Date:   Wed Oct 24 23:52:57 2007 +0200

    KVM: Export memory slot allocation mechanism
    
    Remove kvm memory slot allocation mechanism from the ioctl
    and put it to exported function.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cb1a650958d37b941f6517adaa6e600b82978f91
Author: Izik Eidus <izike@qumranet.com>
Date:   Thu Oct 25 11:54:04 2007 +0200

    KVM: Unmap kernel-allocated memory on slot destruction
    
    kvm_vm_ioctl_set_memory_region() is able to remove memory in addition to
    adding it.  Therefore when using kernel swapping support for old userspaces,
    we need to munmap the memory if the user request to remove it
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 11fbdcc2357e502f1db984398580f3e5f7b22ce1
Author: Christian Borntraeger <borntraeger@de.ibm.com>
Date:   Thu Oct 11 15:34:17 2007 +0200

    KVM: Per-architecture hypercall definitions
    
    Currently kvm provides hypercalls only for x86* architectures. To
    provide hypercall infrastructure for other kvm architectures I split
    kvm_para.h into a generic header file and architecture specific
    definitions.
    
    Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 882b84e0e7d033fdaf553bc5cdf535dfcd99ec18
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Oct 10 12:15:54 2007 +0200

    KVM: Split IOAPIC reset function and export for kernel RESET
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 221aabc8a9ddc0d6b1c5f35f556410ee5b4549c3
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Oct 10 12:14:25 2007 +0200

    KVM: Export PIC reset for kernel device reset
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 51a29f6879429e999ca7e40b4ee35c264102bcc6
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 21 11:03:36 2007 +0200

    KVM: Add a might_sleep() annotation to gfn_to_page()
    
    This will help trap accesses to guest memory in atomic context.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 38376823b92c1496c851464abc751a2d12ff5192
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 21 11:00:39 2007 +0200

    KVM: Move vmx_vcpu_reset() out of vmx_vcpu_setup()
    
    Split guest reset code out of vmx_vcpu_setup().  Besides being cleaner, this
    moves the realmode tss setup (which can sleep) outside vmx_vcpu_setup()
    (which is executed with preemption enabled).
    
    [izik: remove unused variable]
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 130452e6577d52fed09bbc3e0fd23d4e01dc6145
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sat Oct 20 15:34:38 2007 +0800

    KVM: Portability: Split kvm_vcpu into arch dependent and independent parts (part 1)
    
    First step to split kvm_vcpu.  Currently, we just use an macro to define
    the common fields in kvm_vcpu for all archs, and all archs need to define
    its own kvm_vcpu struct.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b25d84aa1de2a0259e54415faef1c5bdef15cec3
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Thu Oct 18 09:59:34 2007 -0500

    KVM: Allocate userspace memory for older userspace
    
    Allocate a userspace buffer for older userspaces.  Also eliminate phys_mem
    buffer.  The memset() in kvmctl really kills initial memory usage but swapping
    works even with old userspaces.
    
    A side effect is that maximum guest side is reduced for older userspace on
    i386.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e6a9547518fdab65640a9f77bd285472dc26beb9
Author: Christian Borntraeger <borntraeger@de.ibm.com>
Date:   Thu Oct 18 14:39:10 2007 +0200

    KVM: Use virtual cpu accounting if available for guest times.
    
    ppc and s390 offer the possibility to track process times precisely
    by looking at cpu timer on every context switch, irq, softirq etc.
    We can use that infrastructure as well for guest time accounting.
    We need to account the used time before we change the state.
    This patch adds a call to account_system_vtime to kvm_guest_enter
    and kvm_guest exit. If CONFIG_VIRT_CPU_ACCOUNTING is not set,
    account_system_vtime is defined in hardirq.h as an empty function,
    which means this patch does not change the behaviour on other
    platforms.
    
    I compile tested this patch on x86 and function tested the patch on
    s390.
    
    Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e0534e37e90b89331e0eb03cd4f3fa7a4164dcea
Author: Izik Eidus <avi@qumranet.com>
Date:   Thu Oct 18 11:09:33 2007 +0200

    KVM: MMU: Partial swapping of guest memory
    
    This allows guest memory to be swapped.  Pages which are currently mapped
    via shadow page tables are pinned into memory, but all other pages can
    be freely swapped.
    
    The patch makes gfn_to_page() elevate the page's reference count, and
    introduces kvm_release_page() that pairs with it.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 91775c12b13346db968b990b5c38dca83ebbd0c1
Author: Izik Eidus <avi@qumranet.com>
Date:   Wed Oct 17 19:17:48 2007 +0200

    KVM: MMU: Make gfn_to_page() always safe
    
    In case the page is not present in the guest memory map, return a dummy
    page the guest can scribble on.
    
    This simplifies error checking in its users.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c28b628fbe789e3cf279c53157eca599b3308a78
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Oct 16 14:43:46 2007 +0200

    KVM: MMU: Keep a reverse mapping of non-writable translations
    
    The current kvm mmu only reverse maps writable translation.  This is used
    to write-protect a page in case it becomes a pagetable.
    
    But with swapping support, we need a reverse mapping of read-only pages as
    well:  when we evict a page, we need to remove any mapping to it, whether
    writable or not.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 13e17d387b510ee4d2ae7220c36d46a53d04b99d
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Oct 16 14:42:30 2007 +0200

    KVM: MMU: Add rmap_next(), a helper for walking kvm rmaps
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6565476fe6a56189c6f7144b7be915be3579292a
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Oct 16 18:23:27 2007 -0700

    KVM: x86 emulator: cmc, clc, cli, sti
    
    Instruction: cmc, clc, cli, sti
    opcodes: 0xf5, 0xf8, 0xfa, 0xfb respectively.
    
    [avi: fix reference to EFLG_IF which is not defined anywhere]
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a30fcdc59c5a93457338fe4de27108543389106d
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 17 12:18:47 2007 +0200

    KVM: MMU: Simplify page table walker
    
    Simplify the walker level loop not to carry so much information from one
    loop to the next.  In addition to being complex, this made kmap_atomic()
    critical sections difficult to manage.
    
    As a result of this change, kmap_atomic() sections are limited to actually
    touching the guest pte, which allows the other functions called from the
    walker to do sleepy operations.  This will happen when we enable swapping.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cf386590baf3111a17f5a64e10328dbe8a60bcf7
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Fri Oct 12 17:40:33 2007 -0700

    KVM: x86 emulator: Implement emulation of instruction: inc & dec
    
    Instructions:
    	inc r16/r32 (opcode 0x40-0x47)
    	dec r16/r32 (opcode 0x48-0x4f)
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit bba43153582493fef707335cf1de525d12dbeeec
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Oct 16 17:22:08 2007 +0200

    KVM: Rename KVM_TLB_FLUSH to KVM_REQ_TLB_FLUSH
    
    We now have a new namespace, KVM_REQ_*, for bits in vcpu->requests.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit daf24e56a39ce57002da560f6144065ce7c18450
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Oct 16 16:23:22 2007 +0200

    KVM: Move apic timer interrupt backlog processing to common code
    
    Beside the obvious goodness of making code more common, this prevents
    a livelock with the next patch which moves interrupt injection out of the
    critical section.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1ac4f23de15bdcc8375a2ed67f54d63322746123
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Fri Oct 12 11:01:59 2007 +0200

    KVM: Add some \n in ioapic_debug()
    
    Add new-line at end of debug strings.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 32bc111f729a49d67f425568f1ac573b4577aa45
Author: Qing He <qing.he@intel.com>
Date:   Mon Sep 24 17:39:41 2007 +0800

    KVM: apic round robin cleanup
    
    If no apic is enabled in the bitmap of an interrupt delivery with delivery
    mode of lowest priority, a warning should be reported rather than select
    a fallback vcpu
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Eddie (Yaozu) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b2bd95573b0efa3ddbeb41d52fe5ddc140529324
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Thu Oct 11 19:16:52 2007 +0200

    KVM: Portability: split kvm_vcpu_ioctl
    
    This patch splits kvm_vcpu_ioctl into archtecture independent parts, and
    x86 specific parts which go to kvm_arch_vcpu_ioctl in x86.c.
    
    Common ioctls for all architectures are:
    KVM_RUN, KVM_GET/SET_(S-)REGS, KVM_TRANSLATE, KVM_INTERRUPT,
    KVM_DEBUG_GUEST, KVM_SET_SIGNAL_MASK, KVM_GET/SET_FPU
    Note that some PPC chips don't have an FPU, so we might need an #ifdef
    around KVM_GET/SET_FPU one day.
    
    x86 specific ioctls are:
    KVM_GET/SET_LAPIC, KVM_SET_CPUID, KVM_GET/SET_MSRS
    
    An interresting aspect is vcpu_load/vcpu_put. We now have a common
    vcpu_load/put which does the preemption stuff, and an architecture
    specific kvm_arch_vcpu_load/put. In the x86 case, this one calls the
    vmx/svm function defined in kvm_x86_ops.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Reviewed-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 73b3b30fe93fd5923c101311a79e18404bad7e8c
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:30:21 2007 +0200

    KVM: MMU: When updating the dirty bit, inform the mmu about it
    
    Since the mmu uses different shadow pages for dirty large pages and clean
    large pages, this allows the mmu to drop ptes that are now invalid.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 99920ba10b1d5bb51d273e42c58f8767e5a5bde4
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:22:59 2007 +0200

    KVM: MMU: Move dirty bit updates to a separate function
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b1e01be00832901cb70690b468f3819b76ef8473
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:13:49 2007 +0200

    KVM: MMU: Instantiate real-mode shadows as user writable shadows
    
    This is consistent with real-mode permissions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 85399c5e0e8d2e6c5330dfb66a16aee59978fc52
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:12:24 2007 +0200

    KVM: MMU: Disable write access on clean large pages
    
    By forcing clean huge pages to be read-only, we have separate roles
    for the shadow of a clean large page and the shadow of a dirty large
    page.  This is necessary because different ptes will be instantiated
    for the two cases, even for read faults.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8635624404040a9565033bf26bb70a278e85652a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:08:41 2007 +0200

    KVM: MMU: Fix nx access bit for huge pages
    
    We must set the bit before the shift, otherwise the wrong bit gets set.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8e31a1895e0821947d4d1e7a533400481875eed5
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 12:32:30 2007 +0200

    KVM: Move guest pte dirty bit management to the guest pagetable walker
    
    This is more consistent with the accessed bit management, and makes the dirty
    bit available earlier for other purposes.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c58d4160facaf0730cc548d6c18cf4250b70d960
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed Oct 10 20:08:41 2007 -0500

    KVM: MMU: More struct kvm_vcpu -> struct kvm cleanups
    
    This time, the biggest change is gpa_to_hpa. The translation of GPA to HPA does
    not depend on the VCPU state unlike GVA to GPA so there's no need to pass in
    the kvm_vcpu.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 55d32ac35e946d641573a5bc0ebc99e8644897e7
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed Oct 10 19:25:50 2007 -0500

    KVM: MMU: Clean up MMU functions to take struct kvm when appropriate
    
    Some of the MMU functions take a struct kvm_vcpu even though they affect all
    VCPUs.  This patch cleans up some of them to instead take a struct kvm.  This
    makes things a bit more clear.
    
    The main thing that was confusing me was whether certain functions need to be
    called on all VCPUs.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d45e5fa2beb3afbab77a1d63b335bc79ea624347
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Wed Oct 10 17:16:19 2007 +0200

    KVM: Move x86 msr handling to new files x86.[ch]
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d8c9c8421106deed341737833f7b1fe09ed8eae0
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Oct 9 19:20:39 2007 +0200

    KVM: Support assigning userspace memory to the guest
    
    Instead of having the kernel allocate memory to the guest, let userspace
    allocate it and pass the address to the kernel.
    
    This is required for s390 support, but also enables features like memory
    sharing and using hugetlbfs backed memory.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 42db4f37fc99106f31db4ff1bebe4602aa01d436
Author: Mike Day <ncmike@ncultra.org>
Date:   Mon Oct 8 09:02:08 2007 -0400

    KVM: CodingStyle cleanup
    
    Signed-off-by: Mike D. Day <ncmike@ncultra.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b764fc6a6988905196cada4a2cc8b8d249a88182
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Oct 8 10:55:29 2007 +1000

    KVM: Remove gratuitous casts from lapic.c
    
    Since vcpu->apic is of the correct type, there's not need to cast.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ed6929cf8066cbdf0e9576237d6665fe07b9f719
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Oct 8 10:50:48 2007 +1000

    KVM: Hoist kvm_create_lapic() into kvm_vcpu_init()
    
    Move kvm_create_lapic() into kvm_vcpu_init(), rather than having svm
    and vmx do it.  And make it return the error rather than a fairly
    random -ENOMEM.
    
    This also solves the problem that neither svm.c nor vmx.c actually
    handles the error path properly.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 179576337126bd75c3937fc3ba9ecdbf40206ec1
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Oct 8 10:48:30 2007 +1000

    KVM: Add kvm_free_lapic() to pair with kvm_create_lapic()
    
    Instead of the asymetry of kvm_free_apic, implement kvm_free_lapic().
    And guess what?  I found a minor bug: we don't need to hrtimer_cancel()
    from kvm_main.c, because we do that in kvm_free_apic().
    
    Also:
    1) kvm_vcpu_uninit should be the reverse order from kvm_vcpu_init.
    2) Don't set apic->regs_page to zero before freeing apic.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f98bb805217e9f51356fe91e3b1a3675636b7b91
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Oct 2 18:52:55 2007 +0200

    KVM: Allow dynamic allocation of the mmu shadow cache size
    
    The user is now able to set how many mmu pages will be allocated to the guest.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a3aa9d5f129928fbe12dac8d18a425cf7ddeb9b6
Author: Izik Eidus <izike@qumranet.com>
Date:   Mon Oct 1 22:14:18 2007 +0200

    KVM: Add general accessors to read and write guest memory
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1721129db9eb57da3526d5c35f0a3f4751092fbb
Author: Izik Eidus <izike@qumranet.com>
Date:   Thu Sep 27 14:11:22 2007 +0200

    KVM: Remove the usage of page->private field by rmap
    
    When kvm uses user-allocated pages in the future for the guest, we won't
    be able to use page->private for rmap, since page->rmap is reserved for
    the filesystem.  So we move the rmap base pointers to the memory slot.
    
    A side effect of this is that we need to store the gfn of each gpte in
    the shadow pages, since the memory slot is addressed by gfn, instead of
    hfn like struct page.
    
    Signed-off-by: Izik Eidus <izik@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cdd86a3587294db88cf38087ef86d8a8bd4078b4
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Sep 30 11:02:53 2007 +0200

    KVM: VMX: Simplify vcpu_clear()
    
    Now that smp_call_function_single() knows how to call a function on the
    current cpu, there's no need to check explicitly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8d094b4b639ef0d1f13efc4087e2cf7eed18b829
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Sep 30 10:50:12 2007 +0200

    KVM: VMX: Don't clear the vmcs if the vcpu is not loaded on any processor
    
    Noted by Eddie Dong.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9baa931ce679cd86e6f4df65eafe75a624f462e6
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 25 13:36:40 2007 +0200

    KVM: x86 emulator: Any legacy prefix after a REX prefix nullifies its effect
    
    This patch modifies the management of REX prefix according behavior
    I saw in Xen 3.1.  In Xen, this modification has been introduced by
    Jan Beulich.
    
    http://lists.xensource.com/archives/html/xen-changelog/2007-01/msg00081.html
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 356706e476c0c070ab0136d82de40d7f2d5334b0
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Sep 24 17:00:58 2007 +0200

    KVM: Purify x86_decode_insn() error case management
    
    The only valid case is on protected page access, other cases are errors.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cefc532968a7dda45f3f77a5e9c668d4fbb522e0
Author: Qing He <qing.he@intel.com>
Date:   Mon Sep 24 17:22:13 2007 +0800

    KVM: x86_emulator: no writeback for bt
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b2a7e15b33e396b90714542ff9da6a2e69faa1ca
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Sep 24 11:10:56 2007 +0200

    KVM: x86 emulator: Remove no_wb, use dst.type = OP_NONE instead
    
    Remove no_wb, use dst.type = OP_NONE instead, idea stollen from xen-3.1
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b9b6d84d9d4e49ce3745114cfceb08169cc0d049
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Sep 24 11:10:55 2007 +0200

    KVM: x86 emulator: remove _eflags and use directly ctxt->eflags.
    
    Remove _eflags and use directly ctxt->eflags. Caching eflags is not needed as
    it is restored to vcpu by kvm_main.c:emulate_instruction() from ctxt->eflags
    only if emulation doesn't fail.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a1d2a0157d8921eef92cbdb932619dc37b8a992b
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Sep 24 11:10:54 2007 +0200

    KVM: x86 emulator: split some decoding into functions for readability
    
    To improve readability, move push, writeback, and grp 1a/2/3/4/5/9 emulation
    parts into functions.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 076e313e2e111b5fc3aa9f1f419d6344a5d4a6c6
Author: Ryan Harper <ryanh@us.ibm.com>
Date:   Tue Sep 18 14:05:16 2007 -0500

    KVM: MMU: Ignore reserved bits in cr3 in non-pae mode
    
    This patch removes the fault injected when the guest attempts to set reserved
    bits in cr3.  X86 hardware doesn't generate a fault when setting reserved bits.
    The result of this patch is that vmware-server, running within a kvm guest,
    boots and runs memtest from an iso.
    
    Signed-off-by: Ryan Harper <ryanh@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8fb4094e6c7715f7c1bf7c900cdc10c9544790db
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Sep 23 14:10:49 2007 +0200

    KVM: MMU: Make flooding detection work when guest page faults are bypassed
    
    When we allow guest page faults to reach the guests directly, we lose
    the fault tracking which allows us to detect demand paging.  So we provide
    an alternate mechnism by clearing the accessed bit when we set a pte, and
    checking it later to see if the guest actually used it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 42105992041fbb1420e70440564e737d9217e5c8
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Sep 16 18:58:32 2007 +0200

    KVM: Allow not-present guest page faults to bypass kvm
    
    There are two classes of page faults trapped by kvm:
     - host page faults, where the fault is needed to allow kvm to install
       the shadow pte or update the guest accessed and dirty bits
     - guest page faults, where the guest has faulted and kvm simply injects
       the fault back into the guest to handle
    
    The second class, guest page faults, is pure overhead.  We can eliminate
    some of it on vmx using the following evil trick:
     - when we set up a shadow page table entry, if the corresponding guest pte
       is not present, set up the shadow pte as not present
     - if the guest pte _is_ present, mark the shadow pte as present but also
       set one of the reserved bits in the shadow pte
     - tell the vmx hardware not to trap faults which have the present bit clear
    
    With this, normal page-not-present faults go directly to the guest,
    bypassing kvm entirely.
    
    Unfortunately, this trick only works on Intel hardware, as AMD lacks a
    way to discriminate among page faults based on error code.  It is also
    a little risky since it uses reserved bits which might become unreserved
    in the future, so a module parameter is provided to disable it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c7af157c7c3afb5b7ed9b90fa62f896b7348c7b4
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Aug 29 03:48:05 2007 +0300

    KVM: VMX: Further reduce efer reloads
    
    KVM avoids reloading the efer msr when the difference between the guest
    and host values consist of the long mode bits (which are switched by
    hardware) and the NX bit (which is emulated by the KVM MMU).
    
    This patch also allows KVM to ignore SCE (syscall enable) when the guest
    is running in 32-bit mode.  This is because the syscall instruction is
    not available in 32-bit mode on Intel processors, so the SCE bit is
    effectively meaningless.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a7a21db3747f42f0800fc9b36f593ded764eeb31
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:27:37 2007 +0200

    KVM: Call x86_decode_insn() only when needed
    
    Move emulate_ctxt to kvm_vcpu to keep emulate context when we exit from kvm
    module. Call x86_decode_insn() only when needed. Modify x86_emulate_insn() to
    not modify the context if it must be re-entered.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4b04b506860b2c93974cfecc1b42966dba9d40c9
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:27:27 2007 +0200

    KVM: emulate_instruction() calls now x86_decode_insn() and x86_emulate_insn()
    
    emulate_instruction() calls now x86_decode_insn() and x86_emulate_insn().
    x86_emulate_insn() is x86_emulate_memop() without the decoding part.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 768d86be259b4a27d6729bfae8687299b9482b85
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:27:19 2007 +0200

    KVM: x86 emulator: move all decoding process to function x86_decode_insn()
    
    Split the decoding process into a new function x86_decode_insn().
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1186b126c4b2dd19c4cafaf2fed0cf719f70e514
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:52:50 2007 +0200

    KVM: x86 emulator: move all x86_emulate_memop() to a structure
    
    Move all x86_emulate_memop() common variables between decode and execute to a
    structure decode_cache.  This will help in later separating decode and
    emulate.
    
                struct decode_cache {
                    u8 twobyte;
                    u8 b;
                    u8 lock_prefix;
                    u8 rep_prefix;
                    u8 op_bytes;
                    u8 ad_bytes;
                    struct operand src;
                    struct operand dst;
                    unsigned long *override_base;
                    unsigned int d;
                    unsigned long regs[NR_VCPU_REGS];
                    unsigned long eip;
                    /* modrm */
                    u8 modrm;
                    u8 modrm_mod;
                    u8 modrm_reg;
                    u8 modrm_rm;
                    u8 use_modrm_ea;
                    unsigned long modrm_ea;
                    unsigned long modrm_val;
               };
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b9faf10e0648cd7aef88e66d514fb25f539c752f
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:26:38 2007 +0200

    KVM: x86 emulator: remove unused functions
    
    Remove #ifdef functions never used
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit eccc7af16b268302089c302da84f2f7d99c13164
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Sep 17 14:57:50 2007 -0500

    KVM: Refactor hypercall infrastructure (v3)
    
    This patch refactors the current hypercall infrastructure to better
    support live migration and SMP.  It eliminates the hypercall page by
    trapping the UD exception that would occur if you used the wrong hypercall
    instruction for the underlying architecture and replacing it with the right
    one lazily.
    
    A fall-out of this patch is that the unhandled hypercalls no longer trap to
    userspace.  There is very little reason though to use a hypercall to
    communicate with userspace as PIO or MMIO can be used.  There is no code
    in tree that uses userspace hypercalls.
    
    [avi: fix #ud injection on vmx]
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 32b9abd5cfe444dc5ebee1d5d0a560b72f7d17fe
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Sep 17 14:57:49 2007 -0500

    KVM: x86 emulator: Add vmmcall/vmcall to x86_emulate (v3)
    
    Add vmmcall/vmcall to x86_emulate.  Future patch will implement functionality
    for these instructions.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9bfdd08f0a65e67d0842ca35516669bec523fbe0
Author: Avi Kivity <avi@qumranet.com>
Date:   Sat Dec 22 22:14:10 2007 +0200

    KVM: MMU: Fix cmpxchg8b emulation on i386 (again)
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 433be519ccd864b860385a03a864b4024b5d2f40
Author: Izik Eidus <izike@qumranet.com>
Date:   Thu Dec 20 10:41:39 2007 +0200

    KVM: Ensure pages are copied on write
    
    Fix userspace memory handling bug related to cow the previous way we called
    get_user_pages, was without the force flag and therefore it didnt break COW
    shared pages.  This caused host memory corruption when host userspace fork()ed.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6fd19d01a523fd82db3867402ed1774589bcf6c1
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Dec 19 12:02:40 2007 +0200

    KVM: Print data for unimplemented wrmsr
    
    This can help diagnosing what the guest is trying to do.  In many cases
    we can get away with partial emulation of msrs.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 97dd1569ab4ba6844a08a65cd3bc4e79dc533d38
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Dec 18 19:47:18 2007 +0200

    KVM: MMU: Add cache miss statistic
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 89075253714a299236b8debfbfdcb670a3932a1b
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Tue Dec 18 06:08:27 2007 +0800

    KVM: MMU: Coalesce remote tlb flushes
    
    Host side TLB flush can be merged together if multiple
    spte need to be write-protected.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 04e6202e30ecd89c8e48802b64397ebaa55363e9
Author: xiantao.zhang@intel.com <xiantao.zhang@intel.com>
Date:   Mon Dec 17 20:27:27 2007 +0800

    KVM: Expose ioapic to ia64 save/restore APIs
    
    IA64 also needs to see ioapic structure in irqchip.
    
    Signed-off-by: xiantao.zhang@intel.com <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e4f8524a630ca33c3000263d29acb5c619ee94f0
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Dec 17 14:21:40 2007 +0800

    KVM: Move kvm_vcpu_kick() to x86.c
    
    Moving kvm_vcpu_kick() to x86.c. Since it should be
    common for all archs, put its declarations in <linux/kvm_host.h>
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e9d06906038878357658d316b897d3105cec704e
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Dec 17 14:16:14 2007 +0800

    KVM: Move ioapic code to common directory.
    
    Move ioapic code to common, since IA64 also needs it.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8d4b72100521699e0d2c4295b7ec8fb53f3131c9
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Dec 17 13:59:56 2007 +0800

    KVM: Move irqchip declarations into new ioapic.h and lapic.h
    
    This allows reuse of ioapic in ia64.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5ca24d96dbc31d7018bc71e355f824f25b70316e
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 16 14:09:36 2007 +0200

    KVM: Fix compile error in asm/kvm_host.h
    
    "types.h" refers to the wrong file.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 276db3794711c5aff691c75495d6a5327773271f
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 16 11:13:16 2007 +0200

    KVM: Move drivers/kvm/* to virt/kvm/
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 327f5d208b042f5228ff2d57054a32f57ab37c2c
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 16 11:02:48 2007 +0200

    KVM: Move arch dependent files to new directory arch/x86/kvm/
    
    This paves the way for multiple architecture support.  Note that while
    ioapic.c could potentially be shared with ia64, it is also moved.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1ca8b98d60891eea9e5083cd40db99bb3715a918
Author: Ryan Harper <ryanh@us.ibm.com>
Date:   Thu Dec 13 10:21:10 2007 -0600

    KVM: VMX: Add printk_ratelimit in vmx_intr_assist
    
    Add printk_ratelimit check in front of printk.  This prevents spamming
    of the message during 32-bit ubuntu 6.06server install.  Previously, it
    would hang during the partition formatting stage.
    
    Signed-off-by: Ryan Harper <ryanh@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c272edd56fd1201238dfe9b29f027249067ed318
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 10:23:23 2007 +0800

    KVM: Portability: Move kvm_vm_stat to x86.h
    
    This patch moves kvm_vm_stat to x86.h, and every arch
    can define its own kvm_vm_stat in $arch.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f0950596e4e420583b3b3e6bb168793a5cb483bd
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 10:20:16 2007 +0800

    KVM: Portability: Move round_robin_prev_vcpu and tss_addr to kvm_arch
    
    This patches moves two fields round_robin_prev_vcpu and tss to kvm_arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e1ff111628bb1cf49ddcaca105304e1cd83e577c
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 10:17:34 2007 +0800

    KVM: Portability: move vpic and vioapic to kvm_arch
    
    This patches moves two fields vpid and vioapic to kvm_arch
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4bfe8599b11664e041ff1733cbe43aedf53e054d
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 10:01:48 2007 +0800

    KVM: Portability: Move mmu-related fields to kvm_arch
    
    This patches moves mmu-related fields to kvm_arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f18af6836ed727e79873cf868a47cbec18e18914
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:54:20 2007 +0800

    KVM: Portability: Move memslot aliases to new struct kvm_arch
    
    This patches create kvm_arch to hold arch-specific kvm fileds
    and moves fields naliases and aliases to kvm_arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5104cf2ffdd47854a00e48fb3a089f107e7f30da
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:49:26 2007 +0800

    KVM: Portability: Move kvm_vcpu_stat to x86.h
    
    This patches moves kvm_vcpu_stat to x86.h, so every
    arch can define its own kvm_vcpu_stat structure.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9edf53496ee757f2c090ba73d49cfd18de3b5b14
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:45:31 2007 +0800

    KVM: Portability: Expand the KVM_VCPU_COMM in kvm_vcpu structure.
    
    This patches removes KVM_COMM macro, original it is hold
    kvm_vcpu common fields.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit acd488e9febb97780eab35d5af85fa59a72a295d
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:41:22 2007 +0800

    KVM: Portability: Move kvm_vcpu definition back to kvm.h
    
    This patches moves kvm_vcpu definition to kvm.h, and finally
    kvm.h includes x86.h.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 689ab57bf3245a685845f6906f4b47364f11af84
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Dec 14 09:35:10 2007 +0800

    KVM: Portability: Split mmu-related static inline functions to mmu.h
    
    Since these functions need to know the details of kvm or kvm_vcpu structure,
    it can't be put in x86.h.  Create mmu.h to hold them.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5b30a22d7c2ced107410de7b912916976e8d951d
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Thu Dec 13 23:50:52 2007 +0800

    KVM: Portability: Introduce kvm_vcpu_arch
    
    Move all the architecture-specific fields in kvm_vcpu into a new struct
    kvm_vcpu_arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e315a28bf3e1e0f3d31231b106198009a152f499
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Tue Dec 11 20:36:00 2007 +0800

    KVM: Portability: Move kvm{pic,ioapic} accesors to x86 specific code
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d5855a0ec163afd591f272e13e1f2c366794306f
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 13 14:44:12 2007 +0200

    KVM: Another cmpxchg emulation compile fix
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit eeb8852f8e8588e77bde5dfbb20808e7ba70105f
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 13 14:44:12 2007 +0200

    KVM: Another cmpxchg emulation compile fix
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a5e8e8a4f7ddb211d856eb30e93e70af9d51b735
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 13 14:30:28 2007 +0200

    KVM: Another cmpxchg i386 compile fix
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit eb9270cee7061e5a985eb9eac559270292b1d7ec
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 13 14:14:17 2007 +0200

    KVM: Make cmpxchg emulation compile on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 24a673eb377069509e8dad5ff7fa146e363df739
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 13 13:19:04 2007 +0200

    KVM: Fix bad kunmap_atomic() paramerter inm cmpxchg emulation
    
    Noticed by Uri Lublin
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9a5ee611ca79bec98e9d97e93f11df8083b8af6e
Author: Marcelo Tosatti <marcelo@kvack.org>
Date:   Wed Dec 12 10:46:12 2007 -0500

    KVM: MMU: emulated cmpxchg8b should be atomic on i386
    
    Emulate cmpxchg8b atomically on i386. This is required to avoid a guest
    pte walker from seeing a splitted write.
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 158be9e5f1af9b04da7b7349328dc466e2220047
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Wed Dec 12 12:37:24 2007 +0100

    KVM: LAPIC: minor debugging compile fix
    
    This patch fixes a compile error of the LAPIC code with APIC debugging enabled.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d3a1ec98fe24e8693b794397e533a3d4770f8b09
Author: Marcelo Tosatti <marcelo@kvack.org>
Date:   Tue Dec 11 19:12:27 2007 -0500

    KVM: MMU: Fix SMP shadow instantiation race
    
    There is a race where VCPU0 is shadowing a pagetable entry while VCPU1
    is updating it, which results in a stale shadow copy.
    
    Fix that by comparing the contents of the cached guest pte with the
    current guest pte after write-protecting the guest pagetable.
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2018f67db6923910e135b6a8aaf1c9167cfa7f5d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Dec 11 19:23:11 2007 +0200

    KVM: MMU: Fix kunmap_atomic() call in cmpxchg_gpte()
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 51ef1ac7b23ee32bfcc61c229d634fdc1c68b38a
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Tue Dec 11 15:36:57 2007 +0100

    KVM: SVM: support writing 0 to K8 performance counter control registers
    
    This lets SVM ignore writes of the value 0 to the performance counter control
    registers.  Thus enabling them will still fail in the guest, but a write of 0
    which keeps them disabled is accepted.  This is required to boot Windows
    Vista 64bit.
    
    [avi: avoid fall-thru in switch statement]
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ced316df1d44ed441d6268e13500fef2b7ad9bfb
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Thu Dec 6 21:02:25 2007 +0100

    KVM: SVM: Exit to userspace if write to cr8 and not using in-kernel apic
    
    With this patch KVM on SVM will exit to userspace if the guest writes to CR8
    and the in-kernel APIC is disabled.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3ef4e62ae2a1c0109e20da39e7b42570c59cb3cc
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 18:43:00 2007 +0200

    KVM: MMU: Use mmu_set_spte() for real-mode shadows
    
    In addition to removing some duplicated code, this also handles the unlikely
    case of real-mode code updating a guest page table.  This can happen when
    one vcpu (in real mode) touches a second vcpu's (in protected mode) page
    tables, or if a vcpu switches to real mode, touches page tables, and switches
    back.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d1d1b5649e7e531fe5eac5e218c99289ce5a1a1d
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 18:39:41 2007 +0200

    KVM: MMU: Adjust mmu_set_spte() debug code for gpte removal
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 86a3d6593e8942de664d9999402f3fc28e83c4cd
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:40:31 2007 +0200

    KVM: MMU: Move set_pte() into guest paging mode independent code
    
    As set_pte() no longer references either a gpte or the guest walker, we can
    move it out of paging mode dependent code (which compiles twice and is
    generally nasty).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2deb9afd26d79c868f88ff9019f32e1498f1fe6c
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:33:46 2007 +0200

    KVM: MMU: Remove walker argument to set_pte()
    
    Unused.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 65d8730b56fc407c26ad725313f8c3acfc16d804
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:32:30 2007 +0200

    KVM: MMU: Pass pte dirty flag to set_pte() instead of calculating it on-site
    
    This allows us to remove its dependency on pt_element_t.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 49b2ab6e5ca086c3e3fa4abc922b296d38e59fb5
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:27:52 2007 +0200

    KVM: MMU: No need to pick up nx bit from guest pte
    
    We already set it according to cumulative access permissions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ddac3bf3e282a13d74cf61863ea100803cda671d
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 17:00:02 2007 +0200

    KVM: MMU: Fix inherited permissions for emulated guest pte updates
    
    When we emulate a guest pte write, we fail to apply the correct inherited
    permissions from the parent ptes.  Now that we store inherited permissions
    in the shadow page, we can use that to update the pte permissions correctly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 10b29d09ceec124d145977ae1e207931825fd755
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 16:52:56 2007 +0200

    KVM: MMU: Move pte access calculation into a helper function
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4e2cddf1e56ba13bb44fd035a669f2d7ce1874f7
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 16:37:36 2007 +0200

    KVM: MMU: Set nx bit correctly on shadow ptes
    
    While the page table walker correctly generates a guest page fault
    if a guest tries to execute a non-executable page, the shadow code does
    not mark it non-executable.  This means that if a guest accesses an nx
    page first with a read access, then subsequent code fetch accesses will
    succeed.
    
    Fix by setting the nx bit on shadow ptes.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4347f30eaf3debcfd2b0d90573930b1b8a7de389
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 9 16:15:46 2007 +0200

    KVM: MMU: Simplify calculation of pte access
    
    The nx bit is awkwardly placed in the 63rd bit position; furthermore it
    has a reversed meaning compared to the other bits, which means we can't use
    a bitwise and to calculate compounded access masks.
    
    So, we simplify things by creating a new 3-bit exec/write/user access word,
    and doing all calculations in that.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 979158adc73ea573897a1d43f9a6cd4dc13e95d6
Author: Marcelo Tosatti <marcelo@kvack.org>
Date:   Fri Dec 7 07:56:58 2007 -0500

    KVM: MMU: Use cmpxchg for pte updates on walk_addr()
    
    In preparation for multi-threaded guest pte walking, use cmpxchg()
    when updating guest pte's. This guarantees that the assignment of the
    dirty bit can't be lost if two CPU's are faulting the same address
    simultaneously.
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2066ea592f45f4bd2ad2cc0d78b7694b48976105
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 21:11:31 2007 +0200

    KVM: VMX: Fix cr8 exit optimization
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9581bd821d8ffe6c5cd4605e8c97e4bce352f1af
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 19:50:00 2007 +0200

    KVM: SVM: Trap access to the cr8 register
    
    Later we may be able to use the virtual tpr feature, but for now,
    just trap it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8893b9252a3256910c184153522cf77940f85cac
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 18:14:14 2007 +0200

    KVM: x86 emulator: Fix stack instructions on 64-bit mode
    
    Stack instructions are always 64-bit on 64-bit mode; many of the
    emulated stack instructions did not take that into account.  Fix by
    adding a 'Stack' bitflag and setting the operand size appropriately
    during the decode stage (except for 'push r/m', which is in a group
    with a few other instructions, so it gets its own treatment).
    
    This fixes random crashes on Vista x64.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c8926698e4ee6c5d7661a73aaf92e00d8c1c8d66
Author: Joerg Roedel <jroedel@lemmy.amd.com>
Date:   Thu Dec 6 15:46:52 2007 +0100

    KVM: SVM: Emulate read/write access to cr8
    
    This patch adds code to emulate the access to the cr8 register to the x86
    instruction emulator in kvm.  This is needed on svm, where there is no
    hardware decode for control register access.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3224ccab2ad15aa096fa4195a2d0f30722eb6c8c
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 16:32:45 2007 +0200

    KVM: VMX: Avoid exit when setting cr8 if the local apic is in the kernel
    
    With apic in userspace, we must exit to userspace after a cr8 write in order
    to update the tpr.  But if the apic is in the kernel, the exit is unnecessary.
    
    Noticed by Joerg Roedel.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 297a0bef8fbad4367abac85d1a976a1cc8d3c8bf
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Dec 6 16:15:02 2007 +0200

    KVM: x86 emulator: fix eflags preparation for emulation
    
    We prepare eflags for the emulated instruction, then clobber it with an 'andl'.
    Fix by popping eflags as the last thing in the sequence.
    
    Patch taken from Xen (16143:959b4b92b6bf)
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ac319e6300d31ccfe0b2a7767c5fcd9385028954
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 15:22:50 2007 +0200

    KVM: Use generalized exception queue for injecting #UD
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a78abaa8f90d60274ca818f45d4e1abe19da7bc4
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 14:12:03 2007 +0200

    KVM: Replace #GP injection by the generalized exception queue
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f93ec37e9625b2d60e2f58d0bcc123878b695260
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 14:04:58 2007 +0200

    KVM: Replace page fault injection by the generalized exception queue
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 39aaaf61fb37a66120928ad4a5b03393f5b8ffe1
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 13:41:11 2007 +0200

    KVM: Generalize exception injection mechanism
    
    Instead of each subarch doing its own thing, add an API for queuing an
    injection, and manage failed exception injection centerally (i.e., if
    an inject failed due to a shadow page fault, we need to requeue it).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d8bb219b4d69baa8f65e934ef830dd742f69c001
Author: Marcelo Tosatti <marcelo@kvack.org>
Date:   Tue Dec 4 13:42:16 2007 -0500

    KVM: MMU: Remove unused prev_shadow_ent variable from fetch()
    
    Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 83f9425dc148dbe0d373b7e07f221affff9e91cb
Author: npiggin@suse.de <npiggin@suse.de>
Date:   Wed Dec 5 18:15:52 2007 +1100

    KVM: Convert KVM from ->nopage() to ->fault()
    
    Signed-off-by: Nick Piggin <npiggin@suse.de>
    Cc: kvm-devel@lists.sourceforge.net
    Cc: avi@qumranet.com
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fc424a7c4280adc9335ed92f31772b7df3f85745
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Dec 3 16:15:26 2007 -0600

    KVM: Portability: Create kvm_arch_vcpu_runnable() function
    
    This abstracts the detail of x86 hlt and INIT modes into a function.
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 359306765f53cdc7736e0303c31658239f6df825
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Dec 3 15:30:25 2007 -0600

    KVM: Portability: Stop including x86-specific headers in kvm_main.c
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3f3c54ebf4d6f3423571be332f7965a7efb327cc
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Dec 3 15:30:24 2007 -0600

    KVM: Portability: Move IO device definitions to its own header file
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 871bc1ede128496f82409b34d94a9e8e977c3cf7
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Dec 3 15:30:23 2007 -0600

    KVM: Portability: Move address types to their own header file
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit efdeac066fbc73cbb4cc2d8c8975192454ea2a50
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Dec 2 22:53:07 2007 +0800

    KVM: Extend ioapic code to support iosapic
    
    iosapic supports an additional mmio EOI register compared to ioapic.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f2601fd2874b94d14dfcb21a3e1b84d7f3262b17
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Dec 2 22:49:09 2007 +0800

    KVM: Replace dest_Lowest_Prio and dest_Fixed with self-defined macros
    
    Change
      dest_Loest_Prio -> IOAPIC_LOWEST_PRIORITY
      dest_Fixed -> IOAPIC_FIXED
    
    the original names are x86 specific, while the ioapic code will be reused
    for ia64.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9a771bb114c3e37b232ed0e798c5c7a59e4f0c4b
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Dec 2 22:35:57 2007 +0800

    KVM: Replace kvm_lapic with kvm_vcpu in ioapic/lapic interface
    
    This patch replaces lapic structure with kvm_vcpu in ioapic.c, making ioapic
    independent of the local apic, as required by ia64.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e8d3621f9f0c209bdb888811a94168e25ea731cd
Author: Carlo Marcelo Arenas Belon <carenas@sajinet.com.pe>
Date:   Sat Dec 1 06:17:11 2007 -0600

    KVM: SVM: Remove KVM specific defines for MSR_EFER
    
    This patch removes the KVM specific defines for MSR_EFER that were being used
    in the svm support file and migrates all references to use instead the ones
    from the kernel headers that are used everywhere else and that have the same
    values.
    
    Signed-off-by: Carlo Marcelo Arenas Belon <carenas@sajinet.com.pe>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a393444c97f6d7355a6d7d6d7aeb80f1e72472b1
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Dec 2 10:50:06 2007 +0200

    KVM: Export include/linux/kvm.h only if $ARCH actually supports KVM
    
    Currently, make headers_check barfs due to <asm/kvm.h>, which <linux/kvm.h>
    includes, not existing.  Rather than add a zillion <asm/kvm.h>s, export kvm.h
    only if the arch actually supports it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d535dd00b9fc5b7aadc4229dac9e68fd36381c34
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Thu Nov 29 15:35:39 2007 +0800

    KVM: Correct kvm_init() error paths not freeing bad_pge.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 16d0ba47ac043733d756e299d7c17b4e1ea72203
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 21 04:36:41 2007 +0800

    KVM: Portability: Move KVM_INTERRUPT vcpu ioctl to x86.c
    
    Other archs doesn't need it.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 71be592a14aa8d127315b2c47bf83cc0d810a341
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 28 18:14:43 2007 +0200

    KVM: Don't bypass the mmu if in pae and pdptrs changed
    
    Relading cr3 with unchanged values usually means just a tlb flush, but if in
    pae mode and the pdptrs have changed, we have to reload them.
    
    This fixes a Solaris x86 regression.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ff378422888a8b3ff296e30297fcabf1c831ff83
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 27 19:30:56 2007 +0200

    KVM: x86 emulator: unify four switch statements into two
    
    Unify the special instruction switch with the regular instruction switch,
    and the two byte special instruction switch with the regular two byte
    instruction switch.  That makes it much easier to find an instruction or
    the place an instruction needs to be added in.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 924894fc582925140ed0648b3ca54af6df69d02e
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 27 19:14:21 2007 +0200

    KVM: x86 emulator: unify two switches
    
    The rep prefix cleanup left two switch () statements next to each other.
    Unify them.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 55915c4ddc358f4199f7c14650da1ec998832db8
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 27 19:05:37 2007 +0200

    KVM: x86 emulator: Move rep processing before instruction execution
    
    Currently rep processing is handled somewhere in the middle of instruction
    processing.  Move it to a sensible place.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0d84c92a54aff9ca6f9e25f4551d1af34eb59a8c
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Nov 26 18:30:07 2007 +0200

    KVM: Fix cpuid2 killing 32-bit guests on non-NX machines
    
    KVM_SET_CPUID fails to remove NX when the host doesn't support it, as previous
    versions do.  On the other hand, KVM_SET_CPUID2 removes the feature even
    though, since we tell userspace about it, it shouldn't be necessary.
    
    Swap the two cases.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d1923b98c89ec7d5fbcd852d07bd987e16aea02a
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 26 08:33:53 2007 -0600

    KVM: Add ifdef in irqchip struct for x86 only structures
    
    This patch fixes a small issue where sturctures:
    	kvm_pic_state
    	kvm_ioapic_state
    
    are defined inside x86 specific code and may or may not
    be defined in anyway for other architectures. The problem
    caused is one cannot compile userspace apps (ex. libkvm)
    for other archs since a size cannot be determined for these
    structures.
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 282cb7a36798ae74de609538354d92ee9b704ca2
Author: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date:   Mon Nov 26 13:49:09 2007 +0100

    KVM: x86 emulator: cmps instruction
    
    Add emulation for the cmps instruction.  This lets OpenBSD boot on kvm.
    
    Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8fc822737d95c85ea1db8a71fd835db11c0bdaa7
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Nov 26 16:10:43 2007 +0200

    KVM: x86 emulator: rename REP_REPE_PREFIX
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 259ad7f05da1720f7bf88a00bec795dd7b517810
Author: Izik Eidus <izike@qumranet.com>
Date:   Mon Nov 26 14:08:14 2007 +0200

    KVM: MMU: mark pages that were inserted to the shadow pages table as accessed
    
    Mark guest pages as accessed when removed from the shadow page tables for
    better lru processing.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8a29e7fd6400c8c14f4d8f7650c00a7e514d2829
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 25 17:45:31 2007 +0200

    KVM: Remove misleading check for mmio during event injection
    
    mmio was already handled in kvm_arch_vcpu_ioctl_run(), so no need to check
    again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 694401697ccd822bb08019731c3ee1bb34323d8e
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Nov 23 09:29:11 2007 +0200

    KVM: Revert segment_descriptor.h removal
    
    Mainline isn't ready for it yet.
    
    This reverts commits 9f9d8a265d28fc36cfc4f08af0210d4f8da5b147,
    51727a110220681f6f43b005d069e28c58f5d151,
    7a819d50439ec2801d9aad5850e76f183360a37a, and
    d24618db16bd6d6d7ea6129e2de8433416f81823.
    
    Reported by Xiantao Zhang.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9f9d8a265d28fc36cfc4f08af0210d4f8da5b147
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Thu Nov 22 17:24:01 2007 +0100

    KVM: Remove desc.h include in kvm_main.c
    
    This patch removes the include of asm/desc.h in kvm_main.c, which is
    only available for x86 and not needed anymore.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8df313eea0f1db990da6798d86b60fc2c2f1b57a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 22 14:16:12 2007 +0200

    KVM: x86 emulator: address size and operand size overrides are sticky
    
    Current implementation is to toggle, which is incorrect.  Patch ported from
    corresponding Xen code.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 06ac686984864851df4556dcfe184238a66e71c2
Author: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date:   Thu Nov 22 11:32:09 2007 +0100

    KVM: x86 emulator: Make a distinction between repeat prefixes F3 and F2
    
    cmps and scas instructions accept repeat prefixes F3 and F2. So in
    order to emulate those prefixed instructions we need to be able to know
    if prefixes are REP/REPE/REPZ or REPNE/REPNZ. Currently kvm doesn't make
    this distinction. This patch introduces this distinction.
    
    Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 368170736f3fa12d2c4846f01c5c2aaac19a22d6
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Thu Nov 22 11:20:33 2007 +0800

    KVM: Portability: Move unalias_gfn to arch dependent file
    
    Non-x86 archs don't need this mechanism. Move it to arch, and
    keep its interface in common.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cad4dbe630b663d1d21f928984fd170595aef498
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Wed Nov 21 14:33:25 2007 +0800

    KVM: VMX: Remove the secondary execute control dependency on irqchip
    
    The state of SECONDARY_VM_EXEC_CONTROL shouldn't depend on in-kernel IRQ chip,
    this patch fix this.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit bf3cee1783e143e4d24423bea59caa5f874cfc92
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 22 12:18:10 2007 +0200

    KVM: x86 emulator: Fix instruction fetch cache hit check
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 316bd49570ab7b1812bdd954ccd59ad8bc9e9ddc
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 22 11:42:59 2007 +0200

    KVM: Fix faults during injection of real-mode interrupts
    
    If vmx fails to inject a real-mode interrupt while fetching the interrupt
    redirection table, it fails to record this in the vectoring information
    field.  So we detect this condition and do it ourselves.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8fd969cf9a7fca9d4599b839e1995a6403d5a69e
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 22 11:30:47 2007 +0200

    KVM: VMX: Read & store IDT_VECTORING_INFO_FIELD
    
    We'll want to write to it in order to fix real-mode irq injection problems,
    but it is a read-only field.  Storing it in a variable solves that issue.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 51727a110220681f6f43b005d069e28c58f5d151
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 19:17:33 2007 +0200

    KVM: Fix compile error on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7a819d50439ec2801d9aad5850e76f183360a37a
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 18:35:55 2007 +0200

    KVM: Remove segment_descriptor, part 2
    
    Turns out there was another definition somewhere else.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d24618db16bd6d6d7ea6129e2de8433416f81823
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 17:34:40 2007 +0200

    KVM: Replace private 'struct segment descriptor' by x86's desc_struct
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 57d03e8414672b5730ef7e11bfc8412219ec1553
Author: Dan Kenigsberg <danken@qumranet.com>
Date:   Wed Nov 21 17:10:04 2007 +0200

    KVM: Enhance guest cpuid management
    
    The current cpuid management suffers from several problems, which inhibit
    passing through the host feature set to the guest:
    
     - No way to tell which features the host supports
    
      While some features can be supported with no changes to kvm, others
      need explicit support.  That means kvm needs to vet the feature set
      before it is passed to the guest.
    
     - No support for indexed or stateful cpuid entries
    
      Some cpuid entries depend on ecx as well as on eax, or on internal
      state in the processor (running cpuid multiple times with the same
      input returns different output).  The current cpuid machinery only
      supports keying on eax.
    
     - No support for save/restore/migrate
    
      The internal state above needs to be exposed to userspace so it can
      be saved or migrated.
    
    This patch adds extended cpuid support by means of three new ioctls:
    
     - KVM_GET_SUPPORTED_CPUID: get all cpuid entries the host (and kvm)
       supports
    
     - KVM_SET_CPUID2: sets the vcpu's cpuid table
    
     - KVM_GET_CPUID2: gets the vcpu's cpuid table, including hidden state
    
    Signed-off-by: Dan Kenigsberg <danken@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2a4922c0219e85b2acd92b69644864baa20b2e96
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 16:41:05 2007 +0200

    KVM: Disallow fork() and similar games when using a VM
    
    We don't want the meaning of guest userspace changing under our feet.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a96b251aa077206f06481c94f8a0791b3cebf3a8
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 15:32:41 2007 +0200

    KVM: MMU: Rename 'release_page'
    
    Rename the awkwardly named variable.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 000e8b3948c9e3eebc133e585381ec6819d545c5
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 15:28:32 2007 +0200

    KVM: MMU: Rename variable of type 'struct kvm_mmu_page *'
    
    These are traditionally named 'page', but even more traditionally, that name
    is reserved for variables that point to a 'struct page'.  Rename them to 'sp'
    (for "shadow page").
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit eff7b4c194c24bea98d1f3460ea99b57ffcd7708
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 15:01:44 2007 +0200

    KVM: Remove gpa_to_hpa()
    
    Converting last uses along the way.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ee4d8423330ef532cad2f7eb08e1c756eb8ba1c6
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:57:44 2007 +0200

    KVM: MMU: Remove gva_to_hpa()
    
    No longer used.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4a5a77187742a411111394d0ad4ddf463e8eb5f7
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:54:16 2007 +0200

    KVM: MMU: Simplify nonpaging_map()
    
    Instead of passing an hpa, pass a regular struct page.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2847f40d0f9ca46ce5a6e090f6ba73cec51082cf
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:44:45 2007 +0200

    KVM: MMU: Introduce gfn_to_gpa()
    
    Converting a frame number to an address is tricky since the data type changes
    size.  Introduce a function to do it.  This fixes an actual bug when
    accessing guest ptes.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0c621b19fab7f7dcd859ec4f159accd5145461df
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:20:22 2007 +0200

    KVM: MMU: Adjust page_header_update_slot() to accept a gfn instead of a gpa
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0a2c38f403b2be6cfb1bd63eff7baa4004463c0f
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:16:30 2007 +0200

    KVM: MMU: Merge set_pte() and set_pte_common()
    
    Since set_pte() is now the only caller of set_pte_common(), merge the two
    functions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f1336a6a39c8afb430e7673c31bf279186c12910
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:11:49 2007 +0200

    KVM: MMU: Remove set_pde()
    
    It is now identical to set_pte().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3aafe8270b660e947d6f7d5feba6f706bca96140
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 14:08:40 2007 +0200

    KVM: MMU: Remove extra gaddr parameter from set_pte_common()
    
    Similar information is available in the gfn parameter, so use that.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ef44f09e80ff7d68b0dbe2d58b210079fe8990a0
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 13:54:47 2007 +0200

    KVM: MMU: Move pse36 handling to the guest walker
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1916b5e84c3dfa5fd085f4c3e98a4a700693c4e4
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 12:35:07 2007 +0200

    KVM: MMU: Introduce and use gpte_to_gfn()
    
    Instead of repretitively open-coding this.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 45b905fe748de97264edf476b011fece80e3f343
Author: Izik Eidus <izike@localhost.localdomain>
Date:   Tue Nov 20 12:02:12 2007 +0200

    KVM: MMU: Code cleanup
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 76559d9ad5324f466289c493d473259cd095fea0
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 02:57:59 2007 +0200

    KVM: Don't bother the mmu if cr3 load doesn't change cr3
    
    If the guest requests just a tlb flush, don't take the vm lock and
    drop the mmu context pointlessly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 798d6e84e6936206bfd9db9426ffb3687673b3c3
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 02:07:27 2007 +0200

    KVM: Add parentheses to silence gcc
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d07badf24aea04516a84676c08d5e0d4073363e8
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 21 02:06:21 2007 +0200

    KVM: MMU: Avoid unnecessary remote tlb flushes when guest updates a pte
    
    If all we're doing is increasing permissions on a pte (typical for demand
    paging), then there's not need to flush remote tlbs.  Worst case they'll
    get a spurious page fault.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 277eec7b001e006fcc21058d9b54dd5eda5c6d06
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 23:01:14 2007 +0200

    KVM: Add statistic for remote tlb flushes
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c91b40158f97160c5595a9c3a8829e49c298e7dd
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 21:39:54 2007 +0200

    KVM: MMU: Implement guest page fault bypass for nonpae
    
    I spent an hour worrying why I see so many guest page faults on FC6 i386.
    Turns out bypass wasn't implemented for nonpae.  Implement it so it doesn't
    happen again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 99040d8a4d9e4e6a49263e71d1832d360561a4e6
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 15:30:24 2007 +0200

    KVM: Split vcpu creation to avoid vcpu_load() before preemption setup
    
    Split kvm_arch_vcpu_create() into kvm_arch_vcpu_create() and
    kvm_arch_vcpu_setup(), enabling preemption notification between the two.
    This mean that we can now do vcpu_load() within kvm_arch_vcpu_setup().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fed1ca74c178299ab544795064d1c2636cbe78a9
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Tue Nov 20 16:25:04 2007 +0800

    KVM: Portability:  Split kvm_set_memory_region() to have an arch callout
    
    Moving !user_alloc case to kvm_arch to avoid unnecessary
    code logic in non-x86 platform.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3e2a2eca6961927778bb69d559a40183a85910c3
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Tue Nov 20 13:11:38 2007 +0800

    KVM: Recalculate mmu pages needed for every memory region change
    
    Instead of incrementally changing the mmu cache size for every memory slot
    operation, recalculate it from scratch.  This is simpler and safer.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 90557b67ec617449792f4ca4181d874edce14bea
Author: Amit Shah <amit.shah@qumranet.com>
Date:   Mon Nov 19 17:57:35 2007 +0200

    KVM: SVM: Fix FPU leak and re-enable lazy FPU switching
    
    The clts code didn't use set_cr0 properly, so our lazy FPU
    processing wasn't being done by the clts instruction at all.
    
    This fixes all the FPU leaks, so re-enabling lazy FPU
    optimization.
    
    Signed-off-by: Amit Shah <amit.shah@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 20046029f8b71bf0ed949542e5eecb3607619511
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 13:15:52 2007 +0200

    KVM: x86 emulator: prefetch up to 15 bytes of the instruction executed
    
    Instead of fetching one byte at a time, prefetch 15 bytes (or until the next
    page boundary) to avoid guest page table walks.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4df10721942bbdf93e4c7c168e775000a66fce00
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 12:49:31 2007 +0200

    KVM: x86 emulator: retire ->write_std()
    
    Theoretically used to acccess memory known to be ordinary RAM, it was
    never implemented.  It is questionable whether it is possible to implement
    it correctly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8afc7a0f22d7c5d8f5cbc3f0cfb0cd3d2bc66605
Author: Izik Eidus <izike@localhost.localdomain>
Date:   Tue Nov 20 11:49:33 2007 +0200

    KVM: MMU: Selectively set PageDirty when releasing guest memory
    
    Improve dirty bit setting for pages that kvm release, until now every page
    that we released we marked dirty, from now only pages that have potential
    to get dirty we mark dirty.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 86faf46a83fdf65c90aa87ff3e27509436db97ed
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Nov 20 11:30:04 2007 +0200

    KVM: MMU: Fix potential memory leak with smp real-mode
    
    When we map a page, we check whether some other vcpu mapped it for us and if
    so, bail out.  But we should decrease the refcount on the page as we do so.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cc02b6a48c18507ab445000cfd2ef17724c39a7b
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 20 11:45:14 2007 +0200

    KVM: Export include/asm-x86/kvm.h
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b92527793d12ce0f412cfe67aff959b1047ff255
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:37 2007 -0600

    KVM: Portability: Move cpuid structures to <asm/kvm.h>
    
    This patch moves structures:
    	kvm_cpuid_entry
    	kvm_cpuid
    
    from include/linux/kvm.h to include/asm-x86/kvm.h
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ea36fb9556c5cf3121fd8cfb30f273c384a44cb4
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:36 2007 -0600

    KVM: Portability: Move kvm_sregs and msr structures to <asm/kvm.h>
    
    Move structures:
    	kvm_sregs
    	kvm_msr_entry
    	kvm_msrs
    	kvm_msr_list
    
    from include/linux/kvm.h to include/asm-x86/kvm.h
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7cf1d50b5293d7b505e2c4ecae4c52e42d08383e
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:35 2007 -0600

    KVM: Portability: Move kvm_segment & kvm_dtable structure to  <asm/kvm.h>
    
    This patch moves structures:
    	kvm_segment
    	kvm_dtable
    from include/linux/kvm.h to include/asm-x86/kvm.h
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit bf1b9719c3cd2744e8e1c61c9f710339062e82b1
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:34 2007 -0600

    KVM: Portability: Move structure lapic_state to <asm/kvm.h>
    
    This patch moves structure lapic_state from include/linux/kvm.h
    to include/asm-x86/kvm.h
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 139a72a019300db50cc0bbff3f08fae182ad5295
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:33 2007 -0600

    KVM: Portability: Move kvm_regs to <asm/kvm.h>
    
    This patch moves structure kvm_regs to include/asm-x86/kvm.h.
    Each architecture will need to create there own version of this
    structure.
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6d441a253607c9d753c002fa8384ab864d15493d
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:32 2007 -0600

    KVM: Portability: Move x86 pic strutctures
    
    This patch moves structures:
    	kvm_pic_state
    	kvm_ioapic_state
    
    to inclue/asm-x86/kvm.h.
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7bb420c0b02bff8a555a55cf8f0c181f80ece640
Author: Jerone Young <jyoung5@us.ibm.com>
Date:   Mon Nov 19 17:06:31 2007 -0600

    KVM: Portability: Move kvm_memory_alias to asm/kvm.h
    
    This patch moves sturct kvm_memory_alias from include/linux/kvm.h
    to include/asm-x86/kvm.h. Also have include/linux/kvm.h include
    include/asm/kvm.h.
    
    Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2f89106aa04dbc1ea5128734e5851eafb599570b
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Nov 19 14:04:44 2007 -0600

    KVM: Move misplaced comment
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1ac3d1e5a5d87d0f92b324ae8e4022d1a79e5342
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Nov 19 14:04:43 2007 -0600

    KVM: Correct consistent typo: "destory" -> "destroy"
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 15aab1f1b230180fa281861f4f3f502f97ec4ac1
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Mon Nov 19 14:04:45 2007 -0600

    KVM: Remove unused "rmap_overflow" variable
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e724f7e84ce40f4284f428e1766cc49b2712be4e
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Nov 19 18:44:15 2007 +0200

    KVM: MMU: Remove unused variable
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1f4870283a5a27992a48a831bf4efcfea0f12c3f
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Nov 19 18:28:09 2007 +0200

    KVM: Add missing #include <asm/pgtable.h>
    
    Needed for empty_zero_page.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aa79833baa1491d6f8ed0158045eb92736194672
Author: Izik Eidus <izike@qumranet.com>
Date:   Mon Nov 19 11:16:57 2007 +0200

    kvm: simplify kvm_clear_guest_page()
    
    Use kvm_write_guest_page() with empty_zero_page, instead of doing
    kmap and memset.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6589612b279ac8fc98dc32c2beaccaf73a123db3
Author: Izik Eidus <izike@qumranet.com>
Date:   Mon Nov 19 11:28:19 2007 +0200

    KVM: MMU: Change guest pte access to kvm_{read,write}_guest()
    
    Things are simpler and more regular this way.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9fbcc4a1b7cf873a5aa1a357320fb82d588aa316
Author: Jan Kiszka <jan.kiszka@siemens.com>
Date:   Mon Nov 19 10:21:45 2007 +0100

    KVM: VMX: Force seg.base == (seg.sel << 4) in real  mode
    
    Ensure that segment.base == segment.selector << 4 when entering the real
    mode on Intel so that the CPU will not bark at us.  This fixes some old
    protected mode demo from http://www.x86.org/articles/pmbasics/tspec_a1_doc.htm.
    
    Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3f3b5523cbea87a7f5fb956d6d9934db8906c720
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 15:24:28 2007 +0800

    KVM: Portability: Move some function declarations to x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 52377f9e6e1b698819778db98c0c849becefacaa
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 15:08:31 2007 +0800

    KVM: Move some static inline functions out from kvm.h into x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f71f7978a42662d1334eab5e4247dd62f801c627
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 14:56:05 2007 +0800

    KVM: Portability: Move vcpu regs enumeration definition to x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e165cb34cb45b847a091037d8063e215f0aab839
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 14:40:47 2007 +0800

    KVM: Portability: Move struct kvm_x86_ops definition to x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5f45b1657504e8bc4c2a75ed4a320bb6ef230722
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Mon Nov 19 14:33:37 2007 +0800

    KVM: Portability: Move some macro definitions from kvm.h to x86.h
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 67935bd1f29066d71fcc90b95ca80317d4a61114
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Nov 18 20:43:21 2007 +0800

    KVM: Portability: MMU initialization and teardown split
    
    Move out kvm_mmu init and exit functionality from kvm_main.c
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 655f2b2de26803d94285ca632dbb7a65939a52e9
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sun Nov 18 20:29:43 2007 +0800

    KVM: Portability: Move kvm_vcpu_ioctl_get_dirty_log to arch-specific  file
    
    Meanwhile keep the interface in common, and leave as more logic in common
    as possible.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3fbf24b10ff70174876626caf67090a8b82d1c3d
Author: Amit Shah <amit.shah@qumranet.com>
Date:   Sun Nov 18 22:42:47 2007 +0530

    KVM: SVM: Disable Lazy FPU optimization
    
    Host FPU state is leaked into the guest FPU state. This happens
    because of the lazy FPU optimization, so just reload the FPU
    each time there's a VM exit/entry.
    
    The real fix should follow soon.
    
    This is observed in cases where fonts in a guest aren't rendered
    correctly (bug 1807560).
    
    The test program, courtesy Avi is:
    
    double test_fpu_once()
    {
        int i;
        double f = 0;
    
        for (i = 0; i < 10000000; ++i)
            f += 1 / (1.0 + i);
        return f;
    }
    
    void test_fpu()
    {
        double a, b;
        int runs;
    
        runs = 0;
        a = test_fpu_once();
        while (1) {
            b = test_fpu_once();
            if (fabs(a - b) > 1e-9)
                printf("error: %20.16f -> %20.16f\n", a, b);
            a = b;
            if (++runs % 100 == 0)
                printf("runs: %8d\n", runs);
        }
    }
    
    int main(int ac, char **av)
    {
        test_fpu();
        return 0;
    }
    
    Run this on the host and the guest, both pinned to the same
    host CPU.
    
    Signed-off-by: Amit Shah <amit.shah@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 15aa6fbcd5908038d328eea250c282b2224cff1d
Author: Amit Shah <amit.shah@qumranet.com>
Date:   Thu Nov 15 18:38:46 2007 +0200

    KVM: Make unloading of FPU state when putting vcpu arch-independent
    
    Instead of having each architecture do it individually, we
    do this in the arch-independent code (just x86 as of now).
    Turns out SVM did not do this at all.
    
    Signed-off-by: Amit Shah <amit.shah@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3fefa5b661688e3f82179047047b98a27b1255ac
Author: Amit Shah <amit.shah@qumranet.com>
Date:   Sun Nov 18 22:25:40 2007 +0530

    KVM: x86 emulator: Use emulator_write_emulated and not emulator_write_std
    
    emulator_write_std() is not implemented, and calling write_emulated should
    work just as well in place of write_std.
    
    Fixes emulator failures with the push r/m instruction.
    
    Signed-off-by: Amit Shah <amit.shah@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 90af65e22b7ef28aa0dc6d16ff42dee1b5329a82
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 16:37:07 2007 +0200

    KVM: MMU: Add some mmu statistics
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 65bee3dea12864c90e59985802b38b5d1726e411
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 16:24:12 2007 +0200

    KVM: Extend stats support for VM stats
    
    This is in addition to the current virtual cpu statistics.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d600e85f82c9da9641dfd593c70706dc398a6a35
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 15:17:51 2007 +0200

    KVM: Add instruction emulation statistics

commit 5b355c536b29219696ebd62c05ed6b62a209e19d
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 13:54:33 2007 +0200

    KVM: Add fpu_reload counter
    
    Measure the number of times we switch the fpu state.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3ceafc84da4130359f38a6c9870e8d434eca75cb
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 18 13:50:24 2007 +0200

    KVM: Replace 'light_exits' stat with 'host_state_reload'
    
    This is a little more accurate (since it counts actual reloads, not potential
    reloads), and reverses the sense of the statistic to measure a bad event like
    most of the other stats (e.g. we want to minimize all counters).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 070b3d6d8c2d62af9331d64a26a7abb0bc215329
Author: Zhang Xiantao <xiantao@vtsmp-build32.los-vmm.org>
Date:   Sun Nov 18 18:43:45 2007 +0800

    KVM: Portability: Add two hooks to handle kvm_create and destroy vm
    
    Add two arch hooks to handle kvm_create_vm and kvm destroy_vm. Now, just
    put io_bus init and destory in common.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b99467ab695205c63a0614dc3a33559a9dc98c60
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Nov 16 14:38:21 2007 +0800

    KVM: Remove __init attributes for kvm_init_debug and kvm_init_msr_list
    
    Since their callers are not declared with __init.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d0399b219b6ded177f7ddef7cf69ba3c0a94e5af
Author: Joe Perches <joe@perches.com>
Date:   Mon Nov 12 20:06:51 2007 -0800

    KVM: Remove ptr comparisons to 0
    
    Fix sparse warnings "Using plain integer as NULL pointer"
    
    Signed-off-by: Joe Perches <joe@perches.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ebc1a8eca279a2387248d1950cc66fff9f6f6e67
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Fri Nov 16 16:29:15 2007 +0800

    KVM: x86 emulator: Rename 'cr2' to 'memop'
    
    Previous patches have removed the dependency on cr2; we can now stop passing
    it to the emulator and rename uses to 'memop'.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c3763adbd51d15ac1a158c4d51314237ee5be6ff
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Fri Nov 16 13:05:55 2007 +0800

    KVM: Portability: Make kvm_vcpu_ioctl_translate arch dependent
    
    Move kvm_vcpu_ioctl_translate to arch, since mmu would be put under arch.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aed946861467c0ded1e10c6f18a0b39f503addf2
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 15 18:06:18 2007 +0200

    KVM: VMX: Consolidate register usage in vmx_vcpu_run()
    
    We pass vcpu, vmx->fail, and vmx->launched to assembly code, but all three
    are fields within vmx.  Consolidate by only passing in vmx and offsets for
    the rest.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e42ff0b65e30b0b2f8c022e113d0c6fd607d4eef
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Thu Nov 15 23:07:47 2007 +0800

    KVM: Portability: move KVM_CHECK_EXTENSION
    
    Make KVM_CHECK_EXTENSION code into a function, all archs can define its
    capability independently.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8cdbd1f3f0b5d9599fd834de2000d2286ee05931
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Thu Nov 15 14:52:28 2007 +0800

    KVM: x86 emulator: modify 'lods', and 'stos' not to depend on CR2
    
    The current 'lods' and 'stos' is depending on incoming CR2 rather than decode
    memory address from registers.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e29c6434dcc64992a84e1a34a16ccfc9fd623621
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:40:21 2007 +0800

    KVM: Portability: Move x86 specific code from kvm_init() to kvm_arch()
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5b6ea43200c02c08b60123a5478a981e0ede7b99
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:39:31 2007 +0800

    KVM: Portability: Combine kvm_init and kvm_init_x86
    
    Will be called once arch module registers itself.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c981fc179788d3bd7e6ccdc285774f5fee569045
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:38:21 2007 +0800

    KVM: Portability: Add vcpu and hardware management arch hooks
    
    Add the following hooks:
    
      void decache_vcpus_on_cpu(int cpu);
      int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
      void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
      void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
      void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
      void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
      struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
      void kvm_arch_vcpu_destory(struct kvm_vcpu *vcpu);
      int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
      void kvm_arch_hardware_enable(void *garbage);
      void kvm_arch_hardware_disable(void *garbage);
      int kvm_arch_hardware_setup(void);
      void kvm_arch_hardware_unsetup(void);
      void kvm_arch_check_processor_compat(void *rtn);
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e402df0a544e9160ab68b1c666789f7007c66327
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:09:30 2007 +0800

    KVM: Portability: Move kvm_x86_ops to x86.c
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 744060c59e5ad694721a8503c9416cdeb5411cdc
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Wed Nov 14 20:08:51 2007 +0800

    KVM: Portability: Move some includes to x86.c
    
    Move some includes to x86.c from kvm_main.c, since the related functions
    have been moved to x86.c
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Acked-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7ee0b303e266990a54a26804d94bc24f649efcf9
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 22:10:22 2007 +0200

    KVM: Change kvm_{read,write}_guest() to use copy_{from,to}_user()
    
    This changes kvm_write_guest_page/kvm_read_guest_page to use
    copy_to_user/read_from_user, as a result we get better speed
    and better dirty bit tracking.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e8906e9fe7bc376f7d89ad089a19339fe83838a4
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 22:05:04 2007 +0200

    KVM: introduce gfn_to_hva()
    
    Convert a guest frame number to the corresponding host virtual address.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c94fd678cc5aaa7ef62051e42199576c563c0118
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 22:02:22 2007 +0200

    KVM: add kvm_is_error_hva()
    
    Check for the "error hva", an address outside the user address space that
    signals a bad gfn.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f81fa4f8c6759d70f70cb93f802e8b5b7ee3b31c
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 11 18:37:32 2007 +0200

    KVM: Simplify CPU_TASKS_FROZEN cpu notifier handling
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a798c61d96780e45f8489f9b0dce93c313edb814
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 14:48:17 2007 +0200

    KVM: x86 emulator: remove 8 bytes operands emulator for call near instruction
    
    it is removed beacuse it isnt supported on a real host
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d452efc6f43bbc9e945720ac8ea36461c514001a
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 14:46:34 2007 +0200

    KVM: x86 emulator: fix the saving of of the eip value
    
    this make sure that no matter what is the operand size,
    all the value of the eip will be saved
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f4e63bc2d9a7e0bad6209fe6ec95fda4ad2f3e9c
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Nov 11 14:40:48 2007 +0200

    KVM: x86 emulator: fix JMP_REL
    
    Change JMP_REL to call to register_address_increment(): the operands size
    should not effect the calculation of the eip, instead the ad_bytes should
    affect it.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 495dcb4309e309d9ec42dc471b1548c09a359ed8
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sun Nov 11 12:28:35 2007 +0200

    KVM: VMX: wbinvd exiting
    
    Add wbinvd VM Exit support to prepare for pass-through
    device cache emulation and also enhance real time
    responsiveness.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a5cfeab46e439b2610f264eaf200e3b01f3adf04
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sun Nov 11 12:27:20 2007 +0200

    KVM: VMX: Comment VMX primary/secondary exec ctl definitions
    
    Add comments for secondary/primary Processor-Based VM-execution controls.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 64276997bc49442e0cfb6a8d857274cca5ad2d39
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Nov 11 11:29:26 2007 +0200

    KVM: Go back to atomically injecting interrupts
    
    Revert b622204087b5dbdc76b39cbfa288a41d325e3e9a and
    817b54a86b0a3e0e5955714b84577101ffff9c59, and inject
    interrupts atomically in the guest switch path.  There
    are unresolved issues with injecting interrupts while
    sleeping is enabled.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ab4e018dc95e1b32b7f25746eb9b2c1385aec93e
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 8 18:19:20 2007 +0200

    KVM: VMX: Use vmx to inject real interrupts
    
    Instead of injecting real-mode interrupts by writing the interrupt frame into
    guest memory, abuse vmx by injecting a software interrupt.  We need to
    pretend the software interrupt instruction had a length > 0, so we have to
    adjust rip backward.
    
    This lets us not to mess with writing guest memory, which is complex and also
    sleeps.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1805646bfeeea2365d49c46abcd61f2c0d21e677
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 6 11:29:56 2007 +0200

    KVM: VMX: Avoid reloading host efer on cpus that don't have it
    
    Some VT-capable processors (like the T2600) don't have an efer, so don't try
    to reload it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f3cb18371632dd89d4158319baaa4137cb0f1bbd
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 7 17:14:18 2007 +0200

    KVM: SVM: Defer nmi processing until switch to host state is complete
    
    If we stgi() too soon, nmis can reach the processor even though interrupts
    are disabled, catching it in a half-switched state.  Delay the stgi() until
    we're done switching.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 94ef4576bb78d429e4f25cdc09e348434c76c1a3
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Wed Nov 7 16:20:06 2007 +0200

    KVM: Add make_page_dirty() to kvm_clear_guest_page()
    
    Every write access to guest pages should be tracked.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 157117844e1d228db58dfe7fbd51c5a225ceafe4
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Nov 7 12:57:23 2007 +0200

    KVM: SVM: Fix SMP with kernel apic
    
    AP processor needs to reset to the SIPI vector, not normal INIT.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 35e0154eeb0ace014ecf4cc2bdb93a42ee762aca
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Tue Nov 6 11:37:44 2007 +0800

    KVM: VMX: Fix repeated allocation of apic access page on smp
    
    For SMP guest, alloc_apic_access_page() would be called more than once.
    So only the last one works, causing SMP guest can't benifit from FlexPriority.
    
    This patch fixed this issue.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9c86738fc9c62e5e801720a1cea3d982f9e66ec1
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Nov 6 12:01:44 2007 +0200

    KVM: x86 emulator: Move one-byte insns with reg operand into one-byte section
    
    Previously, they were decoded as 'ImplicitOps', but after now that we decode
    them as SrcReg or DstReg, they should be in the regular section.
    
    Thanks to Alexey Eremenko for a good bisect.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c388ba81841a1ed30a5dd4f6029db2263ad1aca3
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Thu Nov 1 14:16:10 2007 -0500

    KVM: Portability: Move x86 vcpu ioctl handlers to x86.c
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a9acc2800d8676d8a9a91aeaedd16ae4f75c05df
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Wed Oct 31 17:24:25 2007 -0500

    KVM: Portability: Move x86 FPU handling to x86.c
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f0bed919cd92ebc5f6456061c2a1e3936a9cf4bb
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Wed Oct 31 17:24:24 2007 -0500

    KVM: Portability: Move x86 instruction emulation code to x86.c
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 24c5e5dfaafbb5fe3fbb35f456dc5293c7efa2f9
Author: Hollis Blanchard <hollisb@us.ibm.com>
Date:   Wed Oct 31 17:24:23 2007 -0500

    KVM: Portability: Make exported debugfs data architecture-specific
    
    Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 92f0fc32feb5ed773272ca0a4275f9c3d91666b6
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Nov 1 06:31:28 2007 +0200

    KVM: x86 emulator: Hoist modrm and abs decoding into separate functions
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aadea67210c8b9e7a57744a1c2845501d2cdbac7
Author: Uri Lublin <uril@qumranet.com>
Date:   Tue Oct 30 10:42:09 2007 +0200

    KVM: Make mark_page_dirty() work for aliased pages too.
    
    Recommended by Izik Eidus.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d49f75b8587e4078013124b76e89986201e31bd2
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 31 11:21:06 2007 +0200

    KVM: Simplify decode_register_operand() calling convention
    
    Now that rex_prefix is part of the decode cache, there is no need to pass
    it along.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e49ba82c02da109f272da66a7b11d07da24146e4
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 31 11:15:56 2007 +0200

    KVM: x86 emulator: centralize decoding of one-byte register access insns
    
    Instructions like 'inc reg' that have the register operand encoded
    in the opcode are currently specially decoded.  Extend
    decode_register_operand() to handle that case, indicated by having
    DstReg or SrcReg without ModRM.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cc528786cfc5567c8478f57e9ba27abbe899cfe1
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 31 10:27:04 2007 +0200

    KVM: x86 emulator: Extract the common code of SrcReg and DstReg
    
    Share the common parts of SrcReg and DstReg decoding.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit baade4336b89881dfde8664339c888cbad749e21
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Tue Oct 30 18:44:25 2007 +0100

    KVM: Portability: Move pio emulation functions to x86.c
    
    This patch moves implementation of the following functions from
    kvm_main.c to x86.c:
    free_pio_guest_pages, vcpu_find_pio_dev, pio_copy_data, complete_pio,
    kernel_pio, pio_string_write, kvm_emulate_pio, kvm_emulate_pio_string
    
    The function inject_gp, which was duplicated by yesterday's patch
    series, is removed from kvm_main.c now because it is not needed anymore.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e20bd3021d1709638c08ce946e7fe0f21a78107d
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Tue Oct 30 18:44:21 2007 +0100

    KVM: Portability: Move x86 emulation and mmio device hook to x86.c
    
    This patch moves the following functions to from kvm_main.c to x86.c:
    emulator_read/write_std, vcpu_find_pervcpu_dev, vcpu_find_mmio_dev,
    emulator_read/write_emulated, emulator_write_phys,
    emulator_write_emulated_onepage, emulator_cmpxchg_emulated,
    get_setment_base, emulate_invlpg, emulate_clts, emulator_get/set_dr,
    kvm_report_emulation_failure, emulate_instruction
    
    The following data type is moved to x86.c:
    struct x86_emulate_ops emulate_ops
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d465b5f219d61e68f523dbe42f99fe7ec6787e2e
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Tue Oct 30 18:44:17 2007 +0100

    KVM: Portability: Move kvm_get/set_msr[_common] to x86.c
    
    This patch moves the implementation of the functions of kvm_get/set_msr,
    kvm_get/set_msr_common, and set_efer from kvm_main.c to x86.c. The
    definition of EFER_RESERVED_BITS is moved too.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 84397028b8a819ad3c102d0e55f9010e8a43875f
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Oct 29 15:15:20 2007 -0500

    KVM: Fix gfn_to_page() acquiring mmap_sem twice
    
    KVM's nopage handler calls gfn_to_page() which acquires the mmap_sem when
    calling out to get_user_pages().  nopage handlers are already invoked with the
    mmap_sem held though.  Introduce a __gfn_to_page() for use by the nopage
    handler which requires the lock to already be held.
    
    This was noticed by tglx.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a1ac91208be54463677c391b678f82a10b8f6b43
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Mon Oct 29 09:40:42 2007 +0800

    KVM: VMX: Enable memory mapped TPR shadow (FlexPriority)
    
    This patch based on CR8/TPR patch, and enable the TPR shadow (FlexPriority)
    for 32bit Windows.  Since TPR is accessed very frequently by 32bit
    Windows, especially SMP guest, with FlexPriority enabled, we saw significant
    performance gain.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 650d273133f478c40d1a5c720676d8a5687cbf7d
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Mon Oct 29 16:09:35 2007 +0100

    KVM: Portability: Move control register helper functions to x86.c
    
    This patch moves the definitions of CR0_RESERVED_BITS,
    CR4_RESERVED_BITS, and CR8_RESERVED_BITS along with the following
    functions from kvm_main.c to x86.c:
    set_cr0(), set_cr3(), set_cr4(), set_cr8(), get_cr8(), lmsw(),
    load_pdptrs()
    The static function wrapper inject_gp is duplicated in kvm_main.c and
    x86.c for now, the version in kvm_main.c should disappear once the last
    user of it is gone too.
    The function load_pdptrs is no longer static, and now defined in x86.h
    for the time being, until the last user of it is gone from kvm_main.c.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 77c9dcf8e969f79483a8fef6eee35a462b013954
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Mon Oct 29 16:09:10 2007 +0100

    KVM: Portability: move get/set_apic_base to x86.c
    
    This patch moves the implementation of get_apic_base and set_apic_base
    from kvm_main.c to x86.c
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fba9a282a95211407af2a00f500868ae9846c75e
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Mon Oct 29 16:08:51 2007 +0100

    KVM: Portability: Move memory segmentation to x86.c
    
    This patch moves the definition of segment_descriptor_64 for AMD64 and
    EM64T from kvm_main.c to segment_descriptor.h. It also adds a proper
    #ifndef...#define...#endif around that header file.
    The implementation of segment_base is moved from kvm_main.c to x86.c.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 13974a5a9c61c0c1732dfe42c02d703830b5db3a
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Mon Oct 29 16:08:35 2007 +0100

    KVM: Portability: Split kvm_vm_ioctl v3
    
    This patch splits kvm_vm_ioctl into archtecture independent parts, and
    x86 specific parts which go to kvm_arch_vcpu_ioctl in x86.c.
    The patch is unchanged since last submission.
    
    Common ioctls for all architectures are:
    KVM_CREATE_VCPU, KVM_GET_DIRTY_LOG, KVM_SET_USER_MEMORY_REGION
    
    x86 specific ioctls are:
    KVM_SET_MEMORY_REGION,
    KVM_GET/SET_NR_MMU_PAGES, KVM_SET_MEMORY_ALIAS, KVM_CREATE_IRQCHIP,
    KVM_CREATE_IRQ_LINE, KVM_GET/SET_IRQCHIP
    KVM_SET_TSS_ADDR
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Acked-by: Hollis Blanchard <hollisb@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b95061aec006bc4c44e4b244e4ec15c009ab880a
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 28 18:52:05 2007 +0200

    KVM: MMU: Topup the mmu memory preallocation caches before emulating an insn
    
    Emulation may cause a shadow pte to be instantiated, which requires
    memory resources.  Make sure the caches are filled to avoid an oops.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e9e1741b896e474238422b3954eb05ff06075d61
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 28 18:48:59 2007 +0200

    KVM: Move page fault processing to common code
    
    The code that dispatches the page fault and emulates if we failed to map
    is duplicated across vmx and svm.  Merge it to simplify further bugfixing.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 302db0cf4a8a9eb286190abfec5d654cf61d6883
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 28 16:34:25 2007 +0200

    KVM: x86 emulator: don't depend on cr2 for mov abs emulation
    
    The 'mov abs' instruction family (opcodes 0xa0 - 0xa3) still depends on cr2
    provided by the page fault handler.  This is wrong for several reasons:
    
    - if an instruction accessed misaligned data that crosses a page boundary,
      and if the fault happened on the second page, cr2 will point at the
      second page, not the data itself.
    
    - if we're emulating in real mode, or due to a FlexPriority exit, there
      is no cr2 generated.
    
    So, this change adds decoding for this instruction form and drops reliance
    on cr2.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9471ec5bb21c235d14bbcd4ae244b838793c513f
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 28 16:11:58 2007 +0200

    KVM: SVM: Intercept the 'invd' and 'wbinvd' instructions
    
    'invd' can destroy host data, and 'wbinvd' allows the guest to induce
    long (milliseconds) latencies.
    
    Noted by Ben Serebrin.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b5a23e7d1161f1182188c462353c3314010dcb6a
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 28 16:09:18 2007 +0200

    KVM: x86 emulator: invd instruction
    
    Emulate the 'invd' instruction (opcode 0f 08).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8d2d0414d8203a3598fe403c18f7f612e69128fa
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Oct 25 14:18:54 2007 +0200

    KVM: SVM: Let gcc to choose which registers to save (i386)
    
    This patch lets GCC to determine which registers to save when we
    switch to/from a VCPU in the case of AMD i386
    
    * Original code saves following registers:
    
        ebx, ecx, edx, esi, edi, ebp
    
    * Patched code:
    
      - informs GCC that we modify following registers
        using the clobber description:
    
        ebx, ecx, edx, esi, edi
    
      - rbp is saved (pop/push) because GCC seems to ignore its use in the clobber
        description.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b8a500de75585cd6989d175b40421c04ea6b1f16
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Oct 25 14:18:53 2007 +0200

    KVM: SVM: Let gcc to choose which registers to save (x86_64)
    
    This patch lets GCC to determine which registers to save when we
    switch to/from a VCPU in the case of AMD x86_64.
    
    * Original code saves following registers:
    
        rbx, rcx, rdx, rsi, rdi, rbp,
        r8, r9, r10, r11, r12, r13, r14, r15
    
    * Patched code:
    
      - informs GCC that we modify following registers
        using the clobber description:
    
        rbx, rcx, rdx, rsi, rdi
        r8, r9, r10, r11, r12, r13, r14, r15
    
      - rbp is saved (pop/push) because GCC seems to ignore its use in the clobber
        description.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 203ebca1065ff925cc35b31c85d160b2a3ebc473
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Oct 25 14:18:55 2007 +0200

    KVM: VMX: Let gcc to choose which registers to save (i386)
    
    This patch lets GCC to determine which registers to save when we
    switch to/from a VCPU in the case of intel i386.
    
    * Original code saves following registers:
    
        eax, ebx, ecx, edx, edi, esi, ebp (using popa)
    
    * Patched code:
    
      - informs GCC that we modify following registers
        using the clobber description:
    
        ebx, edi, rsi
    
      - doesn't save eax because it is an output operand (vmx->fail)
    
      - cannot put ecx in clobber description because it is an input operand,
        but as we modify it and we want to keep its value (vcpu), we must
        save it (pop/push)
    
      - ebp is saved (pop/push) because GCC seems to ignore its use the clobber
        description.
    
      - edx is saved (pop/push) because it is reserved by GCC (REGPARM) and
        cannot be put in the clobber description.
    
      - line "mov (%%esp), %3 \n\t" has been removed because %3
        is ecx and ecx is restored just after.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit acc3a1756feee379b35a913da2e7fd8e1deaeb98
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Oct 25 14:18:52 2007 +0200

    KVM: VMX: Let gcc to choose which registers to save (x86_64)
    
    This patch lets GCC to determine which registers to save when we
    switch to/from a VCPU in the case of intel x86_64.
    
    * Original code saves following registers:
    
        rax, rbx, rcx, rdx, rsi, rdi, rbp,
        r8, r9, r10, r11, r12, r13, r14, r15
    
    * Patched code:
    
      - informs GCC that we modify following registers
        using the clobber description:
    
        rbx, rdi, rsi,
        r8, r9, r10, r11, r12, r13, r14, r15
    
      - doesn't save rax because it is an output operand (vmx->fail)
    
      - cannot put rcx in clobber description because it is an input operand,
        but as we modify it and we want to keep its value (vcpu), we must
        save it (pop/push)
    
      - rbp is saved (pop/push) because GCC seems to ignore its use in the clobber
        description.
    
      - rdx is saved (pop/push) because it is reserved by GCC (REGPARM) and
        cannot be put in the clobber description.
    
      - line "mov (%%rsp), %3 \n\t" has been removed because %3
        is rcx and rcx is restored just after.
    
      - line ASM_VMX_VMWRITE_RSP_RDX() is moved out of the ifdef/else/endif
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ac484b81d644721fbdfe0f6203ea18d3d23ff060
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Oct 26 14:16:56 2007 +0200

    KVM: x86 emulator: fix 'push imm8' emulation
    
    'push imm8' found itself in the wrong switch somehow, so it is never executed.
    
    This fixes Windows 2003 installation.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 70ac85846fc8c818dd6ca0513954edeab8ef90d0
Author: Izik Eidus <izike@qumranet.com>
Date:   Thu Oct 25 00:29:55 2007 +0200

    KVM: Add ioctl to tss address from userspace,
    
    Currently kvm has a wart in that it requires three extra pages for use
    as a tss when emulating real mode on Intel.  This patch moves the allocation
    internally, only requiring userspace to tell us where in the physical address
    space we can place the tss.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 12360bf90e1bd1e4780ab8903eab675b458ceb16
Author: Izik Eidus <izike@qumranet.com>
Date:   Wed Oct 24 23:57:46 2007 +0200

    KVM: Add kernel-internal memory slots
    
    Reserve a few memory slots for kernel internal use.  This is good for case
    you have to register memory region and you want to be sure it was not
    registered from userspace, and for case you want to register a memory region
    that won't be seen from userspace.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7de757f14b3e2e90a830ab0f2e42518ea1231c6c
Author: Izik Eidus <izike@qumranet.com>
Date:   Wed Oct 24 23:52:57 2007 +0200

    KVM: Export memory slot allocation mechanism
    
    Remove kvm memory slot allocation mechanism from the ioctl
    and put it to exported function.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2ddb29ee7761a7415b1172b77007dccb0cf49e39
Author: Izik Eidus <izike@qumranet.com>
Date:   Thu Oct 25 11:54:04 2007 +0200

    KVM: Unmap kernel-allocated memory on slot destruction
    
    kvm_vm_ioctl_set_memory_region() is able to remove memory in addition to
    adding it.  Therefore when using kernel swapping support for old userspaces,
    we need to munmap the memory if the user request to remove it
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 031b69bbd4fabbd3cc4e93fe88cca3122498070a
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Oct 22 16:33:07 2007 +0200

    KVM: Use new smp_call_function_mask() in kvm_flush_remote_tlbs()
    
    In kvm_flush_remote_tlbs(), replace a loop using smp_call_function_single()
    by a single call to smp_call_function_mask() (which is new for x86_64).
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0d878c5ce8309856b2aff32783601b761db96dec
Author: Christian Borntraeger <borntraeger@de.ibm.com>
Date:   Thu Oct 11 15:34:17 2007 +0200

    KVM: Per-architecture hypercall definitions
    
    Currently kvm provides hypercalls only for x86* architectures. To
    provide hypercall infrastructure for other kvm architectures I split
    kvm_para.h into a generic header file and architecture specific
    definitions.
    
    Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b6afc446d80005ca125a58042a3a63e665f14110
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 21 18:34:54 2007 +0200

    KVM: VMX: vmx_vcpu_setup(): remove unused variable.
    
    Noticed by Izik Eidus.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a785ddf7b0c9dd5367aed014eab3819be0214d0e
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 21 11:03:36 2007 +0200

    KVM: Add a might_sleep() annotation to gfn_to_page()
    
    This will help trap accesses to guest memory in atomic context.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d9f17a13c2180569f82acb85d93970c85a42a855
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 21 11:00:39 2007 +0200

    KVM: Move vmx_vcpu_reset() out of vmx_vcpu_setup()
    
    Split guest reset code out of vmx_vcpu_setup().  Besides being cleaner, this
    moves the realmode tss setup (which can sleep) outside vmx_vcpu_setup()
    (which is executed with preemption enabled).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3b06c8ee1e0710c99c2b34a3c34c9758256a6248
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Oct 21 09:45:12 2007 +0200

    Revert "KVM: VMX: Initialize vcpu with preemption enabled"
    
    This reverts commit 341131670f028f8bec97db6121c018f53129b9c3.  It executes
    vcpu_load() before installing the preemption notifier, which leeds to oopses.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aa89acdb280d63973b255b9cb4d11c73f195fd15
Author: Kevin Pedretti <kevin.pedretti@gmail.com>
Date:   Sun Oct 21 08:55:50 2007 +0200

    KVM: Improve local apic timer wraparound handling
    
    Better handle wrap-around cases when reading the APIC CCR
    (current count register).  Also, if ICR is 0, CCR should also
    be 0... previously reading CCR before setting ICR would result
    in a large kinda-random number.
    
    Signed-off-by: Kevin Pedretti <kevin.pedretti@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 654f22693683aff28c006eceb15eb16cd0b25ec8
Author: Kevin Pedretti <kevin.pedretti@gmail.com>
Date:   Sun Oct 21 08:54:53 2007 +0200

    KVM: Fix local apic timer divide by zero
    
    kvm_lapic_reset() was initializing apic->timer.divide_count to 0,
    which could potentially lead to a divide by zero error in
    apic_get_tmcct().  Any guest that reads the APIC's CCR (current count)
    register before setting DCR (divide configuration) would trigger a divide
    by zero exception in the host kernel, leading to a host-OS crash.
    
    This patch results in apic->timer.divide_count being initialized to
    2 at reset, eliminating the bug (DCR=0 at reset, meaning divide by 2).
    
    Signed-off-by: Kevin Pedretti <kevin.pedretti@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 933a3f772d28b532cd09098257e482694a795dfb
Author: Zhang Xiantao <xiantao.zhang@intel.com>
Date:   Sat Oct 20 15:34:38 2007 +0800

    KVM: Portability: Split kvm_vcpu into arch dependent and independent parts (part 1)
    
    First step to split kvm_vcpu.  Currently, we just use an macro to define
    the common fields in kvm_vcpu for all archs, and all archs need to define
    its own kvm_vcpu struct.
    
    Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 648fc91671f57b818d9501aab30765df75165fbf
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Thu Oct 18 09:59:34 2007 -0500

    KVM: Allocate userspace memory for older userspace
    
    Allocate a userspace buffer for older userspaces.  Also eliminate phys_mem
    buffer.  The memset() in kvmctl really kills initial memory usage but swapping
    works even with old userspaces.
    
    A side effect is that maximum guest side is reduced for older userspace on
    i386.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit bb1d1fee77d97efba72dccec85d857378e3abea5
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 18 17:35:57 2007 +0200

    KVM: MMU: Fix dirty bit pte gpa calculation
    
    The host physical address has no business in there.
    
    Noticed by Izik Eidus.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e735785caa9c0a9134e1246bc8d5c5f73c2a3e65
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Oct 18 15:19:01 2007 +0200

    KVM: Move kvm_guest_exit() after local_irq_enable()
    
    We need to make sure that the timer interrupt happens before we clear
    PF_VCPU, so the accounting code actually sees guest mode.
    
    http://lkml.org/lkml/2007/10/15/114
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1de6a8de93320521092aa237156fd5a6ed0d0ef5
Author: Christian Borntraeger <borntraeger@de.ibm.com>
Date:   Thu Oct 18 14:39:10 2007 +0200

    KVM: Use virtual cpu accounting if available for guest times.
    
    ppc and s390 offer the possibility to track process times precisely
    by looking at cpu timer on every context switch, irq, softirq etc.
    We can use that infrastructure as well for guest time accounting.
    We need to account the used time before we change the state.
    This patch adds a call to account_system_vtime to kvm_guest_enter
    and kvm_guest exit. If CONFIG_VIRT_CPU_ACCOUNTING is not set,
    account_system_vtime is defined in hardirq.h as an empty function,
    which means this patch does not change the behaviour on other
    platforms.
    
    I compile tested this patch on x86 and function tested the patch on
    s390.
    
    Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 341131670f028f8bec97db6121c018f53129b9c3
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 18 12:38:52 2007 +0200

    KVM: VMX: Initialize vcpu with preemption enabled
    
    vcpu initialization require writes to memory (for the real mode tss), which
    is now a sleeping operation.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1253f76f37189f0bfda7e4a46bfe537f329dc0db
Author: Izik Eidus <izike@qumranet.com>
Date:   Thu Oct 18 11:09:33 2007 +0200

    KVM: Partial swapping of guest memory
    
    This allows guest memory to be swapped.  Pages which are currently mapped
    via shadow page tables are pinned into memory, but all other pages can
    be freely swapped.
    
    The patch make gfn_to_page() elevate the page's reference count, and
    introduces kvm_release_page() that pairs with it.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 283c5eefd628c754df85434ed2aca079d35aa2aa
Author: Izik Eidus <izike@qumranet.com>
Date:   Wed Oct 17 19:17:48 2007 +0200

    KVM: MMU: Make gfn_to_page() always safe
    
    In case the page is not present in the guest memory map, return a dummy
    page the guest can scribble on.
    
    This simplifies error checking in its users.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ca335c8f08d88b4bb978afa02b86c206ecfd21d1
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Oct 16 14:43:46 2007 +0200

    KVM: MMU: Keep a reverse mapping of non-writable translations
    
    The current kvm mmu only reverse maps writable translation.  This is used
    to write-protect a page in case it becomes a pagetable.
    
    But with swapping support, we need a reverse mapping of read-only pages as
    well:  when we evict a page, we need to remove any mapping to it, whether
    writable or not.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 751e23de45cd219956036db6d55b8685e74a7090
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Oct 16 14:42:30 2007 +0200

    KVM: MMU: Add rmap_next(), a helper for walking kvm rmaps
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f83562246921d6a8a7de8b76853a6835ace3699d
Author: Aurelien Jarno <aurelien@aurel32.net>
Date:   Wed Oct 17 19:30:41 2007 +0200

    KVM: x86 emulator: fix access registers for instructions with ModR/M byte and Mod = 3
    
    The patch belows changes the access type to register from memory for
    instructions that are declared as SrcMem or DstMem, but have a
    ModR/M byte with Mod = 3.
    
    It fixes (at least) the lmsw and smsw instructions on an AMD64 CPU,
    which are needed for FreeBSD.
    
    Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0863fb5961a612a5d930c5796b7b0188082f4b14
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 18 10:46:02 2007 +0200

    KVM: x86 emulator: use a defined flag definition
    
    EFLG_IF is not defined anywhere.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ad6c935c4963ee5577210ba47434c7c59aec81fa
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Oct 16 18:23:27 2007 -0700

    KVM: x86 emulator: cmc, clc, cli, sti
    
    Instruction: cmc, clc, cli, sti
    opcodes: 0xf5, 0xf8, 0xfa, 0xfb respectively.
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b622204087b5dbdc76b39cbfa288a41d325e3e9a
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 17 12:20:21 2007 +0200

    KVM: Actually move the interrupt injection code out of the critical section
    
    Commit 817b54a86b0a3e0e5955714b84577101ffff9c59 claimed to do this, but
    actually didn't (despite laying all the infrastructure for it).  This completes
    the movement.
    
    Noticed by Eddie Dong.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f3a00bac6038e0a6ee519a6454ca2efc11805639
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 17 12:18:47 2007 +0200

    KVM: MMU: Simplify page table walker
    
    Simplify the walker level loop not to carry so much information from one
    loop to the next.  In addition to being complex, this made kmap_atomic()
    critical sections difficult to manage.
    
    As a result of this change, kmap_atomic() sections are limited to actually
    touching the guest pte, which allows the other functions called from the
    walker to do sleepy operations.  This will happen when we enable swapping.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9b6895d0e4bb62f46bdd05c7e4b4e99709462385
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 17 11:03:06 2007 +0200

    Revert "KVM: MMU: Call update_dirty_bit() without disabling preemption"
    
    This reverts commit 5a691bafbc79643d60f1925a581d1af5a9f0f60d.  It unmaps
    walker->table while leaving ptep pointing into it.  Fails spectacularly on
    i386.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d8a0c28def19005173b0e4edcaa120b865eaac6d
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Fri Oct 12 17:40:33 2007 -0700

    KVM: x86 emulator: Implement emulation of instruction: inc & dec
    
    Instructions:
    	inc r16/r32 (opcode 0x40-0x47)
    	dec r16/r32 (opcode 0x48-0x4f)
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f2d7854fd4932dec136dec5115a6d8e6d4a8a5a5
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Oct 16 19:06:15 2007 +0200

    KVM: VMX: Force vm86 mode if setting flags during real mode
    
    When resetting from userspace, we need to handle the flags being cleared
    even after we are in real mode.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 32262da4c1f9268a9c271360342821c114a8a8e7
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Oct 16 17:22:08 2007 +0200

    KVM: Rename KVM_TLB_FLUSH to KVM_REQ_TLB_FLUSH
    
    We now have a new namespace, KVM_REQ_*, for bits in vcpu->requests.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 817b54a86b0a3e0e5955714b84577101ffff9c59
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Oct 16 16:29:39 2007 +0200

    KVM: Move interrupt injection out of interrupt disabled section
    
    Instead of injecting interrupts while we're in the critical section during
    the guest switch, inject them earlier.  In case we had an irq raised between
    the injection point and the switch, we check a bit during guest switch and
    if needed we go back and redo the injection.
    
    This improves system latency, and allows sleeping during injection (which
    is needed when injecting real-mode interrupts on Intel).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1c37fdc33da77df7fbb0affe5c3896d15f6f2702
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Oct 16 16:23:22 2007 +0200

    KVM: Move apic timer interrupt backlog processing to common code
    
    Beside the obvious goodness of making code more common, this prevents
    a livelock with the next patch which moves interrupt injection out of the
    critical section.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5a691bafbc79643d60f1925a581d1af5a9f0f60d
Author: Izik Eidus <izike@qumranet.com>
Date:   Mon Oct 15 19:24:52 2007 +0200

    KVM: MMU: Call update_dirty_bit() without disabling preemption
    
    update_dirty_bit() will want to touch memory, which will one day invoke
    sleeping operations.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0f4a1103e329eda5741eb548bc5746653471ce86
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Mon Oct 15 14:24:20 2007 +0800

    KVM: x86 emulator: implement 'movnti mem, reg'
    
    Implement emulation of instruction:
        movnti m32/m64, r32/r64
        opcode: 0x0f 0xc3
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3923231f506fb4e83f9fc5f8efce3607d696b33f
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Fri Oct 12 11:01:59 2007 +0200

    KVM: Add some \n in ioapic_debug()
    
    Add new-line at end of debug strings.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a279c26a8ad12130c347eddfd92de6f4a1c3b53b
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 19:46:26 2007 +0200

    KVM: Restore missing #include <linux/vmalloc.h>

commit 33aafecf3f106ba6aa8847dfdae033a73e5d1b50
Author: Carsten Otte <cotte@de.ibm.com>
Date:   Thu Oct 11 19:16:52 2007 +0200

    KVM: Portability: split kvm_vcpu_ioctl
    
    This patch splits kvm_vcpu_ioctl into archtecture independent parts, and
    x86 specific parts which go to kvm_arch_vcpu_ioctl in x86.c.
    
    Common ioctls for all architectures are:
    KVM_RUN, KVM_GET/SET_(S-)REGS, KVM_TRANSLATE, KVM_INTERRUPT,
    KVM_DEBUG_GUEST, KVM_SET_SIGNAL_MASK, KVM_GET/SET_FPU
    Note that some PPC chips don't have an FPU, so we might need an #ifdef
    around KVM_GET/SET_FPU one day.
    
    x86 specific ioctls are:
    KVM_GET/SET_LAPIC, KVM_SET_CPUID, KVM_GET/SET_MSRS
    
    An interresting aspect is vcpu_load/vcpu_put. We now have a common
    vcpu_load/put which does the preemption stuff, and an architecture
    specific kvm_arch_vcpu_load/put. In the x86 case, this one calls the
    vmx/svm function defined in kvm_x86_ops.
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Reviewed-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 56161cd30f573c7aa24440df1f9f848c80c203da
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:30:21 2007 +0200

    KVM: MMU: When updating the dirty bit, inform the mmu about it
    
    Since the mmu uses different shadow pages for dirty large pages and clean
    large pages, this allows the mmu to drop ptes that are now invalid.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c507f33a3e7053fe31a9352b61e7bc76e4640c2f
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:22:59 2007 +0200

    KVM: MMU: Move dirty bit updates to a separate function
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7ccf679da43960a30336c5fcede3378f972763fb
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:13:49 2007 +0200

    KVM: MMU: Instatiate real-mode shadows as user writable shadows
    
    This is consistent with real-mode permissions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 636a2d88e5b03699095dcafbf754fd2b3d6c7f17
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:12:24 2007 +0200

    KVM: MMU: Disable write access on clean large pages
    
    By forcing clean huge pages to be read-only, we have separate roles
    for the shadow of a clean large page and the shadow of a dirty large
    page.  This is necessary because different ptes will be instantiated
    for the two cases, even for read faults.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 154833765b9edde8fbc40cd30b2f0270bd6734d2
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 15:08:41 2007 +0200

    KVM: MMU: Fix nx access bit for huge pages
    
    We must set the bit before the shift, otherwise the wrong bit gets set.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 14fe74643102320001eeb52f4fb940cc56eeea3a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Oct 11 12:32:30 2007 +0200

    KVM: Move guest pte dirty bit management to the guest pagetable walker
    
    This is more consistent with the accessed bit management, and makes the dirty
    bit available earlier for other purposes.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4d7767d5d025a9ba5e230fd121aca7e1a064f969
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed Oct 10 20:08:41 2007 -0500

    KVM: MMU: More struct kvm_vcpu -> struct kvm cleanups
    
    This time, the biggest change is gpa_to_hpa. The translation of GPA to HPA does
    not depend on the VCPU state unlike GVA to GPA so there's no need to pass in
    the kvm_vcpu.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 35e19206b7601a5a7c922f6c2932a01de15a5cb3
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed Oct 10 19:25:50 2007 -0500

    KVM: MMU: Clean up MMU functions to take struct kvm when appropriate
    
    Some of the MMU functions take a struct kvm_vcpu even though they affect all
    VCPUs.  This patch cleans up some of them to instead take a struct kvm.  This
    makes things a bit more clear.
    
    The main thing that was confusing me was whether certain functions need to be
    called on all VCPUs.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ac413bb5e5ca60a3e12a99593d184577ff7518c4
Author: cotte@de.ibm.com <cotte@de.ibm.com>
Date:   Wed Oct 10 17:16:19 2007 +0200

    KVM: Move x86 msr handling to new files x86.[ch]
    
    Signed-off-by: Carsten Otte <cotte@de.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2a96e86bf672a4c056483de24f4f25c3a19b4c11
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Oct 10 14:03:16 2007 +0200

    KVM: Replace enum by #define
    
    Easier for existence test (#ifdef) in userspace.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ad23abbeab86af3156467163b01b25237f99e9cb
Author: Dong, Eddie <eddie.dong@intel.com>
Date:   Wed Oct 10 14:26:45 2007 +0800

    KVM: VMX: Reset mmu context when entering real mode
    
    Resetting an SMP guest will force AP enter real mode (RESET) with
    paging enabled in protected mode. While current enter_rmode() can
    only handle mode switch from nonpaging mode to real mode which leads
    to SMP reboot failure.
    
    Fix by reloading the mmu context on entering real mode.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 342a6bf85cd40a70fd382324085dc3b0a901c865
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Oct 10 12:15:54 2007 +0200

    KVM: Split IOAPIC reset function and export for kernel RESET
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ebac07d56f961752e209b662e2341aad3e85b408
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Oct 10 12:14:25 2007 +0200

    KVM: Export PIC reset for kernel device reset
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7faf3e45cc45eb2b5b0db3e3f4412b15784a38cc
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Oct 9 19:20:39 2007 +0200

    KVM: Support assigning userspace memory to the guest
    
    Instead of having the kernel allocate memory to the guest, let userspace
    allocate it and pass the address to the kernel.
    
    This is required for s390 support, but also enables features like memory
    sharing and using hugetlbfs backed memory.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 49c32b24524c0b929bcf93ebcc3acc9c7edc87e9
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Oct 9 12:12:19 2007 +0200

    KVM: VMX: Handle NMIs before enabling interrupts and preemption
    
    This makes sure we handle NMI on the current cpu, and that we don't service
    maskable interrupts before non-maskable ones.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7060e1c92b504ac725e2ffbc91053c1dc684e685
Author: Mike Day <ncmike@ncultra.org>
Date:   Mon Oct 8 09:02:08 2007 -0400

    KVM: CodingStyle cleanup
    
    Signed-off-by: Mike D. Day <ncmike@ncultra.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 82da48c29aabfb4941154e399e2a781d6eb786ca
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Oct 8 10:55:29 2007 +1000

    KVM: Remove gratuitous casts from lapic.c
    
    Since vcpu->apic is of the correct type, there's not need to cast.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a2230360fb019b080119126a9706c017a5cc96a4
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Oct 8 10:50:48 2007 +1000

    KVM: Hoist kvm_create_lapic() into kvm_vcpu_init()
    
    Move kvm_create_lapic() into kvm_vcpu_init(), rather than having svm
    and vmx do it.  And make it return the error rather than a fairly
    random -ENOMEM.
    
    This also solves the problem that neither svm.c nor vmx.c actually
    handles the error path properly.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3bbc5ebb64c17319970ef283c35aba79ed7fa915
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Oct 8 10:48:30 2007 +1000

    KVM: Add kvm_free_lapic() to pair with kvm_create_lapic()
    
    Instead of the asymetry of kvm_free_apic, implement kvm_free_lapic().
    And guess what?  I found a minor bug: we don't need to hrtimer_cancel()
    from kvm_main.c, because we do that in kvm_free_apic().
    
    Also:
    1) kvm_vcpu_uninit should be the reverse order from kvm_vcpu_init.
    2) Don't set apic->regs_page to zero before freeing apic.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f1b8c28f1886c9375361d7f2ebca1f742ea6bc5f
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Oct 8 10:01:45 2007 +0200

    KVM: Check I/O APIC indirect index before writing
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d2cfd9a5359db3b0fa8af0d07cf5801c3b22d9ae
Author: Izik Eidus <izike@qumranet.com>
Date:   Tue Oct 2 18:52:55 2007 +0200

    KVM: Allow dynamic allocation of the mmu shadow cache size
    
    The user is now able to set how many mmu pages will be allocated to the guest.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1842a5c0bf840e1a4f251f344136dd8ebc7c7974
Author: Izik Eidus <izik@Home1.(none)>
Date:   Mon Oct 1 22:14:18 2007 +0200

    KVM: Add general accessors to read and write guest memory
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 06ba8d55ba4052ded08eea5bed9519c6f6780d8c
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Oct 1 11:01:06 2007 +0200

    KVM: x86 emulator: Correct management of REP prefix
    
    This patch corrects some errors appearing when we have an emulation failure
    on an operation using REP prefix.
    
    When x86_emulate_insn() fails, saving EIP and ECX is not enough as emulation
    should have modified other registers like RSI or RDI. Moreover, the emulation
    can fail on the writeback, and in this case we are not able to restore
    registers.
    
    At beginning of x86_emulate_insn(), we restore registers from vcpu as they were
    not modified by x86d_decode_insn() and we save EIP to be able to restore it
    in case of failure.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c78fc6fb953bbbd9e0b931759b03162768991132
Author: Izik Eidus <izike@qumranet.com>
Date:   Thu Sep 27 14:11:22 2007 +0200

    KVM: Remove the usage of paeg->private field by rmap
    
    When kvm uses user-allocated pages in the future for the guest, we won't
    be able to use page->private for rmap, since page->rmap is reserved for
    the filesystem.  So we move the rmap base pointers to the memory slot.
    
    A side effect of this is that we need to store the gfn of each gpte in
    the shadow pages, since the memory slot is addressed by gfn, instead of
    hfn like struct page.
    
    Signed-off-by: Izik Eidus <izik@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 08079fa606400c5594e91ac26dcfbabe63940507
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Sep 30 11:02:53 2007 +0200

    KVM: VMX: Simplify vcpu_clear()
    
    Now that smp_call_function_single() knows how to call a function on the
    current cpu, there's no need to check explicitly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b595c624ca08b063ee5397a5cc881126a6570654
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Sep 30 10:50:12 2007 +0200

    KVM: VMX: Don't clear the vmcs if the vcpu is not loaded on any processor
    
    Noted by Eddie Dong.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c0c8b920137e988bb4cde1bb41e0e409c8abb844
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 25 15:36:35 2007 +0200

    KVM: x86 emulator: remove unused variable
    
    Remove unused variable introduced by commit
    5ed6627ee96f0a6802d99e71879d98610ba17e01 (I missed it, sorry)
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9889cbb1bb752615ba1f2f533f7e1386ffeb29c3
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Sep 27 10:45:34 2007 +0200

    KVM: x86 emulator: On a pop instruction, don't restore ECX and EIP on error
    
    This patch corrects a mistake introduced by commit
    5d9b36eec8ca6abe03da91efdfc7b5861525bd43 and reported by Nitin A Kamble.
    
    The pop instruction restores ECX and EIP if read_std() fails and if we have
    a REP prefix, but at this level ECX and EIP are not saved (and not modified).
    We don't have to restore it.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ba483db9d1e6bf59501c5751399e8b7f966c01d6
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Sep 27 10:07:04 2007 +0200

    i386: Expose IOAPIC register definitions even if CONFIG_X86_IO_APIC is not set
    
    KVM reuses the IOAPIC register definitions, and needs them even if the
    host is not compiled with IOAPIC support.  Move the #ifdef below so that only
    the IOAPIC variables and functions are protected, and the register definitions
    are available to all.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5ed6627ee96f0a6802d99e71879d98610ba17e01
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 25 13:36:40 2007 +0200

    KVM: x86 emulator: Any legacy prefix after a REX prefix nullifies its effect
    
    This patch modifies the management of REX prefix according behavior
    I saw in Xen 3.1.  In Xen, this modification has been introduced by
    Jan Beulich.
    
    http://lists.xensource.com/archives/html/xen-changelog/2007-01/msg00081.html
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6972c9253725255034d0f8d83f5bdbf70430a95b
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Sep 24 17:00:58 2007 +0200

    KVM: Purify x86_decode_insn() error case management
    
    The only valid case is on protected page access, other cases are errors.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2cd2d1d11d1f67c4d660c0cf6758dd6e588c4dd6
Author: Qing He <qing.he@intel.com>
Date:   Mon Sep 24 17:39:41 2007 +0800

    KVM: apic round robin cleanup
    
    If no apic is enabled in the bitmap of an interrupt delivery with delivery
    mode of lowest priority, a warning should be reported rather than select
    a fallback vcpu
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Eddie (Yaozu) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 04817088a0e8d96587e4fb954d104d62f71df58d
Author: Qing He <qing.he@intel.com>
Date:   Mon Sep 24 17:22:13 2007 +0800

    KVM: x86_emulator: no writeback for bt
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a21855c2ed30a7a01468558bfc12a05722ef3771
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Sep 24 11:10:56 2007 +0200

    KVM: x86 emulator: Remove no_wb, use dst.type = OP_NONE instead
    
    Remove no_wb, use dst.type = OP_NONE instead, idea stollen from xen-3.1
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d7f0f98414e3ab5259d54aa6ebd86a825af76980
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Sep 24 11:10:55 2007 +0200

    KVM: x86 emulator: remove _eflags and use directly ctxt->eflags.
    
    Remove _eflags and use directly ctxt->eflags. Caching eflags is not needed as
    it is restored to vcpu by kvm_main.c:emulate_instruction() from ctxt->eflags
    only if emulation doesn't fail.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d98df34cc539942d8d5540ffa2425ca91056a7d3
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Sep 24 11:10:54 2007 +0200

    KVM: x86 emulator: split some decoding into functions for readability
    
    To improve readability, move push, writeback, and grp 1a/2/3/4/5/9 emulation
    parts into functions.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 62d1ea7fdcdb905072198e4cec8f724c8ad33092
Author: Ryan Harper <ryanh@us.ibm.com>
Date:   Tue Sep 18 14:05:16 2007 -0500

    KVM: MMU: Ignore reserved bits in cr3 in non-pae mode
    
    This patch removes the fault injected when the guest attempts to set reserved
    bits in cr3.  X86 hardware doesn't generate a fault when setting reserved bits.
    The result of this patch is that vmware-server, running within a kvm guest,
    boots and runs memtest from an iso.
    
    Signed-off-by: Ryan Harper <ryanh@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4acc535e64696fb09da6d2f41a5a8b8f60739c03
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Sep 23 14:10:49 2007 +0200

    KVM: MMU: Make flooding detection work when guest page faults are bypassed
    
    When we allow guest page faults to reach the guests directly, we lose
    the fault tracking which allows us to detect demand paging.  So we provide
    an alternate mechnism by clearing the accessed bit when we set a pte, and
    checking it later to see if the guest actually used it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 30649900566e8e8785b814f4a40e6660d8086873
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Sep 16 18:58:32 2007 +0200

    KVM: Allow not-present guest page faults to bypass kvm
    
    There are two classes of page faults trapped by kvm:
     - host page faults, where the fault is needed to allow kvm to install
       the shadow pte or update the guest accessed and dirty bits
     - guest page faults, where the guest has faulted and kvm simply injects
       the fault back into the guest to handle
    
    The second class, guest page faults, is pure overhead.  We can eliminate
    some of it on vmx using the following evil trick:
     - when we set up a shadow page table entry, if the corresponding guest pte
       is not present, set up the shadow pte as not present
     - if the guest pte _is_ present, mark the shadow pte as present but also
       set one of the reserved bits in the shadow pte
     - tell the vmx hardware not to trap faults which have the present bit clear
    
    With this, normal page-not-present faults go directly to the guest,
    bypassing kvm entirely.
    
    Unfortunately, this trick only works on Intel hardware, as AMD lacks a
    way to discriminate among page faults based on error code.  It is also
    a little risky since it uses reserved bits which might become unreserved
    in the future, so a module parameter is provided to disable it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit afa232aeb1676c63c5c4000a6d865cdc9455b2b5
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Sep 23 12:30:19 2007 +0200

    KVM: MMU: Set shadow pte atomically in mmu_pte_write_zap_pte()
    
    Setting shadow page table entry should be set atomicly using set_shadow_pte().
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 99f6c824362215f3038cfe54ddcd3c940281e9cd
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Sep 21 05:29:13 2007 +0200

    KVM: Fix ioapic edge-triggered interrupts
    
    - clear irr after service
    - do not service after unmasking; wait for a new edge
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5fdd2a196e7975d446fedf6973cbb20708f1359c
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Sep 20 12:27:28 2007 +0200

    KVM: Fix host oops due to guest changing efer
    
    If the guest changes efer from long mode with sce disabled to legacy mode,
    then load_transition_efer() zeros vmx->host_state.guest_efer_loaded, but
    the SCE-disabled efer remains in effect. So when we return to the host,
    we disable SCE and syscalls no longer work.
    
    Fix by (a) not touching vmx->host_state.guest_efer_loaded if we're not
    setting it, and instead (b) clearing it explicitly when we switch back.
    Also switch back when the guest writes to efer so we start from a clean
    slate.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e8ebaa91f96407a90c1cb81708a87a25f40ba8ab
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Sep 20 11:17:24 2007 +0200

    KVM: x86 emulator: fix repne/repnz decoding
    
    The repnz/repne instructions must set rep_prefix to 1 like rep/repe/repz.
    
    This patch correct the disk probe problem met with OpenBSD.
    
    This issue appears with commit 091b206f6c56f2329e11bac2fa40d6f236ab0bc2
    because before it, the decoding was done internally to kvm and after it
    is done by x86_emulate.c (which doesn't do it correctly).
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0203e2d5d0d0cea6eed6e437d9456aad71135913
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Sep 19 16:08:53 2007 +0200

    KVM: Implement ioapic irq polarity bit
    
    Reverse the sense of the irq level if the polarity bit is set.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit faba110779451794f764a4802e740146e8efb93f
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Sep 19 15:48:58 2007 +0200

    KVM: Avoid redelivery of edge-triggered irq if it is already in service
    
    Noticed by Eddie Dong.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3ddc321087b0083fec2eff1bc613410fdc2e8388
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Wed Sep 19 13:41:55 2007 +0200

    KVM: x85 emulator: Correct inconcistency in between cr2 and ctxt->cr2.
    
    This patch corrects an inconcistency of cr2 introduced by the x86 emulator
    split.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 37d266e7330baedc52fab5cc699cff2c8cc2947e
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Sep 18 16:34:25 2007 -0700

    KVM: x86 emulator: fix merge screwup due to emulator split
    
    This code has gone to wrong place in the file. Moving it back to
    right location.
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1196dd4e2e6f09053335b0a91e3cc793808c00a7
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Sep 19 10:52:18 2007 +0200

    KVM: Fix ioapic.c compilation failure due to missing include
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 44a0469583ff93240acb76085a993a1d30202679
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Sep 19 10:44:58 2007 +0200

    KVM: VMX: Fix build on i386 due to EFER_LMA not defined

commit caba4b5c24f24bf003dd385e5658f0b682bdf0ac
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Aug 29 03:48:05 2007 +0300

    KVM: VMX: Further reduce efer reloads
    
    KVM avoids reloading the efer msr when the difference between the guest
    and host values consist of the long mode bits (which are switched by
    hardware) and the NX bit (which is emulated by the KVM MMU).
    
    This patch also allows KVM to ignore SCE (syscall enable) when the guest
    is running in 32-bit mode.  This is because the syscall instruction is
    not available in 32-bit mode on Intel processors, so the SCE bit is
    effectively meaningless.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 97594d420a09db38e3f2c8aa2c8481dc51c11e82
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Sep 18 15:26:30 2007 +0200

    KVM: Fix #UD exception delivery
    
    It doesn't have an error code, and it uses the #UD vector.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5e7a195fc4b1c0df577658e01a25b91d49bc68ee
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Sep 18 14:19:00 2007 +0200

    KVM: Fix ioapic level-triggered interrupt redelivery
    
    The ioapic failed to set the irr bit if a previous interrupt was already
    being serviced.  This caused interrupt loss fairly soon, leading to loss
    of level-triggered devices like pic networking.
    
    This patch fixes the problem by always setting irr when an irq is asserted.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5d9b36eec8ca6abe03da91efdfc7b5861525bd43
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:27:37 2007 +0200

    KVM: Call x86_decode_insn() only when needed
    
    Move emulate_ctxt to kvm_vcpu to keep emulate context when we exit from kvm
    module. Call x86_decode_insn() only when needed. Modify x86_emulate_insn() to
    not modify the context if it must be re-entered.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a00840cfcc2c18662e04ac94fcbe12266c403cad
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:27:27 2007 +0200

    KVM: emulate_instruction() calls now x86_decode_insn() and x86_emulate_insn()
    
    emulate_instruction() calls now x86_decode_insn() and x86_emulate_insn().
    x86_emulate_insn() is x86_emulate_memop() without the decoding part.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a40bf553563276cf3aff293b6ec36373bf3dc968
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:27:19 2007 +0200

    KVM: x86 emulator: move all decoding process to function x86_decode_insn()
    
    Split the decoding process into a new function x86_decode_insn().
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c18617e89f3a94fd74d55dde36b54c8ca23072f9
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:52:50 2007 +0200

    KVM: x86 emulator: move all x86_emulate_memop() to a structure
    
    Move all x86_emulate_memop() common variables between decode and execute to a
    structure decode_cache.  This will help in later separating decode and
    emulate.
    
                struct decode_cache {
                    u8 twobyte;
                    u8 b;
                    u8 lock_prefix;
                    u8 rep_prefix;
                    u8 op_bytes;
                    u8 ad_bytes;
                    struct operand src;
                    struct operand dst;
                    unsigned long *override_base;
                    unsigned int d;
                    unsigned long regs[NR_VCPU_REGS];
                    unsigned long eip;
                    /* modrm */
                    u8 modrm;
                    u8 modrm_mod;
                    u8 modrm_reg;
                    u8 modrm_rm;
                    u8 use_modrm_ea;
                    unsigned long modrm_ea;
                    unsigned long modrm_val;
               };
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d6a754e5ec1ae429e7bd22a2b54e0fea1d64e1d9
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Tue Sep 18 11:26:38 2007 +0200

    KVM: x86 emulator: remove unused functions
    
    Remove #ifdef functions never used
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8db6e3d54971e76019b02a6ad860c9df35218dfc
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Sep 17 14:57:50 2007 -0500

    KVM: Refactor hypercall infrastructure (v3)
    
    This patch refactors the current hypercall infrastructure to better
    support live migration and SMP.  It eliminates the hypercall page by
    trapping the UD exception that would occur if you used the wrong hypercall
    instruction for the underlying architecture and replacing it with the right
    one lazily.
    
    A fall-out of this patch is that the unhandled hypercalls no longer trap to
    userspace.  There is very little reason though to use a hypercall to
    communicate with userspace as PIO or MMIO can be used.  There is no code
    in tree that uses userspace hypercalls.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cd132dabf169ce61b5be58d42d9bd08984cba429
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Sep 17 14:57:49 2007 -0500

    KVM: x86 emulator: Add vmmcall/vmcall to x86_emulate (v3)
    
    Add vmmcall/vmcall to x86_emulate.  Future patch will implement functionality
    for these instructions.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 17f668f73876cb0a67404db12b843850a9426bbb
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Sep 17 11:02:51 2007 +0200

    KVM: Fix virtualization menu help text
    
    What guest drivers?
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 23263a086e85f0065f95e3cb676dd96434da98d8
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Sep 17 10:59:31 2007 +0200

    KVM: Remove errant printk() in kvm_vcpu_ioctl_get_sregs()
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 718a3339a903ea1935148eb095c2f8ce741a54a2
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Sep 17 10:58:27 2007 +0200

    KVM: Fix kvm_vcpu_ioctl_get_sregs() warning on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 37fa44d29fb9d9c47126e40bfe266f8bf74de43d
Author: Qing He <qing.he@intel.com>
Date:   Mon Sep 17 14:47:13 2007 +0800

    KVM: fix PIC interrupt delivery on different APIC conditions
    
    This patch changes the PIC interrupts delivery. Now it is only deliverd
    to vcpu0 when either condition is met (on vcpu0):
      1. local APIC is hardware disabled
      2. LVT0 is unmasked and configured to delivery mode ExtInt
    
    It fixes the 2x faster wall clock on x86_64 and SMP i386 Linux guests
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c408e4e8d9045d53c1d82c622a5756febd051ef9
Author: Avi Kivity <avi@qumranet.com>
Date:   Sat Sep 15 17:34:36 2007 +0300

    KVM: Skip pio instruction when it is emulated, not executed
    
    If we defer updating rip until pio instructions are executed, we have a
    problem with reset:  a pio reset updates rip, and when the instruction
    completes we skip the emulated instruction, pointing rip somewhere completely
    unrelated.
    
    Fix by updating rip when we see decode the instruction, not after emulation.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 340bcebdee0382c3b1dd9f963e21e4217594467b
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sat Sep 15 10:45:05 2007 +0300

    KVM: x86 emulator: popf
    
    Implement emulation of instruction:
        popf
        opcode:  0x9d
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 338be073d760fde05ee2f78ace4e9576dc3e6909
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sat Sep 15 10:43:33 2007 +0300

    KVM: x86 emulator: fix src, dst value initialization
    
    Some operand fetches are less than the machine word size and can result in
    stale bits if used together with operands of different sizes.
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a5f988993b5b167d007c8c33d45ad8e0f849d22a
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sat Sep 15 10:41:26 2007 +0300

    KVM: x86 emulator: jmp abs
    
    Implement emulation of instruction:
        jump absolute r/m
        opcode: 0xff /4
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c6aeb4632919c37317213c9b5a41bedbcc8b3416
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sat Sep 15 10:35:36 2007 +0300

    KVM: x86 emulator: lea
    
    Implement emulation of instruction
        lea r16/r32, m
        opcode:  0x8d:
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c9aa71c7901df4fcf1eb33721bea8b581188f2bf
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sat Sep 15 10:25:41 2007 +0300

    KVM: X86 emulator: jump conditional short
    
    Implement emulation of more jump conditional instructions
        jcc shortrel
        opcodes: 0x70 - 0x7f
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ef70803677eec7ab37d61e48b0d1cb628c3f991b
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sat Sep 15 10:23:07 2007 +0300

    KVM: x86 emulator: imlpement jump conditional relative
    
    Implement emulation of instruction:
        jump conditional rel
        opcodes: 0x0f 0x80 - 0x0f 0x8f
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 021854e6f9566e2990c1dfee474d4d509f84e3fd
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sat Sep 15 10:13:07 2007 +0300

    KVM: x86 emulator: sort opcodes into ascending order
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 28621bdce24324e1f8b33fa25595cc0609153be6
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Wed Sep 12 18:03:11 2007 +0800

    KVM: VMX: Prevent setting CPU_BASED_TPR_SHADOW on i386 host
    
    Though tpr shadow feature can be used on i386 host, but it needs support from
    virtual apic access feature which hasn't been implemented yet, otherwise it
    will cause trouble on i386 machine which supports this feature.
    
    This patch disables tpr shadow feature for i386 host now.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 58d8159c7264eee015ad0656afd018aecbb3c69f
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Sep 12 13:21:09 2007 +0300

    KVM: Improve emulation failure reporting
    
    Report failed opcodes from all locations.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0fe149eb04e5e67f4d3ebc2ab9f2426356a308ba
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Aug 28 18:22:47 2007 -0700

    KVM: x86 emulator: pushf
    
    Implement emulation of instruction
    	pushf
    	opcode: 0x9c
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 917fca3116605cfa2db01d390ba94b0412c88eb3
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Aug 28 18:08:37 2007 -0700

    KVM: x86 emulator: call near
    
    Implement emulation of instruction
    	opcode: 0xe8
    	call (near)
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2b15dbf376872112c832576cc3b3f618e0b85e2d
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Aug 28 17:58:52 2007 -0700

    KVM: x86 emulator: push imm8
    
    Implement the instruction
    
        	push imm8
        	opcode: 0x6a
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c7c2eaa2c305ed106da78afd7b95f42cec8d6dc8
Author: He, Qing <qing.he@intel.com>
Date:   Wed Sep 12 14:18:28 2007 +0800

    KVM: VMX: Fix exit qualification width on i386
    
    According to Intel Software Developer's Manual, Vol. 3B, Appendix H.4.2,
    exit qualification should be of natural width. However, current code
    uses u64 as the data type for this register, which occasionally
    introduces invalid value to VMExit handling logics. This patch fixes
    this bug.
    
    I have tested Windows and Linux guest on i386 host, and they can boot
    successfully with this patch.
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit de60b339983ae64920b1bc58bb5c2c6b10db5d93
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Sep 12 10:58:04 2007 +0300

    KVM: Fix link error to "genapic"
    
    GET_APIC_ID may use genapic instance for some machine
    configuration in i386 architecture, but it is not exported
    for outside usage. This patch remove this reference.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0663a73e366dd1df52c0ce4fec32f47455575324
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Sep 10 18:10:54 2007 +0300

    KVM: Move main vcpu loop into subarch independent code
    
    This simplifies adding new code as well as reducing overall code size.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ec1ff57323b7ce5022cae99e27ff8297ce2aaa27
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Sep 10 17:27:03 2007 +0300

    KVM: VMX: Move vm entry failure handling to the exit handler
    
    This will help moving the main loop to subarch independent code.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2f6ad5e1fb93c0392e6cfaf2ef2bee3aaaa19244
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Sep 10 15:20:59 2007 +0300

    KVM: Remove smp_processor_id() in kvm_vcpu_kick()
    
    The value is meaningless since it can change; instead call the function
    unconditionally.  It is a no-op on the same cpu anyway.  This removes
    annoying warning on runtime.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a40fa4c30f0883a3a4a1560e0174540d6594e0ca
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Sep 10 11:28:17 2007 +0300

    KVM: MMU: Don't do GFP_NOWAIT allocations
    
    Before preempt notifiers, kvm needed to allocate memory with GFP_NOWAIT so
    as not to have to enable preemption and take a heavyweight exit.  On oom, we'd
    fall back to a GFP_KERNEL allocation.
    
    With preemption notifiers, we can do a GFP_KERNEL allocation, and perform
    the heavyweight exit only if the kernel decides to put us to sleep.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5b25a47c1edb6ba9ac23e745260e7533be371c1d
Author: He, Qing <qing.he@intel.com>
Date:   Mon Sep 10 11:01:52 2007 +0300

    KVM: fix apic timer migration when inactive
    
    When local apic timer is inactive or is expired in one shot mode, it
    should not be restarted on vcpu and hrtimer migration. This patch fixes this.
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e44af0f4ee99974ce40102e23784bc3cae7f4466
Author: Jindrich Makovicka <makovick@gmail.com>
Date:   Sun Sep 9 18:45:01 2007 +0300

    KVM: Fix lapic 64-bit division on 32-bit hosts
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c54c215e7e71b99c0a3270d7fc85668179bea67a
Author: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date:   Sun Sep 9 15:41:59 2007 +0300

    KVM: Rename kvm_arch_ops to kvm_x86_ops
    
    This patch just renames the current (misnamed) _arch namings to _x86 to
    ensure better readability when a real arch layer takes place.
    
    Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit beec957bd8205ebbd9dc2eecb166fe4ae06e31e4
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Aug 30 14:56:21 2007 +0200

    KVM: Simplify memory allocation
    
    The mutex->splinlock convertion alllows us to make some code simplifications.
    As we can keep the lock longer, we don't have to release it and then
    have to check if the environment has not been modified before re-taking it. We
    can remove kvm->busy and kvm->memory_config_version.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f71c862171a7265085798d1aa8c43eadb6d85520
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Thu Sep 6 01:21:32 2007 +1000

    KVM: Hoist SVM's get_cs_db_l_bits into core code.
    
    SVM gets the DB and L bits for the cs by decoding the segment.  This
    is in fact the completely generic code, so hoist it for kvm-lite to use.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1cfd09dd0492b50376ed703f4252c489d91d1597
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Thu Sep 6 01:20:38 2007 +1000

    KVM: Keep control regs in sync
    
    We don't update the vcpu control registers in various places.  We
    should do so.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aa38840d3d2e0a804e628077df8d8879b496d741
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Sun Sep 9 14:12:54 2007 +0300

    KVM: Clean up unloved invlpg emulation
    
    invlpg shouldn't fetch the "src" address, since it may not be valid,
    however SVM's "solution" which neuters emulation of all group 7
    instruction is horrible and breaks kvm-lite.  The simplest fix is to
    put a special check in for invlpg.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8fa7178a4f0c96662bab31ff46e3bff1995ff14a
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Sun Sep 9 14:10:57 2007 +0300

    KVM: Remove the unused invlpg member of struct kvm_arch_ops.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f4ed61146e11e6f42500385feee51eb30ed324d0
Author: Amit Shah <amit.shah@qumranet.com>
Date:   Sat Aug 25 11:35:52 2007 +0300

    KVM: Set the ET flag in CR0 after initializing FX
    
    This was missed when moving stuff around in fbc4f2e
    
    Fixes Solaris guests and bug #1773613
    
    Signed-off-by: Amit Shah <amit.shah@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fe3f479c1a1564b3612a955f037a1905b85c9f6f
Author: He, Qing <qing.he@intel.com>
Date:   Mon Sep 3 17:07:41 2007 +0300

    KVM: enable in-kernel APIC INIT/SIPI handling
    
    This patch enables INIT/SIPI handling using in-kernel APIC by
    introducing a ->mp_state field to emulate the SMP state transition.
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Xin Li <xin.b.li@intel.com>

commit a0c1343ffdac844fe659678928d1eb6c88e8aeb4
Author: He, Qing <qing.he@intel.com>
Date:   Mon Sep 3 17:01:36 2007 +0300

    KVM: round robin for APIC lowest priority delivery mode
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ca7d5e3ddce0d1483fbb28ba59d7677c8935d785
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Mon Sep 3 17:00:24 2007 +0300

    KVM: deliver PIC interrupt only to vcpu0
    
    Signed-off-by: Eddie (Yaozu) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1fef0a7c83cc8ce89c6ea25225898da57ea68a63
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Mon Sep 3 16:56:58 2007 +0300

    KVM: VMX: Fix tpr threshold updating
    
    The TPR threshold must be updated only after any irqs are injected.
    
    Signed-off-by: Eddie (Yaozu) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 43803341a9873c3955f10352b30f5d449aae70b5
Author: He, Qing <qing.he@intel.com>
Date:   Thu Aug 30 17:04:26 2007 +0800

    KVM: disable tpr/cr8 sync when in-kernel APIC is used
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 966b840d0d7f52fdf2062772a1477f4d2536ab8f
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Mon Sep 3 16:15:12 2007 +0300

    KVM: Migrate lapic hrtimer when vcpu moves to another cpu
    
    This reduces overhead by accessing cachelines from the wrong node, as well
    as simplifying locking.
    
    Signed-off-by: Yaozu (Eddie) Dong <Eddie.Dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 50587b4ba6352cb87212b581f3e6a4b21ee5ff7f
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sat Aug 25 18:00:41 2007 +0300

    KVM: Keep track of missed timer irq injections
    
    APIC timer IRQ is set every time when a certain period
    expires at host time, but the guest may be descheduled
    at that time and thus the irq be overwritten by later fire.
    This patch keep track of firing irq numbers and decrease
    only when the IRQ is injected to guest or buffered in
    APIC.
    
    Signed-off-by: Yaozu (Eddie) Dong <Eddie.Dong@intel.com>
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 09fe5ff51331cc089d9d02b33ef0fc512dcd8f69
Author: Yang, Sheng <sheng.yang@intel.com>
Date:   Mon Sep 3 16:37:44 2007 +0300

    KVM: VMX: Use shadow TPR/cr8 for 64-bits guests
    
    This patch enables TPR shadow of VMX on CR8 access. 64bit Windows using
    CR8 access TPR frequently. The TPR shadow can improve the performance of
    access TPR by not causing vmexit.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b2954a203243fa837c8c867163b00d2dee278048
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Mon Aug 6 16:29:07 2007 +0300

    KVM: pending irq save/restore
    
    Add in kernel irqchip save/restore support for pending vectors.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6546954a502dedf79401c7e5f564573df78e2f61
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Thu Sep 6 12:22:56 2007 +0300

    KVM: in-kernel LAPIC save and restore support
    
    This patch adds a new vcpu-based IOCTL to save and restore the local
    apic registers for a single vcpu. The kernel only copies the apic page as
    a whole, extraction of registers is left to userspace side. On restore, the
    APIC timer is restarted from the initial count, this introduces a little
    delay, but works fine.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3ae9777fe4c0fdc9688a8976ed105f537b5e2aea
Author: He, Qing <qing.he@intel.com>
Date:   Sun Aug 5 10:49:16 2007 +0300

    KVM: in-kernel IOAPIC save and restore support
    
    This patch adds support for in-kernel ioapic save and restore (to
    and from userspace). It uses the same get/set_irqchip ioctl as
    in-kernel PIC.
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8002f6c199d9c8535ee7848f55c35d012433e75a
Author: He, Qing <qing.he@intel.com>
Date:   Thu Aug 2 14:03:07 2007 +0300

    KVM: Bypass irq_pending get/set when using in kernel irqchip
    
    vcpu->irq_pending is saved in get/set_sreg IOCTL, but when in-kernel
    local APIC is used, doing this may occasionally overwrite vcpu->apic to
    an invalid value, as in the vm restore path.
    
    Signed-off-by: Qing He <qing.he@intel.com>

commit aae5fefba1de58a016d5c49c92c79a58ed989721
Author: He, Qing <qing.he@intel.com>
Date:   Thu Jul 26 11:05:18 2007 +0300

    KVM: Add get/set irqchip ioctls for in-kernel PIC live migration support
    
    This patch adds two new ioctls to dump and write kernel irqchips for
    save/restore and live migration. PIC s/r and l/m is implemented in this
    patch.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6e031384c47b825dd70a6c412f214d3379ed1f83
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sun Jul 22 10:36:31 2007 +0300

    KVM: Protect in-kernel pio using kvm->lock
    
    pio operation and IRQ_LINE kvm_vm_ioctl is not kvm->lock
    protected.  Add lock to same with IOAPIC MMIO operations.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 38557cfd5156f87f7f75623600b6b8c30e4e1ace
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Jul 18 12:15:21 2007 +0300

    KVM: Emulate hlt in the kernel
    
    By sleeping in the kernel when hlt is executed, we simplify the in-kernel
    guest interrupt path considerably.
    
    Signed-off-by: Gregory Haskins <ghaskins@novell.com>
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1fd2279b67369669c95e4474a9a1e0b0b6fbb060
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Jul 18 12:03:39 2007 +0300

    KVM: In-kernel I/O APIC model
    
    This allows in-kernel host-side device drivers to raise guest interrupts
    without going to userspace.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d69018f178ceb0cbb93fdb4795b1b503c6899162
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Thu Sep 6 12:22:56 2007 +0300

    KVM: Emulate local APIC in kernel
    
    Because lightweight exits (exits which don't involve userspace) are many
    times faster than heavyweight exits, it makes sense to emulate high usage
    devices in the kernel.  The local APIC is one such device, especially for
    Windows and for SMP, so we add an APIC model to kvm.
    
    It also allows in-kernel host-side drivers to inject interrupts without
    going through userspace.
    
    Signed-off-by: Yaozu (Eddie) Dong <Eddie.Dong@intel.com>
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7ae590c3ab0dab8a6e94965ccc2e77f9c6309a8d
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Jul 18 11:34:57 2007 +0300

    KVM: Define and use cr8 access functions
    
    This patch is to wrap APIC base register and CR8 operation which can
    provide a unique API for user level irqchip and kernel irqchip.
    This is a preparation of merging lapic/ioapic patch.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 127992c791d57fa7646e5ee8de60360ea3c0bd59
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Fri Jul 6 12:20:49 2007 +0300

    KVM: Add support for in-kernel PIC emulation
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 263773f7a6606efda85f7b184a067b5f560ed39b
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Thu Aug 23 16:33:11 2007 +0200

    KVM: VMX: Split segments reload in vmx_load_host_state()
    
    vmx_load_host_state() bundles fs, gs, ldt, and tss reloading into
    one in the hope that it is infrequent. With smp guests, fs reloading is
    frequent due to fs being used by threads.
    
    Unbundle the reloads so reduce expensive gs reloads.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f3e0aa2b4593e7d5dd064a3c56c919f85ef0d9eb
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Aug 22 18:09:29 2007 +0300

    KVM: X86 emulator: fix 'push reg' writeback
    
    Pointed out by Rusty Russell.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 46a948d80db725870da4ebdf5893d8efc426446d
Author: Izik Eidus <izike@qumranet.com>
Date:   Mon Aug 20 18:11:00 2007 +0300

    KVM: Support more memory slots
    
    Needed for mapping memory at 4GB.
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a88bbc1699461bab7479fdcef3ea1c12069acd1f
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Aug 20 15:44:39 2007 +0300

    KVM: MMU: Fix rare oops on guest context switch
    
    A guest context switch to an uncached cr3 can require allocation of
    shadow pages, but we only recycle shadow pages in kvm_mmu_page_fault().
    
    Move shadow page recycling to mmu_topup_memory_caches(), which is called
    from both the page fault handler and from guest cr3 reload.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit feeb915ce6cd7a5f51b2e56b6ff8dffb959a9594
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Aug 19 13:51:00 2007 +0300

    KVM: Avoid calling smp_call_function_single() with interrupts disabled
    
    When taking a cpu down, we need to hardware_disable() it.  Unfortunately,
    the CPU_DYING notifier is called with interrupts disabled, which means
    we can't use smp_call_function_single().  Fortunately, the CPU_DYING notifier
    is always called on the dying cpu, so we don't need to use the function at
    all and can simply call hardware_disable() directly.
    
    Tested-by: Paolo Ornati <ornati@fastwebnet.it>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 086f2ee50db8a1f39b0e17ab17d9c79b5964f0d7
Author: Izik Eidus <izike@qumranet.com>
Date:   Sun Aug 19 22:24:58 2007 +0300

    KVM: VMX: allow rmode_tss_base() to work with >2G of guest memory
    
    Signed-off-by: Izik Eidus <izike@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a843332b0445c9d60e4c9bda965b10cbe632a088
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sun Aug 19 11:07:06 2007 +0300

    KVM: x86 emulator: implement 'push reg' (opcodes 0x50-0x57)
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b23f94e52c4cec9e5ad404ac1426c49d64902dbf
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sun Aug 19 11:03:13 2007 +0300

    KVM: x86 emulator: Implement 'jmp rel short' instruction (opcode 0xeb)
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1bee2245738d473f57dcc973e16a497ff800c026
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Sun Aug 19 11:00:36 2007 +0300

    KVM: x86 emulator: implement 'jmp rel' instruction (opcode 0xe9)
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1757c2eb9f4a6b3b3aac1026dd15f26eb5dea041
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Fri Aug 17 15:17:41 2007 +0300

    KVM: x86 emulator: implement 'and $imm, %{al|ax|eax}'
    
    Implement emulation of instruction
        and al imm8 (opcode 0x24)
        and ax/eax imm16/imm32 (opcode 0x25)
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9bcbb7df435e154b148662df3721f71847a9342c
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Thu Aug 16 13:01:00 2007 +0300

    KVM: Communicate cr8 changes to userspace
    
    This allows running 64-bit Windows.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 93d097821cce141b3c74bbf20735c6dde443715f
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Aug 15 15:23:34 2007 +0300

    KVM: Close minor race in signal handling
    
    We need to check for signals inside the critical section, otherwise a
    signal can be sent which we will not notice.  Also move the check
    before entry, so that if the signal happens before the first entry,
    we exit immediately instead of waiting for something to happen to the
    guest.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 83aecfbf44f3ba92abde47957a3c9175f1ec7165
Author: Glauber de Oliveira Costa <gcosta@redhat.com>
Date:   Wed Aug 15 05:36:45 2007 +0300

    KVM: VMX: Don't require cr8 load/store exit capability when running on 32-bit
    
    This is because cr8 is not available on IA-32. It is just used in 64-bit mode.
    The rdmsr will then report this as not present, and it will lead us to return
    an -EIO.
    
    Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7a3773c7d8a0b488e86b98571e5b858a222b12a5
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Sun Aug 5 10:43:32 2007 +0300

    KVM: Clean up kvm_setup_pio()
    
    Split kvm_setup_pio() into two functions, one to setup in/out pio
    (kvm_emulate_pio()) and one to setup ins/outs pio (kvm_emulate_pio_string()).
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 091b206f6c56f2329e11bac2fa40d6f236ab0bc2
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Sun Aug 5 10:36:40 2007 +0300

    KVM: Cleanup string I/O instruction emulation
    
    Both vmx and svm decode the I/O instructions, and both botch the job,
    requiring the instruction prefixes to be fetched in order to completely
    decode the instruction.
    
    So, if we see a string I/O instruction, use the x86 emulator to decode it,
    as it already has all the prefix decoding machinery.
    
    This patch defines ins/outs opcodes in x86_emulate.c and calls
    emulate_instruction() from io_interception() (svm.c) and from handle_io()
    (vmx.c).  It removes all vmx/svm prefix instruction decoders
    (get_addr_size(), io_get_override(), io_address(), get_io_count())
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3415130f97a18f354853cab694d392553aa51af8
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Wed Aug 1 21:51:09 2007 +0300

    KVM: Remove useless assignment
    
    Line 1809 of kvm_main.c is useless, value is overwritten in line 1815:
    
    1809         now = min(count, PAGE_SIZE / size);
    1810
    1811         if (!down)
    1812                 in_page = PAGE_SIZE - offset_in_page(address);
    1813         else
    1814                 in_page = offset_in_page(address) + size;
    1815         now = min(count, (unsigned long)in_page / size);
    1816         if (!now) {
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 777d55128214491c48efa4b88355c3fa38c3b057
Author: Li, Xin B <xin.b.li@intel.com>
Date:   Wed Aug 1 21:49:10 2007 +0300

    KVM: VMX: Remove a duplicated ia32e mode vm entry control
    
    Remove a duplicated ia32e mode VM Entry control definition and use the
    proper one.
    
    Signed-off-by: Xin Li <xin.b.li@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b6d8a8dd56ee037b64af90085dd4bd54cbf16ac5
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Wed Aug 1 14:46:11 2007 +1000

    KVM: Use kmem_cache_free for kmem_cache_zalloc'ed objects
    
    We use kfree in svm.c and vmx.c, and this works, but it could break at
    any time.  kfree() is supposed to match up with kmalloc().
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c4a026d15eecb4ec9769210aa31d7992f2b87c74
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Wed Aug 1 10:48:02 2007 +1000

    KVM: Add and use pr_unimpl for standard formatting of unimplemented features
    
    All guest-invokable printks should be ratelimited to prevent malicious
    guests from flooding logs.  This is a start.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 07b7ac315dfb33f8e56a3c19572a96318b8cbc43
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Wed Aug 1 10:17:06 2007 +1000

    KVM: Remove unneeded kvm_dev_open and kvm_dev_release functions.
    
    Devices don't need open or release functions.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 71cda75733ec3020a5ce57a31eaf300f007c67b2
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Wed Aug 1 10:12:22 2007 +1000

    KVM: Remove stat_set from debugfs
    
    We shouldn't define stat_set on the debug attributes, since that will
    cause silent failure on writing: without a set argument, userspace
    will get -EACCESS.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6e0bfce30aa37a7fd5dd9c041296dfa237dae728
Author: Gabriel C <nix.or.die@googlemail.com>
Date:   Wed Aug 1 16:23:10 2007 +0200

    KVM: Fix defined but not used warning in drivers/kvm/vmx.c
    
    move_msr_up() is used only on X86_64 and generates a warning on !X86_64
    
    Signed-off-by: Gabriel Craciunescu <nix.or.die@googlemail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e203ad4bcf11981df6fc1677fedbdb29f6fa38e8
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 20:46:12 2007 +1000

    KVM: Remove redundant alloc_vmcs_cpu declaration
    
    alloc_vmcs_cpu is already declared (static) above, no need to
    redeclare.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8716bbed1f90ec805ca20a0c3264e181278c08cd
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 20:42:42 2007 +1000

    KVM: SVM: Make set_msr_interception more reliable
    
    set_msr_interception() is used by svm to set up which MSRs should be
    intercepted.  It can only fail if someone has changed the code to try
    to intercept an MSR without updating the array of ranges.
    
    The return value is ignored anyway: it should just BUG() if it doesn't
    work.  (A build-time failure would be better, but that's tricky).
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a3573510c9b6a93fffaa118e58494d439c37a17a
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 20:41:14 2007 +1000

    KVM: Cleanup mark_page_dirty
    
    For some reason, mark_page_dirty open-codes __gfn_to_memslot().
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 087ba994ef1267032319ff2ec2d8addb8bc5a567
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 20:45:03 2007 +1000

    KVM: Don't assign vcpu->cr3 if it's invalid: check first, set last
    
    sSigned-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c53b35b292e58cf234aa7ca08fc679e61d4b291b
Author: Yang, Sheng <sheng.yang@intel.com>
Date:   Tue Jul 31 14:23:01 2007 +0300

    KVM: VMX: Add cpu consistency check
    
    All the physical CPUs on the board should support the same VMX feature
    set.  Add check_processor_compatibility to kvm_arch_ops for the consistency
    check.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2e3bac2a9a2d52b6f349296812c5b752249e3e30
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 19:57:47 2007 +1000

    KVM: kvm_vm_ioctl_get_dirty_log restore "nothing dirty" optimization
    
    kvm_vm_ioctl_get_dirty_log scans bitmap to see it it's all zero, but
    doesn't use that information.
    
    Avi says:
    	Looks like it was used to guard	kvm_mmu_slot_remove_write_access();
    	optimizing the case where the guest just leaves the screen alone (which
    	it usually does, especially in benchmarks).
    
    	I'd rather reinstate that optimization.  See
    	90cb0529dd230548a7f0d6b315997be854caea1b where the damage was done.
    
    It's pretty simple: if the bitmap is all zero, we don't need to do anything to
    clean it.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 68fa04ca20fb8cf79e171c37bffd74466f12ad2b
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 21:13:43 2007 +1000

    KVM: Use alignment properties of vcpu to simplify FPU ops
    
    Now we use a kmem cache for allocating vcpus, we can get the 16-byte
    alignment required by fxsave & fxrstor instructions, and avoid
    manually aligning the buffer.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0ce565a6fc253c87f26d51c506cd13554889a598
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 21:12:19 2007 +1000

    KVM: Use kmem cache for allocating vcpus
    
    Avi wants the allocations of vcpus centralized again.  The easiest way
    is to add a "size" arg to kvm_init_arch, and expose the thus-prepared
    cache to the modules.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 29b8a493b293639ae509c44386dc6a8ff79debd0
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Jul 30 13:41:19 2007 +0300

    KVM: Remove kvm_{read,write}_guest()
    
    ... in favor of the more general emulator_{read,write}_*.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6d2b86f131a3cbf370b4a65f6a6db63081cb6efb
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Jul 30 13:35:24 2007 +0300

    KVM: Change the emulator_{read,write,cmpxchg}_* functions to take a vcpu
    
    ... instead of a x86_emulate_ctxt, so that other callers can use it easily.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 80917728e43e248155c019f743655806b582b099
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Jul 30 15:56:36 2007 +0300

    KVM: x86 emulator: disable writeback for debug register instructions
    
    These are handled internally by the instruction.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1c23728a5acd3a1fe5d628e23e3e4c27ee77118f
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 20:08:05 2007 +1000

    KVM: SVM: internal function name cleanup
    
    Changes some svm.c internal function names:
    1) io_adress -> io_address  (de-germanify the spelling)
    2) kvm_reput_irq -> reput_irq  (it's not a generic kvm function)
    3) kvm_do_inject_irq -> (it's not a generic kvm function)
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 61736efb5398154eceafcce0337fe0621d7eeeb0
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 20:07:08 2007 +1000

    KVM: SVM: de-containization
    
    container_of is wonderful, but not casting at all is better.  This
    patch changes svm.c's internal functions to pass "struct vcpu_svm"
    instead of "struct kvm_vcpu" and using container_of.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b15c5febefc05f04b5db04552bef18a6902e657c
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 16:41:57 2007 +1000

    KVM: Remove three magic numbers
    
    There are several places where hardcoded numbers are used in place of
    the easily-available constant, which is poor form.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b21514dab8c88570bf2078249881a8210e50bafa
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 16:31:43 2007 +1000

    KVM: VMX: pass vcpu_vmx internally
    
    container_of is wonderful, but not casting at all is better.  This
    patch changes vmx.c's internal functions to pass "struct vcpu_vmx"
    instead of "struct kvm_vcpu" and using container_of.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7d3fd03221bb8352a263249e6adb1232064e4341
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 16:29:56 2007 +1000

    KVM: fx_init() needs preemption disabled while it plays with the FPU state
    
    Now that kvm generally runs with preemption enabled, we need to protect
    the fpu intialization sequence.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 985bc8087daf3719d89e5ed28fe59eecd58fae71
Author: Shaohua Li <shaohua.li@intel.com>
Date:   Mon Jul 23 14:51:37 2007 +0800

    KVM: Convert vm lock to a mutex
    
    This allows the kvm mmu to perform sleepy operations, such as memory
    allocation.
    
    Signed-off-by: Shaohua Li <shaohua.li@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8928fb48c7a7f9053a55f1d0023cbc533f2b3663
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jul 11 18:17:21 2007 +0300

    KVM: Use the scheduler preemption notifiers to make kvm preemptible
    
    Current kvm disables preemption while the new virtualization registers are
    in use.  This of course is not very good for latency sensitive workloads (one
    use of virtualization is to offload user interface and other latency
    insensitive stuff to a container, so that it is easier to analyze the
    remaining workload).  This patch re-enables preemption for kvm; preemption
    is now only disabled when switching the registers in and out, and during
    the switch to guest mode and back.
    
    Contains fixes from Shaohua Li <shaohua.li@intel.com>.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 510144c386fb650a5530311721ae9d90bf12eaee
Author: Yang, Sheng <sheng.yang@intel.com>
Date:   Sun Jul 29 11:07:42 2007 +0300

    KVM: VMX: Improve the method of writing vmcs control
    
    Put cpu feature detecting part in hardware_setup, and stored the vmcs
    condition in global variable for further check.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fbc4f2e23aa26a8537f8f147c75a632e498c39c7
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Fri Jul 27 17:16:56 2007 +1000

    KVM: Dynamically allocate vcpus
    
    This patch converts the vcpus array in "struct kvm" to a pointer
    array, and changes the "vcpu_create" and "vcpu_setup" hooks into one
    "vcpu_create" call which does the allocation and initialization of the
    vcpu (calling back into the kvm_vcpu_init core helper).
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6532f26b4f39a409475918da47844eaff219f50b
Author: Gregory Haskins <ghaskins@novell.com>
Date:   Fri Jul 27 08:13:10 2007 -0400

    KVM: Remove arch specific components from the general code
    
    struct kvm_vcpu has vmx-specific members; remove them to a private structure.
    
    Signed-off-by: Gregory Haskins <ghaskins@novell.com>
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 35b8e2b29b372ab285819c3b84d6db1d0165998b
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Wed Jul 25 13:29:51 2007 +1000

    KVM: load_pdptrs() cleanups
    
    load_pdptrs can be handed an invalid cr3, and it should not oops.
    This can happen because we injected #gp in set_cr3() after we set
    vcpu->cr3 to the invalid value, or from kvm_vcpu_ioctl_set_sregs(), or
    memory configuration changes after the guest did set_cr3().
    
    We should also copy the pdpte array once, before checking and
    assigning, otherwise an SMP guest can potentially alter the values
    between the check and the set.
    
    Finally one nitpick: ret = 1 should be done as late as possible: this
    allows GCC to check for unset "ret" should the function change in
    future.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9cb698bd020974a7e950eca6285254b50b0b64d5
Author: Aurelien Jarno <aurelien@aurel32.net>
Date:   Wed Jul 25 11:41:57 2007 +0200

    KVM: Remove dead code in the cmpxchg instruction emulation
    
    The writeback fixes (02c03a326a5df825cc01de426f72e160db2b9538) let
    some dead code in the cmpxchg instruction emulation. Remove it.
    
    Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d9cbd1d77543d731f31e8ea5d1738d4aad81694a
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Wed Jul 25 12:17:06 2007 +0300

    KVM: VMX: Import some constants of vmcs from IA32 SDM
    
    This patch mainly imports some constants and rename two exist constants
    of vmcs according to IA32 SDM.
    
    It also adds two constants to indicate Lock bit and Enable bit in
    MSR_IA32_FEATURE_CONTROL, and replace the hardcode _5_ with these two
    bits.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>
    
    --

commit bfa6c62f98bd0602025d7b48e267d817082f5d07
Author: Aurelien Jarno <aurelien@aurel32.net>
Date:   Wed Jul 25 10:19:54 2007 +0200

    KVM: disable writeback for 0x0f 0x01 instructions.
    
    0x0f 0x01 instructions (ie lgdt, lidt, smsw, lmsw and invlpg) does
    not use writeback. This patch set no_wb=1 when emulating those
    instructions.
    
    This fixes a regression booting the FreeBSD kernel on AMD.
    
    Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 24beb1e24843f05c3acfd20fc2fbcf4f5ab18ec7
Author: Shaohua Li <shaohua.li@intel.com>
Date:   Mon Jul 23 14:51:39 2007 +0800

    KVM: Move gfn_to_page out of kmap/unmap pairs
    
    gfn_to_page might sleep with swap support. Move it out of the kmap calls.
    
    Signed-off-by: Shaohua Li <shaohua.li@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 33c5dfed96a8cb19ccc2e08073ef97e5c731dae3
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jul 25 09:22:12 2007 +0300

    KVM: Fix removal of nx capability from guest cpuid
    
    Testing the wrong bit caused kvm not to disable nx on the guest when it is
    disabled on the host (an mmu optimization relies on the nx bits being the
    same in the guest and host).
    
    This allows Windows to boot when nx is disabled on te host (e.g. when
    host pae is disabled).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8d4faaba7b1ac40b96709dc244e7d81058918a08
Author: Shaohua Li <shaohua.li@intel.com>
Date:   Mon Jul 23 14:51:32 2007 +0800

    KVM: Hoist kvm_mmu_reload() out of the critical section
    
    vmx_cpu_run doesn't handle error correctly and kvm_mmu_reload might
    sleep with mutex changes, so I move it above.
    
    Signed-off-by: Shaohua Li <shaohua.li@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b41e5014dd8712e8de2b656617f7a7a158cd992a
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Jul 23 18:33:14 2007 +0300

    Revert "KVM: Avoid useless memory write when possible"
    
    This reverts commit 8a1449563b3e5ede56b28cc977c8da22a17cdf51.  While it
    does save useless updates, it (probably) defeats the fork detector, causing
    a massive performance loss.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4d69bc0c78587849583d63ada004c82dc6277829
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 23 17:11:02 2007 +1000

    KVM: Return if the pdptrs are invalid when the guest turns on PAE.
    
    Don't fall through and turn on PAE in this case.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e8c2eb98b58dd135b14d87e6dd1d621bc630d919
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 23 17:08:21 2007 +1000

    KVM: Fix unlikely kvm_create vs decache_vcpus_on_cpu race
    
    We add the kvm to the vm_list before initializing the vcpu mutexes,
    which can be mutex_trylock()'ed by decache_vcpus_on_cpu().
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aae0954ed6ac2a00ee76fd209aa2a39bb2f43a0c
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jul 22 18:48:54 2007 +0300

    KVM: Correctly handle writes crossing a page boundary
    
    Writes that are contiguous in virtual memory may not be contiguous in
    physical memory; so split writes that straddle a page boundary.
    
    Thanks to Aurelien for reporting the bug, patient testing, and a fix
    to this very patch.
    
    Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 76f0301b5e4d2603d8e1ee5295db29faea660b49
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jul 22 15:51:58 2007 +0300

    KVM: x86 emulator: fix faulty check for two-byte opcode
    
    Right now, the bug is harmless as we never emulate one-byte 0xb6 or 0xb7.
    But things may change.
    
    Noted by the mysterious Gabriel C.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 86ba3093d785da1d2d1c5ecbf060d91edd7a5092
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jul 22 12:32:57 2007 +0300

    KVM: Require CONFIG_ANON_INODES
    
    Found by Sebastian Siewior and randconfig.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6da018860ce19321e25b685b72f3836d243c2137
Author: Avi Kivity <avi@qumranet.com>
Date:   Sat Jul 21 09:00:21 2007 +0300

    KVM: MMU: Fix cleaning up the shadow page allocation cache
    
    __free_page() wants a struct page, not a virtual address.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 29530eb22ba3b0baf260e2767cb125b61151ed25
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Jul 20 12:30:58 2007 +0300

    KVM: x86 emulator: fix cmov for writeback changes
    
    The writeback fixes (02c03a326a5df825cc01de426f72e160db2b9538) broke
    cmov emulation.  Fix.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 92bd26eb2a199716ceeb5604b8f9f5ed7e69ac3d
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Jul 20 08:18:27 2007 +0300

    KVM: MMU: Fix oopses with SLUB
    
    The kvm mmu uses page->private on shadow page tables; so does slub, and
    an oops result.  Fix by allocating regular pages for shadows instead of
    using slub.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 860852357a6590299a273f1141dbf1871df0b491
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:37:17 2007 +1000

    KVM: Use standard CR8 flags, and fix TPR definition
    
    Intel manual (and KVM definition) say the TPR is 4 bits wide.  Also fix
    CR8_RESEVED_BITS typo.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Acked-by: H. Peter Anvin <hpa@zytor.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 56282e5368afbc8ec6eebb6413bbb2ec0733d0ed
Author: Jeff Dike <jdike@addtoit.com>
Date:   Tue Jul 17 12:26:59 2007 -0400

    KVM: Set exit_reason to KVM_EXIT_MMIO where run->mmio is initialized.
    
    Signed-off-by: Jeff Dike <jdike@linux.intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7e5437f39897a09e79e69bd0c8d4641f13715cc4
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Wed Jul 18 13:05:58 2007 +1000

    KVM: Trivial: Use standard BITMAP macros, open-code userspace-exposed header
    
    Creating one's own BITMAP macro seems suboptimal: if we use manual
    arithmetic in the one place exposed to userspace, we can use standard
    macros elsewhere.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0dfb860def58bfb2daa000af490ed1986373fea5
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:34:16 2007 +1000

    Use standard CR4 flags, tighten checking
    
    On this machine (Intel), writing to the CR4 bits 0x00000800 and
    0x00001000 cause a GPF.  The Intel manual is a little unclear, but
    AFIACT they're reserved, too.
    
    Also fix spelling of CR4_RESEVED_BITS.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2aee2b5274884f40475fe9ad6a7f7a3d608e0ea4
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:32:55 2007 +1000

    Use standard CR3 flags, tighten checking
    
    The kernel now has asm/cpu-features.h: use those macros instead of inventing
    our own.
    
    Also spell out definition of CR3_RESEVED_BITS, fix spelling and
    tighten it for the non-PAE case.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 688e14654b3ffb0292a209c052e7579948b17f27
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:19:08 2007 +1000

    KVM: Trivial: Use standard CR0 flags macros from asm/cpu-features.h
    
    The kernel now has asm/cpu-features.h: use those macros instead of
    inventing our own.
    
    Also spell out definition of CR0_RESEVED_BITS (no code change) and fix typo.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0da5e37f4dc3df7a941ddba8863b289863e8dd40
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:17:55 2007 +1000

    KVM: Trivial: Avoid hardware_disable predeclaration
    
    Don't pre-declare hardware_disable: shuffle the reboot hook down.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 24356bfad9c4b8ba70920153aec00e78698ccb9a
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:16:56 2007 +1000

    KVM: Trivial: Comment spelling may escape grep
    
    Speling error in comment.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 793551cce1b90fac232e0a38269247815fb0d02a
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:16:11 2007 +1000

    KVM: Trivial: Make decode_register() static
    
    I have shied away from touching x86_emulate.c (it could definitely use
    some love, but it is forked from the Xen code, and it would be more
    productive to cross-merge fixes).
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 53df15a3cae92d4528dc8de21132bed3aa929ca1
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:15:29 2007 +1000

    KVM: Trivial: Remove unused struct cpu_user_regs declaration
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a9531af471c86779d28ba973cf5f54f82cfbdb8d
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:12:26 2007 +1000

    KVM: Trivial: /dev/kvm interface is no longer experimental.
    
    KVM interface is no longer experimental.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 817d90b391f6c51d07bf9d6a94778a5957d46f65
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jul 17 14:20:30 2007 +0300

    KVM: x86 emulator: implement rdmsr and wrmsr
    
    Allow real-mode emulation of rdmsr and wrmsr.  This allows smp Windows to
    boot, presumably for its sipi trampoline.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 66d8a4e4d4bd470216028daabb9d887b73259c96
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jul 17 13:04:56 2007 +0300

    KVM: Fix memory slot management functions for guest smp
    
    The memory slot management functions were oriented against vcpu 0, where
    they should be kvm-wide.  This causes hangs starting X on guest smp.
    
    Fix by making the functions (and resultant tail in the mmu) non-vcpu-specific.
    Unfortunately this reduces the efficiency of the mmu object cache a bit.  We
    may have to revisit this later.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4dd0d9a876db49da29185c868cbea6c77c09c600
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Tue Jul 17 11:52:33 2007 +0300

    KVM: In-kernel string pio write support
    
    Add string pio write support to support some version of Windows.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7bb566d5c8661a179106579978c0c606e7fa8a93
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jul 17 11:45:55 2007 +0300

    KVM:: Future-proof the exit information union ABI
    
    Note that as the size of struct kvm_run is not part of the ABI, we can add
    things at the end.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f2973ff11f9f8ef4b90413cea9cedd7f20639e3e
Author: Jeff Dike <jdike@addtoit.com>
Date:   Mon Jul 16 15:24:47 2007 -0400

    KVM - add hypercall nr to kvm_run
    
    Add the hypercall number to kvm_run and initialize it.  This changes the ABI,
    but as this particular ABI was unusable before this no users are affected.
    
    Signed-off-by: Jeff Dike <jdike@linux.intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 973ae594c1a65936fc09acab412be51d97b703b9
Author: Qing He <qing.he@intel.com>
Date:   Thu Jul 12 12:33:56 2007 +0300

    KVM: SMP: Add vcpu_id field in struct vcpu
    
    This patch adds a `vcpu_id' field in `struct vcpu', so we can
    differentiate BSP and APs without pointer comparison or arithmetic.
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9f5aa99d6256aa14b64683283ba1c4be910bc67e
Author: Nguyen Anh Quynh <aquynh@gmail.com>
Date:   Wed Jul 11 14:30:54 2007 +0300

    KVM: Fix *nopage() in kvm_main.c
    
    *nopage() in kvm_main.c should only store the type of mmap() fault if
    the pointers are not NULL. This patch fixes the problem.
    
    Signed-off-by: Nguyen Anh Quynh <aquynh@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6287464e41b2b520d78d417f3d1b37aca9202a04
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jul 10 17:50:55 2007 +0300

    KVM: MMU: Store nx bit for large page shadows
    
    We need to distinguish between large page shadows which have the nx bit set
    and those which don't.  The problem shows up when booting a newer smp Linux
    kernel, where the trampoline page (which is in real mode, which uses the
    same shadow pages as large pages) is using the same mapping as a kernel data
    page, which is mapped using nx, causing kvm to spin on that page.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a737ba627a98f2ae66c308148c9c967c73f13f5d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 13:11:41 2007 +0300

    KVM: Use CPU_DYING for disabling virtualization
    
    Only at the CPU_DYING stage can we be sure that no user process will
    be scheduled onto the cpu and oops when trying to use virtualization
    extensions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4fba051d7ec9ec1961f477d9a20311d8432738b7
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 13:09:41 2007 +0300

    KVM: Tune hotplug/suspend IPIs
    
    The hotplug IPIs can be called from the cpu on which we are currently
    running on, so use on_cpu().  Similarly, drop on_each_cpu() for the
    suspend/resume callbacks, as we're in atomic context here and only one
    cpu is up anyway.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 63e8e638342401a5fd04ec310c5d0695c645e444
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 13:03:52 2007 +0300

    KVM: Keep track of which cpus have virtualization enabled
    
    By keeping track of which cpus have virtualization enabled, we
    prevent double-enable or double-disable during hotplug, which is a
    very fatal oops.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9b6f4dedfeb83190b6196fe201e2f33c97de1c73
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 12:42:10 2007 +0300

    SMP: Implement on_cpu()
    
    This defines on_cpu() which is similar to smp_call_function_single()
    except that it works if cpu happens to be the current cpu.  Can also be
    seen as a complement to on_each_cpu() (which also doesn't treat the
    current cpu specially).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 55971a0f3faab6ecdce1e17dafc6d968f3236ade
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 12:37:34 2007 +0300

    HOTPLUG: Adapt thermal throttle to CPU_DYING
    
    CPU_DYING is notified in atomic context, so no taking mutexes here.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 529bd39d193eeae66a7c0fc3b12169ea566dc0e5
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 12:33:15 2007 +0300

    HOTPLUG: Adapt cpuset hotplug callback to CPU_DYING
    
    CPU_DYING is called in atomic context, so don't try to take any locks.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 33e6f5c2bd102cb43a1e9ae5fe210b0d5f9ac69f
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 12:23:10 2007 +0300

    HOTPLUG: Add CPU_DYING notifier
    
    KVM wants a notification when a cpu is about to die, so it can disable
    hardware extensions, but at a time when user processes cannot be scheduled
    on the cpu, so it doesn't try to use virtualization extensions after they
    have been disabled.
    
    This adds a CPU_DYING notification.  The notification is called in atomic
    context on the doomed cpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0d9c57e0a7ee426096af3d79114d23e50ed6d42b
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jul 8 11:15:32 2007 +0300

    KVM: Fix svm availability check miscompile on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 222a35d12ad9ef4f4a97da496f0e038e94681d3b
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 28 14:15:57 2007 -0400

    KVM: Clean up #includes
    
    Remove unnecessary ones, and rearange the remaining in the standard order.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 41ac4b23696b12fec15191969bc18da42359861d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 28 08:38:16 2007 -0400

    KVM: Remove kvmfs in favor of the anonymous inodes source
    
    kvm uses a pseudo filesystem, kvmfs, to generate inodes, a job that the
    new anonymous inodes source does much better.
    
    Cc: Davide Libenzi <davidel@xmailserver.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cfc329b216bc3e54fe1107e8f714c7b3bc133224
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Fri Jun 22 12:29:50 2007 +0300

    KVM: SVM: Reliably detect if SVM was disabled by BIOS
    
    This patch adds an implementation to the svm is_disabled function to
    detect reliably if the BIOS disabled the SVM feature in the CPU. This
    fixes the issues with kernel panics when loading the kvm-amd module on
    machines where SVM is available but disabled.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a2a8a256f8d4ff1595900b810fea90e5e5911b6d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 21 11:54:45 2007 +0300

    KVM: VMX: Remove unnecessary code in vmx_tlb_flush()
    
    A vmexit implicitly flushes the tlb; the code is bogus.
    
    Noted by Shaohua Li.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 37ebbf17fbf71ec261c57c1404ac7c50ade97c13
Author: Shaohua Li <shaohua.li@intel.com>
Date:   Wed Jun 20 17:13:26 2007 +0800

    KVM: MMU: Fix Wrong tlb flush order
    
    Need to flush the tlb after updating a pte, not before.
    
    Signed-off-by: Shaohua Li <shaohua.li@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 030421334ae91b7f6302a1cfe9c971a8991b4870
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jun 20 11:20:04 2007 +0300

    KVM: VMX: Reinitialize the real-mode tss when entering real mode
    
    Protected mode code may have corrupted the real-mode tss, so re-initialize
    it when switching to real mode.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8a1449563b3e5ede56b28cc977c8da22a17cdf51
Author: Luca Tettamanti <kronos.it@gmail.com>
Date:   Tue Jun 19 22:41:38 2007 +0200

    KVM: Avoid useless memory write when possible
    
    When writing to normal memory and the memory area is unchanged the write
    can be safely skipped, avoiding the costly kvm_mmu_pte_write.
    
    Signed-Off-By: Luca Tettamanti <kronos.it@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ba9c20c048726037664d303362b688759fdf6e9d
Author: Luca Tettamanti <kronos.it@gmail.com>
Date:   Tue Jun 19 22:41:20 2007 +0200

    KVM: Fix x86 emulator writeback
    
    When the old value and new one are the same the emulator skips the
    write; this is undesirable when the destination is a MMIO area and the
    write shall be performed regardless of the previous value. This
    optimization breaks e.g. a Linux guest APIC compiled without
    X86_GOOD_APIC.
    
    Remove the check and perform the writeback stage in the emulation unless
    it's explicitly disabled (currently push and some 2 bytes instructions
    may disable the writeback).
    
    Signed-Off-By: Luca Tettamanti <kronos.it@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8e770bbe8651e8d13e1d09d426657fbed0fe052a
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Tue Jun 19 18:05:03 2007 +0300

    KVM: Add support for in-kernel pio handlers
    
    Useful for the PIC and PIT.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ecd01fac443e69a574cb064d44e78ff783a1e1a4
Author: Gregory Haskins <ghaskins@novell.com>
Date:   Thu May 31 14:08:58 2007 -0400

    KVM: VMX: Fix interrupt checking on lightweight exit
    
    With kernel-injected interrupts, we need to check for interrupts on
    lightweight exits too.
    
    Signed-off-by: Gregory Haskins <ghaskins@novell.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit af93971fab7729229a45ecd64c72f56421bbcd0f
Author: Gregory Haskins <ghaskins@novell.com>
Date:   Thu May 31 14:08:53 2007 -0400

    KVM: Adds support for in-kernel mmio handlers
    
    Signed-off-by: Gregory Haskins <ghaskins@novell.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e0d1fb847d117124da53145b2d9b7f4d3da8e82c
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Jun 19 11:21:15 2007 +0300

    KVM: Implement emulation of instruction "ret" (opcode 0xc3)
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 246e9cd14121973b3c653b990d80bcd1c2163dd5
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Jun 19 11:16:04 2007 +0300

    KVM: Implement emulation of "pop reg" instruction (opcode 0x58-0x5f)
    
    For use in real mode.
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b0c4137315fc6f711fd3a0fc82aedb61a2536ac9
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jun 17 12:24:23 2007 +0300

    KVM: Bring local tree in line with origin
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6685637b211ad67bdce21bfd9f91bc888b3acb4f
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jun 13 19:55:28 2007 +0300

    KVM: VMX: Ensure vcpu time stamp counter is monotonous
    
    If the time stamp counter goes backwards, a guest delay loop can become
    infinite.  This can happen if a vcpu is migrated to another cpu, where
    the counter has a lower value than the first cpu.
    
    Since we're doing an IPI to the first cpu anyway, we can use that to pick
    up the old tsc, and use that to calculate the adjustment we need to make
    to the tsc offset.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8aefa5d7ac55d487af62755545ecc02bc53678af
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jun 13 19:43:19 2007 +0300

    KVM: Initialize the BSP bit in the APIC_BASE msr correctly
    
    Needs to be set on vcpu 0 only.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 218179e7978af0308bcbd08f6c43bd5b3607a909
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 12 08:58:13 2007 +0300

    KVM: Require a cpu which can set 64-bit values atomically
    
    set_64bit() is not available on 80386 and i486.  Noticed by Adrian Bunk.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 74a54c5cfe3a1ea3777964a9e8e7bef119ca549b
Author: Shani Moideen <shani.moideen@wipro.com>
Date:   Mon Jun 11 09:31:33 2007 +0530

    KVM: VMX: Replace memset(<addr>, 0, PAGESIZE) with clear_page(<addr>)
    
    Signed-off-by: Shani Moideen <shani.moideen@wipro.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ff4d2f93a9459aa820b56a59e9dbd3967aa407ce
Author: Shani Moideen <shani.moideen@wipro.com>
Date:   Mon Jun 11 09:28:26 2007 +0530

    KVM: SVM: Replace memset(<addr>, 0, PAGESIZE) with clear_page(<addr>)
    
    Signed-off-by: Shani Moideen <shani.moideen@wipro.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3105c9a9a2d5f64c9e67745120b8ee5c205847a3
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 7 19:18:30 2007 +0300

    KVM: Flush remote tlbs when reducing shadow pte permissions
    
    When a vcpu causes a shadow tlb entry to have reduced permissions, it
    must also clear the tlb on remote vcpus.  We do that by:
    
    - setting a bit on the vcpu that requests a tlb flush before the next entry
    - if the vcpu is currently executing, we send an ipi to make sure it
      exits before we continue
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2c3ac418d752e7f73ca0d9081a4377278432d565
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 7 19:11:53 2007 +0300

    KVM: Keep an upper bound of initialized vcpus
    
    That way, we don't need to loop for KVM_MAX_VCPUS for a single vcpu
    vm.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7ca30c3f2efbf9ab5ab595d9bc3e0bd3b705aba1
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 16:15:51 2007 +0300

    KVM: Emulate hlt on real mode for Intel
    
    This has two use cases: the bios can't boot from disk, and guest smp
    bootstrap.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e7ebb74dbacc100cfd621157ac63b95e63e3292d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 15:53:05 2007 +0300

    KVM: Move duplicate halt handling code into kvm_main.c
    
    Will soon have a thid user.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a80408da7a05e0be2ae99ad47dafd4bb4bc847cd
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 14:37:09 2007 +0300

    KVM: Enable guest smp
    
    As we don't support guest tlb shootdown yet, this is only reliable
    for real-mode guests.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 80b70c068ce4333e5e1242f32f538835a4e5d896
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 14:36:10 2007 +0300

    KVM: Fix adding an smp virtual machine to the vm list
    
    If we add the vm once per vcpu, we corrupt the list if the guest has
    multiple vcpus.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 16fb83998b62717831dca3d913455091c855b3cd
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 12:17:03 2007 +0300

    KVM: Fix vcpu freeing for guest smp
    
    A vcpu can pin up to four mmu shadow pages, which means the freeing
    loop will never terminate.  Fix by first unpinning shadow pages on
    all vcpus, then freeing shadow pages.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 55ae364d6a882c94511db17e8023c8976d44cd2d
Author: Nguyen Anh Quynh <aquynh@gmail.com>
Date:   Tue Jun 5 10:35:19 2007 +0300

    KVM: Remove unnecessary initialization and checks in mark_page_dirty()
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0ae1aebcc9825fba4d115c197e9c099fd9644caf
Author: Robert P. J. Day <rpjday@mindspring.com>
Date:   Sun Jun 3 13:35:29 2007 -0400

    KVM: Replace C code with call to ARRAY_SIZE() macro.
    
    Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4b82b37a35a085a07d9ed84efee06c69655fd3d1
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Jun 4 15:58:30 2007 +0300

    KVM: Lazy guest cr3 switching
    
    Switch guest paging context may require us to allocate memory, which
    might fail.  Instead of wiring up error paths everywhere, make context
    switching lazy and actually do the switch before the next guest entry,
    where we can return an error if allocation fails.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fa8cfb020b0ef0acef94ddc9035b932308840314
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Jun 4 11:11:23 2007 +0300

    KVM: VMX: Fix asm constraint
    
    "g" can select a memory location, in which case size information is lost
    and gas needs an instruction suffix.  Since the suffix is different for
    i386 and x86_64, we simply change the constraint to "r".
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 63275ba244275719d6fd4d77c10d6b15586aa727
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 18:28:51 2007 +0300

    KVM: MMU: Remove unused large page marker
    
    This has not been used for some time, as the same information is available
    in the page header.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 21e3670e57c34809d4c141ce1dde4fd8b23a4d60
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 18:24:09 2007 +0300

    KVM: MMU: Don't cache guest access bits in the shadow page table
    
    This was once used to avoid accessing the guest pte when upgrading
    the shadow pte from read-only to read-write.  But usually we need
    to set the guest pte dirty or accessed bits anyway, so this wasn't
    really exploited.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 319d035ef290b510edb7f848d41098c31ceaace0
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 18:20:14 2007 +0300

    KVM: MMU: Simpify accessed/dirty/present/nx bit handling
    
    Always set the accessed and dirty bit (since having them cleared causes
    a read-modify-write cycle), always set the present bit, and copy the
    nx bit from the guest.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 080e7fd753ec60140ea89ebb0ea94625ae541534
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 17:17:06 2007 +0300

    KVM: MMU: Remove cr0.wp tricks
    
    No longer needed as we do everything in one place.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cc9d465c7a9ef3a109814fa866676f876ff42133
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 15:46:04 2007 +0300

    KVM: MMU: Make setting shadow ptes atomic on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 823c30e8740ad71bd9556f3cd235231ad00bfa55
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 15:23:35 2007 +0300

    KVM: Make shadow pte updates atomic
    
    With guest smp, a second vcpu might see partial updates when the first
    vcpu services a page fault.  So delay all updates until we have figured
    out what the pte should look like.
    
    Note that on i386, this is still not completely atomic as a 64-bit write
    will be split into two on a 32-bit machine.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b7bd6888968e797f2deaa4aa9f98466a2371392b
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 15:14:09 2007 +0300

    KVM: Move shadow pte modifications from set_pte/set_pde to set_pde_common()
    
    We want all shadow pte modifications in one place.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b70ccb0b3fd4ac02c0f6cf5153008c736fa27710
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 15:08:29 2007 +0300

    KVM: MMU: Fold fix_write_pf() into set_pte_common()
    
    This prevents some work from being performed twice, and, more importantly,
    reduces the number of places where we modify shadow ptes.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ad5555224aa01b2ddcc45ab9f0172b5497a7cd5d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 11:56:54 2007 +0300

    KVM: MMU: Fold fix_read_pf() into set_pte_common()
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3f1380d422cbd5b9231c3e997e4cbec000e3a08f
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 11:45:18 2007 +0300

    KVM: MMU: Pass the guest pde to set_pte_common
    
    We will need the accessed bit (in addition to the dirty bit) and
    also write access (for setting the dirty bit) in a future patch.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5fe13ee0e2b404dd34dea17ec0849b4a940a5755
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 30 19:31:17 2007 +0300

    KVM: MMU: Move set_pte_common() to pte width dependent code
    
    In preparation of some modifications.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5ada0f87635fa10a40a22b8b249c3d1fedb79840
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 30 14:21:51 2007 +0300

    KVM: MMU: Simplify fetch() a little bit
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 67310badceaed0519cb8efbe6054d790563ea136
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 30 12:34:53 2007 +0300

    KVM: MMU: Use slab caches for shadow pages and their headers
    
    Use slab caches instead of a simple custom list.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6d9d80f421f77da043b8b6898e01327763adecd2
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Tue May 29 15:07:21 2007 +0300

    KVM: Use symbolic constants instead of magic numbers
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4eaa906699812e2e28c3237cfedd8c21cbd17c4b
Author: Markus Rechberger <markus.rechberger@amd.com>
Date:   Sun May 27 10:46:52 2007 +0300

    KVM: Fix includes
    
    KVM compilation fails for some .configs.  This fixes it.
    
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d67c455e06a1eaf8ab20b5c4e51f4ae8271b2637
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 11:17:33 2007 +0300

    KVM: x86 emulator: implement wbinvd
    
    Vista seems to trigger it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fc1193d546ec21c279a8e4e3e9eaf999275b2223
Author: Jan Engelhardt <jengelh@linux01.gwdg.de>
Date:   Wed May 23 14:22:11 2007 -0700

    Use menuconfig objects II - KVM/Virt
    
    Make a "menuconfig" out of the Kconfig objects "menu, ..., endmenu",
    so that the user can disable all the options in that menu at once
    instead of having to disable each option separately.
    
    Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a6935dbdaa7278d5e4a4d7478f29462f2a5db7fe
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon May 21 09:15:47 2007 +0300

    KVM: VMX: Remove warnings on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1ab29f3fb765b08e65de563d9053d4d05cc95f52
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Mon May 21 07:28:09 2007 +0300

    KVM: VMX: Avoid saving and restoring msr_efer on lightweight vmexit
    
    MSR_EFER.LME/LMA bits are automatically save/restored by VMX
    hardware, KVM only needs to save NX/SCE bits at time of heavy
    weight VM Exit. But clearing NX bits in host envirnment may
    cause system hang if the host page table is using EXB bits,
    thus we leave NX bits as it is. If Host NX=1 and guest NX=0, we
    can do guest page table EXB bits check before inserting a shadow
    pte (though no guest is expecting to see this kind of gp fault).
    If host NX=0, we present guest no Execute-Disable feature to guest,
    thus no host NX=0, guest NX=1 combination.
    
    This patch reduces raw vmexit time by ~27%.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 64ce9a0cf0960f9a029e54d1bffc06123d3b5893
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sun May 20 16:28:59 2007 +0300

    KVM: VMX: Fix a typo which mixes X86_64 and CONFIG_X86_64
    
    This prevents compilation on 64-bits.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cc1d717e078464a049cf8364417ec44267cd6143
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sun May 20 10:50:08 2007 +0300

    KVM: VMX: Cleanup redundant code in MSR set
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8bf50c5c6b2af81355412ec1696a7e2c8ad940f2
Author: Daniel Hecken <dh@bahntechnik.de>
Date:   Sun May 20 10:32:14 2007 +0300

    KVM: VMX: Compile-fix for 32-bit hosts
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f552bf62c86b383dd74030c5830c8043bf41e0bd
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Thu May 17 18:55:15 2007 +0300

    KVM: VMX: Avoid saving and restoring msrs on lightweight vmexit
    
    In a lightweight exit (where we exit and reenter the guest without
    scheduling or exiting to userspace in between), we don't need various
    msrs on the host, and avoiding shuffling them around reduces raw exit
    time by 8%.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8edb11391b763357734cc5fd293d788d8591e6d7
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Thu May 17 15:50:34 2007 +0300

    KVM: VMX: Handle #SS faults from real mode
    
    Instructions with address size override prefix opcode 0x67
    Cause the #SS fault with 0 error code in VM86 mode.  Forward
    them to the emulator.
    
    Signed-Off-By: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit bdf3f418471ba3c65aa78a1943da179d8320fdf8
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon May 14 20:41:13 2007 +0300

    KVM: VMX: Use local labels in inline assembly
    
    This makes oprofile dumps and disassebly easier to read.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ca76d209b88c344fc6a8eac17057c0088a3d6940
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun May 13 20:18:14 2007 +0300

    KVM: Remove merge artifact
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 52916bb7c142b5cf8a81da225bf51c2ea60c5b49
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 8 11:34:07 2007 +0300

    KVM: Fix vmx I/O bitmap initialization on highmem systems
    
    kunmap() expects a struct page, not a virtual address.  Fixes an oops loading
    kvm-intel.ko on i386 with CONFIG_HIGHMEM.
    
    Thanks to Michael Ivanov <deruhu@peterstar.ru> for reporting.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit facc2faaf471ca539ddd96fdbdf2e147421468a6
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon May 7 10:55:37 2007 +0300

    KVM: Avoid corrupting tr in real mode
    
    The real mode tr needs to be set to a specific tss so that I/O
    instructions can function.  Divert the new tr values to the real
    mode save area from where they will be restored on transition to
    protected mode.
    
    This fixes some crashes on reboot when the bios accesses an I/O
    instruction.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 05eb943c9b547ecc4de850f04ed4c09356440528
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun May 6 16:10:01 2007 +0300

    KVM: VMX: Only reload guest msrs if they are already loaded
    
    If we set an msr via an ioctl() instead of by handling a guest exit, we
    have the host state loaded, so reloading the msrs would clobber host
    state instead of guest state.
    
    This fixes a host oops (and loss of a cpu) on a guest reboot.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 242b0f9ae76651226fb42d9ec3ecb1a9d8d7b263
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun May 6 15:50:58 2007 +0300

    KVM: MMU: Store shadow page tables as kernel virtual addresses, not physical
    
    Simpifies things a bit.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 03aeb06a4440265777ae4ed62e8431955cbea865
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun May 6 15:36:30 2007 +0300

    KVM: MMU: Simplify kvm_mmu_free_page() a tiny bit
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f66b4a983d460d68ef5cc392285190065b0617e5
Author: Matthew Gregan <kinetik@flim.org>
Date:   Sun May 6 10:59:46 2007 +0300

    KVM: Implement IA32_EBL_CR_POWERON msr
    
    Attempting to boot the default 'bsd' kernel of OpenBSD 4.1 i386 in a guest
    fails early in the kernel init inside p3_get_bus_clock while trying to read
    the IA32_EBL_CR_POWERON MSR.  KVM logs an 'unhandled MSR' message and the
    guest kernel faults.
    
    This patch is sufficient to allow OpenBSD to boot, after which it seems to
    run fine.  I'm not sure if this is the correct solution for dealing with
    this particular MSR, but it works for me.
    
    Signed-off-by: Matthew Gregan <kinetik@flim.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7a57011a5e7c4082fdfd204115a8212298ef723f
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 23:06:22 2007 +0300

    KVM: Set cr0.mp for guests
    
    This allows fwait instructions to be trapped when the guest fpu is not
    loaded.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 90fb720a59dafb11d591a8e53a4a65bfa6fcfea9
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 22:57:13 2007 +0300

    KVM: Ensure host cr0.ts is saved
    
    Otherwise, host fpu state may be corrupted after an exit.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7616f59b208b088afd85d40aa06ca6d4d4a6ca1a
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 20:40:00 2007 +0300

    KVM: Consolidate guest fpu activation and deactivation
    
    Easier to keep track of where the fpu is this way.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7ca14868fd7f3c0dc21450e61cca5b77a47daf0d
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 17:57:40 2007 +0300

    KVM: Rationalize exception bitmap usage
    
    Everyone owns a piece of the exception bitmap, but they happily write to
    the entire thing like there's no tomorrow.  Centralize handling in
    update_exception_bitmap() and have everyone call that.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit de32f820227fbe3e159ec42ce8fd55057155edca
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 17:33:43 2007 +0300

    KVM: Move some more msr mangling into vmx_save_host_state()
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fa580ecc53536620546659740ae2dfcea763d17c
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 17:30:48 2007 +0300

    KVM: Prevent guest fpu state from leaking into the host
    
    The lazy fpu changes did not take into account that some vmexit handlers
    can sleep.  Move loading the guest state into the inner loop so that it
    can be reloaded if necessary, and move loading the host state into
    vmx_vcpu_put() so it can be performed whenever we relinquish the vcpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit bc8dcc2107de0ba8f25fc910c4559ebe3df33045
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 16:54:03 2007 +0300

    KVM: Fix potential guest state leak into host
    
    The lightweight vmexit path avoids saving and reloading certain host
    state.  However in certain cases lightweight vmexit handling can schedule()
    which requires reloading the host state.
    
    So we store the host state in the vcpu structure, and reloaded it if we
    relinquish the vcpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 11bdaf6e26c0cbabd9b6c8f2e9de60190815d348
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 18:24:38 2007 +0300

    KVM: Increase mmu shadow cache to 1024 pages
    
    This improves kbuild times by about 10%, bringing it within a respectable
    25% of native.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d6540cdffea466f1ee17a52ef530d40577b476b2
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 16:53:31 2007 +0300

    KVM: Update shadow pte on write to guest pte
    
    A typical demand page/copy on write pattern is:
    
    - page fault on vaddr
    - kvm propagates fault to guest
    - guest handles fault, updates pte
    - kvm traps write, clears shadow pte, resumes guest
    - guest returns to userspace, re-faults on same vaddr
    - kvm installs shadow pte, resumes guest
    - guest continues
    
    So, three vmexits for a single guest page fault.  But if instead of clearing
    the page table entry, we update to correspond to the value that the guest
    has just written, we eliminate the third vmexit.
    
    This patch does exactly that, reducing kbuild time by about 10%.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 807762acc40f7cc16aefcfaef8a596a4af988b20
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 16:44:05 2007 +0300

    KVM: MMU: Respect nonpae pagetable quadrant when zapping ptes
    
    When a guest writes to a page that has an mmu shadow, we have to clear
    the shadow pte corresponding to the memory location touched by the guest.
    
    Now, in nonpae mode, a single guest page may have two or four shadow
    pages (because a nonpae page maps 4MB or 4GB, whereas the pae shadow maps
    2MB or 1GB), so we when we look up the page we find up to three additional
    aliases for the page.  Since we _clear_ the shadow pte, it doesn't matter
    except for a slight performance penalty, but if we want to _update_ the
    shadow pte instead of clearing it, it is vital that we don't modify the
    aliases.
    
    Fortunately, exactly which page is needed (the "quadrant") is easily
    computed, and is accessible in the shadow page header.  All we need is
    to ignore shadow pages from the wrong quadrants.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4a5c1655c9f6df8c668428d3c5d2ad4f67dce08d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 14:16:52 2007 +0300

    KVM: Unify kvm_mmu_pre_write() and kvm_mmu_post_write()
    
    Instead of calling two functions and repeating expensive checks, call one
    function and provide it with before/after information.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ff31cf26ff8e17c2f7164c39dc03fe309ed36506
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 11:32:28 2007 +0300

    KVM: Be more careful restoring fs on lightweight vmexit
    
    i386 wants fs for accessing the pda even on a lightweight exit, so ensure
    we can always restore it.  This fixes a regression on i386 introduced by
    the lightweight vmexit patch.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e6d2f6292194c931b2fa11373a66d640245e1b14
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 17:05:38 2007 +0300

    KVM: Reduce misfirings of the fork detector
    
    The kvm mmu tries to detects forks by looking for repeated writes to a
    page table.  If it sees a fork, it unshadows the page table so the page
    table copying can proceed at native speed instead of being emulated.
    
    However, the detector also triggered on simple demand paging access patterns:
    a linear walk of memory would of course cause repeated writes to the same
    pagetable page, causing it to unshadow prematurely.
    
    Fix by resetting the fork detector if we detect a demand fault.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f908e27039ab637013ad17c64e4ef77c4c0a24b8
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 16:15:58 2007 +0300

    KVM: Unindent some code
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5cf48c367dec74ba8553c53ed332cd075fa38b88
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 16:07:54 2007 +0300

    KVM: Avoid saving and restoring some host CPU state on lightweight vmexit
    
    Many msrs and the like will only be used by the host if we schedule() or
    return to userspace.  Therefore, we avoid saving them if we handle the
    exit within the kernel, and if a reschedule is not requested.
    
    Based on a patch from Eddie Dong <eddie.dong@intel.com> with a couple of
    fixes by me.
    
    Signed-off-by: Yaozu(Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2d8d6944a2249f642420bbc70b199182c70ebc9a
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 14:47:02 2007 +0300

    KVM: Assume that writes smaller than 4 bytes are to non-pagetable pages
    
    This allows us to remove write protection earlier than otherwise.  Should
    some mad OS choose to use byte writes to update pagetables, it will suffer
    a performance hit, but still work correctly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7d0e7eed6200c54462e884abc8dd6681df2f5e7d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 12:42:43 2007 +0300

    KVM: Fix RMW mmio handling
    
    Commit 9bf671a47ed6af3164524a31dbef9360f1b66fb5 optimized the mmio
    read path by returning to the emulator directly after an mmio read request.
    But we may also need to return back to userspace in case the instruction
    was a read-modify-write instruction, which means we need to issue a write
    after completion of the read instead of returning to the guest.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f05f41f9bb1cf72a13caf61c2931dbbf4bff51eb
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Apr 30 09:48:11 2007 +0300

    KVM: SVM: Allow direct guest access to PC debug port
    
    The PC debug port is used for IO delay and does not require emulation.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 99c7b51d71c0b0062b752c5f0a4b3498d3d165db
Author: He, Qing <qing.he@intel.com>
Date:   Mon Apr 30 09:45:24 2007 +0300

    KVM: VMX: Enable io bitmaps to avoid IO port 0x80 VMEXITs
    
    This patch enables IO bitmaps control on vmx and unmask the 0x80 port to
    avoid VMEXITs caused by accessing port 0x80. 0x80 is used as delays (see
    include/asm/io.h), and handling VMEXITs on its access is unnecessary but
    slows things down. This patch improves kernel build test at around
    3%~5%.
    	Because every VM uses the same io bitmap, it is shared between
    all VMs rather than a per-VM data structure.
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c06d7c14c006c5e2dcd2a7d84603b51e9e60d7a7
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 29 16:25:49 2007 +0300

    KVM: Remove unused 'instruction_length'
    
    As we no longer emulate in userspace, this is meaningless.  We don't
    compute it on SVM anyway.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 20426d1309353b3e2771f9c7f534e01ce7a019f2
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 29 15:02:17 2007 +0300

    KVM: Don't require explicit indication of completion of mmio or pio
    
    It is illegal not to return from a pio or mmio request without completing
    it, as mmio or pio is an atomic operation.  Therefore, we can simplify
    the userspace interface by avoiding the completion indication.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9bf671a47ed6af3164524a31dbef9360f1b66fb5
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 14 15:54:54 2007 +0200

    KVM: Remove extraneous guest entry on mmio read
    
    When emulating an mmio read, we actually emulate twice: once to determine
    the physical address of the mmio, and, after we've exited to userspace to
    get the mmio value, we emulate again to place the value in the result
    register and update any flags.
    
    But we don't really need to enter the guest again for that, only to take
    an immediate vmexit.  So, if we detect that we're doing an mmio read,
    emulate a single instruction before entering the guest again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8dfdb0d81fb9e858c14e03fd5e007b20167cd065
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 29 13:01:34 2007 +0300

    KVM: Remove trailing whitespace
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1628bcc25417eae4c83ca87e0899c7e02961d975
Author: Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Date:   Sun Apr 29 11:56:06 2007 +0300

    KVM: SVM: Only save/restore MSRs when needed
    
    We only have to save/restore MSR_GS_BASE on every VMEXIT.  The rest can be
    saved/restored when we leave the VCPU.  Since we don't emulate the DEBUGCTL
    MSRs and the guest cannot write to them, we don't have to worry about
    saving/restoring them at all.
    
    This shaves a whopping 40% off raw vmexit costs on AMD.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 68ba823bbe6d546e3ceb63d006c62a84e92837db
Author: Adrian Bunk <bunk@stusta.de>
Date:   Sat Apr 28 21:20:48 2007 +0200

    KVM: fix an if() condition
    
    It might have worked in this case since PT_PRESENT_MASK is 1, but let's
    express this correctly.
    
    Signed-off-by: Adrian Bunk <bunk@stusta.de>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fe7dc1f2c0c3d0c21abf9dfa4387f0b748080688
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Fri Apr 27 09:29:49 2007 +0300

    KVM: VMX: Add lazy FPU support for VT
    
    Only save/restore the FPU host state when the guest is actually using the
    FPU.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4a579478e5259df8828a8b9e5b3ddac2a946ce88
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Fri Apr 27 09:29:21 2007 +0300

    KVM: VMX: Properly shadow the CR0 register in the vcpu struct
    
    Set all of the host mask bits for CR0 so that we can maintain a proper
    shadow of CR0.  This exposes CR0.TS, paving the way for lazy fpu handling.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aad1187a6c0201701026cdb2f7f6eeb49b2af4a2
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 16:57:46 2007 +0300

    KVM: Move need_resched() check to common code
    
    Pointed out by Anthony Liguori.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b08487bd204708241c9b71ebfc555e334a4e4711
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Apr 25 16:49:19 2007 +0300

    KVM: VMX: Avoid unnecessary vcpu_load()/vcpu_put() cycles
    
    By checking if a reschedule is needed, we avoid dropping the vcpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 25900fd20d141145348178ffe91948e47c83e2ab
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 11:51:06 2007 +0300

    KVM: Avoid unused function warning due to assertion removal
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2bd9b992631841b1be5883a5c27b9c58ae9bb96a
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 11:48:45 2007 +0300

    KVM: We want asserts on debug builds, not release
    
    Noticed by Michael Riepe.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c3efc3ab86aa651106f6302592e25c7ab8285c35
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 12 13:03:01 2007 +0300

    KVM: Initialize cr0 to indicate an fpu is present
    
    Solaris panics if it sees a cpu with no fpu, and it seems to rely on this
    bit.  Closes sourceforge bug 1698920.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 28b183145d34a8ad1bc462df565165a88bcb5220
Author: Yaozu Dong <eddie.dong@intel.com>
Date:   Wed Apr 25 14:17:25 2007 +0800

    KVM: MMU: Avoid heavy ASSERT at non debug mode.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 418987aef13b475140b76f9f780046d63eb16f86
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 11:01:28 2007 +0300

    KVM: Document MSR_K6_STAR's special place in the msr index array
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 90ca9e3d54c8b0ac2023c624d1c7260bb8926beb
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 10:59:52 2007 +0300

    KVM: Don't complain about cpu erratum AA15
    
    It slows down Windows x64 horribly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6f19cb49965e1316b285a443c9392031b1634f2e
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Apr 24 14:13:01 2007 +0300

    KVM: Fix msr-avoidance regression on Core processors
    
    Core processors don't have the STAR msr, so the attempt not to save
    it caused an underflow in the number of msrs.
    
    Fix by only avoiding the STAR msr if it is actually present.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ccf9e2f22e5caf6274b5e9aafd9814a32ef049d5
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Apr 23 09:17:21 2007 -0500

    KVM: Lazy FPU support for SVM
    
    Avoid saving and restoring the guest fpu state on every exit.  This
    shaves ~100 cycles off the guest/host switch.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d558e0b49319cfc9aa92e9b7215580f265a2ead7
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 22 15:28:19 2007 +0300

    KVM: Allow passing 64-bit values to the emulated read/write API
    
    This simplifies the API somewhat (by eliminating the special-case
    cmpxchg8b on i386).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 551284356a39f20de70cd5556e85ae92080aec8c
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Apr 20 13:41:09 2007 +0300

    KVM: Silence compile warning on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 459377fe9ba4a307144ead3ad86993cdee9f8fe8
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 19 17:27:43 2007 +0300

    KVM: Per-vcpu statistics
    
    Make the exit statistics per-vcpu instead of global.  This gives a 3.5%
    boost when running one virtual machine per core on my two socket dual core
    (4 cores total) machine.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5c828f83928f186320d74627089122ebc9ea98ce
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 19 14:28:44 2007 +0300

    KVM: VMX: Only save/restore MSR_K6_STAR if necessary
    
    Intel hosts only support syscall/sysret in long more (and only if efer.sce
    is enabled), so only reload the related MSR_K6_STAR if the guest will
    actually be able to use it.
    
    This reduces vmexit cost by about 500 cycles (6400 -> 5870) on my setup.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 37d6247b3636cbf47014694483d2d25c3806e8f2
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 19 13:26:39 2007 +0300

    KVM: Fold drivers/kvm/kvm_vmx.h into drivers/kvm/vmx.c
    
    No meat in that file.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ba9c2fc1015a2b2f1f930274d465662ed8b860e6
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 19 13:22:48 2007 +0300

    KVM: VMX: Don't switch 64-bit msrs for 32-bit guests
    
    Some msrs are only used by x86_64 instructions, and are therefore
    not needed when the guest is legacy mode.  By not bothering to switch
    them, we reduce vmexit latency by 2400 cycles (from about 8800) when
    running a 32-bt guest on a 64-bit host.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8d6c8a0d891f8c37889f28f368c2621f85e50035
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 18 11:18:18 2007 +0300

    KVM: Fix off-by-one when writing to a nonpae guest pde
    
    Nonpae guest pdes are shadowed by two pae ptes, so we double the offset
    twice: once to account for the pte size difference, and once because we
    need to shadow pdes for a single guest pde.
    
    But when writing to the upper guest pde we also need to truncate the
    lower bits, otherwise the multiply shifts these bits into the pde index
    and causes an access to the wrong shadow pde.  If we're at the end of the
    page (accessing the very last guest pde) we can even overflow into the
    next host page and oops.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f0b9c908fa1451147a07f2f4e4a9409fb7b14160
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Apr 17 15:30:24 2007 +0300

    KVM: VMX: Reduce unnecessary saving of host msrs
    
    THe automatically switched msrs are never changed on the host (with
    the exception of MSR_KERNEL_GS_BASE) and thus there is no need to save
    them on every vm entry.
    
    This reduces vmexit latency by ~400 cycles on i386 and by ~900 cycles (10%)
    on x86_64.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7368e6550cdf72b0ad1b68dbe923f85e37ef4d08
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Apr 17 10:53:22 2007 +0300

    KVM: Handle guest page faults when emulating mmio
    
    Usually, guest page faults are detected by the kvm page fault handler,
    which detects if they are shadow faults, mmio faults, pagetable faults,
    or normal guest page faults.
    
    However, in ceratin circumstances, we can detect a page fault much later.
    One of these events is the following combination:
    
    - A two memory operand instruction (e.g. movsb) is executed.
    - The first operand is in mmio space (which is the fault reported to kvm)
    - The second operand is in an ummaped address (e.g. a guest page fault)
    
    The Windows 2000 installer does such an access, an promptly hangs.  Fix
    by adding the missing page fault injection on that path.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 894f5a5efc0c48482eb10ad48891054a659e5941
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 16 14:28:40 2007 +0300

    KVM: SVM: Report hardware exit reason to userspace instead of dmesg
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 94d806a6efd4401ce43358af6a9e8df5a63151ae
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 16 13:36:10 2007 +0300

    KVM: Fix pio completion
    
    Check cur_count instead of count to avoid false completions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d3344ae6f6293913d6e4f230ebee0b370f2e3f98
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 16 11:53:17 2007 +0300

    KVM: Retry sleeping allocation if atomic allocation fails
    
    This avoids -ENOMEM under memory pressure.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 327585c3b4c1d6b04bb752f70f350d98ca855080
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 15 16:31:09 2007 +0300

    KVM: Use slab caches to allocate mmu data structures
    
    Better leak detection, statistics, memory use, speed -- goodness all
    around.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3079541923d2cdf702490eff7081610b7320e37f
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 15 15:48:11 2007 +0300

    KVM: Fix string pio when count == 0
    
    Surprisingly, VT traps when executing a string pio instruction with zero
    count.  Perhaps more surprisingly, the Windows ne2000 driver issues such
    instructions.
    
    Since we aren't prepared to handle completions of these instructions,
    avoid the entire mess by continuing execution without escaping to userspace.
    
    This fixes the networking problems reported by Leslie Mann <lmann@nt.net>
    with recent versions of kvm.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3ef1110c81993e01343e1b473f5d7d1a23e6a8a3
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 12 17:35:58 2007 +0300

    KVM: Handle partial pae pdptr
    
    Some guests (Solaris) do not set up all four pdptrs, but leave some invalid.
    kvm incorrectly treated these as valid page directories, pinning the
    wrong pages and causing general confusion.
    
    Fix by checking the valid bit of a pae pdpte.  This closes sourceforge bug
    1698922.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4e9d9d330d9c9e66c449be10950562e407366a73
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 11 19:04:39 2007 +0300

    KVM: Fix memory leak on pio completion
    
    We get_page() the pages participating in pio before we return to userspace,
    yet we neglect to free them.  The can leak all guest memory in a few seconds
    by doing a
    
        hdparm -d 0 /dev/hda; dd < /dev/hda > /dev/null
    
    on the guest.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b630b9c6819844e29cddcfeaee901f6ada5d571b
Author: Eric Sesterhenn / Snakebyte <snakebyte@gmx.de>
Date:   Mon Apr 9 16:15:05 2007 +0200

    KVM: Fix overflow bug in overflow detection code
    
    The expression
    
       sp - 6 < sp
    
    where sp is a u16 is undefined in C since 'sp - 6' is promoted to int,
    and signed overflow is undefined in C.  gcc 4.2 actually warns about it.
    Replace with a simpler test.
    
    Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c338c271f150ab2ded369ef4c1882f85b28af709
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 2 13:05:50 2007 +0300

    KVM: Use kernel-standard types
    
    Noted by Joerg Roedel.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0ea6eecef44923d66409a49d71e4fa87fa0f5bed
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 1 16:34:31 2007 +0300

    KVM: Add fpu get/set operations
    
    These are really helpful when migrating an floating point app to another
    machine.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 05671a064c73b8cb8966ddd037ece2d6ae2cb75b
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Mar 30 16:54:30 2007 +0300

    KVM: Add physical memory aliasing feature
    
    With this, we can specify that accesses to one physical memory range will
    be remapped to another.  This is useful for the vga window at 0xa0000 which
    is used as a movable window into the (much larger) framebuffer.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8e08039818b6a5b8c81b905f863adaa18d774171
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Mar 30 14:02:32 2007 +0300

    KVM: Simply gfn_to_page()
    
    Mapping a guest page to a host page is a common operation.  Currently,
    one has first to find the memory slot where the page belongs (gfn_to_memslot),
    then locate the page itself (gfn_to_page()).
    
    This is clumsy, and also won't work well with memory aliases.  So simplify
    gfn_to_page() not to require memory slot translation first, and instead do it
    internally.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 66a9932c55ff7240955d57b7d1e62178a9e80868
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Fri Mar 30 13:06:33 2007 +0300

    Add mmu cache clear function
    
    Functions that play around with the physical memory map
    need a way to clear mappings to possibly nonexistent or
    invalid memory.  Both the mmu cache and the processor tlb
    are cleared.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6095d7b8291fc3e05f3b8790a9bc86b54af281a2
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Fri Mar 30 17:02:14 2007 +0300

    KVM: SVM: enable LBRV virtualization if available
    
    This patch enables the virtualization of the last branch record MSRs on
    SVM if this feature is available in hardware. It also introduces a small
    and simple check feature for specific SVM extensions.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8f1469e8477bea483d5a6348a30a534449048c8d
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 28 20:04:16 2007 +0200

    KVM: x86 emulator: fix bit string operations operand size
    
    On x86, bit operations operate on a string of bits that can reside in
    multiple words.  For example, 'btsl %eax, (blah)' will touch the word
    at blah+4 if %eax is between 32 and 63.
    
    The x86 emulator compensates for that by advancing the operand address
    by (bit offset / BITS_PER_LONG) and truncating the bit offset to the
    range (0..BITS_PER_LONG-1).  This has a side effect of forcing the operand
    size to 8 bytes on 64-bit hosts.
    
    Now, a 32-bit guest goes and fork()s a process.  It write protects a stack
    page at 0xbffff000 using the 'btr' instruction, at offset 0xffc in the page
    table, with bit offset 1 (for the write permission bit).
    
    The emulator now forces the operand size to 8 bytes as previously described,
    and an innocent page table update turns into a cross-page-boundary write,
    which is assumed by the mmu code not to be a page table, so it doesn't
    actually clear the corresponding shadow page table entry.  The guest and
    host permissions are out of sync and guest memory is corrupted soon
    afterwards, leading to guest failure.
    
    Fix by not using BITS_PER_LONG as the word size; instead use the actual
    operand size, so we get a 32-bit write in that case.
    
    Note we still have to teach the mmu to handle cross-page-boundary writes
    to guest page table; but for now this allows Damn Small Linux 0.4 (2.4.20)
    to boot.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e3a065c4e99bb8282d72a2c3c75234d7d7408be6
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 27 17:50:20 2007 +0200

    KVM: Remove debug message
    
    No longer interesting.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 19cd40d605bb99fc9058973a69ef208c8b5b1e42
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 27 16:12:41 2007 +0200

    Revert "added KVM_GET_MEM_MAP ioctl to get the memory bitmap for a memory slot"
    
    This reverts commit ade11a015f83d270d1201c440199146f852fe5e4.
    
    As the balloon path will be through qemu, it will have direct knowledge of
    released gfns, so this API is not directly needed.  If it becomes useful in
    the future, it will be un-reverted.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 932bf20c0c2075f958bb86b481d8f359197b4d6a
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 26 19:31:52 2007 +0200

    KVM: Use list_move()
    
    Use list_move() where possible.  Noticed by Dor Laor.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 31e82571e8a77d5feb1093627ef0b31f28649590
Author: Michal Piotrowski <michal.k.k.piotrowski@gmail.com>
Date:   Sun Mar 25 17:59:32 2007 +0200

    KVM: Remove unused function
    
    Remove unused function
    
    CC      drivers/kvm/svm.o
    drivers/kvm/svm.c:207: warning: ‘inject_db’ defined but not used
    
    Signed-off-by: Michal Piotrowski <michal.k.k.piotrowski@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9207113c121519986a114ee5c498184e618ffd68
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 25 12:07:27 2007 +0200

    KVM: SVM: Ensure timestamp counter monotonicity
    
    When a vcpu is migrated from one cpu to another, its timestamp counter
    may lose its monotonic property if the host has unsynced timestamp counters.
    This can confuse the guest, sometimes to the point of refusing to boot.
    
    As the rdtsc instruction is rather fast on AMD processors (7-10 cycles),
    we can simply record the last host tsc when we drop the cpu, and adjust
    the vcpu tsc offset when we detect that we've migrated to a different cpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b40faf227eb371a52aa21d08f8e9c33fc06602b4
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Mar 23 09:55:25 2007 +0200

    KVM: MMU: Fix hugepage pdes mapping same physical address with different access
    
    The kvm mmu keeps a shadow page for hugepage pdes; if several such pdes map
    the same physical address, they share the same shadow page.  This is a fairly
    common case (kernel mappings on i386 nonpae Linux, for example).
    
    However, if the two pdes map the same memory but with different permissions, kvm
    will happily use the cached shadow page.  If the access through the more
    permissive pde will occur after the access to the strict pde, an endless pagefault
    loop will be generated and the guest will make no progress.
    
    Fix by making the access permissions part of the cache lookup key.
    
    The fix allows Xen pae to boot on kvm and run guest domains.
    
    Thanks to Jeremy Fitzhardinge for reporting the bug and testing the fix.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 061bba1190514205594d2046f5dc31a01a135163
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 22 15:10:32 2007 +0200

    Revert "KVM: Remove extraneous guest entry on mmio read"
    
    This reverts commit b0092d187cfa19dfcada3b85d728af5ae27989dc.
    
    While the optimization is sound, it regresses booting the Fedora Core 6
    32 bit kernel.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4cec1674d1436157c7dcc2b5b6f625b08b2b96e8
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Wed Mar 21 19:47:00 2007 +0100

    KVM: SVM: forbid guest to execute monitor/mwait
    
    This patch forbids the guest to execute monitor/mwait instructions on
    SVM. This is necessary because the guest can execute these instructions
    if they are available even if the kvm cpuid doesn't report its
    existence.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7921ad9e303f3f03dd81b552e3b0cd87ef355219
Author: Sergey Kiselev <sergey.kiselev@intel.com>
Date:   Thu Mar 22 14:06:18 2007 +0200

    KVM: Handle writes to MCG_STATUS msr
    
    Some older (~2.6.7) kernels write MCG_STATUS register during kernel
    boot (mce_clear_all() function, called from mce_init()). It's not
    currently handled by kvm and will cause it to inject a GPF.
    Following patch adds a "nop" handler for this.
    
    Signed-off-by: Sergey Kiselev <sergey.kiselev@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 36809e1326c13887d324025d4592958ead8758d5
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 21 18:14:42 2007 +0200

    KVM: Remove unused and write-only variables
    
    Trivial cleanup.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 262e17b818054dad314a062a439681d79a336d48
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 21 18:11:36 2007 +0200

    KVM: Don't allow the guest to turn off the cpu cache
    
    The cpu cache is a host resource; the guest should not be able to turn
    it off (even for itself).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8c37a70d93ba3e4286ad7524f7915a32ed39cac9
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 21 17:58:32 2007 +0200

    KVM: Hack real-mode segments on vmx from KVM_SET_SREGS
    
    As usual, we need to mangle segment registers when emulating real mode
    as vm86 has specific constraints.  We special case the reset segment base,
    and set the "access rights" (or descriptor flags) to vm86 comaptible values.
    
    This fixes reboot on vmx.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0bf8d346418255335dc9062d96b9f8814b471690
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 21 13:44:58 2007 +0200

    KVM: Modify guest segments after potentially switching modes
    
    The SET_SREGS ioctl modifies both cr0.pe (real mode/protected mode) and
    guest segment registers.  Since segment handling is modified by the mode on
    Intel procesors, update the segment registers after the mode switch has taken
    place.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f97af70b3aa8a92ddeabb7d42477e7d13dd0a192
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 18:44:51 2007 +0200

    KVM: Remove set_cr0_no_modeswitch() arch op
    
    set_cr0_no_modeswitch() was a hack to avoid corrupting segment registers.
    As we now cache the protected mode values on entry to real mode, this
    isn't an issue anymore, and it interferes with reboot (which usually _is_
    a modeswitch).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e314dde30e3851e8effc017c6fffced11d90183a
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 18:40:40 2007 +0200

    KVM: Workaround vmx inability to virtualize the reset state
    
    The reset state has cs.selector == 0xf000 and cs.base == 0xffff0000,
    which aren't compatible with vm86 mode, which is used for real mode
    virtualization.
    
    When we create a vcpu, we set cs.base to 0xf0000, but if we get there by
    way of a reset, the values are inconsistent and vmx refuses to enter
    guest mode.
    
    Workaround by detecting the state and munging it appropriately.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 88aea7ddfae755633b0a80ccfa56244b3c79c7b0
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 14:34:28 2007 +0200

    KVM: MMU: Remove global pte tracking
    
    The initial, noncaching, version of the kvm mmu flushed the all nonglobal
    shadow page table translations (much like a native tlb flush).  The new
    implementation flushes translations only when they change, rendering global
    pte tracking superfluous.
    
    This removes the unused tracking mechanism and storage space.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 66e5d5c81b5b89e39aa86e3bf9864d228f468b0d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 14:29:06 2007 +0200

    KVM: MMU: Remove unnecessary check for pdptr access
    
    We already special case the pdptr access, so no need to check it again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c01571ed56754dfea458cc37d553c360082411a1
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 12:46:50 2007 +0200

    KVM: Avoid guest virtual addresses in string pio userspace interface
    
    The current string pio interface communicates using guest virtual addresses,
    relying on userspace to translate addresses and to check permissions.  This
    interface cannot fully support guest smp, as the check needs to take into
    account two pages at one in case an unaligned string transfer straddles a
    page boundary.
    
    Change the interface not to communicate guest addresses at all; instead use
    a buffer page (mmaped by userspace) and do transfers there.  The kernel
    manages the virtual to physical translation and can perform the checks
    atomically by taking the appropriate locks.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 74c24de6e7848a45d6109d987d4fd2ccd83e432e
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 7 13:11:17 2007 +0200

    KVM: Future-proof argument-less ioctls
    
    Some ioctls ignore their arguments.  By requiring them to be zero now,
    we allow a nonzero value to have some special meaning in the future.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 29e686a1dc9631b7898d087a0ab1c4716672e209
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 7 13:05:38 2007 +0200

    KVM: Allow kernel to select size of mmap() buffer
    
    This allows us to store offsets in the kernel/user kvm_run area, and be
    sure that userspace has them mapped.  As offsets can be outside the
    kvm_run struct, userspace has no way of knowing how much to mmap.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cce3a1062817218c67163732339e2ea25e9f023b
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 5 19:46:05 2007 +0200

    KVM: Add guest mode signal mask
    
    Allow a special signal mask to be used while executing in guest mode.  This
    allows signals to be used to interrupt a vcpu without requiring signal
    delivery to a userspace handler, which is quite expensive.  Userspace still
    receives -EINTR and can get the signal via sigwait().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cd3aaa2392baec9674792d71d304ec41e540b517
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 5 17:45:40 2007 +0200

    KVM: Initialize the apic_base msr on svm too
    
    Older userspace didn't care, but newer userspace (with the cpuid changes)
    does.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c303c0efc5b2ff8c0f77c9079fa66f62801da93d
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 14:24:03 2007 +0200

    KVM: Add a special exit reason when exiting due to an interrupt
    
    This is redundant, as we also return -EINTR from the ioctl, but it
    allows us to examine the exit_reason field on resume without seeing
    old data.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 62919332e00e3226dd1f728ff83107d06a6d9a81
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 14:17:08 2007 +0200

    KVM: Fold kvm_run::exit_type into kvm_run::exit_reason
    
    Currently, userspace is told about the nature of the last exit from the
    guest using two fields, exit_type and exit_reason, where exit_type has
    just two enumerations (and no need for more).  So fold exit_type into
    exit_reason, reducing the complexity of determining what really happened.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9e16898f4f5d6cdc35030bb272631611b71548fe
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 13:59:30 2007 +0200

    KVM: Allow userspace to process hypercalls which have no kernel handler
    
    This is useful for paravirtualized graphics devices, for example.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 440fd9098bceb2ca0856d962ff62db9af4d1094a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 17:56:20 2007 +0200

    KVM: Add method to check for backwards-compatible API extensions
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0b37dedb178bcb3b0a28f65e6ae835bf58184301
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 17:20:13 2007 +0200

    KVM: Renumber ioctls
    
    The recent changes have left the ioctl numbers in complete disarray.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 95cab16b18e1c1a786a9fc5ea6fcd68b29ae3481
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 16:47:06 2007 +0200

    KVM: Remove minor wart from KVM_CREATE_VCPU ioctl
    
    That ioctl does not transfer any data, so it should be an _IO rather than an
    _IOW.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ba5cb15b027b76ba7b4d247914eb6d20065c0767
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 16:20:40 2007 +0200

    KVM: Remove the 'emulated' field from the userspace interface
    
    We no longer emulate single instructions in userspace.  Instead, we service
    mmio or pio requests.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 706e8fe655be36aa686f1fbb398d3a4470d4939b
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 28 20:46:53 2007 +0200

    KVM: Handle cpuid in the kernel instead of punting to userspace
    
    KVM used to handle cpuid by letting userspace decide what values to
    return to the guest.  We now handle cpuid completely in the kernel.  We
    still let userspace decide which values the guest will see by having
    userspace set up the value table beforehand (this is necessary to allow
    management software to set the cpu features to the least common denominator,
    so that live migration can work).
    
    The motivation for the change is that kvm kernel code can be impacted by
    cpuid features, for example the x86 emulator.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aad2f6e0faf4b03e087bbe6751acdacd72e911b6
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Feb 22 19:48:43 2007 +0200

    KVM: Initialize PIO I/O count
    
    This allows userspace to ignore the io.rep field.  No a big deal, but
    friendly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e668cf946ee8654c7f5afe3feeed686a3566c22a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Feb 22 19:39:30 2007 +0200

    KVM: Do not communicate to userspace through cpu registers during PIO
    
    Currently when passing the a PIO emulation request to userspace, we
    rely on userspace updating %rax (on 'in' instructions) and %rsi/%rdi/%rcx
    (on string instructions).  This (a) requires two extra ioctls for getting
    and setting the registers and (b) is unfriendly to non-x86 archs, when
    they get kvm ports.
    
    So fix by doing the register fixups in the kernel and passing to userspace
    only an abstract description of the PIO to be done.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3de857cd1335bd2e02b60d3a50b7da93ccbabf1d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Feb 22 12:58:31 2007 +0200

    KVM: Use a shared page for kernel/user communication when runing a vcpu
    
    Instead of passing a 'struct kvm_run' back and forth between the kernel and
    userspace, allocate a page and allow the user to mmap() it.  This reduces
    needless copying and makes the interface expandable by providing lots of
    free space.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 128e159e11e999496ec44a549fcac91de3802389
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 19 13:18:10 2007 +0200

    KVM: Prevent system selectors leaking into guest on real->protected mode transition on vmx
    
    Intel virtualization extensions do not support virtualizing real mode.  So
    kvm uses virtualized vm86 mode to run real mode code.  Unfortunately, this
    virtualized vm86 mode does not support the so called "big real" mode, where
    the segment selector and base do not agree with each other according to the
    real mode rules (base == selector << 4).
    
    To work around this, kvm checks whether a selector/base pair violates the
    virtualized vm86 rules, and if so, forces it into conformance.  On a
    transition back to protected mode, if we see that the guest did not touch
    a forced segment, we restore it back to the original protected mode value.
    
    This pile of hacks breaks down if the gdt has changed in real mode, as it
    can cause a segment selector to point to a system descriptor instead of a
    normal data segment.  In fact, this happens with the Windows bootloader
    and the qemu acpi bios, where a protected mode memcpy routine issues an
    innocent 'pop %es' and traps on an attempt to load a system descriptor.
    
    "Fix" by checking if the to-be-restored selector points at a system segment,
    and if so, coercing it into a normal data segment.  The long term solution,
    of course, is to abandon vm86 mode and use emulation for big real mode.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ade11a015f83d270d1201c440199146f852fe5e4
Author: Uri Lublin <uril@qumranet.com>
Date:   Wed Mar 14 19:21:06 2007 +0200

    added KVM_GET_MEM_MAP ioctl to get the memory bitmap for a memory slot
    
    To be used when there may be "holes" in the memory.
    Specifically to not break VM migration when ballooning mechanism exists
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>

commit b0092d187cfa19dfcada3b85d728af5ae27989dc
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 14 15:54:54 2007 +0200

    KVM: Remove extraneous guest entry on mmio read
    
    When emulating an mmio read, we actually emulate twice: once to determine
    the physical address of the mmio, and, after we've exited to userspace to
    get the mmio value, we emulate again to place the value in the result
    register and update any flags.
    
    But we don't really need to enter the guest again for that, only to take
    an immediate vmexit.  So, if we detect that we're doing an mmio read,
    emulate a single instruction before entering the guest again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 470db88b8b3491199e8d55b771d66e74b2fd53cd
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Mar 11 13:52:33 2007 +0100

    KVM: always reload segment selectors
    
    failed VM entry on VMX might still change %fs or %gs, thus make sure
    that KVM always reloads the segment selectors. This is crutial on both
    x86 and x86_64: x86 has __KERNEL_PDA in %fs on which things like
    'current' depends and x86_64 has 0 there and needs MSR_GS_BASE to work.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit f7edc6a39584a3f95687a5320675fadb23bccbe5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Mar 10 11:22:51 2007 +0100

    KVM: trivial whitespace fixes
    
    trivial whitespace fixes.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit f3a33bfeaa5cade1a9ac1facb5cb904a483b1e5c
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Mar 9 13:04:31 2007 +0200

    KVM: MMU: Fix host memory corruption on i386 with >= 4GB ram
    
    PAGE_MASK is an unsigned long, so using it to mask physical addresses on
    i386 (which are 64-bit wide) leads to truncation.  This can result in
    page->private of unrelated memory pages being modified, with disasterous
    results.
    
    Fix by not using PAGE_MASK for physical addresses; instead calculate
    the correct value directly from PAGE_SIZE.  Also fix a similar BUG_ON().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6ee9853b015f8807f497ffad39b142ddc1403aa9
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 8 17:13:32 2007 +0200

    KVM: MMU: Fix guest writes to nonpae pde
    
    KVM shadow page tables are always in pae mode, regardless of the guest
    setting.  This means that a guest pde (mapping 4MB of memory) is mapped
    to two shadow pdes (mapping 2MB each).
    
    When the guest writes to a pte or pde, we intercept the write and emulate it.
    We also remove any shadowed mappings corresponding to the write.  Since the
    mmu did not account for the doubling in the number of pdes, it removed the
    wrong entry, resulting in a mismatch between shadow page tables and guest
    page tables, followed shortly by guest memory corruption.
    
    This patch fixes the problem by detecting the special case of writing to
    a non-pae pde and adjusting the address and number of shadow pdes zapped
    accordingly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 374c1509c7d04a4e351b1812c2f0b9dac3ea0c0a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 8 11:48:09 2007 +0200

    KVM: Fix bogus sign extension in mmu mapping audit
    
    When auditing a 32-bit guest on a 64-bit host, sign extension of the page
    table directory pointer table index caused bogus addresses to be shown on
    audit errors.
    
    Fix by declaring the index unsigned.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fac539542cbf923a39238b10557c88f99fd45b59
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 7 09:29:48 2007 +0200

    KVM: Export <linux/kvm.h>
    
    This allows users to actually build prgrams that use kvm without
    the entire source tree.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c14a46343cc9f04f15ebc67573031fe8bbe1555a
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 6 12:05:53 2007 +0200

    KVM: Fix guest sysenter on vmx
    
    The vmx code currently treats the guest's sysenter support msrs as 32-bit
    values, which breaks 32-bit compat mode userspace on 64-bit guests.  Fix by
    using the native word width of the machine.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ea135e7671189ffb7e67843bf98740dac0c6ccfa
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 13:27:36 2007 +0200

    KVM: Use own minor number
    
    Use the minor number (232) allocated to kvm by lanana.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 21af17507f37658414191b1cf1337efbaf7dd530
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Mon Feb 19 18:25:43 2007 +0200

    KVM: Use the generic skip_emulated_instruction() in hypercall code
    
    Instead of twiddling the rip registers directly, use the
    skip_emulated_instruction() function to do that for us.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 57d78025d84fb607aa335d015a79b257517aa209
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Mon Feb 19 16:44:49 2007 +0200

    KVM: Fix guest register corruption on paravirt hypercall
    
    The hypercall code mixes up the ->cache_regs() and ->decache_regs()
    callbacks, resulting in guest register corruption.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 28e9803c9134683a884efe05abdb3f814c1ca7e7
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 19:21:03 2007 +0200

    KVM: Unset kvm_arch_ops if arch module loading failed
    
    Otherwise, the core module thinks the arch module is loaded, and won't
    let you reload it after you've fixed the bug.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 426bc2fd1462706ec92d0e9efdb0cf3643f4eb67
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 11:28:13 2007 +0200

    KVM: Move kvmfs magic number to <linux/magic.h>
    
    From: Andrew Morton <akpm@linux-foundation.org>
    
    Use the standard magic.h for kvmfs.
    
    Cc: Avi Kivity <avi@qumranet.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c1a8557e1da6e7d8bf8f77cb1b47c077f5c2a67d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 26 16:29:43 2007 +0200

    KVM: Fix bogus failure in kvm.ko module initialization
    
    A bogus 'return r' can cause an otherwise successful module load to fail.
    This both denies users the use of kvm, and it also denies them the use of
    their machine, as it leaves a filesystem registered with its callbacks
    pointing into now-freed module memory.
    
    Fix by returning a zero like a good module.
    
    Thanks to Richard Lucassen <mailinglists@lucassen.org> (?) for reporting
    the problem and for providing access to a machine which exhibited it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7703ff91ee2ed171f2175d030e7f063c4efab2f5
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:37:32 2007 +0200

    KVM: Remove write access permissions when dirty-page-logging is enabled
    
    Enabling dirty page logging is done using KVM_SET_MEMORY_REGION ioctl.
    If the memory region already exists, we need to remove write accesses,
    so writes will be caught, and dirty pages will be logged.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b77fd1f62576463434fc434cbdcd808847e169a1
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:15:33 2007 +0200

    kvm: move do_remove_write_access() up
    
    To be called from kvm_vm_ioctl_set_memory_region()
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 62e287e7210d6ff142b3b05233fa1f5df686b794
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 16:43:09 2007 +0200

    KVM: Fix dirty page log bitmap size/access calculation
    
    Since dirty_bitmap is an unsigned long array, the alignment and size need
    to take that into account.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 871574eb14e959c19d94fdee7c3e2b88ae06770f
Author: Uri Lublin <uril@qumranet.com>
Date:   Wed Feb 21 18:25:21 2007 +0200

    KVM: Add missing calls to mark_page_dirty()
    
    A few places where we modify guest memory fail to call mark_page_dirty(),
    causing live migration to fail.  This adds the missing calls.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 42017e8bf8eb7b6f65b95bca1368ee274fc5ef50
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:37:32 2007 +0200

    kvm: dirty page logging: remove write access permissions when dirty-page-logging is enabled
    
    Enabling dirty page logging is done using KVM_SET_MEMORY_REGION ioctl.
    If the memory region already exists, there is a need to remove write accesses,
        so writes will be caught, and dirty pages will be logged.

commit a9fd29cfcb643b97cd76c7d836be4d0ed80f69e0
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:15:33 2007 +0200

    kvm: move do_remove_write_access() up
    
    To be called from kvm_vm_ioctl_set_memory_region()

commit fba4ba9c513ad2cd328f5f16980aa7b90d40cec0
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 16:43:09 2007 +0200

    kvm: dirty pages log: fix bitmap size/access calculation
    
    Since dirty_bitmap is an unsigned long array (pointer)

commit ae160d732685ab33d5a3a495663aa2b54c4d4734
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 15:47:42 2007 +0200

    .gitignore: ignore emacs backup files (*~)

commit 8267c1cd9a8a038e91c94e0cabc571a3614dc3e5
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 21 19:47:40 2007 +0200

    KVM: Bump API version
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c65237e78c19b8173338a49933c611dece13c1c6
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 21 18:04:26 2007 +0200

    KVM: Per-vcpu inodes
    
    Allocate a distinct inode for every vcpu in a VM.  This has the following
    benefits:
    
     - the filp cachelines are no longer bounced when f_count is incremented on
       every ioctl()
     - the API and internal code are distinctly clearer; for example, on the
       KVM_GET_REGS ioctl, there is no need to copy the vcpu number from
       userspace and then copy the registers back; the vcpu identity is derived
       from the fd used to make the call
    
    Right now the performance benefits are completely theoretical since (a) we
    don't support more than one vcpu per VM and (b) virtualization hardware
    inefficiencies completely everwhelm any cacheline bouncing effects.  But
    both of these will change, and we need to prepare the API today.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 11c1297fadc533d1f66252088b4f4775018bafbb
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Feb 20 18:41:05 2007 +0200

    KVM: Move kvm_vm_ioctl_create_vcpu() around
    
    In preparation of some hacking.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f3ad84386727171d8308338a2c5dee1deac2e50d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Feb 20 18:27:58 2007 +0200

    KVM: Rename some kvm_dev_ioctl_*() functions to kvm_vm_ioctl_*()
    
    This reflects the changed scope, from device-wide to single vm (previously
    every device open created a virtual machine).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 733e3f74f1c51bbc2e7a99df8b51767504b58de2
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 21 19:28:04 2007 +0200

    KVM: Create an inode per virtual machine
    
    This avoids having filp->f_op and the corresponding inode->i_fop different,
    which is a little unorthodox.
    
    The ioctl list is split into two: global kvm ioctls and per-vm ioctls.  A new
    ioctl, KVM_CREATE_VM, is used to create VMs and return the VM fd.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 52a96114380f8ab615626e4cec57b7015895bd0f
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Feb 20 14:07:37 2007 +0200

    KVM: Add internal filesystem for generating inodes
    
    The kvmfs inodes will represent virtual machines and vcpus, as necessary,
    reducing cacheline bouncing due to inodes and filps being shared.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b00bc8b10197715f5b842f1f9a60e67a3484b10f
Author: Uri Lublin <uril@qumranet.com>
Date:   Wed Feb 21 18:25:21 2007 +0200

    kvm, dirty pages log: adding some calls to mark_page_dirty()

commit 58a214eba321d92f833221c26777e2119e34a19d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:48 2007 +0200

    KVM: More 0 -> NULL conversions
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f73199bb57b4c8feb7d8f60c6f1a25107de18dab
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: SVM: intercept SMI to handle it at host level
    
    This patch changes the SVM code to intercept SMIs and handle it
    outside the guest.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fa2742c78f10fad8682e3af17df3e9fc2eece9e4
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: svm: init cr0 with the wp bit set
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8da588a919dc0bef76e384d16fd13ea2189aa82d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Wire up hypercall handlers to a central arch-independent location
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 68f16784f188d280c75b39e2367ebc1adbc66d9d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Add hypercall host support for svm
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7c8bd4d6fc0e2bfb35cd4c0e8ff39c4f8972d951
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Add host hypercall support for vmx
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f846fa34a14ec37dc0194c6f47ea4374c140e6f1
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: add MSR based hypercall API
    
    This adds a special MSR based hypercall API to KVM. This is to be
    used by paravirtual kernels and virtual drivers.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8aa04bb13cf90d68c26d6bea1e4c720f1f027be0
Author: Markus Rechberger <markus.rechberger@amd.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Use page_private()/set_page_private() apis
    
    Besides using an established api, this allows using kvm in older kernels.
    
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4d5a7e81cc63d28e94373cdeb74dc44045edaa10
Author: Ahmed S. Darwish <darwish.07@gmail.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: Use ARRAY_SIZE macro instead of manual calculation.
    
    Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0fe9875fb3f9946a6c1cef6f1b9a286edc8ee2b9
Author: Markus Rechberger <markus.rechberger@amd.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: vmx: hack set_cr0_no_modeswitch() to actually do modeswitch
    
    From: Joerg Roedel <joerg.roedel@amd.com>
    
    The whole thing is rotten, but this allows vmx to boot with the guest reboot
    fix.
    
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7e6e2bbad7f5dbccb389ee6d79be661972b18b15
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: Cosmetics
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cc66daca849ca8c2900ba8cc7640de664296d36a
Author: Jeremy Katz <katzj@redhat.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: Move virtualization deactivation from CPU_DEAD state to CPU_DOWN_PREPARE
    
    This gives it more chances of surviving suspend.
    
    Signed-off-by: Jeremy Katz <katzj@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2959cd13ecc1fbe1b2339937481844ff963f1e7f
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: mmu: add missing dirty page tracking cases
    
    We fail to mark a page dirty in three cases:
    
    - setting the accessed bit in a pte
    - setting the dirty bit in a pte
    - emulating a write into a pagetable
    
    This fix adds the missing cases.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86/Kconfig                  |    6 
 arch/x86/Makefile                 |    2 
 arch/x86/kvm/Kconfig              |   54 
 arch/x86/kvm/Makefile             |   14 
 arch/x86/kvm/i8259.c              |  450 +++
 arch/x86/kvm/irq.c                |   78 
 arch/x86/kvm/irq.h                |   88 
 arch/x86/kvm/kvm_svm.h            |   45 
 arch/x86/kvm/lapic.c              | 1154 ++++++++
 arch/x86/kvm/lapic.h              |   50 
 arch/x86/kvm/mmu.c                | 1894 ++++++++++++++
 arch/x86/kvm/mmu.h                |   44 
 arch/x86/kvm/paging_tmpl.h        |  481 +++
 arch/x86/kvm/segment_descriptor.h |   29 
 arch/x86/kvm/svm.c                | 1731 +++++++++++++
 arch/x86/kvm/svm.h                |  325 ++
 arch/x86/kvm/vmx.c                | 2678 ++++++++++++++++++++
 arch/x86/kvm/vmx.h                |  324 ++
 arch/x86/kvm/x86.c                | 3287 +++++++++++++++++++++++++
 arch/x86/kvm/x86_emulate.c        | 1912 ++++++++++++++
 drivers/Kconfig                   |    2 
 drivers/Makefile                  |    1 
 drivers/kvm/Kconfig               |   54 
 drivers/kvm/Makefile              |   10 
 drivers/kvm/i8259.c               |  450 ---
 drivers/kvm/ioapic.c              |  388 --
 drivers/kvm/irq.c                 |   98 
 drivers/kvm/irq.h                 |  165 -
 drivers/kvm/kvm.h                 |  796 ------
 drivers/kvm/kvm_main.c            | 3628 ----------------------------
 drivers/kvm/kvm_svm.h             |   45 
 drivers/kvm/lapic.c               | 1080 --------
 drivers/kvm/mmu.c                 | 1498 -----------
 drivers/kvm/paging_tmpl.h         |  511 ---
 drivers/kvm/segment_descriptor.h  |   17 
 drivers/kvm/svm.c                 | 1754 -------------
 drivers/kvm/svm.h                 |  324 --
 drivers/kvm/vmx.c                 | 2566 -------------------
 drivers/kvm/vmx.h                 |  310 --
 drivers/kvm/x86_emulate.c         | 1662 ------------
 drivers/kvm/x86_emulate.h         |  155 -
 include/asm-x86/Kbuild            |    1 
 include/asm-x86/kvm.h             |  191 +
 include/asm-x86/kvm_host.h        |  612 ++++
 include/asm-x86/kvm_para.h        |  105 
 include/asm-x86/kvm_x86_emulate.h |  186 +
 include/linux/Kbuild              |    2 
 include/linux/kvm.h               |  203 -
 include/linux/kvm_host.h          |  299 ++
 include/linux/kvm_para.h          |   80 
 include/linux/kvm_types.h         |   54 
 kernel/fork.c                     |    1 
 virt/kvm/ioapic.c                 |  403 +++
 virt/kvm/ioapic.h                 |   95 
 virt/kvm/iodev.h                  |   63 
 virt/kvm/kvm_main.c               | 1400 ++++++++++
 56 files changed, 18126 insertions(+), 15729 deletions(-)

diff -puN arch/x86/Kconfig~git-kvm arch/x86/Kconfig
--- a/arch/x86/Kconfig~git-kvm
+++ a/arch/x86/Kconfig
@@ -467,6 +467,10 @@ config SWIOTLB
 	  access 32-bits of memory can be used on systems with more than
 	  3 GB of memory. If unsure, say Y.
 
+config ARCH_SUPPORTS_KVM
+	bool
+	default y
+
 
 config NR_CPUS
 	int "Maximum number of CPUs (2-255)"
@@ -1625,4 +1629,6 @@ source "security/Kconfig"
 
 source "crypto/Kconfig"
 
+source "arch/x86/kvm/Kconfig"
+
 source "lib/Kconfig"
diff -puN arch/x86/Makefile~git-kvm arch/x86/Makefile
--- a/arch/x86/Makefile~git-kvm
+++ a/arch/x86/Makefile
@@ -17,3 +17,5 @@ else
         UTS_MACHINE := x86_64
         include $(srctree)/arch/x86/Makefile_64
 endif
+
+core-$(CONFIG_KVM) += arch/x86/kvm/
diff -puN /dev/null arch/x86/kvm/Kconfig
--- /dev/null
+++ a/arch/x86/kvm/Kconfig
@@ -0,0 +1,54 @@
+#
+# KVM configuration
+#
+menuconfig VIRTUALIZATION
+	bool "Virtualization"
+	depends on ARCH_SUPPORTS_KVM || X86
+	default y
+	---help---
+	  Say Y here to get to see options for using your Linux host to run other
+	  operating systems inside virtual machines (guests).
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and disabled.
+
+if VIRTUALIZATION
+
+config KVM
+	tristate "Kernel-based Virtual Machine (KVM) support"
+	depends on ARCH_SUPPORTS_KVM && EXPERIMENTAL
+	select PREEMPT_NOTIFIERS
+	select ANON_INODES
+	---help---
+	  Support hosting fully virtualized guest machines using hardware
+	  virtualization extensions.  You will need a fairly recent
+	  processor equipped with virtualization extensions. You will also
+	  need to select one or more of the processor modules below.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  To compile this as a module, choose M here: the module
+	  will be called kvm.
+
+	  If unsure, say N.
+
+config KVM_INTEL
+	tristate "KVM for Intel processors support"
+	depends on KVM
+	---help---
+	  Provides support for KVM on Intel processors equipped with the VT
+	  extensions.
+
+config KVM_AMD
+	tristate "KVM for AMD processors support"
+	depends on KVM
+	---help---
+	  Provides support for KVM on AMD processors equipped with the AMD-V
+	  (SVM) extensions.
+
+# OK, it's a little counter-intuitive to do this, but it puts it neatly under
+# the virtualization menu.
+source drivers/lguest/Kconfig
+
+endif # VIRTUALIZATION
diff -puN /dev/null arch/x86/kvm/Makefile
--- /dev/null
+++ a/arch/x86/kvm/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
+
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
+obj-$(CONFIG_KVM) += kvm.o
+kvm-intel-objs = vmx.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+kvm-amd-objs = svm.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff -puN /dev/null arch/x86/kvm/i8259.c
--- /dev/null
+++ a/arch/x86/kvm/i8259.c
@@ -0,0 +1,450 @@
+/*
+ * 8259 interrupt controller emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *   Port from Qemu.
+ */
+#include <linux/mm.h>
+#include "irq.h"
+
+#include <linux/kvm_host.h>
+
+/*
+ * set irq level. If an edge is detected, then the IRR is set to 1
+ */
+static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+{
+	int mask;
+	mask = 1 << irq;
+	if (s->elcr & mask)	/* level triggered */
+		if (level) {
+			s->irr |= mask;
+			s->last_irr |= mask;
+		} else {
+			s->irr &= ~mask;
+			s->last_irr &= ~mask;
+		}
+	else	/* edge triggered */
+		if (level) {
+			if ((s->last_irr & mask) == 0)
+				s->irr |= mask;
+			s->last_irr |= mask;
+		} else
+			s->last_irr &= ~mask;
+}
+
+/*
+ * return the highest priority found in mask (highest = smallest
+ * number). Return 8 if no irq
+ */
+static inline int get_priority(struct kvm_kpic_state *s, int mask)
+{
+	int priority;
+	if (mask == 0)
+		return 8;
+	priority = 0;
+	while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
+		priority++;
+	return priority;
+}
+
+/*
+ * return the pic wanted interrupt. return -1 if none
+ */
+static int pic_get_irq(struct kvm_kpic_state *s)
+{
+	int mask, cur_priority, priority;
+
+	mask = s->irr & ~s->imr;
+	priority = get_priority(s, mask);
+	if (priority == 8)
+		return -1;
+	/*
+	 * compute current priority. If special fully nested mode on the
+	 * master, the IRQ coming from the slave is not taken into account
+	 * for the priority computation.
+	 */
+	mask = s->isr;
+	if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
+		mask &= ~(1 << 2);
+	cur_priority = get_priority(s, mask);
+	if (priority < cur_priority)
+		/*
+		 * higher priority found: an irq should be generated
+		 */
+		return (priority + s->priority_add) & 7;
+	else
+		return -1;
+}
+
+/*
+ * raise irq to CPU if necessary. must be called every time the active
+ * irq may change
+ */
+static void pic_update_irq(struct kvm_pic *s)
+{
+	int irq2, irq;
+
+	irq2 = pic_get_irq(&s->pics[1]);
+	if (irq2 >= 0) {
+		/*
+		 * if irq request by slave pic, signal master PIC
+		 */
+		pic_set_irq1(&s->pics[0], 2, 1);
+		pic_set_irq1(&s->pics[0], 2, 0);
+	}
+	irq = pic_get_irq(&s->pics[0]);
+	if (irq >= 0)
+		s->irq_request(s->irq_request_opaque, 1);
+	else
+		s->irq_request(s->irq_request_opaque, 0);
+}
+
+void kvm_pic_update_irq(struct kvm_pic *s)
+{
+	pic_update_irq(s);
+}
+
+void kvm_pic_set_irq(void *opaque, int irq, int level)
+{
+	struct kvm_pic *s = opaque;
+
+	pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+	pic_update_irq(s);
+}
+
+/*
+ * acknowledge interrupt 'irq'
+ */
+static inline void pic_intack(struct kvm_kpic_state *s, int irq)
+{
+	if (s->auto_eoi) {
+		if (s->rotate_on_auto_eoi)
+			s->priority_add = (irq + 1) & 7;
+	} else
+		s->isr |= (1 << irq);
+	/*
+	 * We don't clear a level sensitive interrupt here
+	 */
+	if (!(s->elcr & (1 << irq)))
+		s->irr &= ~(1 << irq);
+}
+
+int kvm_pic_read_irq(struct kvm_pic *s)
+{
+	int irq, irq2, intno;
+
+	irq = pic_get_irq(&s->pics[0]);
+	if (irq >= 0) {
+		pic_intack(&s->pics[0], irq);
+		if (irq == 2) {
+			irq2 = pic_get_irq(&s->pics[1]);
+			if (irq2 >= 0)
+				pic_intack(&s->pics[1], irq2);
+			else
+				/*
+				 * spurious IRQ on slave controller
+				 */
+				irq2 = 7;
+			intno = s->pics[1].irq_base + irq2;
+			irq = irq2 + 8;
+		} else
+			intno = s->pics[0].irq_base + irq;
+	} else {
+		/*
+		 * spurious IRQ on host controller
+		 */
+		irq = 7;
+		intno = s->pics[0].irq_base + irq;
+	}
+	pic_update_irq(s);
+
+	return intno;
+}
+
+void kvm_pic_reset(struct kvm_kpic_state *s)
+{
+	s->last_irr = 0;
+	s->irr = 0;
+	s->imr = 0;
+	s->isr = 0;
+	s->priority_add = 0;
+	s->irq_base = 0;
+	s->read_reg_select = 0;
+	s->poll = 0;
+	s->special_mask = 0;
+	s->init_state = 0;
+	s->auto_eoi = 0;
+	s->rotate_on_auto_eoi = 0;
+	s->special_fully_nested_mode = 0;
+	s->init4 = 0;
+}
+
+static void pic_ioport_write(void *opaque, u32 addr, u32 val)
+{
+	struct kvm_kpic_state *s = opaque;
+	int priority, cmd, irq;
+
+	addr &= 1;
+	if (addr == 0) {
+		if (val & 0x10) {
+			kvm_pic_reset(s);	/* init */
+			/*
+			 * deassert a pending interrupt
+			 */
+			s->pics_state->irq_request(s->pics_state->
+						   irq_request_opaque, 0);
+			s->init_state = 1;
+			s->init4 = val & 1;
+			if (val & 0x02)
+				printk(KERN_ERR "single mode not supported");
+			if (val & 0x08)
+				printk(KERN_ERR
+				       "level sensitive irq not supported");
+		} else if (val & 0x08) {
+			if (val & 0x04)
+				s->poll = 1;
+			if (val & 0x02)
+				s->read_reg_select = val & 1;
+			if (val & 0x40)
+				s->special_mask = (val >> 5) & 1;
+		} else {
+			cmd = val >> 5;
+			switch (cmd) {
+			case 0:
+			case 4:
+				s->rotate_on_auto_eoi = cmd >> 2;
+				break;
+			case 1:	/* end of interrupt */
+			case 5:
+				priority = get_priority(s, s->isr);
+				if (priority != 8) {
+					irq = (priority + s->priority_add) & 7;
+					s->isr &= ~(1 << irq);
+					if (cmd == 5)
+						s->priority_add = (irq + 1) & 7;
+					pic_update_irq(s->pics_state);
+				}
+				break;
+			case 3:
+				irq = val & 7;
+				s->isr &= ~(1 << irq);
+				pic_update_irq(s->pics_state);
+				break;
+			case 6:
+				s->priority_add = (val + 1) & 7;
+				pic_update_irq(s->pics_state);
+				break;
+			case 7:
+				irq = val & 7;
+				s->isr &= ~(1 << irq);
+				s->priority_add = (irq + 1) & 7;
+				pic_update_irq(s->pics_state);
+				break;
+			default:
+				break;	/* no operation */
+			}
+		}
+	} else
+		switch (s->init_state) {
+		case 0:		/* normal mode */
+			s->imr = val;
+			pic_update_irq(s->pics_state);
+			break;
+		case 1:
+			s->irq_base = val & 0xf8;
+			s->init_state = 2;
+			break;
+		case 2:
+			if (s->init4)
+				s->init_state = 3;
+			else
+				s->init_state = 0;
+			break;
+		case 3:
+			s->special_fully_nested_mode = (val >> 4) & 1;
+			s->auto_eoi = (val >> 1) & 1;
+			s->init_state = 0;
+			break;
+		}
+}
+
+static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
+{
+	int ret;
+
+	ret = pic_get_irq(s);
+	if (ret >= 0) {
+		if (addr1 >> 7) {
+			s->pics_state->pics[0].isr &= ~(1 << 2);
+			s->pics_state->pics[0].irr &= ~(1 << 2);
+		}
+		s->irr &= ~(1 << ret);
+		s->isr &= ~(1 << ret);
+		if (addr1 >> 7 || ret != 2)
+			pic_update_irq(s->pics_state);
+	} else {
+		ret = 0x07;
+		pic_update_irq(s->pics_state);
+	}
+
+	return ret;
+}
+
+static u32 pic_ioport_read(void *opaque, u32 addr1)
+{
+	struct kvm_kpic_state *s = opaque;
+	unsigned int addr;
+	int ret;
+
+	addr = addr1;
+	addr &= 1;
+	if (s->poll) {
+		ret = pic_poll_read(s, addr1);
+		s->poll = 0;
+	} else
+		if (addr == 0)
+			if (s->read_reg_select)
+				ret = s->isr;
+			else
+				ret = s->irr;
+		else
+			ret = s->imr;
+	return ret;
+}
+
+static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+{
+	struct kvm_kpic_state *s = opaque;
+	s->elcr = val & s->elcr_mask;
+}
+
+static u32 elcr_ioport_read(void *opaque, u32 addr1)
+{
+	struct kvm_kpic_state *s = opaque;
+	return s->elcr;
+}
+
+static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+	switch (addr) {
+	case 0x20:
+	case 0x21:
+	case 0xa0:
+	case 0xa1:
+	case 0x4d0:
+	case 0x4d1:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static void picdev_write(struct kvm_io_device *this,
+			 gpa_t addr, int len, const void *val)
+{
+	struct kvm_pic *s = this->private;
+	unsigned char data = *(unsigned char *)val;
+
+	if (len != 1) {
+		if (printk_ratelimit())
+			printk(KERN_ERR "PIC: non byte write\n");
+		return;
+	}
+	switch (addr) {
+	case 0x20:
+	case 0x21:
+	case 0xa0:
+	case 0xa1:
+		pic_ioport_write(&s->pics[addr >> 7], addr, data);
+		break;
+	case 0x4d0:
+	case 0x4d1:
+		elcr_ioport_write(&s->pics[addr & 1], addr, data);
+		break;
+	}
+}
+
+static void picdev_read(struct kvm_io_device *this,
+			gpa_t addr, int len, void *val)
+{
+	struct kvm_pic *s = this->private;
+	unsigned char data = 0;
+
+	if (len != 1) {
+		if (printk_ratelimit())
+			printk(KERN_ERR "PIC: non byte read\n");
+		return;
+	}
+	switch (addr) {
+	case 0x20:
+	case 0x21:
+	case 0xa0:
+	case 0xa1:
+		data = pic_ioport_read(&s->pics[addr >> 7], addr);
+		break;
+	case 0x4d0:
+	case 0x4d1:
+		data = elcr_ioport_read(&s->pics[addr & 1], addr);
+		break;
+	}
+	*(unsigned char *)val = data;
+}
+
+/*
+ * callback when PIC0 irq status changed
+ */
+static void pic_irq_request(void *opaque, int level)
+{
+	struct kvm *kvm = opaque;
+	struct kvm_vcpu *vcpu = kvm->vcpus[0];
+
+	pic_irqchip(kvm)->output = level;
+	if (vcpu)
+		kvm_vcpu_kick(vcpu);
+}
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+{
+	struct kvm_pic *s;
+	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+	if (!s)
+		return NULL;
+	s->pics[0].elcr_mask = 0xf8;
+	s->pics[1].elcr_mask = 0xde;
+	s->irq_request = pic_irq_request;
+	s->irq_request_opaque = kvm;
+	s->pics[0].pics_state = s;
+	s->pics[1].pics_state = s;
+
+	/*
+	 * Initialize PIO device
+	 */
+	s->dev.read = picdev_read;
+	s->dev.write = picdev_write;
+	s->dev.in_range = picdev_in_range;
+	s->dev.private = s;
+	kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
+	return s;
+}
diff -puN /dev/null arch/x86/kvm/irq.c
--- /dev/null
+++ a/arch/x86/kvm/irq.c
@@ -0,0 +1,78 @@
+/*
+ * irq.c: API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+
+/*
+ * check if there is pending interrupt without
+ * intack.
+ */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
+{
+	struct kvm_pic *s;
+
+	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
+		if (kvm_apic_accept_pic_intr(v)) {
+			s = pic_irqchip(v->kvm);	/* PIC */
+			return s->output;
+		} else
+			return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
+
+/*
+ * Read pending interrupt vector and intack.
+ */
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
+{
+	struct kvm_pic *s;
+	int vector;
+
+	vector = kvm_get_apic_interrupt(v);	/* APIC */
+	if (vector == -1) {
+		if (kvm_apic_accept_pic_intr(v)) {
+			s = pic_irqchip(v->kvm);
+			s->output = 0;		/* PIC */
+			vector = kvm_pic_read_irq(s);
+		}
+	}
+	return vector;
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
+{
+	kvm_inject_apic_timer_irqs(vcpu);
+	/* TODO: PIT, RTC etc. */
+}
+EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
+
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+	kvm_apic_timer_intr_post(vcpu, vec);
+	/* TODO: PIT, RTC etc. */
+}
+EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff -puN /dev/null arch/x86/kvm/irq.h
--- /dev/null
+++ a/arch/x86/kvm/irq.h
@@ -0,0 +1,88 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/mm_types.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+#include "ioapic.h"
+#include "lapic.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+typedef void irq_request_func(void *opaque, int level);
+
+struct kvm_kpic_state {
+	u8 last_irr;	/* edge detection */
+	u8 irr;		/* interrupt request register */
+	u8 imr;		/* interrupt mask register */
+	u8 isr;		/* interrupt service register */
+	u8 priority_add;	/* highest irq priority */
+	u8 irq_base;
+	u8 read_reg_select;
+	u8 poll;
+	u8 special_mask;
+	u8 init_state;
+	u8 auto_eoi;
+	u8 rotate_on_auto_eoi;
+	u8 special_fully_nested_mode;
+	u8 init4;		/* true if 4 byte init */
+	u8 elcr;		/* PIIX edge/trigger selection */
+	u8 elcr_mask;
+	struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+	irq_request_func *irq_request;
+	void *irq_request_opaque;
+	int output;		/* intr from master PIC */
+	struct kvm_io_device dev;
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_read_irq(struct kvm_pic *s);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+{
+	return kvm->arch.vpic;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	return pic_irqchip(kvm) != NULL;
+}
+
+void kvm_pic_reset(struct kvm_kpic_state *s);
+
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+
+#endif
diff -puN /dev/null arch/x86/kvm/kvm_svm.h
--- /dev/null
+++ a/arch/x86/kvm/kvm_svm.h
@@ -0,0 +1,45 @@
+#ifndef __KVM_SVM_H
+#define __KVM_SVM_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <asm/msr.h>
+
+#include "svm.h"
+
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+	MSR_FS_BASE,
+#endif
+	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+#define NUM_DB_REGS 4
+
+struct kvm_vcpu;
+
+struct vcpu_svm {
+	struct kvm_vcpu vcpu;
+	struct vmcb *vmcb;
+	unsigned long vmcb_pa;
+	struct svm_cpu_data *svm_data;
+	uint64_t asid_generation;
+
+	unsigned long db_regs[NUM_DB_REGS];
+
+	u64 next_rip;
+
+	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+	u64 host_gs_base;
+	unsigned long host_cr2;
+	unsigned long host_db_regs[NUM_DB_REGS];
+	unsigned long host_dr6;
+	unsigned long host_dr7;
+};
+
+#endif
+
diff -puN /dev/null arch/x86/kvm/lapic.c
--- /dev/null
+++ a/arch/x86/kvm/lapic.c
@@ -0,0 +1,1154 @@
+
+/*
+ * Local APIC virtualization
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2007 Novell
+ * Copyright (C) 2007 Intel
+ *
+ * Authors:
+ *   Dor Laor <dor.laor@qumranet.com>
+ *   Gregory Haskins <ghaskins@novell.com>
+ *   Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *
+ * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/apicdef.h>
+#include <asm/atomic.h>
+#include <asm/div64.h>
+#include "irq.h"
+
+#define PRId64 "d"
+#define PRIx64 "llx"
+#define PRIu64 "u"
+#define PRIo64 "o"
+
+#define APIC_BUS_CYCLE_NS 1
+
+/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+#define apic_debug(fmt, arg...)
+
+#define APIC_LVT_NUM			6
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION			(0x14UL | ((APIC_LVT_NUM - 1) << 16))
+#define LAPIC_MMIO_LENGTH		(1 << 12)
+/* followed define is not in apicdef.h */
+#define APIC_SHORT_MASK			0xc0000
+#define APIC_DEST_NOSHORT		0x0
+#define APIC_DEST_MASK			0x800
+#define MAX_APIC_VECTOR			256
+
+#define VEC_POS(v) ((v) & (32 - 1))
+#define REG_POS(v) (((v) >> 5) << 4)
+
+static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+	return *((u32 *) (apic->regs + reg_off));
+}
+
+static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+	*((u32 *) (apic->regs + reg_off)) = val;
+}
+
+static inline int apic_test_and_set_vector(int vec, void *bitmap)
+{
+	return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int apic_test_and_clear_vector(int vec, void *bitmap)
+{
+	return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void apic_set_vector(int vec, void *bitmap)
+{
+	set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void apic_clear_vector(int vec, void *bitmap)
+{
+	clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int apic_hw_enabled(struct kvm_lapic *apic)
+{
+	return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+}
+
+static inline int  apic_sw_enabled(struct kvm_lapic *apic)
+{
+	return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+}
+
+static inline int apic_enabled(struct kvm_lapic *apic)
+{
+	return apic_sw_enabled(apic) &&	apic_hw_enabled(apic);
+}
+
+#define LVT_MASK	\
+	(APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+
+#define LINT_MASK	\
+	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
+	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+
+static inline int kvm_apic_id(struct kvm_lapic *apic)
+{
+	return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+}
+
+static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
+{
+	return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+}
+
+static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
+{
+	return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+}
+
+static inline int apic_lvtt_period(struct kvm_lapic *apic)
+{
+	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+}
+
+static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
+	LVT_MASK | APIC_LVT_TIMER_PERIODIC,	/* LVTT */
+	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
+	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
+	LINT_MASK, LINT_MASK,	/* LVT0-1 */
+	LVT_MASK		/* LVTERR */
+};
+
+static int find_highest_vector(void *bitmap)
+{
+	u32 *word = bitmap;
+	int word_offset = MAX_APIC_VECTOR >> 5;
+
+	while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
+		continue;
+
+	if (likely(!word_offset && !word[0]))
+		return -1;
+	else
+		return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
+}
+
+static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+{
+	return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+}
+
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+	apic_clear_vector(vec, apic->regs + APIC_IRR);
+}
+
+static inline int apic_find_highest_irr(struct kvm_lapic *apic)
+{
+	int result;
+
+	result = find_highest_vector(apic->regs + APIC_IRR);
+	ASSERT(result == -1 || result >= 16);
+
+	return result;
+}
+
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	int highest_irr;
+
+	if (!apic)
+		return 0;
+	highest_irr = apic_find_highest_irr(apic);
+
+	return highest_irr;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (!apic_test_and_set_irr(vec, apic)) {
+		/* a new pending irq is set in IRR */
+		if (trig)
+			apic_set_vector(vec, apic->regs + APIC_TMR);
+		else
+			apic_clear_vector(vec, apic->regs + APIC_TMR);
+		kvm_vcpu_kick(apic->vcpu);
+		return 1;
+	}
+	return 0;
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+	int result;
+
+	result = find_highest_vector(apic->regs + APIC_ISR);
+	ASSERT(result == -1 || result >= 16);
+
+	return result;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+	u32 tpr, isrv, ppr;
+	int isr;
+
+	tpr = apic_get_reg(apic, APIC_TASKPRI);
+	isr = apic_find_highest_isr(apic);
+	isrv = (isr != -1) ? isr : 0;
+
+	if ((tpr & 0xf0) >= (isrv & 0xf0))
+		ppr = tpr & 0xff;
+	else
+		ppr = isrv & 0xf0;
+
+	apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
+		   apic, ppr, isr, isrv);
+
+	apic_set_reg(apic, APIC_PROCPRI, ppr);
+}
+
+static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
+{
+	apic_set_reg(apic, APIC_TASKPRI, tpr);
+	apic_update_ppr(apic);
+}
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
+{
+	return kvm_apic_id(apic) == dest;
+}
+
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
+{
+	int result = 0;
+	u8 logical_id;
+
+	logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+
+	switch (apic_get_reg(apic, APIC_DFR)) {
+	case APIC_DFR_FLAT:
+		if (logical_id & mda)
+			result = 1;
+		break;
+	case APIC_DFR_CLUSTER:
+		if (((logical_id >> 4) == (mda >> 0x4))
+		    && (logical_id & mda & 0xf))
+			result = 1;
+		break;
+	default:
+		printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
+		       apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+		break;
+	}
+
+	return result;
+}
+
+static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+			   int short_hand, int dest, int dest_mode)
+{
+	int result = 0;
+	struct kvm_lapic *target = vcpu->arch.apic;
+
+	apic_debug("target %p, source %p, dest 0x%x, "
+		   "dest_mode 0x%x, short_hand 0x%x",
+		   target, source, dest, dest_mode, short_hand);
+
+	ASSERT(!target);
+	switch (short_hand) {
+	case APIC_DEST_NOSHORT:
+		if (dest_mode == 0) {
+			/* Physical mode. */
+			if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
+				result = 1;
+		} else
+			/* Logical mode. */
+			result = kvm_apic_match_logical_addr(target, dest);
+		break;
+	case APIC_DEST_SELF:
+		if (target == source)
+			result = 1;
+		break;
+	case APIC_DEST_ALLINC:
+		result = 1;
+		break;
+	case APIC_DEST_ALLBUT:
+		if (target != source)
+			result = 1;
+		break;
+	default:
+		printk(KERN_WARNING "Bad dest shorthand value %x\n",
+		       short_hand);
+		break;
+	}
+
+	return result;
+}
+
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+			     int vector, int level, int trig_mode)
+{
+	int orig_irr, result = 0;
+	struct kvm_vcpu *vcpu = apic->vcpu;
+
+	switch (delivery_mode) {
+	case APIC_DM_FIXED:
+	case APIC_DM_LOWEST:
+		/* FIXME add logic for vcpu on reset */
+		if (unlikely(!apic_enabled(apic)))
+			break;
+
+		orig_irr = apic_test_and_set_irr(vector, apic);
+		if (orig_irr && trig_mode) {
+			apic_debug("level trig mode repeatedly for vector %d",
+				   vector);
+			break;
+		}
+
+		if (trig_mode) {
+			apic_debug("level trig mode for vector %d", vector);
+			apic_set_vector(vector, apic->regs + APIC_TMR);
+		} else
+			apic_clear_vector(vector, apic->regs + APIC_TMR);
+
+		if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+			kvm_vcpu_kick(vcpu);
+		else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
+			vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+			if (waitqueue_active(&vcpu->wq))
+				wake_up_interruptible(&vcpu->wq);
+		}
+
+		result = (orig_irr == 0);
+		break;
+
+	case APIC_DM_REMRD:
+		printk(KERN_DEBUG "Ignoring delivery mode 3\n");
+		break;
+
+	case APIC_DM_SMI:
+		printk(KERN_DEBUG "Ignoring guest SMI\n");
+		break;
+	case APIC_DM_NMI:
+		printk(KERN_DEBUG "Ignoring guest NMI\n");
+		break;
+
+	case APIC_DM_INIT:
+		if (level) {
+			if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+				printk(KERN_DEBUG
+				       "INIT on a runnable vcpu %d\n",
+				       vcpu->vcpu_id);
+			vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+			kvm_vcpu_kick(vcpu);
+		} else {
+			printk(KERN_DEBUG
+			       "Ignoring de-assert INIT to vcpu %d\n",
+			       vcpu->vcpu_id);
+		}
+
+		break;
+
+	case APIC_DM_STARTUP:
+		printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
+		       vcpu->vcpu_id, vector);
+		if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+			vcpu->arch.sipi_vector = vector;
+			vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+			if (waitqueue_active(&vcpu->wq))
+				wake_up_interruptible(&vcpu->wq);
+		}
+		break;
+
+	default:
+		printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
+		       delivery_mode);
+		break;
+	}
+	return result;
+}
+
+static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
+				       unsigned long bitmap)
+{
+	int last;
+	int next;
+	struct kvm_lapic *apic = NULL;
+
+	last = kvm->arch.round_robin_prev_vcpu;
+	next = last;
+
+	do {
+		if (++next == KVM_MAX_VCPUS)
+			next = 0;
+		if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
+			continue;
+		apic = kvm->vcpus[next]->arch.apic;
+		if (apic && apic_enabled(apic))
+			break;
+		apic = NULL;
+	} while (next != last);
+	kvm->arch.round_robin_prev_vcpu = next;
+
+	if (!apic)
+		printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
+
+	return apic;
+}
+
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+		unsigned long bitmap)
+{
+	struct kvm_lapic *apic;
+
+	apic = kvm_apic_round_robin(kvm, vector, bitmap);
+	if (apic)
+		return apic->vcpu;
+	return NULL;
+}
+
+static void apic_set_eoi(struct kvm_lapic *apic)
+{
+	int vector = apic_find_highest_isr(apic);
+
+	/*
+	 * Not every write EOI will has corresponding ISR,
+	 * one example is when Kernel check timer on setup_IO_APIC
+	 */
+	if (vector == -1)
+		return;
+
+	apic_clear_vector(vector, apic->regs + APIC_ISR);
+	apic_update_ppr(apic);
+
+	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
+		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+}
+
+static void apic_send_ipi(struct kvm_lapic *apic)
+{
+	u32 icr_low = apic_get_reg(apic, APIC_ICR);
+	u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+
+	unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
+	unsigned int short_hand = icr_low & APIC_SHORT_MASK;
+	unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
+	unsigned int level = icr_low & APIC_INT_ASSERT;
+	unsigned int dest_mode = icr_low & APIC_DEST_MASK;
+	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
+	unsigned int vector = icr_low & APIC_VECTOR_MASK;
+
+	struct kvm_vcpu *target;
+	struct kvm_vcpu *vcpu;
+	unsigned long lpr_map = 0;
+	int i;
+
+	apic_debug("icr_high 0x%x, icr_low 0x%x, "
+		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
+		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+		   icr_high, icr_low, short_hand, dest,
+		   trig_mode, level, dest_mode, delivery_mode, vector);
+
+	for (i = 0; i < KVM_MAX_VCPUS; i++) {
+		vcpu = apic->vcpu->kvm->vcpus[i];
+		if (!vcpu)
+			continue;
+
+		if (vcpu->arch.apic &&
+		    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
+			if (delivery_mode == APIC_DM_LOWEST)
+				set_bit(vcpu->vcpu_id, &lpr_map);
+			else
+				__apic_accept_irq(vcpu->arch.apic, delivery_mode,
+						  vector, level, trig_mode);
+		}
+	}
+
+	if (delivery_mode == APIC_DM_LOWEST) {
+		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
+		if (target != NULL)
+			__apic_accept_irq(target->arch.apic, delivery_mode,
+					  vector, level, trig_mode);
+	}
+}
+
+static u32 apic_get_tmcct(struct kvm_lapic *apic)
+{
+	u64 counter_passed;
+	ktime_t passed, now;
+	u32 tmcct;
+
+	ASSERT(apic != NULL);
+
+	now = apic->timer.dev.base->get_time();
+	tmcct = apic_get_reg(apic, APIC_TMICT);
+
+	/* if initial count is 0, current count should also be 0 */
+	if (tmcct == 0)
+		return 0;
+
+	if (unlikely(ktime_to_ns(now) <=
+		ktime_to_ns(apic->timer.last_update))) {
+		/* Wrap around */
+		passed = ktime_add(( {
+				    (ktime_t) {
+				    .tv64 = KTIME_MAX -
+				    (apic->timer.last_update).tv64}; }
+				   ), now);
+		apic_debug("time elapsed\n");
+	} else
+		passed = ktime_sub(now, apic->timer.last_update);
+
+	counter_passed = div64_64(ktime_to_ns(passed),
+				  (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
+
+	if (counter_passed > tmcct) {
+		if (unlikely(!apic_lvtt_period(apic))) {
+			/* one-shot timers stick at 0 until reset */
+			tmcct = 0;
+		} else {
+			/*
+			 * periodic timers reset to APIC_TMICT when they
+			 * hit 0. The while loop simulates this happening N
+			 * times. (counter_passed %= tmcct) would also work,
+			 * but might be slower or not work on 32-bit??
+			 */
+			while (counter_passed > tmcct)
+				counter_passed -= tmcct;
+			tmcct -= counter_passed;
+		}
+	} else {
+		tmcct -= counter_passed;
+	}
+
+	return tmcct;
+}
+
+static void __report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+	struct kvm_vcpu *vcpu = apic->vcpu;
+	struct kvm_run *run = vcpu->run;
+
+	set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+	kvm_x86_ops->cache_regs(vcpu);
+	run->tpr_access.rip = vcpu->arch.rip;
+	run->tpr_access.is_write = write;
+}
+
+static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+	if (apic->vcpu->arch.tpr_access_reporting)
+		__report_tpr_access(apic, write);
+}
+
+static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
+{
+	u32 val = 0;
+
+	if (offset >= LAPIC_MMIO_LENGTH)
+		return 0;
+
+	switch (offset) {
+	case APIC_ARBPRI:
+		printk(KERN_WARNING "Access APIC ARBPRI register "
+		       "which is for P6\n");
+		break;
+
+	case APIC_TMCCT:	/* Timer CCR */
+		val = apic_get_tmcct(apic);
+		break;
+
+	case APIC_TASKPRI:
+		report_tpr_access(apic, false);
+		/* fall thru */
+	default:
+		apic_update_ppr(apic);
+		val = apic_get_reg(apic, offset);
+		break;
+	}
+
+	return val;
+}
+
+static void apic_mmio_read(struct kvm_io_device *this,
+			   gpa_t address, int len, void *data)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+	unsigned int offset = address - apic->base_address;
+	unsigned char alignment = offset & 0xf;
+	u32 result;
+
+	if ((alignment + len) > 4) {
+		printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
+		       (unsigned long)address, len);
+		return;
+	}
+	result = __apic_read(apic, offset & ~0xf);
+
+	switch (len) {
+	case 1:
+	case 2:
+	case 4:
+		memcpy(data, (char *)&result + alignment, len);
+		break;
+	default:
+		printk(KERN_ERR "Local APIC read with len = %x, "
+		       "should be 1,2, or 4 instead\n", len);
+		break;
+	}
+}
+
+static void update_divide_count(struct kvm_lapic *apic)
+{
+	u32 tmp1, tmp2, tdcr;
+
+	tdcr = apic_get_reg(apic, APIC_TDCR);
+	tmp1 = tdcr & 0xf;
+	tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
+	apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
+
+	apic_debug("timer divide count is 0x%x\n",
+				   apic->timer.divide_count);
+}
+
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+	ktime_t now = apic->timer.dev.base->get_time();
+
+	apic->timer.last_update = now;
+
+	apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
+		    APIC_BUS_CYCLE_NS * apic->timer.divide_count;
+	atomic_set(&apic->timer.pending, 0);
+	hrtimer_start(&apic->timer.dev,
+		      ktime_add_ns(now, apic->timer.period),
+		      HRTIMER_MODE_ABS);
+
+	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+			   PRIx64 ", "
+			   "timer initial count 0x%x, period %lldns, "
+			   "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
+			   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+			   apic_get_reg(apic, APIC_TMICT),
+			   apic->timer.period,
+			   ktime_to_ns(ktime_add_ns(now,
+					apic->timer.period)));
+}
+
+static void apic_mmio_write(struct kvm_io_device *this,
+			    gpa_t address, int len, const void *data)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+	unsigned int offset = address - apic->base_address;
+	unsigned char alignment = offset & 0xf;
+	u32 val;
+
+	/*
+	 * APIC register must be aligned on 128-bits boundary.
+	 * 32/64/128 bits registers must be accessed thru 32 bits.
+	 * Refer SDM 8.4.1
+	 */
+	if (len != 4 || alignment) {
+		if (printk_ratelimit())
+			printk(KERN_ERR "apic write: bad size=%d %lx\n",
+			       len, (long)address);
+		return;
+	}
+
+	val = *(u32 *) data;
+
+	/* too common printing */
+	if (offset != APIC_EOI)
+		apic_debug("%s: offset 0x%x with length 0x%x, and value is "
+			   "0x%x\n", __FUNCTION__, offset, len, val);
+
+	offset &= 0xff0;
+
+	switch (offset) {
+	case APIC_ID:		/* Local APIC ID */
+		apic_set_reg(apic, APIC_ID, val);
+		break;
+
+	case APIC_TASKPRI:
+		report_tpr_access(apic, true);
+		apic_set_tpr(apic, val & 0xff);
+		break;
+
+	case APIC_EOI:
+		apic_set_eoi(apic);
+		break;
+
+	case APIC_LDR:
+		apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+		break;
+
+	case APIC_DFR:
+		apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+		break;
+
+	case APIC_SPIV:
+		apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
+		if (!(val & APIC_SPIV_APIC_ENABLED)) {
+			int i;
+			u32 lvt_val;
+
+			for (i = 0; i < APIC_LVT_NUM; i++) {
+				lvt_val = apic_get_reg(apic,
+						       APIC_LVTT + 0x10 * i);
+				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
+					     lvt_val | APIC_LVT_MASKED);
+			}
+			atomic_set(&apic->timer.pending, 0);
+
+		}
+		break;
+
+	case APIC_ICR:
+		/* No delay here, so we always clear the pending bit */
+		apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
+		apic_send_ipi(apic);
+		break;
+
+	case APIC_ICR2:
+		apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+		break;
+
+	case APIC_LVTT:
+	case APIC_LVTTHMR:
+	case APIC_LVTPC:
+	case APIC_LVT0:
+	case APIC_LVT1:
+	case APIC_LVTERR:
+		/* TODO: Check vector */
+		if (!apic_sw_enabled(apic))
+			val |= APIC_LVT_MASKED;
+
+		val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
+		apic_set_reg(apic, offset, val);
+
+		break;
+
+	case APIC_TMICT:
+		hrtimer_cancel(&apic->timer.dev);
+		apic_set_reg(apic, APIC_TMICT, val);
+		start_apic_timer(apic);
+		return;
+
+	case APIC_TDCR:
+		if (val & 4)
+			printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
+		apic_set_reg(apic, APIC_TDCR, val);
+		update_divide_count(apic);
+		break;
+
+	default:
+		apic_debug("Local APIC Write to read-only register %x\n",
+			   offset);
+		break;
+	}
+
+}
+
+static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
+{
+	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
+	int ret = 0;
+
+
+	if (apic_hw_enabled(apic) &&
+	    (addr >= apic->base_address) &&
+	    (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
+		ret = 1;
+
+	return ret;
+}
+
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.apic)
+		return;
+
+	hrtimer_cancel(&vcpu->arch.apic->timer.dev);
+
+	if (vcpu->arch.apic->regs_page)
+		__free_page(vcpu->arch.apic->regs_page);
+
+	kfree(vcpu->arch.apic);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * LAPIC interface
+ *----------------------------------------------------------------------
+ */
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (!apic)
+		return;
+	apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
+		     | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+}
+
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u64 tpr;
+
+	if (!apic)
+		return 0;
+	tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+
+	return (tpr & 0xf0) >> 4;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
+
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (!apic) {
+		value |= MSR_IA32_APICBASE_BSP;
+		vcpu->arch.apic_base = value;
+		return;
+	}
+	if (apic->vcpu->vcpu_id)
+		value &= ~MSR_IA32_APICBASE_BSP;
+
+	vcpu->arch.apic_base = value;
+	apic->base_address = apic->vcpu->arch.apic_base &
+			     MSR_IA32_APICBASE_BASE;
+
+	/* with FSB delivery interrupt, we can restart APIC functionality */
+	apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
+		   "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
+
+}
+
+u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic;
+	int i;
+
+	apic_debug("%s\n", __FUNCTION__);
+
+	ASSERT(vcpu);
+	apic = vcpu->arch.apic;
+	ASSERT(apic != NULL);
+
+	/* Stop the timer in case it's a reset to an active apic */
+	hrtimer_cancel(&apic->timer.dev);
+
+	apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+
+	for (i = 0; i < APIC_LVT_NUM; i++)
+		apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+	apic_set_reg(apic, APIC_LVT0,
+		     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+
+	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
+	apic_set_reg(apic, APIC_SPIV, 0xff);
+	apic_set_reg(apic, APIC_TASKPRI, 0);
+	apic_set_reg(apic, APIC_LDR, 0);
+	apic_set_reg(apic, APIC_ESR, 0);
+	apic_set_reg(apic, APIC_ICR, 0);
+	apic_set_reg(apic, APIC_ICR2, 0);
+	apic_set_reg(apic, APIC_TDCR, 0);
+	apic_set_reg(apic, APIC_TMICT, 0);
+	for (i = 0; i < 8; i++) {
+		apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+	}
+	update_divide_count(apic);
+	atomic_set(&apic->timer.pending, 0);
+	if (vcpu->vcpu_id == 0)
+		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+	apic_update_ppr(apic);
+
+	apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
+		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
+		   vcpu, kvm_apic_id(apic),
+		   vcpu->arch.apic_base, apic->base_address);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_reset);
+
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	int ret = 0;
+
+	if (!apic)
+		return 0;
+	ret = apic_enabled(apic);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
+
+/*
+ *----------------------------------------------------------------------
+ * timer interface
+ *----------------------------------------------------------------------
+ */
+
+/* TODO: make sure __apic_timer_fn runs in current pCPU */
+static int __apic_timer_fn(struct kvm_lapic *apic)
+{
+	int result = 0;
+	wait_queue_head_t *q = &apic->vcpu->wq;
+
+	atomic_inc(&apic->timer.pending);
+	if (waitqueue_active(q)) {
+		apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+		wake_up_interruptible(q);
+	}
+	if (apic_lvtt_period(apic)) {
+		result = 1;
+		apic->timer.dev.expires = ktime_add_ns(
+					apic->timer.dev.expires,
+					apic->timer.period);
+	}
+	return result;
+}
+
+static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+{
+	int vector;
+
+	vector = apic_lvt_vector(apic, APIC_LVTT);
+	return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+}
+
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+	struct kvm_lapic *apic;
+	int restart_timer = 0;
+
+	apic = container_of(data, struct kvm_lapic, timer.dev);
+
+	restart_timer = __apic_timer_fn(apic);
+
+	if (restart_timer)
+		return HRTIMER_RESTART;
+	else
+		return HRTIMER_NORESTART;
+}
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic;
+
+	ASSERT(vcpu != NULL);
+	apic_debug("apic_init %d\n", vcpu->vcpu_id);
+
+	apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+	if (!apic)
+		goto nomem;
+
+	vcpu->arch.apic = apic;
+
+	apic->regs_page = alloc_page(GFP_KERNEL);
+	if (apic->regs_page == NULL) {
+		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
+		       vcpu->vcpu_id);
+		goto nomem_free_apic;
+	}
+	apic->regs = page_address(apic->regs_page);
+	memset(apic->regs, 0, PAGE_SIZE);
+	apic->vcpu = vcpu;
+
+	hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	apic->timer.dev.function = apic_timer_fn;
+	apic->base_address = APIC_DEFAULT_PHYS_BASE;
+	vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+
+	kvm_lapic_reset(vcpu);
+	apic->dev.read = apic_mmio_read;
+	apic->dev.write = apic_mmio_write;
+	apic->dev.in_range = apic_mmio_range;
+	apic->dev.private = apic;
+
+	return 0;
+nomem_free_apic:
+	kfree(apic);
+nomem:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(kvm_create_lapic);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	int highest_irr;
+
+	if (!apic || !apic_enabled(apic))
+		return -1;
+
+	apic_update_ppr(apic);
+	highest_irr = apic_find_highest_irr(apic);
+	if ((highest_irr == -1) ||
+	    ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+		return -1;
+	return highest_irr;
+}
+
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
+{
+	u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
+	int r = 0;
+
+	if (vcpu->vcpu_id == 0) {
+		if (!apic_hw_enabled(vcpu->arch.apic))
+			r = 1;
+		if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+		    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+			r = 1;
+	}
+	return r;
+}
+
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
+		atomic_read(&apic->timer.pending) > 0) {
+		if (__inject_apic_timer_irq(apic))
+			atomic_dec(&apic->timer.pending);
+	}
+}
+
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
+		apic->timer.last_update = ktime_add_ns(
+				apic->timer.last_update,
+				apic->timer.period);
+}
+
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+{
+	int vector = kvm_apic_has_interrupt(vcpu);
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (vector == -1)
+		return -1;
+
+	apic_set_vector(vector, apic->regs + APIC_ISR);
+	apic_update_ppr(apic);
+	apic_clear_irr(vector, apic);
+	return vector;
+}
+
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	apic->base_address = vcpu->arch.apic_base &
+			     MSR_IA32_APICBASE_BASE;
+	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+	apic_update_ppr(apic);
+	hrtimer_cancel(&apic->timer.dev);
+	update_divide_count(apic);
+	start_apic_timer(apic);
+}
+
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct hrtimer *timer;
+
+	if (!apic)
+		return;
+
+	timer = &apic->timer.dev;
+	if (hrtimer_cancel(timer))
+		hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+}
+
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
+{
+	u32 data;
+	void *vapic;
+
+	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+		return;
+
+	vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+	data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
+	kunmap_atomic(vapic, KM_USER0);
+
+	apic_set_tpr(vcpu->arch.apic, data & 0xff);
+}
+
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
+{
+	u32 data, tpr;
+	int max_irr, max_isr;
+	struct kvm_lapic *apic;
+	void *vapic;
+
+	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+		return;
+
+	apic = vcpu->arch.apic;
+	tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+	max_irr = apic_find_highest_irr(apic);
+	if (max_irr < 0)
+		max_irr = 0;
+	max_isr = apic_find_highest_isr(apic);
+	if (max_isr < 0)
+		max_isr = 0;
+	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
+
+	vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+	*(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
+	kunmap_atomic(vapic, KM_USER0);
+}
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+{
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return;
+
+	vcpu->arch.apic->vapic_addr = vapic_addr;
+}
diff -puN /dev/null arch/x86/kvm/lapic.h
--- /dev/null
+++ a/arch/x86/kvm/lapic.h
@@ -0,0 +1,50 @@
+#ifndef __KVM_X86_LAPIC_H
+#define __KVM_X86_LAPIC_H
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+
+struct kvm_lapic {
+	unsigned long base_address;
+	struct kvm_io_device dev;
+	struct {
+		atomic_t pending;
+		s64 period;	/* unit: ns */
+		u32 divide_count;
+		ktime_t last_update;
+		struct hrtimer dev;
+	} timer;
+	struct kvm_vcpu *vcpu;
+	struct page *regs_page;
+	void *regs;
+	gpa_t vapic_addr;
+	struct page *vapic_page;
+};
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+
+#endif
diff -puN /dev/null arch/x86/kvm/mmu.c
--- /dev/null
+++ a/arch/x86/kvm/mmu.c
@@ -0,0 +1,1894 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "vmx.h"
+#include "mmu.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+
+#include <asm/page.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+
+#undef MMU_DEBUG
+
+#undef AUDIT
+
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#if defined(MMU_DEBUG) || defined(AUDIT)
+static int dbg = 1;
+#endif
+
+#ifndef MMU_DEBUG
+#define ASSERT(x) do { } while (0)
+#else
+#define ASSERT(x)							\
+	if (!(x)) {							\
+		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
+		       __FILE__, __LINE__, #x);				\
+	}
+#endif
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_MASK (1ULL << 5)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 9
+#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_LEVEL_MASK(level) \
+		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
+
+#define PT64_INDEX(address, level)\
+	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+
+
+#define PT32_LEVEL_BITS 10
+
+#define PT32_LEVEL_SHIFT(level) \
+		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
+
+#define PT32_LEVEL_MASK(level) \
+		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+
+#define PT32_INDEX(address, level)\
+	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
+
+
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_DIR_BASE_ADDR_MASK \
+	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+
+#define PT32_BASE_ADDR_MASK PAGE_MASK
+#define PT32_DIR_BASE_ADDR_MASK \
+	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+			| PT64_NX_MASK)
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_FETCH_MASK (1U << 4)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define RMAP_EXT 4
+
+#define ACC_EXEC_MASK    1
+#define ACC_WRITE_MASK   PT_WRITABLE_MASK
+#define ACC_USER_MASK    PT_USER_MASK
+#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+struct kvm_rmap_desc {
+	u64 *shadow_ptes[RMAP_EXT];
+	struct kvm_rmap_desc *more;
+};
+
+static struct kmem_cache *pte_chain_cache;
+static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *mmu_page_header_cache;
+
+static u64 __read_mostly shadow_trap_nonpresent_pte;
+static u64 __read_mostly shadow_notrap_nonpresent_pte;
+
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+{
+	shadow_trap_nonpresent_pte = trap_pte;
+	shadow_notrap_nonpresent_pte = notrap_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
+
+static int is_write_protection(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr0 & X86_CR0_WP;
+}
+
+static int is_cpuid_PSE36(void)
+{
+	return 1;
+}
+
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shadow_efer & EFER_NX;
+}
+
+static int is_present_pte(unsigned long pte)
+{
+	return pte & PT_PRESENT_MASK;
+}
+
+static int is_shadow_present_pte(u64 pte)
+{
+	pte &= ~PT_SHADOW_IO_MARK;
+	return pte != shadow_trap_nonpresent_pte
+		&& pte != shadow_notrap_nonpresent_pte;
+}
+
+static int is_writeble_pte(unsigned long pte)
+{
+	return pte & PT_WRITABLE_MASK;
+}
+
+static int is_dirty_pte(unsigned long pte)
+{
+	return pte & PT_DIRTY_MASK;
+}
+
+static int is_io_pte(unsigned long pte)
+{
+	return pte & PT_SHADOW_IO_MARK;
+}
+
+static int is_rmap_pte(u64 pte)
+{
+	return pte != shadow_trap_nonpresent_pte
+		&& pte != shadow_notrap_nonpresent_pte;
+}
+
+static gfn_t pse36_gfn_delta(u32 gpte)
+{
+	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+	return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+
+static void set_shadow_pte(u64 *sptep, u64 spte)
+{
+#ifdef CONFIG_X86_64
+	set_64bit((unsigned long *)sptep, spte);
+#else
+	set_64bit((unsigned long long *)sptep, spte);
+#endif
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+				  struct kmem_cache *base_cache, int min)
+{
+	void *obj;
+
+	if (cache->nobjs >= min)
+		return 0;
+	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+		if (!obj)
+			return -ENOMEM;
+		cache->objects[cache->nobjs++] = obj;
+	}
+	return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+	while (mc->nobjs)
+		kfree(mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
+				       int min)
+{
+	struct page *page;
+
+	if (cache->nobjs >= min)
+		return 0;
+	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		set_page_private(page, 0);
+		cache->objects[cache->nobjs++] = page_address(page);
+	}
+	return 0;
+}
+
+static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
+{
+	while (mc->nobjs)
+		free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
+				   pte_chain_cache, 4);
+	if (r)
+		goto out;
+	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+				   rmap_desc_cache, 1);
+	if (r)
+		goto out;
+	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+	if (r)
+		goto out;
+	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+				   mmu_page_header_cache, 4);
+out:
+	return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+				    size_t size)
+{
+	void *p;
+
+	BUG_ON(!mc->nobjs);
+	p = mc->objects[--mc->nobjs];
+	memset(p, 0, size);
+	return p;
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
+				      sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+{
+	kfree(pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
+				      sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
+{
+	kfree(rd);
+}
+
+/*
+ * Take gfn and return the reverse mapping to it.
+ * Note: gfn must be unaliased before this function get called
+ */
+
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	return &slot->rmap[gfn - slot->base_gfn];
+}
+
+/*
+ * Reverse mapping data structures:
+ *
+ * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
+ * that points to page_address(page).
+ *
+ * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
+ * containing more mappings.
+ */
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+	struct kvm_mmu_page *sp;
+	struct kvm_rmap_desc *desc;
+	unsigned long *rmapp;
+	int i;
+
+	if (!is_rmap_pte(*spte))
+		return;
+	gfn = unalias_gfn(vcpu->kvm, gfn);
+	sp = page_header(__pa(spte));
+	sp->gfns[spte - sp->spt] = gfn;
+	rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+	if (!*rmapp) {
+		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
+		*rmapp = (unsigned long)spte;
+	} else if (!(*rmapp & 1)) {
+		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
+		desc = mmu_alloc_rmap_desc(vcpu);
+		desc->shadow_ptes[0] = (u64 *)*rmapp;
+		desc->shadow_ptes[1] = spte;
+		*rmapp = (unsigned long)desc | 1;
+	} else {
+		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
+		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+		while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+			desc = desc->more;
+		if (desc->shadow_ptes[RMAP_EXT-1]) {
+			desc->more = mmu_alloc_rmap_desc(vcpu);
+			desc = desc->more;
+		}
+		for (i = 0; desc->shadow_ptes[i]; ++i)
+			;
+		desc->shadow_ptes[i] = spte;
+	}
+}
+
+static void rmap_desc_remove_entry(unsigned long *rmapp,
+				   struct kvm_rmap_desc *desc,
+				   int i,
+				   struct kvm_rmap_desc *prev_desc)
+{
+	int j;
+
+	for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+		;
+	desc->shadow_ptes[i] = desc->shadow_ptes[j];
+	desc->shadow_ptes[j] = NULL;
+	if (j != 0)
+		return;
+	if (!prev_desc && !desc->more)
+		*rmapp = (unsigned long)desc->shadow_ptes[0];
+	else
+		if (prev_desc)
+			prev_desc->more = desc->more;
+		else
+			*rmapp = (unsigned long)desc->more | 1;
+	mmu_free_rmap_desc(desc);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+	struct kvm_rmap_desc *desc;
+	struct kvm_rmap_desc *prev_desc;
+	struct kvm_mmu_page *sp;
+	struct page *page;
+	unsigned long *rmapp;
+	int i;
+
+	if (!is_rmap_pte(*spte))
+		return;
+	sp = page_header(__pa(spte));
+	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+	mark_page_accessed(page);
+	if (is_writeble_pte(*spte))
+		kvm_release_page_dirty(page);
+	else
+		kvm_release_page_clean(page);
+	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+	if (!*rmapp) {
+		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+		BUG();
+	} else if (!(*rmapp & 1)) {
+		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+		if ((u64 *)*rmapp != spte) {
+			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
+			       spte, *spte);
+			BUG();
+		}
+		*rmapp = 0;
+	} else {
+		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+		prev_desc = NULL;
+		while (desc) {
+			for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
+				if (desc->shadow_ptes[i] == spte) {
+					rmap_desc_remove_entry(rmapp,
+							       desc, i,
+							       prev_desc);
+					return;
+				}
+			prev_desc = desc;
+			desc = desc->more;
+		}
+		BUG();
+	}
+}
+
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+{
+	struct kvm_rmap_desc *desc;
+	struct kvm_rmap_desc *prev_desc;
+	u64 *prev_spte;
+	int i;
+
+	if (!*rmapp)
+		return NULL;
+	else if (!(*rmapp & 1)) {
+		if (!spte)
+			return (u64 *)*rmapp;
+		return NULL;
+	}
+	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+	prev_desc = NULL;
+	prev_spte = NULL;
+	while (desc) {
+		for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+			if (prev_spte == spte)
+				return desc->shadow_ptes[i];
+			prev_spte = desc->shadow_ptes[i];
+		}
+		desc = desc->more;
+	}
+	return NULL;
+}
+
+static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+	unsigned long *rmapp;
+	u64 *spte;
+	int write_protected = 0;
+
+	gfn = unalias_gfn(kvm, gfn);
+	rmapp = gfn_to_rmap(kvm, gfn);
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!spte);
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+		if (is_writeble_pte(*spte)) {
+			set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+			write_protected = 1;
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+	if (write_protected)
+		kvm_flush_remote_tlbs(kvm);
+}
+
+#ifdef MMU_DEBUG
+static int is_empty_shadow_page(u64 *spt)
+{
+	u64 *pos;
+	u64 *end;
+
+	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+		if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
+			printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+			       pos, *pos);
+			return 0;
+		}
+	return 1;
+}
+#endif
+
+static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	ASSERT(is_empty_shadow_page(sp->spt));
+	list_del(&sp->link);
+	__free_page(virt_to_page(sp->spt));
+	__free_page(virt_to_page(sp->gfns));
+	kfree(sp);
+	++kvm->arch.n_free_mmu_pages;
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+					       u64 *parent_pte)
+{
+	struct kvm_mmu_page *sp;
+
+	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
+	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+	ASSERT(is_empty_shadow_page(sp->spt));
+	sp->slot_bitmap = 0;
+	sp->multimapped = 0;
+	sp->parent_pte = parent_pte;
+	--vcpu->kvm->arch.n_free_mmu_pages;
+	return sp;
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	int i;
+
+	if (!parent_pte)
+		return;
+	if (!sp->multimapped) {
+		u64 *old = sp->parent_pte;
+
+		if (!old) {
+			sp->parent_pte = parent_pte;
+			return;
+		}
+		sp->multimapped = 1;
+		pte_chain = mmu_alloc_pte_chain(vcpu);
+		INIT_HLIST_HEAD(&sp->parent_ptes);
+		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+		pte_chain->parent_ptes[0] = old;
+	}
+	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
+		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
+			continue;
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
+			if (!pte_chain->parent_ptes[i]) {
+				pte_chain->parent_ptes[i] = parent_pte;
+				return;
+			}
+	}
+	pte_chain = mmu_alloc_pte_chain(vcpu);
+	BUG_ON(!pte_chain);
+	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+	pte_chain->parent_ptes[0] = parent_pte;
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+				       u64 *parent_pte)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	int i;
+
+	if (!sp->multimapped) {
+		BUG_ON(sp->parent_pte != parent_pte);
+		sp->parent_pte = NULL;
+		return;
+	}
+	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+			if (!pte_chain->parent_ptes[i])
+				break;
+			if (pte_chain->parent_ptes[i] != parent_pte)
+				continue;
+			while (i + 1 < NR_PTE_CHAIN_ENTRIES
+				&& pte_chain->parent_ptes[i + 1]) {
+				pte_chain->parent_ptes[i]
+					= pte_chain->parent_ptes[i + 1];
+				++i;
+			}
+			pte_chain->parent_ptes[i] = NULL;
+			if (i == 0) {
+				hlist_del(&pte_chain->link);
+				mmu_free_pte_chain(pte_chain);
+				if (hlist_empty(&sp->parent_ptes)) {
+					sp->multimapped = 0;
+					sp->parent_pte = NULL;
+				}
+			}
+			return;
+		}
+	BUG();
+}
+
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
+{
+	unsigned index;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *sp;
+	struct hlist_node *node;
+
+	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry(sp, node, bucket, hash_link)
+		if (sp->gfn == gfn && !sp->role.metaphysical) {
+			pgprintk("%s: found role %x\n",
+				 __FUNCTION__, sp->role.word);
+			return sp;
+		}
+	return NULL;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+					     gfn_t gfn,
+					     gva_t gaddr,
+					     unsigned level,
+					     int metaphysical,
+					     unsigned access,
+					     u64 *parent_pte,
+					     bool *new_page)
+{
+	union kvm_mmu_page_role role;
+	unsigned index;
+	unsigned quadrant;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *sp;
+	struct hlist_node *node;
+
+	role.word = 0;
+	role.glevels = vcpu->arch.mmu.root_level;
+	role.level = level;
+	role.metaphysical = metaphysical;
+	role.access = access;
+	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+		role.quadrant = quadrant;
+	}
+	pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+		 gfn, role.word);
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry(sp, node, bucket, hash_link)
+		if (sp->gfn == gfn && sp->role.word == role.word) {
+			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+			pgprintk("%s: found\n", __FUNCTION__);
+			return sp;
+		}
+	++vcpu->kvm->stat.mmu_cache_miss;
+	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+	if (!sp)
+		return sp;
+	pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+	sp->gfn = gfn;
+	sp->role = role;
+	hlist_add_head(&sp->hash_link, bucket);
+	vcpu->arch.mmu.prefetch_page(vcpu, sp);
+	if (!metaphysical)
+		rmap_write_protect(vcpu->kvm, gfn);
+	if (new_page)
+		*new_page = 1;
+	return sp;
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm *kvm,
+					 struct kvm_mmu_page *sp)
+{
+	unsigned i;
+	u64 *pt;
+	u64 ent;
+
+	pt = sp->spt;
+
+	if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			if (is_shadow_present_pte(pt[i]))
+				rmap_remove(kvm, &pt[i]);
+			pt[i] = shadow_trap_nonpresent_pte;
+		}
+		kvm_flush_remote_tlbs(kvm);
+		return;
+	}
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		ent = pt[i];
+
+		pt[i] = shadow_trap_nonpresent_pte;
+		if (!is_shadow_present_pte(ent))
+			continue;
+		ent &= PT64_BASE_ADDR_MASK;
+		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
+	}
+	kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+	mmu_page_remove_parent_pte(sp, parent_pte);
+}
+
+static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < KVM_MAX_VCPUS; ++i)
+		if (kvm->vcpus[i])
+			kvm->vcpus[i]->arch.last_pte_updated = NULL;
+}
+
+static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	u64 *parent_pte;
+
+	++kvm->stat.mmu_shadow_zapped;
+	while (sp->multimapped || sp->parent_pte) {
+		if (!sp->multimapped)
+			parent_pte = sp->parent_pte;
+		else {
+			struct kvm_pte_chain *chain;
+
+			chain = container_of(sp->parent_ptes.first,
+					     struct kvm_pte_chain, link);
+			parent_pte = chain->parent_ptes[0];
+		}
+		BUG_ON(!parent_pte);
+		kvm_mmu_put_page(sp, parent_pte);
+		set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
+	}
+	kvm_mmu_page_unlink_children(kvm, sp);
+	if (!sp->root_count) {
+		hlist_del(&sp->hash_link);
+		kvm_mmu_free_page(kvm, sp);
+	} else
+		list_move(&sp->link, &kvm->arch.active_mmu_pages);
+	kvm_mmu_reset_last_pte_updated(kvm);
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+{
+	/*
+	 * If we set the number of mmu pages to be smaller be than the
+	 * number of actived pages , we must to free some mmu pages before we
+	 * change the value
+	 */
+
+	if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
+	    kvm_nr_mmu_pages) {
+		int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
+				       - kvm->arch.n_free_mmu_pages;
+
+		while (n_used_mmu_pages > kvm_nr_mmu_pages) {
+			struct kvm_mmu_page *page;
+
+			page = container_of(kvm->arch.active_mmu_pages.prev,
+					    struct kvm_mmu_page, link);
+			kvm_mmu_zap_page(kvm, page);
+			n_used_mmu_pages--;
+		}
+		kvm->arch.n_free_mmu_pages = 0;
+	}
+	else
+		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
+					 - kvm->arch.n_alloc_mmu_pages;
+
+	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+}
+
+static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+{
+	unsigned index;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *sp;
+	struct hlist_node *node, *n;
+	int r;
+
+	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+	r = 0;
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
+		if (sp->gfn == gfn && !sp->role.metaphysical) {
+			pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+				 sp->role.word);
+			kvm_mmu_zap_page(kvm, sp);
+			r = 1;
+		}
+	return r;
+}
+
+static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_mmu_page *sp;
+
+	while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
+		pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+		kvm_mmu_zap_page(kvm, sp);
+	}
+}
+
+static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
+{
+	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
+	struct kvm_mmu_page *sp = page_header(__pa(pte));
+
+	__set_bit(slot, &sp->slot_bitmap);
+}
+
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+
+	if (gpa == UNMAPPED_GVA)
+		return NULL;
+	return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+			 unsigned pt_access, unsigned pte_access,
+			 int user_fault, int write_fault, int dirty,
+			 int *ptwrite, gfn_t gfn, struct page *page)
+{
+	u64 spte;
+	int was_rmapped = is_rmap_pte(*shadow_pte);
+	int was_writeble = is_writeble_pte(*shadow_pte);
+
+	pgprintk("%s: spte %llx access %x write_fault %d"
+		 " user_fault %d gfn %lx\n",
+		 __FUNCTION__, *shadow_pte, pt_access,
+		 write_fault, user_fault, gfn);
+
+	/*
+	 * We don't set the accessed bit, since we sometimes want to see
+	 * whether the guest actually used the pte (in order to detect
+	 * demand paging).
+	 */
+	spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+	if (!dirty)
+		pte_access &= ~ACC_WRITE_MASK;
+	if (!(pte_access & ACC_EXEC_MASK))
+		spte |= PT64_NX_MASK;
+
+	spte |= PT_PRESENT_MASK;
+	if (pte_access & ACC_USER_MASK)
+		spte |= PT_USER_MASK;
+
+	if (is_error_page(page)) {
+		set_shadow_pte(shadow_pte,
+			       shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
+		kvm_release_page_clean(page);
+		return;
+	}
+
+	spte |= page_to_phys(page);
+
+	if ((pte_access & ACC_WRITE_MASK)
+	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+		struct kvm_mmu_page *shadow;
+
+		spte |= PT_WRITABLE_MASK;
+		if (user_fault) {
+			mmu_unshadow(vcpu->kvm, gfn);
+			goto unshadowed;
+		}
+
+		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+		if (shadow) {
+			pgprintk("%s: found shadow page for %lx, marking ro\n",
+				 __FUNCTION__, gfn);
+			pte_access &= ~ACC_WRITE_MASK;
+			if (is_writeble_pte(spte)) {
+				spte &= ~PT_WRITABLE_MASK;
+				kvm_x86_ops->tlb_flush(vcpu);
+			}
+			if (write_fault)
+				*ptwrite = 1;
+		}
+	}
+
+unshadowed:
+
+	if (pte_access & ACC_WRITE_MASK)
+		mark_page_dirty(vcpu->kvm, gfn);
+
+	pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+	set_shadow_pte(shadow_pte, spte);
+	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
+	if (!was_rmapped) {
+		rmap_add(vcpu, shadow_pte, gfn);
+		if (!is_rmap_pte(*shadow_pte))
+			kvm_release_page_clean(page);
+	} else {
+		if (was_writeble)
+			kvm_release_page_dirty(page);
+		else
+			kvm_release_page_clean(page);
+	}
+	if (!ptwrite || !*ptwrite)
+		vcpu->arch.last_pte_updated = shadow_pte;
+}
+
+static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
+			   gfn_t gfn, struct page *page)
+{
+	int level = PT32E_ROOT_LEVEL;
+	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+	int pt_write = 0;
+
+	for (; ; level--) {
+		u32 index = PT64_INDEX(v, level);
+		u64 *table;
+
+		ASSERT(VALID_PAGE(table_addr));
+		table = __va(table_addr);
+
+		if (level == 1) {
+			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+				     0, write, 1, &pt_write, gfn, page);
+			return pt_write || is_io_pte(table[index]);
+		}
+
+		if (table[index] == shadow_trap_nonpresent_pte) {
+			struct kvm_mmu_page *new_table;
+			gfn_t pseudo_gfn;
+
+			pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
+				>> PAGE_SHIFT;
+			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
+						     v, level - 1,
+						     1, ACC_ALL, &table[index],
+						     NULL);
+			if (!new_table) {
+				pgprintk("nonpaging_map: ENOMEM\n");
+				kvm_release_page_clean(page);
+				return -ENOMEM;
+			}
+
+			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
+				| PT_WRITABLE_MASK | PT_USER_MASK;
+		}
+		table_addr = table[index] & PT64_BASE_ADDR_MASK;
+	}
+}
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+{
+	int r;
+
+	struct page *page;
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(vcpu->kvm, gfn);
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_free_some_pages(vcpu);
+	r = __nonpaging_map(vcpu, v, write, gfn, page);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	up_read(&current->mm->mmap_sem);
+
+	return r;
+}
+
+
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+		sp->spt[i] = shadow_trap_nonpresent_pte;
+}
+
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	spin_lock(&vcpu->kvm->mmu_lock);
+#ifdef CONFIG_X86_64
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+
+		sp = page_header(root);
+		--sp->root_count;
+		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		return;
+	}
+#endif
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			--sp->root_count;
+		}
+		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+	}
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+}
+
+static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	gfn_t root_gfn;
+	struct kvm_mmu_page *sp;
+
+	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+
+#ifdef CONFIG_X86_64
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+
+		ASSERT(!VALID_PAGE(root));
+		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+				      PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
+		root = __pa(sp->spt);
+		++sp->root_count;
+		vcpu->arch.mmu.root_hpa = root;
+		return;
+	}
+#endif
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		ASSERT(!VALID_PAGE(root));
+		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+			if (!is_present_pte(vcpu->arch.pdptrs[i])) {
+				vcpu->arch.mmu.pae_root[i] = 0;
+				continue;
+			}
+			root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
+		} else if (vcpu->arch.mmu.root_level == 0)
+			root_gfn = 0;
+		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+				      PT32_ROOT_LEVEL, !is_paging(vcpu),
+				      ACC_ALL, NULL, NULL);
+		root = __pa(sp->spt);
+		++sp->root_count;
+		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+	}
+	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+	return vaddr;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+				u32 error_code)
+{
+	gfn_t gfn;
+	int r;
+
+	pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		return r;
+
+	ASSERT(vcpu);
+	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	gfn = gva >> PAGE_SHIFT;
+
+	return nonpaging_map(vcpu, gva & PAGE_MASK,
+			     error_code & PFERR_WRITE_MASK, gfn);
+}
+
+static void nonpaging_free(struct kvm_vcpu *vcpu)
+{
+	mmu_free_roots(vcpu);
+}
+
+static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	context->new_cr3 = nonpaging_new_cr3;
+	context->page_fault = nonpaging_page_fault;
+	context->gva_to_gpa = nonpaging_gva_to_gpa;
+	context->free = nonpaging_free;
+	context->prefetch_page = nonpaging_prefetch_page;
+	context->root_level = 0;
+	context->shadow_root_level = PT32E_ROOT_LEVEL;
+	context->root_hpa = INVALID_PAGE;
+	return 0;
+}
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	++vcpu->stat.tlb_flush;
+	kvm_x86_ops->tlb_flush(vcpu);
+}
+
+static void paging_new_cr3(struct kvm_vcpu *vcpu)
+{
+	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
+	mmu_free_roots(vcpu);
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+			      u64 addr,
+			      u32 err_code)
+{
+	kvm_inject_page_fault(vcpu, addr, err_code);
+}
+
+static void paging_free(struct kvm_vcpu *vcpu)
+{
+	nonpaging_free(vcpu);
+}
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	ASSERT(is_pae(vcpu));
+	context->new_cr3 = paging_new_cr3;
+	context->page_fault = paging64_page_fault;
+	context->gva_to_gpa = paging64_gva_to_gpa;
+	context->prefetch_page = paging64_prefetch_page;
+	context->free = paging_free;
+	context->root_level = level;
+	context->shadow_root_level = level;
+	context->root_hpa = INVALID_PAGE;
+	return 0;
+}
+
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
+static int paging32_init_context(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	context->new_cr3 = paging_new_cr3;
+	context->page_fault = paging32_page_fault;
+	context->gva_to_gpa = paging32_gva_to_gpa;
+	context->free = paging_free;
+	context->prefetch_page = paging32_prefetch_page;
+	context->root_level = PT32_ROOT_LEVEL;
+	context->shadow_root_level = PT32E_ROOT_LEVEL;
+	context->root_hpa = INVALID_PAGE;
+	return 0;
+}
+
+static int paging32E_init_context(struct kvm_vcpu *vcpu)
+{
+	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+}
+
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	if (!is_paging(vcpu))
+		return nonpaging_init_context(vcpu);
+	else if (is_long_mode(vcpu))
+		return paging64_init_context(vcpu);
+	else if (is_pae(vcpu))
+		return paging32E_init_context(vcpu);
+	else
+		return paging32_init_context(vcpu);
+}
+
+static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+		vcpu->arch.mmu.free(vcpu);
+		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	}
+}
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+	destroy_kvm_mmu(vcpu);
+	return init_kvm_mmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		goto out;
+	spin_lock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_free_some_pages(vcpu);
+	mmu_alloc_roots(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+	kvm_mmu_flush_tlb(vcpu);
+out:
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+	mmu_free_roots(vcpu);
+}
+
+static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *sp,
+				  u64 *spte)
+{
+	u64 pte;
+	struct kvm_mmu_page *child;
+
+	pte = *spte;
+	if (is_shadow_present_pte(pte)) {
+		if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+			rmap_remove(vcpu->kvm, spte);
+		else {
+			child = page_header(pte & PT64_BASE_ADDR_MASK);
+			mmu_page_remove_parent_pte(child, spte);
+		}
+	}
+	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+}
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *sp,
+				  u64 *spte,
+				  const void *new)
+{
+	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+		++vcpu->kvm->stat.mmu_pde_zapped;
+		return;
+	}
+
+	++vcpu->kvm->stat.mmu_pte_updated;
+	if (sp->role.glevels == PT32_ROOT_LEVEL)
+		paging32_update_pte(vcpu, sp, spte, new);
+	else
+		paging64_update_pte(vcpu, sp, spte, new);
+}
+
+static bool need_remote_flush(u64 old, u64 new)
+{
+	if (!is_shadow_present_pte(old))
+		return false;
+	if (!is_shadow_present_pte(new))
+		return true;
+	if ((old ^ new) & PT64_BASE_ADDR_MASK)
+		return true;
+	old ^= PT64_NX_MASK;
+	new ^= PT64_NX_MASK;
+	return (old & ~new & PT64_PERM_MASK) != 0;
+}
+
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+{
+	if (need_remote_flush(old, new))
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	else
+		kvm_mmu_flush_tlb(vcpu);
+}
+
+static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
+{
+	u64 *spte = vcpu->arch.last_pte_updated;
+
+	return !!(spte && (*spte & PT_ACCESSED_MASK));
+}
+
+static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+					  const u8 *new, int bytes)
+{
+	gfn_t gfn;
+	int r;
+	u64 gpte = 0;
+
+	if (bytes != 4 && bytes != 8)
+		return;
+
+	/*
+	 * Assume that the pte write on a page table of the same type
+	 * as the current vcpu paging mode.  This is nearly always true
+	 * (might be false while changing modes).  Note it is verified later
+	 * by update_pte().
+	 */
+	if (is_pae(vcpu)) {
+		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+		if ((bytes == 4) && (gpa % 4 == 0)) {
+			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
+			if (r)
+				return;
+			memcpy((void *)&gpte + (gpa % 8), new, 4);
+		} else if ((bytes == 8) && (gpa % 8 == 0)) {
+			memcpy((void *)&gpte, new, 8);
+		}
+	} else {
+		if ((bytes == 4) && (gpa % 4 == 0))
+			memcpy((void *)&gpte, new, 4);
+	}
+	if (!is_present_pte(gpte))
+		return;
+	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+	vcpu->arch.update_pte.gfn = gfn;
+	vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
+}
+
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+		       const u8 *new, int bytes)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	struct kvm_mmu_page *sp;
+	struct hlist_node *node, *n;
+	struct hlist_head *bucket;
+	unsigned index;
+	u64 entry, gentry;
+	u64 *spte;
+	unsigned offset = offset_in_page(gpa);
+	unsigned pte_size;
+	unsigned page_offset;
+	unsigned misaligned;
+	unsigned quadrant;
+	int level;
+	int flooded = 0;
+	int npte;
+	int r;
+
+	pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+	spin_lock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_free_some_pages(vcpu);
+	++vcpu->kvm->stat.mmu_pte_write;
+	kvm_mmu_audit(vcpu, "pre pte write");
+	if (gfn == vcpu->arch.last_pt_write_gfn
+	    && !last_updated_pte_accessed(vcpu)) {
+		++vcpu->arch.last_pt_write_count;
+		if (vcpu->arch.last_pt_write_count >= 3)
+			flooded = 1;
+	} else {
+		vcpu->arch.last_pt_write_gfn = gfn;
+		vcpu->arch.last_pt_write_count = 1;
+		vcpu->arch.last_pte_updated = NULL;
+	}
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
+		if (sp->gfn != gfn || sp->role.metaphysical)
+			continue;
+		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+		misaligned |= bytes < 4;
+		if (misaligned || flooded) {
+			/*
+			 * Misaligned accesses are too much trouble to fix
+			 * up; also, they usually indicate a page is not used
+			 * as a page table.
+			 *
+			 * If we're seeing too many writes to a page,
+			 * it may no longer be a page table, or we may be
+			 * forking, in which case it is better to unmap the
+			 * page.
+			 */
+			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+				 gpa, bytes, sp->role.word);
+			kvm_mmu_zap_page(vcpu->kvm, sp);
+			++vcpu->kvm->stat.mmu_flooded;
+			continue;
+		}
+		page_offset = offset;
+		level = sp->role.level;
+		npte = 1;
+		if (sp->role.glevels == PT32_ROOT_LEVEL) {
+			page_offset <<= 1;	/* 32->64 */
+			/*
+			 * A 32-bit pde maps 4MB while the shadow pdes map
+			 * only 2MB.  So we need to double the offset again
+			 * and zap two pdes instead of one.
+			 */
+			if (level == PT32_ROOT_LEVEL) {
+				page_offset &= ~7; /* kill rounding error */
+				page_offset <<= 1;
+				npte = 2;
+			}
+			quadrant = page_offset >> PAGE_SHIFT;
+			page_offset &= ~PAGE_MASK;
+			if (quadrant != sp->role.quadrant)
+				continue;
+		}
+		spte = &sp->spt[page_offset / sizeof(*spte)];
+		if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
+			gentry = 0;
+			r = kvm_read_guest_atomic(vcpu->kvm,
+						  gpa & ~(pte_size - 1),
+						  &gentry, pte_size);
+			new = (const void *)&gentry;
+			if (r < 0)
+				new = NULL;
+		}
+		while (npte--) {
+			entry = *spte;
+			mmu_pte_write_zap_pte(vcpu, sp, spte);
+			if (new)
+				mmu_pte_write_new_pte(vcpu, sp, spte, new);
+			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+			++spte;
+		}
+	}
+	kvm_mmu_audit(vcpu, "post pte write");
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	if (vcpu->arch.update_pte.page) {
+		kvm_release_page_clean(vcpu->arch.update_pte.page);
+		vcpu->arch.update_pte.page = NULL;
+	}
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa;
+	int r;
+
+	down_read(&current->mm->mmap_sem);
+	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+	up_read(&current->mm->mmap_sem);
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	return r;
+}
+
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
+		struct kvm_mmu_page *sp;
+
+		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
+				  struct kvm_mmu_page, link);
+		kvm_mmu_zap_page(vcpu->kvm, sp);
+		++vcpu->kvm->stat.mmu_recycled;
+	}
+}
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+{
+	int r;
+	enum emulation_result er;
+
+	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+	if (r < 0)
+		goto out;
+
+	if (!r) {
+		r = 1;
+		goto out;
+	}
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		goto out;
+
+	er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
+
+	switch (er) {
+	case EMULATE_DONE:
+		return 1;
+	case EMULATE_DO_MMIO:
+		++vcpu->stat.mmio_exits;
+		return 0;
+	case EMULATE_FAIL:
+		kvm_report_emulation_failure(vcpu, "pagetable");
+		return 1;
+	default:
+		BUG();
+	}
+out:
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+
+	while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
+		sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
+				  struct kvm_mmu_page, link);
+		kvm_mmu_zap_page(vcpu->kvm, sp);
+	}
+	free_page((unsigned long)vcpu->arch.mmu.pae_root);
+}
+
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+{
+	struct page *page;
+	int i;
+
+	ASSERT(vcpu);
+
+	if (vcpu->kvm->arch.n_requested_mmu_pages)
+		vcpu->kvm->arch.n_free_mmu_pages =
+					vcpu->kvm->arch.n_requested_mmu_pages;
+	else
+		vcpu->kvm->arch.n_free_mmu_pages =
+					vcpu->kvm->arch.n_alloc_mmu_pages;
+	/*
+	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+	 * Therefore we need to allocate shadow page tables in the first
+	 * 4GB of memory, which happens to fit the DMA32 zone.
+	 */
+	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+	if (!page)
+		goto error_1;
+	vcpu->arch.mmu.pae_root = page_address(page);
+	for (i = 0; i < 4; ++i)
+		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+	return 0;
+
+error_1:
+	free_mmu_pages(vcpu);
+	return -ENOMEM;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	return alloc_mmu_pages(vcpu);
+}
+
+int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	return init_kvm_mmu(vcpu);
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+
+	destroy_kvm_mmu(vcpu);
+	free_mmu_pages(vcpu);
+	mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+	struct kvm_mmu_page *sp;
+
+	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
+		int i;
+		u64 *pt;
+
+		if (!test_bit(slot, &sp->slot_bitmap))
+			continue;
+
+		pt = sp->spt;
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+			/* avoid RMW */
+			if (pt[i] & PT_WRITABLE_MASK)
+				pt[i] &= ~PT_WRITABLE_MASK;
+	}
+}
+
+void kvm_mmu_zap_all(struct kvm *kvm)
+{
+	struct kvm_mmu_page *sp, *node;
+
+	spin_lock(&kvm->mmu_lock);
+	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+		kvm_mmu_zap_page(kvm, sp);
+	spin_unlock(&kvm->mmu_lock);
+
+	kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_module_exit(void)
+{
+	if (pte_chain_cache)
+		kmem_cache_destroy(pte_chain_cache);
+	if (rmap_desc_cache)
+		kmem_cache_destroy(rmap_desc_cache);
+	if (mmu_page_header_cache)
+		kmem_cache_destroy(mmu_page_header_cache);
+}
+
+int kvm_mmu_module_init(void)
+{
+	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
+					    sizeof(struct kvm_pte_chain),
+					    0, 0, NULL);
+	if (!pte_chain_cache)
+		goto nomem;
+	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
+					    sizeof(struct kvm_rmap_desc),
+					    0, 0, NULL);
+	if (!rmap_desc_cache)
+		goto nomem;
+
+	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+						  sizeof(struct kvm_mmu_page),
+						  0, 0, NULL);
+	if (!mmu_page_header_cache)
+		goto nomem;
+
+	return 0;
+
+nomem:
+	kvm_mmu_module_exit();
+	return -ENOMEM;
+}
+
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+{
+	int i;
+	unsigned int nr_mmu_pages;
+	unsigned int  nr_pages = 0;
+
+	for (i = 0; i < kvm->nmemslots; i++)
+		nr_pages += kvm->memslots[i].npages;
+
+	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+	nr_mmu_pages = max(nr_mmu_pages,
+			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+
+	return nr_mmu_pages;
+}
+
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
+{
+#ifdef CONFIG_X86_64
+	gva = (long long)(gva << 16) >> 16;
+#endif
+	return gva;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+				gva_t va, int level)
+{
+	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+	int i;
+	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+		u64 ent = pt[i];
+
+		if (ent == shadow_trap_nonpresent_pte)
+			continue;
+
+		va = canonicalize(va);
+		if (level > 1) {
+			if (ent == shadow_notrap_nonpresent_pte)
+				printk(KERN_ERR "audit: (%s) nontrapping pte"
+				       " in nonleaf level: levels %d gva %lx"
+				       " level %d pte %llx\n", audit_msg,
+				       vcpu->arch.mmu.root_level, va, level, ent);
+
+			audit_mappings_page(vcpu, ent, va, level - 1);
+		} else {
+			gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
+			struct page *page = gpa_to_page(vcpu, gpa);
+			hpa_t hpa = page_to_phys(page);
+
+			if (is_shadow_present_pte(ent)
+			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
+				printk(KERN_ERR "xx audit error: (%s) levels %d"
+				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
+				       audit_msg, vcpu->arch.mmu.root_level,
+				       va, gpa, hpa, ent,
+				       is_shadow_present_pte(ent));
+			else if (ent == shadow_notrap_nonpresent_pte
+				 && !is_error_hpa(hpa))
+				printk(KERN_ERR "audit: (%s) notrap shadow,"
+				       " valid guest gva %lx\n", audit_msg, va);
+			kvm_release_page_clean(page);
+
+		}
+	}
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned i;
+
+	if (vcpu->arch.mmu.root_level == 4)
+		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+	else
+		for (i = 0; i < 4; ++i)
+			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+				audit_mappings_page(vcpu,
+						    vcpu->arch.mmu.pae_root[i],
+						    i << 30,
+						    2);
+}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+	int nmaps = 0;
+	int i, j, k;
+
+	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+		struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+		struct kvm_rmap_desc *d;
+
+		for (j = 0; j < m->npages; ++j) {
+			unsigned long *rmapp = &m->rmap[j];
+
+			if (!*rmapp)
+				continue;
+			if (!(*rmapp & 1)) {
+				++nmaps;
+				continue;
+			}
+			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+			while (d) {
+				for (k = 0; k < RMAP_EXT; ++k)
+					if (d->shadow_ptes[k])
+						++nmaps;
+					else
+						break;
+				d = d->more;
+			}
+		}
+	}
+	return nmaps;
+}
+
+static int count_writable_mappings(struct kvm_vcpu *vcpu)
+{
+	int nmaps = 0;
+	struct kvm_mmu_page *sp;
+	int i;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		u64 *pt = sp->spt;
+
+		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+			continue;
+
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			u64 ent = pt[i];
+
+			if (!(ent & PT_PRESENT_MASK))
+				continue;
+			if (!(ent & PT_WRITABLE_MASK))
+				continue;
+			++nmaps;
+		}
+	}
+	return nmaps;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+	int n_rmap = count_rmaps(vcpu);
+	int n_actual = count_writable_mappings(vcpu);
+
+	if (n_rmap != n_actual)
+		printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
+		       __FUNCTION__, audit_msg, n_rmap, n_actual);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	gfn_t gfn;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		if (sp->role.metaphysical)
+			continue;
+
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+		rmapp = &slot->rmap[gfn - slot->base_gfn];
+		if (*rmapp)
+			printk(KERN_ERR "%s: (%s) shadow page has writable"
+			       " mappings: gfn %lx role %x\n",
+			       __FUNCTION__, audit_msg, sp->gfn,
+			       sp->role.word);
+	}
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+	int olddbg = dbg;
+
+	dbg = 0;
+	audit_msg = msg;
+	audit_rmap(vcpu);
+	audit_write_protection(vcpu);
+	audit_mappings(vcpu);
+	dbg = olddbg;
+}
+
+#endif
diff -puN /dev/null arch/x86/kvm/mmu.h
--- /dev/null
+++ a/arch/x86/kvm/mmu.h
@@ -0,0 +1,44 @@
+#ifndef __KVM_X86_MMU_H
+#define __KVM_X86_MMU_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+		__kvm_mmu_free_some_pages(vcpu);
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+	if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+		return 0;
+
+	return kvm_mmu_load(vcpu);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+	return vcpu->arch.shadow_efer & EFER_LME;
+#else
+	return 0;
+#endif
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr4 & X86_CR4_PAE;
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr4 & X86_CR4_PSE;
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr0 & X86_CR0_PG;
+}
+
+#endif
diff -puN /dev/null arch/x86/kvm/paging_tmpl.h
--- /dev/null
+++ a/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,481 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+
+#if PTTYPE == 64
+	#define pt_element_t u64
+	#define guest_walker guest_walker64
+	#define FNAME(name) paging##64_##name
+	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
+	#define PT_LEVEL_BITS PT64_LEVEL_BITS
+	#ifdef CONFIG_X86_64
+	#define PT_MAX_FULL_LEVELS 4
+	#define CMPXCHG cmpxchg
+	#else
+	#define CMPXCHG cmpxchg64
+	#define PT_MAX_FULL_LEVELS 2
+	#endif
+#elif PTTYPE == 32
+	#define pt_element_t u32
+	#define guest_walker guest_walker32
+	#define FNAME(name) paging##32_##name
+	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
+	#define PT_LEVEL_BITS PT32_LEVEL_BITS
+	#define PT_MAX_FULL_LEVELS 2
+	#define CMPXCHG cmpxchg
+#else
+	#error Invalid PTTYPE value
+#endif
+
+#define gpte_to_gfn FNAME(gpte_to_gfn)
+#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+	int level;
+	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+	pt_element_t ptes[PT_MAX_FULL_LEVELS];
+	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+	unsigned pt_access;
+	unsigned pte_access;
+	gfn_t gfn;
+	u32 error_code;
+};
+
+static gfn_t gpte_to_gfn(pt_element_t gpte)
+{
+	return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
+{
+	return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
+			 gfn_t table_gfn, unsigned index,
+			 pt_element_t orig_pte, pt_element_t new_pte)
+{
+	pt_element_t ret;
+	pt_element_t *table;
+	struct page *page;
+
+	page = gfn_to_page(kvm, table_gfn);
+	table = kmap_atomic(page, KM_USER0);
+
+	ret = CMPXCHG(&table[index], orig_pte, new_pte);
+
+	kunmap_atomic(table, KM_USER0);
+
+	kvm_release_page_dirty(page);
+
+	return (ret != orig_pte);
+}
+
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+{
+	unsigned access;
+
+	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+#if PTTYPE == 64
+	if (is_nx(vcpu))
+		access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+	return access;
+}
+
+/*
+ * Fetch a guest pte for a guest virtual address
+ */
+static int FNAME(walk_addr)(struct guest_walker *walker,
+			    struct kvm_vcpu *vcpu, gva_t addr,
+			    int write_fault, int user_fault, int fetch_fault)
+{
+	pt_element_t pte;
+	gfn_t table_gfn;
+	unsigned index, pt_access, pte_access;
+	gpa_t pte_gpa;
+
+	pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+walk:
+	walker->level = vcpu->arch.mmu.root_level;
+	pte = vcpu->arch.cr3;
+#if PTTYPE == 64
+	if (!is_long_mode(vcpu)) {
+		pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
+		if (!is_present_pte(pte))
+			goto not_present;
+		--walker->level;
+	}
+#endif
+	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
+	       (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+
+	pt_access = ACC_ALL;
+
+	for (;;) {
+		index = PT_INDEX(addr, walker->level);
+
+		table_gfn = gpte_to_gfn(pte);
+		pte_gpa = gfn_to_gpa(table_gfn);
+		pte_gpa += index * sizeof(pt_element_t);
+		walker->table_gfn[walker->level - 1] = table_gfn;
+		walker->pte_gpa[walker->level - 1] = pte_gpa;
+		pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+			 walker->level - 1, table_gfn);
+
+		kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+
+		if (!is_present_pte(pte))
+			goto not_present;
+
+		if (write_fault && !is_writeble_pte(pte))
+			if (user_fault || is_write_protection(vcpu))
+				goto access_error;
+
+		if (user_fault && !(pte & PT_USER_MASK))
+			goto access_error;
+
+#if PTTYPE == 64
+		if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+			goto access_error;
+#endif
+
+		if (!(pte & PT_ACCESSED_MASK)) {
+			mark_page_dirty(vcpu->kvm, table_gfn);
+			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
+			    index, pte, pte|PT_ACCESSED_MASK))
+				goto walk;
+			pte |= PT_ACCESSED_MASK;
+		}
+
+		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
+
+		walker->ptes[walker->level - 1] = pte;
+
+		if (walker->level == PT_PAGE_TABLE_LEVEL) {
+			walker->gfn = gpte_to_gfn(pte);
+			break;
+		}
+
+		if (walker->level == PT_DIRECTORY_LEVEL
+		    && (pte & PT_PAGE_SIZE_MASK)
+		    && (PTTYPE == 64 || is_pse(vcpu))) {
+			walker->gfn = gpte_to_gfn_pde(pte);
+			walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
+			if (PTTYPE == 32 && is_cpuid_PSE36())
+				walker->gfn += pse36_gfn_delta(pte);
+			break;
+		}
+
+		pt_access = pte_access;
+		--walker->level;
+	}
+
+	if (write_fault && !is_dirty_pte(pte)) {
+		bool ret;
+
+		mark_page_dirty(vcpu->kvm, table_gfn);
+		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
+			    pte|PT_DIRTY_MASK);
+		if (ret)
+			goto walk;
+		pte |= PT_DIRTY_MASK;
+		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+		walker->ptes[walker->level - 1] = pte;
+	}
+
+	walker->pt_access = pt_access;
+	walker->pte_access = pte_access;
+	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
+		 __FUNCTION__, (u64)pte, pt_access, pte_access);
+	return 1;
+
+not_present:
+	walker->error_code = 0;
+	goto err;
+
+access_error:
+	walker->error_code = PFERR_PRESENT_MASK;
+
+err:
+	if (write_fault)
+		walker->error_code |= PFERR_WRITE_MASK;
+	if (user_fault)
+		walker->error_code |= PFERR_USER_MASK;
+	if (fetch_fault)
+		walker->error_code |= PFERR_FETCH_MASK;
+	return 0;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+			      u64 *spte, const void *pte)
+{
+	pt_element_t gpte;
+	unsigned pte_access;
+	struct page *npage;
+
+	gpte = *(const pt_element_t *)pte;
+	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
+		if (!is_present_pte(gpte))
+			set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
+		return;
+	}
+	pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
+		return;
+	npage = vcpu->arch.update_pte.page;
+	if (!npage)
+		return;
+	get_page(npage);
+	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
+		     gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ */
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+			 struct guest_walker *walker,
+			 int user_fault, int write_fault, int *ptwrite,
+			 struct page *page)
+{
+	hpa_t shadow_addr;
+	int level;
+	u64 *shadow_ent;
+	unsigned access = walker->pt_access;
+
+	if (!is_present_pte(walker->ptes[walker->level - 1]))
+		return NULL;
+
+	shadow_addr = vcpu->arch.mmu.root_hpa;
+	level = vcpu->arch.mmu.shadow_root_level;
+	if (level == PT32E_ROOT_LEVEL) {
+		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+		shadow_addr &= PT64_BASE_ADDR_MASK;
+		--level;
+	}
+
+	for (; ; level--) {
+		u32 index = SHADOW_PT_INDEX(addr, level);
+		struct kvm_mmu_page *shadow_page;
+		u64 shadow_pte;
+		int metaphysical;
+		gfn_t table_gfn;
+		bool new_page = 0;
+
+		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
+		if (level == PT_PAGE_TABLE_LEVEL)
+			break;
+		if (is_shadow_present_pte(*shadow_ent)) {
+			shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
+			continue;
+		}
+
+		if (level - 1 == PT_PAGE_TABLE_LEVEL
+		    && walker->level == PT_DIRECTORY_LEVEL) {
+			metaphysical = 1;
+			if (!is_dirty_pte(walker->ptes[level - 1]))
+				access &= ~ACC_WRITE_MASK;
+			table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
+		} else {
+			metaphysical = 0;
+			table_gfn = walker->table_gfn[level - 2];
+		}
+		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+					       metaphysical, access,
+					       shadow_ent, &new_page);
+		if (new_page && !metaphysical) {
+			int r;
+			pt_element_t curr_pte;
+			r = kvm_read_guest_atomic(vcpu->kvm,
+						  walker->pte_gpa[level - 2],
+						  &curr_pte, sizeof(curr_pte));
+			if (r || curr_pte != walker->ptes[level - 2]) {
+				kvm_release_page_clean(page);
+				return NULL;
+			}
+		}
+		shadow_addr = __pa(shadow_page->spt);
+		shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
+			| PT_WRITABLE_MASK | PT_USER_MASK;
+		*shadow_ent = shadow_pte;
+	}
+
+	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
+		     user_fault, write_fault,
+		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
+		     ptwrite, walker->gfn, page);
+
+	return shadow_ent;
+}
+
+/*
+ * Page fault handler.  There are several causes for a page fault:
+ *   - there is no shadow pte for the guest pte
+ *   - write access through a shadow pte marked read only so that we can set
+ *     the dirty bit
+ *   - write access to a shadow pte marked read only so we can update the page
+ *     dirty bitmap, when userspace requests it
+ *   - mmio access; in this case we will never install a present shadow pte
+ *   - normal guest page fault due to the guest pte marked not present, not
+ *     writable, or not executable
+ *
+ *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ *           a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
+			       u32 error_code)
+{
+	int write_fault = error_code & PFERR_WRITE_MASK;
+	int user_fault = error_code & PFERR_USER_MASK;
+	int fetch_fault = error_code & PFERR_FETCH_MASK;
+	struct guest_walker walker;
+	u64 *shadow_pte;
+	int write_pt = 0;
+	int r;
+	struct page *page;
+
+	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+	kvm_mmu_audit(vcpu, "pre page fault");
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		return r;
+
+	down_read(&current->mm->mmap_sem);
+	/*
+	 * Look up the shadow pte for the faulting address.
+	 */
+	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
+			     fetch_fault);
+
+	/*
+	 * The page is not mapped by the guest.  Let the guest handle it.
+	 */
+	if (!r) {
+		pgprintk("%s: guest page fault\n", __FUNCTION__);
+		inject_page_fault(vcpu, addr, walker.error_code);
+		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+		up_read(&current->mm->mmap_sem);
+		return 0;
+	}
+
+	page = gfn_to_page(vcpu->kvm, walker.gfn);
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_free_some_pages(vcpu);
+	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+				  &write_pt, page);
+	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+		 shadow_pte, *shadow_pte, write_pt);
+
+	if (!write_pt)
+		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+
+	/*
+	 * mmio: emulate if accessible, otherwise its a guest fault.
+	 */
+	if (shadow_pte && is_io_pte(*shadow_pte)) {
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		up_read(&current->mm->mmap_sem);
+		return 1;
+	}
+
+	++vcpu->stat.pf_fixed;
+	kvm_mmu_audit(vcpu, "post page fault (fixed)");
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	up_read(&current->mm->mmap_sem);
+
+	return write_pt;
+}
+
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+	struct guest_walker walker;
+	gpa_t gpa = UNMAPPED_GVA;
+	int r;
+
+	r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
+
+	if (r) {
+		gpa = gfn_to_gpa(walker.gfn);
+		gpa |= vaddr & ~PAGE_MASK;
+	}
+
+	return gpa;
+}
+
+static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu_page *sp)
+{
+	int i, offset = 0, r = 0;
+	pt_element_t pt;
+
+	if (sp->role.metaphysical
+	    || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
+		nonpaging_prefetch_page(vcpu, sp);
+		return;
+	}
+
+	if (PTTYPE == 32)
+		offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
+		pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
+					  sizeof(pt_element_t));
+		if (r || is_present_pte(pt))
+			sp->spt[i] = shadow_trap_nonpresent_pte;
+		else
+			sp->spt[i] = shadow_notrap_nonpresent_pte;
+	}
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef SHADOW_PT_INDEX
+#undef PT_LEVEL_MASK
+#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_pde
+#undef CMPXCHG
diff -puN /dev/null arch/x86/kvm/segment_descriptor.h
--- /dev/null
+++ a/arch/x86/kvm/segment_descriptor.h
@@ -0,0 +1,29 @@
+#ifndef __SEGMENT_DESCRIPTOR_H
+#define __SEGMENT_DESCRIPTOR_H
+
+struct segment_descriptor {
+	u16 limit_low;
+	u16 base_low;
+	u8  base_mid;
+	u8  type : 4;
+	u8  system : 1;
+	u8  dpl : 2;
+	u8  present : 1;
+	u8  limit_high : 4;
+	u8  avl : 1;
+	u8  long_mode : 1;
+	u8  default_op : 1;
+	u8  granularity : 1;
+	u8  base_high;
+} __attribute__((packed));
+
+#ifdef CONFIG_X86_64
+/* LDT or TSS descriptor in the GDT. 16 bytes. */
+struct segment_descriptor_64 {
+	struct segment_descriptor s;
+	u32 base_higher;
+	u32 pad_zero;
+};
+
+#endif
+#endif
diff -puN /dev/null arch/x86/kvm/svm.c
--- /dev/null
+++ a/arch/x86/kvm/svm.c
@@ -0,0 +1,1731 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+
+#include "kvm_svm.h"
+#include "irq.h"
+#include "mmu.h"
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+
+#include <asm/desc.h>
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+#define IOPM_ALLOC_ORDER 2
+#define MSRPM_ALLOC_ORDER 1
+
+#define DB_VECTOR 1
+#define UD_VECTOR 6
+#define GP_VECTOR 13
+
+#define DR7_GD_MASK (1 << 13)
+#define DR6_BD_MASK (1 << 13)
+
+#define SEG_TYPE_LDT 2
+#define SEG_TYPE_BUSY_TSS16 3
+
+#define SVM_FEATURE_NPT  (1 << 0)
+#define SVM_FEATURE_LBRV (1 << 1)
+#define SVM_DEATURE_SVML (1 << 2)
+
+static void kvm_reput_irq(struct vcpu_svm *svm);
+
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+unsigned long iopm_base;
+unsigned long msrpm_base;
+
+struct kvm_ldttss_desc {
+	u16 limit0;
+	u16 base0;
+	unsigned base1 : 8, type : 5, dpl : 2, p : 1;
+	unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+	u32 base3;
+	u32 zero1;
+} __attribute__((packed));
+
+struct svm_cpu_data {
+	int cpu;
+
+	u64 asid_generation;
+	u32 max_asid;
+	u32 next_asid;
+	struct kvm_ldttss_desc *tss_desc;
+
+	struct page *save_area;
+};
+
+static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+static uint32_t svm_features;
+
+struct svm_init_data {
+	int cpu;
+	int r;
+};
+
+static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+
+#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
+#define MSRS_RANGE_SIZE 2048
+#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+
+#define MAX_INST_SIZE 15
+
+static inline u32 svm_has(u32 feat)
+{
+	return svm_features & feat;
+}
+
+static inline u8 pop_irq(struct kvm_vcpu *vcpu)
+{
+	int word_index = __ffs(vcpu->arch.irq_summary);
+	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+	int irq = word_index * BITS_PER_LONG + bit_index;
+
+	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+	if (!vcpu->arch.irq_pending[word_index])
+		clear_bit(word_index, &vcpu->arch.irq_summary);
+	return irq;
+}
+
+static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
+{
+	set_bit(irq, vcpu->arch.irq_pending);
+	set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+}
+
+static inline void clgi(void)
+{
+	asm volatile (SVM_CLGI);
+}
+
+static inline void stgi(void)
+{
+	asm volatile (SVM_STGI);
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+	asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
+}
+
+static inline unsigned long kvm_read_cr2(void)
+{
+	unsigned long cr2;
+
+	asm volatile ("mov %%cr2, %0" : "=r" (cr2));
+	return cr2;
+}
+
+static inline void kvm_write_cr2(unsigned long val)
+{
+	asm volatile ("mov %0, %%cr2" :: "r" (val));
+}
+
+static inline unsigned long read_dr6(void)
+{
+	unsigned long dr6;
+
+	asm volatile ("mov %%dr6, %0" : "=r" (dr6));
+	return dr6;
+}
+
+static inline void write_dr6(unsigned long val)
+{
+	asm volatile ("mov %0, %%dr6" :: "r" (val));
+}
+
+static inline unsigned long read_dr7(void)
+{
+	unsigned long dr7;
+
+	asm volatile ("mov %%dr7, %0" : "=r" (dr7));
+	return dr7;
+}
+
+static inline void write_dr7(unsigned long val)
+{
+	asm volatile ("mov %0, %%dr7" :: "r" (val));
+}
+
+static inline void force_new_asid(struct kvm_vcpu *vcpu)
+{
+	to_svm(vcpu)->asid_generation--;
+}
+
+static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
+{
+	force_new_asid(vcpu);
+}
+
+static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	if (!(efer & EFER_LMA))
+		efer &= ~EFER_LME;
+
+	to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
+	vcpu->arch.shadow_efer = efer;
+}
+
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+				bool has_error_code, u32 error_code)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.event_inj = nr
+		| SVM_EVTINJ_VALID
+		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+		| SVM_EVTINJ_TYPE_EXEPT;
+	svm->vmcb->control.event_inj_err = error_code;
+}
+
+static bool svm_exception_injected(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
+}
+
+static int is_external_interrupt(u32 info)
+{
+	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!svm->next_rip) {
+		printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
+		return;
+	}
+	if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
+		printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
+		       __FUNCTION__,
+		       svm->vmcb->save.rip,
+		       svm->next_rip);
+
+	vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
+	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+
+	vcpu->arch.interrupt_window_open = 1;
+}
+
+static int has_svm(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+		printk(KERN_INFO "has_svm: not amd\n");
+		return 0;
+	}
+
+	cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+	if (eax < SVM_CPUID_FUNC) {
+		printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
+		return 0;
+	}
+
+	cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+	if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+		printk(KERN_DEBUG "has_svm: svm not available\n");
+		return 0;
+	}
+	return 1;
+}
+
+static void svm_hardware_disable(void *garbage)
+{
+	struct svm_cpu_data *svm_data
+		= per_cpu(svm_data, raw_smp_processor_id());
+
+	if (svm_data) {
+		uint64_t efer;
+
+		wrmsrl(MSR_VM_HSAVE_PA, 0);
+		rdmsrl(MSR_EFER, efer);
+		wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+		per_cpu(svm_data, raw_smp_processor_id()) = NULL;
+		__free_page(svm_data->save_area);
+		kfree(svm_data);
+	}
+}
+
+static void svm_hardware_enable(void *garbage)
+{
+
+	struct svm_cpu_data *svm_data;
+	uint64_t efer;
+#ifdef CONFIG_X86_64
+	struct desc_ptr gdt_descr;
+#else
+	struct Xgt_desc_struct gdt_descr;
+#endif
+	struct desc_struct *gdt;
+	int me = raw_smp_processor_id();
+
+	if (!has_svm()) {
+		printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
+		return;
+	}
+	svm_data = per_cpu(svm_data, me);
+
+	if (!svm_data) {
+		printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
+		       me);
+		return;
+	}
+
+	svm_data->asid_generation = 1;
+	svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+	svm_data->next_asid = svm_data->max_asid + 1;
+	svm_features = cpuid_edx(SVM_CPUID_FUNC);
+
+	asm volatile ("sgdt %0" : "=m"(gdt_descr));
+	gdt = (struct desc_struct *)gdt_descr.address;
+	svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+
+	rdmsrl(MSR_EFER, efer);
+	wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
+
+	wrmsrl(MSR_VM_HSAVE_PA,
+	       page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
+}
+
+static int svm_cpu_init(int cpu)
+{
+	struct svm_cpu_data *svm_data;
+	int r;
+
+	svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
+	if (!svm_data)
+		return -ENOMEM;
+	svm_data->cpu = cpu;
+	svm_data->save_area = alloc_page(GFP_KERNEL);
+	r = -ENOMEM;
+	if (!svm_data->save_area)
+		goto err_1;
+
+	per_cpu(svm_data, cpu) = svm_data;
+
+	return 0;
+
+err_1:
+	kfree(svm_data);
+	return r;
+
+}
+
+static void set_msr_interception(u32 *msrpm, unsigned msr,
+				 int read, int write)
+{
+	int i;
+
+	for (i = 0; i < NUM_MSR_MAPS; i++) {
+		if (msr >= msrpm_ranges[i] &&
+		    msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
+			u32 msr_offset = (i * MSRS_IN_RANGE + msr -
+					  msrpm_ranges[i]) * 2;
+
+			u32 *base = msrpm + (msr_offset / 32);
+			u32 msr_shift = msr_offset % 32;
+			u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
+			*base = (*base & ~(0x3 << msr_shift)) |
+				(mask << msr_shift);
+			return;
+		}
+	}
+	BUG();
+}
+
+static __init int svm_hardware_setup(void)
+{
+	int cpu;
+	struct page *iopm_pages;
+	struct page *msrpm_pages;
+	void *iopm_va, *msrpm_va;
+	int r;
+
+	iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+
+	if (!iopm_pages)
+		return -ENOMEM;
+
+	iopm_va = page_address(iopm_pages);
+	memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+	clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
+	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+
+	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+
+	r = -ENOMEM;
+	if (!msrpm_pages)
+		goto err_1;
+
+	msrpm_va = page_address(msrpm_pages);
+	memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+	msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
+
+#ifdef CONFIG_X86_64
+	set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
+	set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
+	set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
+	set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
+	set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
+	set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
+#endif
+	set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
+	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
+	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
+	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
+
+	for_each_online_cpu(cpu) {
+		r = svm_cpu_init(cpu);
+		if (r)
+			goto err_2;
+	}
+	return 0;
+
+err_2:
+	__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
+	msrpm_base = 0;
+err_1:
+	__free_pages(iopm_pages, IOPM_ALLOC_ORDER);
+	iopm_base = 0;
+	return r;
+}
+
+static __exit void svm_hardware_unsetup(void)
+{
+	__free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
+	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+	iopm_base = msrpm_base = 0;
+}
+
+static void init_seg(struct vmcb_seg *seg)
+{
+	seg->selector = 0;
+	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
+		SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+	seg->limit = 0xffff;
+	seg->base = 0;
+}
+
+static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
+{
+	seg->selector = 0;
+	seg->attrib = SVM_SELECTOR_P_MASK | type;
+	seg->limit = 0xffff;
+	seg->base = 0;
+}
+
+static void init_vmcb(struct vmcb *vmcb)
+{
+	struct vmcb_control_area *control = &vmcb->control;
+	struct vmcb_save_area *save = &vmcb->save;
+
+	control->intercept_cr_read = 	INTERCEPT_CR0_MASK |
+					INTERCEPT_CR3_MASK |
+					INTERCEPT_CR4_MASK |
+					INTERCEPT_CR8_MASK;
+
+	control->intercept_cr_write = 	INTERCEPT_CR0_MASK |
+					INTERCEPT_CR3_MASK |
+					INTERCEPT_CR4_MASK |
+					INTERCEPT_CR8_MASK;
+
+	control->intercept_dr_read = 	INTERCEPT_DR0_MASK |
+					INTERCEPT_DR1_MASK |
+					INTERCEPT_DR2_MASK |
+					INTERCEPT_DR3_MASK;
+
+	control->intercept_dr_write = 	INTERCEPT_DR0_MASK |
+					INTERCEPT_DR1_MASK |
+					INTERCEPT_DR2_MASK |
+					INTERCEPT_DR3_MASK |
+					INTERCEPT_DR5_MASK |
+					INTERCEPT_DR7_MASK;
+
+	control->intercept_exceptions = (1 << PF_VECTOR) |
+					(1 << UD_VECTOR);
+
+
+	control->intercept = 	(1ULL << INTERCEPT_INTR) |
+				(1ULL << INTERCEPT_NMI) |
+				(1ULL << INTERCEPT_SMI) |
+		/*
+		 * selective cr0 intercept bug?
+		 *    	0:   0f 22 d8                mov    %eax,%cr3
+		 *	3:   0f 20 c0                mov    %cr0,%eax
+		 *	6:   0d 00 00 00 80          or     $0x80000000,%eax
+		 *	b:   0f 22 c0                mov    %eax,%cr0
+		 * set cr3 ->interception
+		 * get cr0 ->interception
+		 * set cr0 -> no interception
+		 */
+		/*              (1ULL << INTERCEPT_SELECTIVE_CR0) | */
+				(1ULL << INTERCEPT_CPUID) |
+				(1ULL << INTERCEPT_INVD) |
+				(1ULL << INTERCEPT_HLT) |
+				(1ULL << INTERCEPT_INVLPGA) |
+				(1ULL << INTERCEPT_IOIO_PROT) |
+				(1ULL << INTERCEPT_MSR_PROT) |
+				(1ULL << INTERCEPT_TASK_SWITCH) |
+				(1ULL << INTERCEPT_SHUTDOWN) |
+				(1ULL << INTERCEPT_VMRUN) |
+				(1ULL << INTERCEPT_VMMCALL) |
+				(1ULL << INTERCEPT_VMLOAD) |
+				(1ULL << INTERCEPT_VMSAVE) |
+				(1ULL << INTERCEPT_STGI) |
+				(1ULL << INTERCEPT_CLGI) |
+				(1ULL << INTERCEPT_SKINIT) |
+				(1ULL << INTERCEPT_WBINVD) |
+				(1ULL << INTERCEPT_MONITOR) |
+				(1ULL << INTERCEPT_MWAIT);
+
+	control->iopm_base_pa = iopm_base;
+	control->msrpm_base_pa = msrpm_base;
+	control->tsc_offset = 0;
+	control->int_ctl = V_INTR_MASKING_MASK;
+
+	init_seg(&save->es);
+	init_seg(&save->ss);
+	init_seg(&save->ds);
+	init_seg(&save->fs);
+	init_seg(&save->gs);
+
+	save->cs.selector = 0xf000;
+	/* Executable/Readable Code Segment */
+	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
+		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
+	save->cs.limit = 0xffff;
+	/*
+	 * cs.base should really be 0xffff0000, but vmx can't handle that, so
+	 * be consistent with it.
+	 *
+	 * Replace when we have real mode working for vmx.
+	 */
+	save->cs.base = 0xf0000;
+
+	save->gdtr.limit = 0xffff;
+	save->idtr.limit = 0xffff;
+
+	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
+	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+
+	save->efer = MSR_EFER_SVME_MASK;
+	save->dr6 = 0xffff0ff0;
+	save->dr7 = 0x400;
+	save->rflags = 2;
+	save->rip = 0x0000fff0;
+
+	/*
+	 * cr0 val on cpu init should be 0x60000010, we enable cpu
+	 * cache by default. the orderly way is to enable cache in bios.
+	 */
+	save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
+	save->cr4 = X86_CR4_PAE;
+	/* rdx = ?? */
+}
+
+static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	init_vmcb(svm->vmcb);
+
+	if (vcpu->vcpu_id != 0) {
+		svm->vmcb->save.rip = 0;
+		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
+		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
+	}
+
+	return 0;
+}
+
+static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+	struct vcpu_svm *svm;
+	struct page *page;
+	int err;
+
+	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!svm) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = kvm_vcpu_init(&svm->vcpu, kvm, id);
+	if (err)
+		goto free_svm;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto uninit;
+	}
+
+	svm->vmcb = page_address(page);
+	clear_page(svm->vmcb);
+	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+	svm->asid_generation = 0;
+	memset(svm->db_regs, 0, sizeof(svm->db_regs));
+	init_vmcb(svm->vmcb);
+
+	fx_init(&svm->vcpu);
+	svm->vcpu.fpu_active = 1;
+	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+	if (svm->vcpu.vcpu_id == 0)
+		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+
+	return &svm->vcpu;
+
+uninit:
+	kvm_vcpu_uninit(&svm->vcpu);
+free_svm:
+	kmem_cache_free(kvm_vcpu_cache, svm);
+out:
+	return ERR_PTR(err);
+}
+
+static void svm_free_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, svm);
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	int i;
+
+	if (unlikely(cpu != vcpu->cpu)) {
+		u64 tsc_this, delta;
+
+		/*
+		 * Make sure that the guest sees a monotonically
+		 * increasing TSC.
+		 */
+		rdtscll(tsc_this);
+		delta = vcpu->arch.host_tsc - tsc_this;
+		svm->vmcb->control.tsc_offset += delta;
+		vcpu->cpu = cpu;
+		kvm_migrate_apic_timer(vcpu);
+	}
+
+	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+}
+
+static void svm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	int i;
+
+	++vcpu->stat.host_state_reload;
+	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+	rdtscll(vcpu->arch.host_tsc);
+}
+
+static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_cache_regs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+	vcpu->arch.rip = svm->vmcb->save.rip;
+}
+
+static void svm_decache_regs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+	svm->vmcb->save.rip = vcpu->arch.rip;
+}
+
+static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
+{
+	return to_svm(vcpu)->vmcb->save.rflags;
+}
+
+static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+	to_svm(vcpu)->vmcb->save.rflags = rflags;
+}
+
+static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
+{
+	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+	switch (seg) {
+	case VCPU_SREG_CS: return &save->cs;
+	case VCPU_SREG_DS: return &save->ds;
+	case VCPU_SREG_ES: return &save->es;
+	case VCPU_SREG_FS: return &save->fs;
+	case VCPU_SREG_GS: return &save->gs;
+	case VCPU_SREG_SS: return &save->ss;
+	case VCPU_SREG_TR: return &save->tr;
+	case VCPU_SREG_LDTR: return &save->ldtr;
+	}
+	BUG();
+	return NULL;
+}
+
+static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+	struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+	return s->base;
+}
+
+static void svm_get_segment(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg)
+{
+	struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+	var->base = s->base;
+	var->limit = s->limit;
+	var->selector = s->selector;
+	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
+	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
+	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
+	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
+	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
+	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
+	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+	var->unusable = !var->present;
+}
+
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	dt->limit = svm->vmcb->save.idtr.limit;
+	dt->base = svm->vmcb->save.idtr.base;
+}
+
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->save.idtr.limit = dt->limit;
+	svm->vmcb->save.idtr.base = dt->base ;
+}
+
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	dt->limit = svm->vmcb->save.gdtr.limit;
+	dt->base = svm->vmcb->save.gdtr.base;
+}
+
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->save.gdtr.limit = dt->limit;
+	svm->vmcb->save.gdtr.base = dt->base ;
+}
+
+static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+#ifdef CONFIG_X86_64
+	if (vcpu->arch.shadow_efer & EFER_LME) {
+		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+			vcpu->arch.shadow_efer |= EFER_LMA;
+			svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+		}
+
+		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+			vcpu->arch.shadow_efer &= ~EFER_LMA;
+			svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+		}
+	}
+#endif
+	if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
+		svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+		vcpu->fpu_active = 1;
+	}
+
+	vcpu->arch.cr0 = cr0;
+	cr0 |= X86_CR0_PG | X86_CR0_WP;
+	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+	svm->vmcb->save.cr0 = cr0;
+}
+
+static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+       vcpu->arch.cr4 = cr4;
+       to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
+}
+
+static void svm_set_segment(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+	s->base = var->base;
+	s->limit = var->limit;
+	s->selector = var->selector;
+	if (var->unusable)
+		s->attrib = 0;
+	else {
+		s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+		s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+		s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+		s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
+		s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+		s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+		s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
+	}
+	if (seg == VCPU_SREG_CS)
+		svm->vmcb->save.cpl
+			= (svm->vmcb->save.cs.attrib
+			   >> SVM_SELECTOR_DPL_SHIFT) & 3;
+
+}
+
+/* FIXME:
+
+	svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
+	svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
+
+*/
+
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+{
+	return -EOPNOTSUPP;
+}
+
+static int svm_get_irq(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u32 exit_int_info = svm->vmcb->control.exit_int_info;
+
+	if (is_external_interrupt(exit_int_info))
+		return exit_int_info & SVM_EVTINJ_VEC_MASK;
+	return -1;
+}
+
+static void load_host_msrs(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+	wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
+#endif
+}
+
+static void save_host_msrs(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
+#endif
+}
+
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
+{
+	if (svm_data->next_asid > svm_data->max_asid) {
+		++svm_data->asid_generation;
+		svm_data->next_asid = 1;
+		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+	}
+
+	svm->vcpu.cpu = svm_data->cpu;
+	svm->asid_generation = svm_data->asid_generation;
+	svm->vmcb->control.asid = svm_data->next_asid++;
+}
+
+static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
+{
+	return to_svm(vcpu)->db_regs[dr];
+}
+
+static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
+		       int *exception)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	*exception = 0;
+
+	if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
+		svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
+		svm->vmcb->save.dr6 |= DR6_BD_MASK;
+		*exception = DB_VECTOR;
+		return;
+	}
+
+	switch (dr) {
+	case 0 ... 3:
+		svm->db_regs[dr] = value;
+		return;
+	case 4 ... 5:
+		if (vcpu->arch.cr4 & X86_CR4_DE) {
+			*exception = UD_VECTOR;
+			return;
+		}
+	case 7: {
+		if (value & ~((1ULL << 32) - 1)) {
+			*exception = GP_VECTOR;
+			return;
+		}
+		svm->vmcb->save.dr7 = value;
+		return;
+	}
+	default:
+		printk(KERN_DEBUG "%s: unexpected dr %u\n",
+		       __FUNCTION__, dr);
+		*exception = UD_VECTOR;
+		return;
+	}
+}
+
+static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	u32 exit_int_info = svm->vmcb->control.exit_int_info;
+	struct kvm *kvm = svm->vcpu.kvm;
+	u64 fault_address;
+	u32 error_code;
+
+	if (!irqchip_in_kernel(kvm) &&
+		is_external_interrupt(exit_int_info))
+		push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
+
+	fault_address  = svm->vmcb->control.exit_info_2;
+	error_code = svm->vmcb->control.exit_info_1;
+	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+}
+
+static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	int er;
+
+	er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+	if (er != EMULATE_DONE)
+		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+	return 1;
+}
+
+static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+	if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
+		svm->vmcb->save.cr0 &= ~X86_CR0_TS;
+	svm->vcpu.fpu_active = 1;
+
+	return 1;
+}
+
+static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	/*
+	 * VMCB is undefined after a SHUTDOWN intercept
+	 * so reinitialize it.
+	 */
+	clear_page(svm->vmcb);
+	init_vmcb(svm->vmcb);
+
+	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+	return 0;
+}
+
+static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+	int size, down, in, string, rep;
+	unsigned port;
+
+	++svm->vcpu.stat.io_exits;
+
+	svm->next_rip = svm->vmcb->control.exit_info_2;
+
+	string = (io_info & SVM_IOIO_STR_MASK) != 0;
+
+	if (string) {
+		if (emulate_instruction(&svm->vcpu,
+					kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+			return 0;
+		return 1;
+	}
+
+	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+	port = io_info >> 16;
+	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+	rep = (io_info & SVM_IOIO_REP_MASK) != 0;
+	down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
+
+	return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
+}
+
+static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	return 1;
+}
+
+static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	svm->next_rip = svm->vmcb->save.rip + 1;
+	skip_emulated_instruction(&svm->vcpu);
+	return kvm_emulate_halt(&svm->vcpu);
+}
+
+static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	svm->next_rip = svm->vmcb->save.rip + 3;
+	skip_emulated_instruction(&svm->vcpu);
+	kvm_emulate_hypercall(&svm->vcpu);
+	return 1;
+}
+
+static int invalid_op_interception(struct vcpu_svm *svm,
+				   struct kvm_run *kvm_run)
+{
+	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+	return 1;
+}
+
+static int task_switch_interception(struct vcpu_svm *svm,
+				    struct kvm_run *kvm_run)
+{
+	pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
+	kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+	return 0;
+}
+
+static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	svm->next_rip = svm->vmcb->save.rip + 2;
+	kvm_emulate_cpuid(&svm->vcpu);
+	return 1;
+}
+
+static int emulate_on_interception(struct vcpu_svm *svm,
+				   struct kvm_run *kvm_run)
+{
+	if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
+		pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
+	return 1;
+}
+
+static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
+	if (irqchip_in_kernel(svm->vcpu.kvm))
+		return 1;
+	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+	return 0;
+}
+
+static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	switch (ecx) {
+	case MSR_IA32_TIME_STAMP_COUNTER: {
+		u64 tsc;
+
+		rdtscll(tsc);
+		*data = svm->vmcb->control.tsc_offset + tsc;
+		break;
+	}
+	case MSR_K6_STAR:
+		*data = svm->vmcb->save.star;
+		break;
+#ifdef CONFIG_X86_64
+	case MSR_LSTAR:
+		*data = svm->vmcb->save.lstar;
+		break;
+	case MSR_CSTAR:
+		*data = svm->vmcb->save.cstar;
+		break;
+	case MSR_KERNEL_GS_BASE:
+		*data = svm->vmcb->save.kernel_gs_base;
+		break;
+	case MSR_SYSCALL_MASK:
+		*data = svm->vmcb->save.sfmask;
+		break;
+#endif
+	case MSR_IA32_SYSENTER_CS:
+		*data = svm->vmcb->save.sysenter_cs;
+		break;
+	case MSR_IA32_SYSENTER_EIP:
+		*data = svm->vmcb->save.sysenter_eip;
+		break;
+	case MSR_IA32_SYSENTER_ESP:
+		*data = svm->vmcb->save.sysenter_esp;
+		break;
+	default:
+		return kvm_get_msr_common(vcpu, ecx, data);
+	}
+	return 0;
+}
+
+static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	u64 data;
+
+	if (svm_get_msr(&svm->vcpu, ecx, &data))
+		kvm_inject_gp(&svm->vcpu, 0);
+	else {
+		svm->vmcb->save.rax = data & 0xffffffff;
+		svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
+		svm->next_rip = svm->vmcb->save.rip + 2;
+		skip_emulated_instruction(&svm->vcpu);
+	}
+	return 1;
+}
+
+static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	switch (ecx) {
+	case MSR_IA32_TIME_STAMP_COUNTER: {
+		u64 tsc;
+
+		rdtscll(tsc);
+		svm->vmcb->control.tsc_offset = data - tsc;
+		break;
+	}
+	case MSR_K6_STAR:
+		svm->vmcb->save.star = data;
+		break;
+#ifdef CONFIG_X86_64
+	case MSR_LSTAR:
+		svm->vmcb->save.lstar = data;
+		break;
+	case MSR_CSTAR:
+		svm->vmcb->save.cstar = data;
+		break;
+	case MSR_KERNEL_GS_BASE:
+		svm->vmcb->save.kernel_gs_base = data;
+		break;
+	case MSR_SYSCALL_MASK:
+		svm->vmcb->save.sfmask = data;
+		break;
+#endif
+	case MSR_IA32_SYSENTER_CS:
+		svm->vmcb->save.sysenter_cs = data;
+		break;
+	case MSR_IA32_SYSENTER_EIP:
+		svm->vmcb->save.sysenter_eip = data;
+		break;
+	case MSR_IA32_SYSENTER_ESP:
+		svm->vmcb->save.sysenter_esp = data;
+		break;
+	case MSR_K7_EVNTSEL0:
+	case MSR_K7_EVNTSEL1:
+	case MSR_K7_EVNTSEL2:
+	case MSR_K7_EVNTSEL3:
+		/*
+		 * only support writing 0 to the performance counters for now
+		 * to make Windows happy. Should be replaced by a real
+		 * performance counter emulation later.
+		 */
+		if (data != 0)
+			goto unhandled;
+		break;
+	default:
+	unhandled:
+		return kvm_set_msr_common(vcpu, ecx, data);
+	}
+	return 0;
+}
+
+static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	u64 data = (svm->vmcb->save.rax & -1u)
+		| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+	svm->next_rip = svm->vmcb->save.rip + 2;
+	if (svm_set_msr(&svm->vcpu, ecx, data))
+		kvm_inject_gp(&svm->vcpu, 0);
+	else
+		skip_emulated_instruction(&svm->vcpu);
+	return 1;
+}
+
+static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (svm->vmcb->control.exit_info_1)
+		return wrmsr_interception(svm, kvm_run);
+	else
+		return rdmsr_interception(svm, kvm_run);
+}
+
+static int interrupt_window_interception(struct vcpu_svm *svm,
+				   struct kvm_run *kvm_run)
+{
+	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+	/*
+	 * If the user space waits to inject interrupts, exit as soon as
+	 * possible
+	 */
+	if (kvm_run->request_interrupt_window &&
+	    !svm->vcpu.arch.irq_summary) {
+		++svm->vcpu.stat.irq_window_exits;
+		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+		return 0;
+	}
+
+	return 1;
+}
+
+static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
+				      struct kvm_run *kvm_run) = {
+	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
+	[SVM_EXIT_READ_CR3]           		= emulate_on_interception,
+	[SVM_EXIT_READ_CR4]           		= emulate_on_interception,
+	[SVM_EXIT_READ_CR8]           		= emulate_on_interception,
+	/* for now: */
+	[SVM_EXIT_WRITE_CR0]          		= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR3]          		= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR4]          		= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR8]          		= cr8_write_interception,
+	[SVM_EXIT_READ_DR0] 			= emulate_on_interception,
+	[SVM_EXIT_READ_DR1]			= emulate_on_interception,
+	[SVM_EXIT_READ_DR2]			= emulate_on_interception,
+	[SVM_EXIT_READ_DR3]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_DR0]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_DR1]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_DR2]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception,
+	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
+	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
+	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
+	[SVM_EXIT_INTR] 			= nop_on_interception,
+	[SVM_EXIT_NMI]				= nop_on_interception,
+	[SVM_EXIT_SMI]				= nop_on_interception,
+	[SVM_EXIT_INIT]				= nop_on_interception,
+	[SVM_EXIT_VINTR]			= interrupt_window_interception,
+	/* [SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception, */
+	[SVM_EXIT_CPUID]			= cpuid_interception,
+	[SVM_EXIT_INVD]                         = emulate_on_interception,
+	[SVM_EXIT_HLT]				= halt_interception,
+	[SVM_EXIT_INVLPG]			= emulate_on_interception,
+	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
+	[SVM_EXIT_IOIO] 		  	= io_interception,
+	[SVM_EXIT_MSR]				= msr_interception,
+	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
+	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
+	[SVM_EXIT_VMRUN]			= invalid_op_interception,
+	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
+	[SVM_EXIT_VMLOAD]			= invalid_op_interception,
+	[SVM_EXIT_VMSAVE]			= invalid_op_interception,
+	[SVM_EXIT_STGI]				= invalid_op_interception,
+	[SVM_EXIT_CLGI]				= invalid_op_interception,
+	[SVM_EXIT_SKINIT]			= invalid_op_interception,
+	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
+	[SVM_EXIT_MONITOR]			= invalid_op_interception,
+	[SVM_EXIT_MWAIT]			= invalid_op_interception,
+};
+
+
+static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u32 exit_code = svm->vmcb->control.exit_code;
+
+	kvm_reput_irq(svm);
+
+	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		kvm_run->fail_entry.hardware_entry_failure_reason
+			= svm->vmcb->control.exit_code;
+		return 0;
+	}
+
+	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
+	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
+		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
+		       "exit_code 0x%x\n",
+		       __FUNCTION__, svm->vmcb->control.exit_int_info,
+		       exit_code);
+
+	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+	    || !svm_exit_handlers[exit_code]) {
+		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+		kvm_run->hw.hardware_exit_reason = exit_code;
+		return 0;
+	}
+
+	return svm_exit_handlers[exit_code](svm, kvm_run);
+}
+
+static void reload_tss(struct kvm_vcpu *vcpu)
+{
+	int cpu = raw_smp_processor_id();
+
+	struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
+	svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
+	load_TR_desc();
+}
+
+static void pre_svm_run(struct vcpu_svm *svm)
+{
+	int cpu = raw_smp_processor_id();
+
+	struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
+
+	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+	if (svm->vcpu.cpu != cpu ||
+	    svm->asid_generation != svm_data->asid_generation)
+		new_asid(svm, svm_data);
+}
+
+
+static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
+{
+	struct vmcb_control_area *control;
+
+	control = &svm->vmcb->control;
+	control->int_vector = irq;
+	control->int_ctl &= ~V_INTR_PRIO_MASK;
+	control->int_ctl |= V_IRQ_MASK |
+		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+}
+
+static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm_inject_irq(svm, irq);
+}
+
+static void svm_intr_assist(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *vmcb = svm->vmcb;
+	int intr_vector = -1;
+
+	if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
+	    ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
+		intr_vector = vmcb->control.exit_int_info &
+			      SVM_EVTINJ_VEC_MASK;
+		vmcb->control.exit_int_info = 0;
+		svm_inject_irq(svm, intr_vector);
+		return;
+	}
+
+	if (vmcb->control.int_ctl & V_IRQ_MASK)
+		return;
+
+	if (!kvm_cpu_has_interrupt(vcpu))
+		return;
+
+	if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
+	    (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
+	    (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
+		/* unable to deliver irq, set pending irq */
+		vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
+		svm_inject_irq(svm, 0x0);
+		return;
+	}
+	/* Okay, we can deliver the interrupt: grab it and update PIC state. */
+	intr_vector = kvm_cpu_get_interrupt(vcpu);
+	svm_inject_irq(svm, intr_vector);
+	kvm_timer_intr_post(vcpu, intr_vector);
+}
+
+static void kvm_reput_irq(struct vcpu_svm *svm)
+{
+	struct vmcb_control_area *control = &svm->vmcb->control;
+
+	if ((control->int_ctl & V_IRQ_MASK)
+	    && !irqchip_in_kernel(svm->vcpu.kvm)) {
+		control->int_ctl &= ~V_IRQ_MASK;
+		push_irq(&svm->vcpu, control->int_vector);
+	}
+
+	svm->vcpu.arch.interrupt_window_open =
+		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
+}
+
+static void svm_do_inject_vector(struct vcpu_svm *svm)
+{
+	struct kvm_vcpu *vcpu = &svm->vcpu;
+	int word_index = __ffs(vcpu->arch.irq_summary);
+	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+	int irq = word_index * BITS_PER_LONG + bit_index;
+
+	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+	if (!vcpu->arch.irq_pending[word_index])
+		clear_bit(word_index, &vcpu->arch.irq_summary);
+	svm_inject_irq(svm, irq);
+}
+
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+				       struct kvm_run *kvm_run)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control = &svm->vmcb->control;
+
+	svm->vcpu.arch.interrupt_window_open =
+		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
+
+	if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
+		/*
+		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
+		 */
+		svm_do_inject_vector(svm);
+
+	/*
+	 * Interrupts blocked.  Wait for unblock.
+	 */
+	if (!svm->vcpu.arch.interrupt_window_open &&
+	    (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
+		control->intercept |= 1ULL << INTERCEPT_VINTR;
+	 else
+		control->intercept &= ~(1ULL << INTERCEPT_VINTR);
+}
+
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+	return 0;
+}
+
+static void save_db_regs(unsigned long *db_regs)
+{
+	asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
+	asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
+	asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
+	asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
+}
+
+static void load_db_regs(unsigned long *db_regs)
+{
+	asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
+	asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
+	asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
+	asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
+}
+
+static void svm_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	force_new_asid(vcpu);
+}
+
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u16 fs_selector;
+	u16 gs_selector;
+	u16 ldt_selector;
+
+	pre_svm_run(svm);
+
+	save_host_msrs(vcpu);
+	fs_selector = read_fs();
+	gs_selector = read_gs();
+	ldt_selector = read_ldt();
+	svm->host_cr2 = kvm_read_cr2();
+	svm->host_dr6 = read_dr6();
+	svm->host_dr7 = read_dr7();
+	svm->vmcb->save.cr2 = vcpu->arch.cr2;
+
+	if (svm->vmcb->save.dr7 & 0xff) {
+		write_dr7(0);
+		save_db_regs(svm->host_db_regs);
+		load_db_regs(svm->db_regs);
+	}
+
+	clgi();
+
+	local_irq_enable();
+
+	asm volatile (
+#ifdef CONFIG_X86_64
+		"push %%rbp; \n\t"
+#else
+		"push %%ebp; \n\t"
+#endif
+
+#ifdef CONFIG_X86_64
+		"mov %c[rbx](%[svm]), %%rbx \n\t"
+		"mov %c[rcx](%[svm]), %%rcx \n\t"
+		"mov %c[rdx](%[svm]), %%rdx \n\t"
+		"mov %c[rsi](%[svm]), %%rsi \n\t"
+		"mov %c[rdi](%[svm]), %%rdi \n\t"
+		"mov %c[rbp](%[svm]), %%rbp \n\t"
+		"mov %c[r8](%[svm]),  %%r8  \n\t"
+		"mov %c[r9](%[svm]),  %%r9  \n\t"
+		"mov %c[r10](%[svm]), %%r10 \n\t"
+		"mov %c[r11](%[svm]), %%r11 \n\t"
+		"mov %c[r12](%[svm]), %%r12 \n\t"
+		"mov %c[r13](%[svm]), %%r13 \n\t"
+		"mov %c[r14](%[svm]), %%r14 \n\t"
+		"mov %c[r15](%[svm]), %%r15 \n\t"
+#else
+		"mov %c[rbx](%[svm]), %%ebx \n\t"
+		"mov %c[rcx](%[svm]), %%ecx \n\t"
+		"mov %c[rdx](%[svm]), %%edx \n\t"
+		"mov %c[rsi](%[svm]), %%esi \n\t"
+		"mov %c[rdi](%[svm]), %%edi \n\t"
+		"mov %c[rbp](%[svm]), %%ebp \n\t"
+#endif
+
+#ifdef CONFIG_X86_64
+		/* Enter guest mode */
+		"push %%rax \n\t"
+		"mov %c[vmcb](%[svm]), %%rax \n\t"
+		SVM_VMLOAD "\n\t"
+		SVM_VMRUN "\n\t"
+		SVM_VMSAVE "\n\t"
+		"pop %%rax \n\t"
+#else
+		/* Enter guest mode */
+		"push %%eax \n\t"
+		"mov %c[vmcb](%[svm]), %%eax \n\t"
+		SVM_VMLOAD "\n\t"
+		SVM_VMRUN "\n\t"
+		SVM_VMSAVE "\n\t"
+		"pop %%eax \n\t"
+#endif
+
+		/* Save guest registers, load host registers */
+#ifdef CONFIG_X86_64
+		"mov %%rbx, %c[rbx](%[svm]) \n\t"
+		"mov %%rcx, %c[rcx](%[svm]) \n\t"
+		"mov %%rdx, %c[rdx](%[svm]) \n\t"
+		"mov %%rsi, %c[rsi](%[svm]) \n\t"
+		"mov %%rdi, %c[rdi](%[svm]) \n\t"
+		"mov %%rbp, %c[rbp](%[svm]) \n\t"
+		"mov %%r8,  %c[r8](%[svm]) \n\t"
+		"mov %%r9,  %c[r9](%[svm]) \n\t"
+		"mov %%r10, %c[r10](%[svm]) \n\t"
+		"mov %%r11, %c[r11](%[svm]) \n\t"
+		"mov %%r12, %c[r12](%[svm]) \n\t"
+		"mov %%r13, %c[r13](%[svm]) \n\t"
+		"mov %%r14, %c[r14](%[svm]) \n\t"
+		"mov %%r15, %c[r15](%[svm]) \n\t"
+
+		"pop  %%rbp; \n\t"
+#else
+		"mov %%ebx, %c[rbx](%[svm]) \n\t"
+		"mov %%ecx, %c[rcx](%[svm]) \n\t"
+		"mov %%edx, %c[rdx](%[svm]) \n\t"
+		"mov %%esi, %c[rsi](%[svm]) \n\t"
+		"mov %%edi, %c[rdi](%[svm]) \n\t"
+		"mov %%ebp, %c[rbp](%[svm]) \n\t"
+
+		"pop  %%ebp; \n\t"
+#endif
+		:
+		: [svm]"a"(svm),
+		  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
+		  [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
+		  [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
+		  [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
+		  [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
+		  [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
+		  [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
+#ifdef CONFIG_X86_64
+		  , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
+		  [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
+		  [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
+		  [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
+		  [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
+		  [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
+		  [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
+		  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
+#endif
+		: "cc", "memory"
+#ifdef CONFIG_X86_64
+		, "rbx", "rcx", "rdx", "rsi", "rdi"
+		, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+		, "ebx", "ecx", "edx" , "esi", "edi"
+#endif
+		);
+
+	if ((svm->vmcb->save.dr7 & 0xff))
+		load_db_regs(svm->host_db_regs);
+
+	vcpu->arch.cr2 = svm->vmcb->save.cr2;
+
+	write_dr6(svm->host_dr6);
+	write_dr7(svm->host_dr7);
+	kvm_write_cr2(svm->host_cr2);
+
+	load_fs(fs_selector);
+	load_gs(gs_selector);
+	load_ldt(ldt_selector);
+	load_host_msrs(vcpu);
+
+	reload_tss(vcpu);
+
+	local_irq_disable();
+
+	stgi();
+
+	svm->next_rip = 0;
+}
+
+static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->save.cr3 = root;
+	force_new_asid(vcpu);
+
+	if (vcpu->fpu_active) {
+		svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
+		svm->vmcb->save.cr0 |= X86_CR0_TS;
+		vcpu->fpu_active = 0;
+	}
+}
+
+static int is_disabled(void)
+{
+	u64 vm_cr;
+
+	rdmsrl(MSR_VM_CR, vm_cr);
+	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
+		return 1;
+
+	return 0;
+}
+
+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+	/*
+	 * Patch in the VMMCALL instruction:
+	 */
+	hypercall[0] = 0x0f;
+	hypercall[1] = 0x01;
+	hypercall[2] = 0xd9;
+}
+
+static void svm_check_processor_compat(void *rtn)
+{
+	*(int *)rtn = 0;
+}
+
+static bool svm_cpu_has_accelerated_tpr(void)
+{
+	return false;
+}
+
+static struct kvm_x86_ops svm_x86_ops = {
+	.cpu_has_kvm_support = has_svm,
+	.disabled_by_bios = is_disabled,
+	.hardware_setup = svm_hardware_setup,
+	.hardware_unsetup = svm_hardware_unsetup,
+	.check_processor_compatibility = svm_check_processor_compat,
+	.hardware_enable = svm_hardware_enable,
+	.hardware_disable = svm_hardware_disable,
+	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+
+	.vcpu_create = svm_create_vcpu,
+	.vcpu_free = svm_free_vcpu,
+	.vcpu_reset = svm_vcpu_reset,
+
+	.prepare_guest_switch = svm_prepare_guest_switch,
+	.vcpu_load = svm_vcpu_load,
+	.vcpu_put = svm_vcpu_put,
+	.vcpu_decache = svm_vcpu_decache,
+
+	.set_guest_debug = svm_guest_debug,
+	.get_msr = svm_get_msr,
+	.set_msr = svm_set_msr,
+	.get_segment_base = svm_get_segment_base,
+	.get_segment = svm_get_segment,
+	.set_segment = svm_set_segment,
+	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
+	.set_cr0 = svm_set_cr0,
+	.set_cr3 = svm_set_cr3,
+	.set_cr4 = svm_set_cr4,
+	.set_efer = svm_set_efer,
+	.get_idt = svm_get_idt,
+	.set_idt = svm_set_idt,
+	.get_gdt = svm_get_gdt,
+	.set_gdt = svm_set_gdt,
+	.get_dr = svm_get_dr,
+	.set_dr = svm_set_dr,
+	.cache_regs = svm_cache_regs,
+	.decache_regs = svm_decache_regs,
+	.get_rflags = svm_get_rflags,
+	.set_rflags = svm_set_rflags,
+
+	.tlb_flush = svm_flush_tlb,
+
+	.run = svm_vcpu_run,
+	.handle_exit = handle_exit,
+	.skip_emulated_instruction = skip_emulated_instruction,
+	.patch_hypercall = svm_patch_hypercall,
+	.get_irq = svm_get_irq,
+	.set_irq = svm_set_irq,
+	.queue_exception = svm_queue_exception,
+	.exception_injected = svm_exception_injected,
+	.inject_pending_irq = svm_intr_assist,
+	.inject_pending_vectors = do_interrupt_requests,
+
+	.set_tss_addr = svm_set_tss_addr,
+};
+
+static int __init svm_init(void)
+{
+	return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
+			      THIS_MODULE);
+}
+
+static void __exit svm_exit(void)
+{
+	kvm_exit();
+}
+
+module_init(svm_init)
+module_exit(svm_exit)
diff -puN /dev/null arch/x86/kvm/svm.h
--- /dev/null
+++ a/arch/x86/kvm/svm.h
@@ -0,0 +1,325 @@
+#ifndef __SVM_H
+#define __SVM_H
+
+enum {
+	INTERCEPT_INTR,
+	INTERCEPT_NMI,
+	INTERCEPT_SMI,
+	INTERCEPT_INIT,
+	INTERCEPT_VINTR,
+	INTERCEPT_SELECTIVE_CR0,
+	INTERCEPT_STORE_IDTR,
+	INTERCEPT_STORE_GDTR,
+	INTERCEPT_STORE_LDTR,
+	INTERCEPT_STORE_TR,
+	INTERCEPT_LOAD_IDTR,
+	INTERCEPT_LOAD_GDTR,
+	INTERCEPT_LOAD_LDTR,
+	INTERCEPT_LOAD_TR,
+	INTERCEPT_RDTSC,
+	INTERCEPT_RDPMC,
+	INTERCEPT_PUSHF,
+	INTERCEPT_POPF,
+	INTERCEPT_CPUID,
+	INTERCEPT_RSM,
+	INTERCEPT_IRET,
+	INTERCEPT_INTn,
+	INTERCEPT_INVD,
+	INTERCEPT_PAUSE,
+	INTERCEPT_HLT,
+	INTERCEPT_INVLPG,
+	INTERCEPT_INVLPGA,
+	INTERCEPT_IOIO_PROT,
+	INTERCEPT_MSR_PROT,
+	INTERCEPT_TASK_SWITCH,
+	INTERCEPT_FERR_FREEZE,
+	INTERCEPT_SHUTDOWN,
+	INTERCEPT_VMRUN,
+	INTERCEPT_VMMCALL,
+	INTERCEPT_VMLOAD,
+	INTERCEPT_VMSAVE,
+	INTERCEPT_STGI,
+	INTERCEPT_CLGI,
+	INTERCEPT_SKINIT,
+	INTERCEPT_RDTSCP,
+	INTERCEPT_ICEBP,
+	INTERCEPT_WBINVD,
+	INTERCEPT_MONITOR,
+	INTERCEPT_MWAIT,
+	INTERCEPT_MWAIT_COND,
+};
+
+
+struct __attribute__ ((__packed__)) vmcb_control_area {
+	u16 intercept_cr_read;
+	u16 intercept_cr_write;
+	u16 intercept_dr_read;
+	u16 intercept_dr_write;
+	u32 intercept_exceptions;
+	u64 intercept;
+	u8 reserved_1[44];
+	u64 iopm_base_pa;
+	u64 msrpm_base_pa;
+	u64 tsc_offset;
+	u32 asid;
+	u8 tlb_ctl;
+	u8 reserved_2[3];
+	u32 int_ctl;
+	u32 int_vector;
+	u32 int_state;
+	u8 reserved_3[4];
+	u32 exit_code;
+	u32 exit_code_hi;
+	u64 exit_info_1;
+	u64 exit_info_2;
+	u32 exit_int_info;
+	u32 exit_int_info_err;
+	u64 nested_ctl;
+	u8 reserved_4[16];
+	u32 event_inj;
+	u32 event_inj_err;
+	u64 nested_cr3;
+	u64 lbr_ctl;
+	u8 reserved_5[832];
+};
+
+
+#define TLB_CONTROL_DO_NOTHING 0
+#define TLB_CONTROL_FLUSH_ALL_ASID 1
+
+#define V_TPR_MASK 0x0f
+
+#define V_IRQ_SHIFT 8
+#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+
+#define V_INTR_PRIO_SHIFT 16
+#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
+
+#define V_IGN_TPR_SHIFT 20
+#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+
+#define V_INTR_MASKING_SHIFT 24
+#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
+
+#define SVM_INTERRUPT_SHADOW_MASK 1
+
+#define SVM_IOIO_STR_SHIFT 2
+#define SVM_IOIO_REP_SHIFT 3
+#define SVM_IOIO_SIZE_SHIFT 4
+#define SVM_IOIO_ASIZE_SHIFT 7
+
+#define SVM_IOIO_TYPE_MASK 1
+#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
+#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
+#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
+#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+
+struct __attribute__ ((__packed__)) vmcb_seg {
+	u16 selector;
+	u16 attrib;
+	u32 limit;
+	u64 base;
+};
+
+struct __attribute__ ((__packed__)) vmcb_save_area {
+	struct vmcb_seg es;
+	struct vmcb_seg cs;
+	struct vmcb_seg ss;
+	struct vmcb_seg ds;
+	struct vmcb_seg fs;
+	struct vmcb_seg gs;
+	struct vmcb_seg gdtr;
+	struct vmcb_seg ldtr;
+	struct vmcb_seg idtr;
+	struct vmcb_seg tr;
+	u8 reserved_1[43];
+	u8 cpl;
+	u8 reserved_2[4];
+	u64 efer;
+	u8 reserved_3[112];
+	u64 cr4;
+	u64 cr3;
+	u64 cr0;
+	u64 dr7;
+	u64 dr6;
+	u64 rflags;
+	u64 rip;
+	u8 reserved_4[88];
+	u64 rsp;
+	u8 reserved_5[24];
+	u64 rax;
+	u64 star;
+	u64 lstar;
+	u64 cstar;
+	u64 sfmask;
+	u64 kernel_gs_base;
+	u64 sysenter_cs;
+	u64 sysenter_esp;
+	u64 sysenter_eip;
+	u64 cr2;
+	u8 reserved_6[32];
+	u64 g_pat;
+	u64 dbgctl;
+	u64 br_from;
+	u64 br_to;
+	u64 last_excp_from;
+	u64 last_excp_to;
+};
+
+struct __attribute__ ((__packed__)) vmcb {
+	struct vmcb_control_area control;
+	struct vmcb_save_area save;
+};
+
+#define SVM_CPUID_FEATURE_SHIFT 2
+#define SVM_CPUID_FUNC 0x8000000a
+
+#define MSR_EFER_SVME_MASK (1ULL << 12)
+#define MSR_VM_CR       0xc0010114
+#define MSR_VM_HSAVE_PA 0xc0010117ULL
+
+#define SVM_VM_CR_SVM_DISABLE 4
+
+#define SVM_SELECTOR_S_SHIFT 4
+#define SVM_SELECTOR_DPL_SHIFT 5
+#define SVM_SELECTOR_P_SHIFT 7
+#define SVM_SELECTOR_AVL_SHIFT 8
+#define SVM_SELECTOR_L_SHIFT 9
+#define SVM_SELECTOR_DB_SHIFT 10
+#define SVM_SELECTOR_G_SHIFT 11
+
+#define SVM_SELECTOR_TYPE_MASK (0xf)
+#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
+#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
+#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
+#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
+#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
+#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
+#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
+
+#define SVM_SELECTOR_WRITE_MASK (1 << 1)
+#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
+#define SVM_SELECTOR_CODE_MASK (1 << 3)
+
+#define INTERCEPT_CR0_MASK 1
+#define INTERCEPT_CR3_MASK (1 << 3)
+#define INTERCEPT_CR4_MASK (1 << 4)
+#define INTERCEPT_CR8_MASK (1 << 8)
+
+#define INTERCEPT_DR0_MASK 1
+#define INTERCEPT_DR1_MASK (1 << 1)
+#define INTERCEPT_DR2_MASK (1 << 2)
+#define INTERCEPT_DR3_MASK (1 << 3)
+#define INTERCEPT_DR4_MASK (1 << 4)
+#define INTERCEPT_DR5_MASK (1 << 5)
+#define INTERCEPT_DR6_MASK (1 << 6)
+#define INTERCEPT_DR7_MASK (1 << 7)
+
+#define SVM_EVTINJ_VEC_MASK 0xff
+
+#define SVM_EVTINJ_TYPE_SHIFT 8
+#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_VALID (1 << 31)
+#define SVM_EVTINJ_VALID_ERR (1 << 11)
+
+#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+
+#define	SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
+#define	SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
+#define	SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
+#define	SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
+
+#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
+#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+
+#define	SVM_EXIT_READ_CR0 	0x000
+#define	SVM_EXIT_READ_CR3 	0x003
+#define	SVM_EXIT_READ_CR4 	0x004
+#define	SVM_EXIT_READ_CR8 	0x008
+#define	SVM_EXIT_WRITE_CR0 	0x010
+#define	SVM_EXIT_WRITE_CR3 	0x013
+#define	SVM_EXIT_WRITE_CR4 	0x014
+#define	SVM_EXIT_WRITE_CR8 	0x018
+#define	SVM_EXIT_READ_DR0 	0x020
+#define	SVM_EXIT_READ_DR1 	0x021
+#define	SVM_EXIT_READ_DR2 	0x022
+#define	SVM_EXIT_READ_DR3 	0x023
+#define	SVM_EXIT_READ_DR4 	0x024
+#define	SVM_EXIT_READ_DR5 	0x025
+#define	SVM_EXIT_READ_DR6 	0x026
+#define	SVM_EXIT_READ_DR7 	0x027
+#define	SVM_EXIT_WRITE_DR0 	0x030
+#define	SVM_EXIT_WRITE_DR1 	0x031
+#define	SVM_EXIT_WRITE_DR2 	0x032
+#define	SVM_EXIT_WRITE_DR3 	0x033
+#define	SVM_EXIT_WRITE_DR4 	0x034
+#define	SVM_EXIT_WRITE_DR5 	0x035
+#define	SVM_EXIT_WRITE_DR6 	0x036
+#define	SVM_EXIT_WRITE_DR7 	0x037
+#define SVM_EXIT_EXCP_BASE      0x040
+#define SVM_EXIT_INTR		0x060
+#define SVM_EXIT_NMI		0x061
+#define SVM_EXIT_SMI		0x062
+#define SVM_EXIT_INIT		0x063
+#define SVM_EXIT_VINTR		0x064
+#define SVM_EXIT_CR0_SEL_WRITE	0x065
+#define SVM_EXIT_IDTR_READ	0x066
+#define SVM_EXIT_GDTR_READ	0x067
+#define SVM_EXIT_LDTR_READ	0x068
+#define SVM_EXIT_TR_READ	0x069
+#define SVM_EXIT_IDTR_WRITE	0x06a
+#define SVM_EXIT_GDTR_WRITE	0x06b
+#define SVM_EXIT_LDTR_WRITE	0x06c
+#define SVM_EXIT_TR_WRITE	0x06d
+#define SVM_EXIT_RDTSC		0x06e
+#define SVM_EXIT_RDPMC		0x06f
+#define SVM_EXIT_PUSHF		0x070
+#define SVM_EXIT_POPF		0x071
+#define SVM_EXIT_CPUID		0x072
+#define SVM_EXIT_RSM		0x073
+#define SVM_EXIT_IRET		0x074
+#define SVM_EXIT_SWINT		0x075
+#define SVM_EXIT_INVD		0x076
+#define SVM_EXIT_PAUSE		0x077
+#define SVM_EXIT_HLT		0x078
+#define SVM_EXIT_INVLPG		0x079
+#define SVM_EXIT_INVLPGA	0x07a
+#define SVM_EXIT_IOIO		0x07b
+#define SVM_EXIT_MSR		0x07c
+#define SVM_EXIT_TASK_SWITCH	0x07d
+#define SVM_EXIT_FERR_FREEZE	0x07e
+#define SVM_EXIT_SHUTDOWN	0x07f
+#define SVM_EXIT_VMRUN		0x080
+#define SVM_EXIT_VMMCALL	0x081
+#define SVM_EXIT_VMLOAD		0x082
+#define SVM_EXIT_VMSAVE		0x083
+#define SVM_EXIT_STGI		0x084
+#define SVM_EXIT_CLGI		0x085
+#define SVM_EXIT_SKINIT		0x086
+#define SVM_EXIT_RDTSCP		0x087
+#define SVM_EXIT_ICEBP		0x088
+#define SVM_EXIT_WBINVD		0x089
+#define SVM_EXIT_MONITOR	0x08a
+#define SVM_EXIT_MWAIT		0x08b
+#define SVM_EXIT_MWAIT_COND	0x08c
+#define SVM_EXIT_NPF  		0x400
+
+#define SVM_EXIT_ERR		-1
+
+#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
+
+#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
+#define SVM_VMRUN  ".byte 0x0f, 0x01, 0xd8"
+#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
+#define SVM_CLGI   ".byte 0x0f, 0x01, 0xdd"
+#define SVM_STGI   ".byte 0x0f, 0x01, 0xdc"
+#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
+
+#endif
+
diff -puN /dev/null arch/x86/kvm/vmx.c
--- /dev/null
+++ a/arch/x86/kvm/vmx.c
@@ -0,0 +1,2678 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "irq.h"
+#include "vmx.h"
+#include "segment_descriptor.h"
+#include "mmu.h"
+
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+
+#include <asm/io.h>
+#include <asm/desc.h>
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+static int bypass_guest_pf = 1;
+module_param(bypass_guest_pf, bool, 0);
+
+struct vmcs {
+	u32 revision_id;
+	u32 abort;
+	char data[0];
+};
+
+struct vcpu_vmx {
+	struct kvm_vcpu       vcpu;
+	int                   launched;
+	u8                    fail;
+	u32                   idt_vectoring_info;
+	struct kvm_msr_entry *guest_msrs;
+	struct kvm_msr_entry *host_msrs;
+	int                   nmsrs;
+	int                   save_nmsrs;
+	int                   msr_offset_efer;
+#ifdef CONFIG_X86_64
+	int                   msr_offset_kernel_gs_base;
+#endif
+	struct vmcs          *vmcs;
+	struct {
+		int           loaded;
+		u16           fs_sel, gs_sel, ldt_sel;
+		int           gs_ldt_reload_needed;
+		int           fs_reload_needed;
+		int           guest_efer_loaded;
+	} host_state;
+	struct {
+		struct {
+			bool pending;
+			u8 vector;
+			unsigned rip;
+		} irq;
+	} rmode;
+};
+
+static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+static int init_rmode_tss(struct kvm *kvm);
+
+static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+
+static struct page *vmx_io_bitmap_a;
+static struct page *vmx_io_bitmap_b;
+
+static struct vmcs_config {
+	int size;
+	int order;
+	u32 revision_id;
+	u32 pin_based_exec_ctrl;
+	u32 cpu_based_exec_ctrl;
+	u32 cpu_based_2nd_exec_ctrl;
+	u32 vmexit_ctrl;
+	u32 vmentry_ctrl;
+} vmcs_config;
+
+#define VMX_SEGMENT_FIELD(seg)					\
+	[VCPU_SREG_##seg] = {                                   \
+		.selector = GUEST_##seg##_SELECTOR,		\
+		.base = GUEST_##seg##_BASE,		   	\
+		.limit = GUEST_##seg##_LIMIT,		   	\
+		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
+	}
+
+static struct kvm_vmx_segment_field {
+	unsigned selector;
+	unsigned base;
+	unsigned limit;
+	unsigned ar_bytes;
+} kvm_vmx_segment_fields[] = {
+	VMX_SEGMENT_FIELD(CS),
+	VMX_SEGMENT_FIELD(DS),
+	VMX_SEGMENT_FIELD(ES),
+	VMX_SEGMENT_FIELD(FS),
+	VMX_SEGMENT_FIELD(GS),
+	VMX_SEGMENT_FIELD(SS),
+	VMX_SEGMENT_FIELD(TR),
+	VMX_SEGMENT_FIELD(LDTR),
+};
+
+/*
+ * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
+ * away by decrementing the array size.
+ */
+static const u32 vmx_msr_index[] = {
+#ifdef CONFIG_X86_64
+	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
+#endif
+	MSR_EFER, MSR_K6_STAR,
+};
+#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
+
+static void load_msrs(struct kvm_msr_entry *e, int n)
+{
+	int i;
+
+	for (i = 0; i < n; ++i)
+		wrmsrl(e[i].index, e[i].data);
+}
+
+static void save_msrs(struct kvm_msr_entry *e, int n)
+{
+	int i;
+
+	for (i = 0; i < n; ++i)
+		rdmsrl(e[i].index, e[i].data);
+}
+
+static inline int is_page_fault(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+			     INTR_INFO_VALID_MASK)) ==
+		(INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_no_device(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+			     INTR_INFO_VALID_MASK)) ==
+		(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_invalid_opcode(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+			     INTR_INFO_VALID_MASK)) ==
+		(INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_external_interrupt(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+}
+
+static inline int cpu_has_vmx_tpr_shadow(void)
+{
+	return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
+}
+
+static inline int vm_need_tpr_shadow(struct kvm *kvm)
+{
+	return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
+}
+
+static inline int cpu_has_secondary_exec_ctrls(void)
+{
+	return (vmcs_config.cpu_based_exec_ctrl &
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+}
+
+static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+{
+	return (vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+}
+
+static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+{
+	return ((cpu_has_vmx_virtualize_apic_accesses()) &&
+		(irqchip_in_kernel(kvm)));
+}
+
+static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+{
+	int i;
+
+	for (i = 0; i < vmx->nmsrs; ++i)
+		if (vmx->guest_msrs[i].index == msr)
+			return i;
+	return -1;
+}
+
+static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+{
+	int i;
+
+	i = __find_msr_index(vmx, msr);
+	if (i >= 0)
+		return &vmx->guest_msrs[i];
+	return NULL;
+}
+
+static void vmcs_clear(struct vmcs *vmcs)
+{
+	u64 phys_addr = __pa(vmcs);
+	u8 error;
+
+	asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
+		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+		      : "cc", "memory");
+	if (error)
+		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
+		       vmcs, phys_addr);
+}
+
+static void __vcpu_clear(void *arg)
+{
+	struct vcpu_vmx *vmx = arg;
+	int cpu = raw_smp_processor_id();
+
+	if (vmx->vcpu.cpu == cpu)
+		vmcs_clear(vmx->vmcs);
+	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+		per_cpu(current_vmcs, cpu) = NULL;
+	rdtscll(vmx->vcpu.arch.host_tsc);
+}
+
+static void vcpu_clear(struct vcpu_vmx *vmx)
+{
+	if (vmx->vcpu.cpu == -1)
+		return;
+	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
+	vmx->launched = 0;
+}
+
+static unsigned long vmcs_readl(unsigned long field)
+{
+	unsigned long value;
+
+	asm volatile (ASM_VMX_VMREAD_RDX_RAX
+		      : "=a"(value) : "d"(field) : "cc");
+	return value;
+}
+
+static u16 vmcs_read16(unsigned long field)
+{
+	return vmcs_readl(field);
+}
+
+static u32 vmcs_read32(unsigned long field)
+{
+	return vmcs_readl(field);
+}
+
+static u64 vmcs_read64(unsigned long field)
+{
+#ifdef CONFIG_X86_64
+	return vmcs_readl(field);
+#else
+	return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
+#endif
+}
+
+static noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+	printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
+	       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+	dump_stack();
+}
+
+static void vmcs_writel(unsigned long field, unsigned long value)
+{
+	u8 error;
+
+	asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
+		       : "=q"(error) : "a"(value), "d"(field) : "cc");
+	if (unlikely(error))
+		vmwrite_error(field, value);
+}
+
+static void vmcs_write16(unsigned long field, u16 value)
+{
+	vmcs_writel(field, value);
+}
+
+static void vmcs_write32(unsigned long field, u32 value)
+{
+	vmcs_writel(field, value);
+}
+
+static void vmcs_write64(unsigned long field, u64 value)
+{
+#ifdef CONFIG_X86_64
+	vmcs_writel(field, value);
+#else
+	vmcs_writel(field, value);
+	asm volatile ("");
+	vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static void vmcs_clear_bits(unsigned long field, u32 mask)
+{
+	vmcs_writel(field, vmcs_readl(field) & ~mask);
+}
+
+static void vmcs_set_bits(unsigned long field, u32 mask)
+{
+	vmcs_writel(field, vmcs_readl(field) | mask);
+}
+
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+	u32 eb;
+
+	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
+	if (!vcpu->fpu_active)
+		eb |= 1u << NM_VECTOR;
+	if (vcpu->guest_debug.enabled)
+		eb |= 1u << 1;
+	if (vcpu->arch.rmode.active)
+		eb = ~0;
+	vmcs_write32(EXCEPTION_BITMAP, eb);
+}
+
+static void reload_tss(void)
+{
+#ifndef CONFIG_X86_64
+
+	/*
+	 * VT restores TR but not its size.  Useless.
+	 */
+	struct descriptor_table gdt;
+	struct segment_descriptor *descs;
+
+	get_gdt(&gdt);
+	descs = (void *)gdt.base;
+	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
+	load_TR_desc();
+#endif
+}
+
+static void load_transition_efer(struct vcpu_vmx *vmx)
+{
+	int efer_offset = vmx->msr_offset_efer;
+	u64 host_efer = vmx->host_msrs[efer_offset].data;
+	u64 guest_efer = vmx->guest_msrs[efer_offset].data;
+	u64 ignore_bits;
+
+	if (efer_offset < 0)
+		return;
+	/*
+	 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+	 * outside long mode
+	 */
+	ignore_bits = EFER_NX | EFER_SCE;
+#ifdef CONFIG_X86_64
+	ignore_bits |= EFER_LMA | EFER_LME;
+	/* SCE is meaningful only in long mode on Intel */
+	if (guest_efer & EFER_LMA)
+		ignore_bits &= ~(u64)EFER_SCE;
+#endif
+	if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
+		return;
+
+	vmx->host_state.guest_efer_loaded = 1;
+	guest_efer &= ~ignore_bits;
+	guest_efer |= host_efer & ignore_bits;
+	wrmsrl(MSR_EFER, guest_efer);
+	vmx->vcpu.stat.efer_reload++;
+}
+
+static void reload_host_efer(struct vcpu_vmx *vmx)
+{
+	if (vmx->host_state.guest_efer_loaded) {
+		vmx->host_state.guest_efer_loaded = 0;
+		load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
+	}
+}
+
+static void vmx_save_host_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->host_state.loaded)
+		return;
+
+	vmx->host_state.loaded = 1;
+	/*
+	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
+	 * allow segment selectors with cpl > 0 or ti == 1.
+	 */
+	vmx->host_state.ldt_sel = read_ldt();
+	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+	vmx->host_state.fs_sel = read_fs();
+	if (!(vmx->host_state.fs_sel & 7)) {
+		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
+		vmx->host_state.fs_reload_needed = 0;
+	} else {
+		vmcs_write16(HOST_FS_SELECTOR, 0);
+		vmx->host_state.fs_reload_needed = 1;
+	}
+	vmx->host_state.gs_sel = read_gs();
+	if (!(vmx->host_state.gs_sel & 7))
+		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
+	else {
+		vmcs_write16(HOST_GS_SELECTOR, 0);
+		vmx->host_state.gs_ldt_reload_needed = 1;
+	}
+
+#ifdef CONFIG_X86_64
+	vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
+	vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
+#else
+	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
+	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
+#endif
+
+#ifdef CONFIG_X86_64
+	if (is_long_mode(&vmx->vcpu))
+		save_msrs(vmx->host_msrs +
+			  vmx->msr_offset_kernel_gs_base, 1);
+
+#endif
+	load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+	load_transition_efer(vmx);
+}
+
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+	unsigned long flags;
+
+	if (!vmx->host_state.loaded)
+		return;
+
+	++vmx->vcpu.stat.host_state_reload;
+	vmx->host_state.loaded = 0;
+	if (vmx->host_state.fs_reload_needed)
+		load_fs(vmx->host_state.fs_sel);
+	if (vmx->host_state.gs_ldt_reload_needed) {
+		load_ldt(vmx->host_state.ldt_sel);
+		/*
+		 * If we have to reload gs, we must take care to
+		 * preserve our gs base.
+		 */
+		local_irq_save(flags);
+		load_gs(vmx->host_state.gs_sel);
+#ifdef CONFIG_X86_64
+		wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+#endif
+		local_irq_restore(flags);
+	}
+	reload_tss();
+	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+	load_msrs(vmx->host_msrs, vmx->save_nmsrs);
+	reload_host_efer(vmx);
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 phys_addr = __pa(vmx->vmcs);
+	u64 tsc_this, delta;
+
+	if (vcpu->cpu != cpu) {
+		vcpu_clear(vmx);
+		kvm_migrate_apic_timer(vcpu);
+	}
+
+	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
+		u8 error;
+
+		per_cpu(current_vmcs, cpu) = vmx->vmcs;
+		asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
+			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+			      : "cc");
+		if (error)
+			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+			       vmx->vmcs, phys_addr);
+	}
+
+	if (vcpu->cpu != cpu) {
+		struct descriptor_table dt;
+		unsigned long sysenter_esp;
+
+		vcpu->cpu = cpu;
+		/*
+		 * Linux uses per-cpu TSS and GDT, so set these when switching
+		 * processors.
+		 */
+		vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
+		get_gdt(&dt);
+		vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
+
+		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
+		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+		/*
+		 * Make sure the time stamp counter is monotonous.
+		 */
+		rdtscll(tsc_this);
+		delta = vcpu->arch.host_tsc - tsc_this;
+		vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
+	}
+}
+
+static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	vmx_load_host_state(to_vmx(vcpu));
+}
+
+static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->fpu_active)
+		return;
+	vcpu->fpu_active = 1;
+	vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
+	if (vcpu->arch.cr0 & X86_CR0_TS)
+		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+	update_exception_bitmap(vcpu);
+}
+
+static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->fpu_active)
+		return;
+	vcpu->fpu_active = 0;
+	vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+	update_exception_bitmap(vcpu);
+}
+
+static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+	vcpu_clear(to_vmx(vcpu));
+}
+
+static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
+{
+	return vmcs_readl(GUEST_RFLAGS);
+}
+
+static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+	if (vcpu->arch.rmode.active)
+		rflags |= IOPL_MASK | X86_EFLAGS_VM;
+	vmcs_writel(GUEST_RFLAGS, rflags);
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+	unsigned long rip;
+	u32 interruptibility;
+
+	rip = vmcs_readl(GUEST_RIP);
+	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	vmcs_writel(GUEST_RIP, rip);
+
+	/*
+	 * We emulated an instruction, so temporary interrupt blocking
+	 * should be removed, if set.
+	 */
+	interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	if (interruptibility & 3)
+		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+			     interruptibility & ~3);
+	vcpu->arch.interrupt_window_open = 1;
+}
+
+static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+				bool has_error_code, u32 error_code)
+{
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+		     nr | INTR_TYPE_EXCEPTION
+		     | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
+		     | INTR_INFO_VALID_MASK);
+	if (has_error_code)
+		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+}
+
+static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+}
+
+/*
+ * Swap MSR entry in host/guest MSR entry array.
+ */
+#ifdef CONFIG_X86_64
+static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+{
+	struct kvm_msr_entry tmp;
+
+	tmp = vmx->guest_msrs[to];
+	vmx->guest_msrs[to] = vmx->guest_msrs[from];
+	vmx->guest_msrs[from] = tmp;
+	tmp = vmx->host_msrs[to];
+	vmx->host_msrs[to] = vmx->host_msrs[from];
+	vmx->host_msrs[from] = tmp;
+}
+#endif
+
+/*
+ * Set up the vmcs to automatically save and restore system
+ * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
+ * mode, as fiddling with msrs is very expensive.
+ */
+static void setup_msrs(struct vcpu_vmx *vmx)
+{
+	int save_nmsrs;
+
+	save_nmsrs = 0;
+#ifdef CONFIG_X86_64
+	if (is_long_mode(&vmx->vcpu)) {
+		int index;
+
+		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
+		if (index >= 0)
+			move_msr_up(vmx, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_LSTAR);
+		if (index >= 0)
+			move_msr_up(vmx, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_CSTAR);
+		if (index >= 0)
+			move_msr_up(vmx, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
+		if (index >= 0)
+			move_msr_up(vmx, index, save_nmsrs++);
+		/*
+		 * MSR_K6_STAR is only needed on long mode guests, and only
+		 * if efer.sce is enabled.
+		 */
+		index = __find_msr_index(vmx, MSR_K6_STAR);
+		if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
+			move_msr_up(vmx, index, save_nmsrs++);
+	}
+#endif
+	vmx->save_nmsrs = save_nmsrs;
+
+#ifdef CONFIG_X86_64
+	vmx->msr_offset_kernel_gs_base =
+		__find_msr_index(vmx, MSR_KERNEL_GS_BASE);
+#endif
+	vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+}
+
+/*
+ * reads and returns guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset    -- 21.3
+ */
+static u64 guest_read_tsc(void)
+{
+	u64 host_tsc, tsc_offset;
+
+	rdtscll(host_tsc);
+	tsc_offset = vmcs_read64(TSC_OFFSET);
+	return host_tsc + tsc_offset;
+}
+
+/*
+ * writes 'guest_tsc' into guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
+ */
+static void guest_write_tsc(u64 guest_tsc)
+{
+	u64 host_tsc;
+
+	rdtscll(host_tsc);
+	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+}
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+	u64 data;
+	struct kvm_msr_entry *msr;
+
+	if (!pdata) {
+		printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
+		return -EINVAL;
+	}
+
+	switch (msr_index) {
+#ifdef CONFIG_X86_64
+	case MSR_FS_BASE:
+		data = vmcs_readl(GUEST_FS_BASE);
+		break;
+	case MSR_GS_BASE:
+		data = vmcs_readl(GUEST_GS_BASE);
+		break;
+	case MSR_EFER:
+		return kvm_get_msr_common(vcpu, msr_index, pdata);
+#endif
+	case MSR_IA32_TIME_STAMP_COUNTER:
+		data = guest_read_tsc();
+		break;
+	case MSR_IA32_SYSENTER_CS:
+		data = vmcs_read32(GUEST_SYSENTER_CS);
+		break;
+	case MSR_IA32_SYSENTER_EIP:
+		data = vmcs_readl(GUEST_SYSENTER_EIP);
+		break;
+	case MSR_IA32_SYSENTER_ESP:
+		data = vmcs_readl(GUEST_SYSENTER_ESP);
+		break;
+	default:
+		msr = find_msr_entry(to_vmx(vcpu), msr_index);
+		if (msr) {
+			data = msr->data;
+			break;
+		}
+		return kvm_get_msr_common(vcpu, msr_index, pdata);
+	}
+
+	*pdata = data;
+	return 0;
+}
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr;
+	int ret = 0;
+
+	switch (msr_index) {
+#ifdef CONFIG_X86_64
+	case MSR_EFER:
+		ret = kvm_set_msr_common(vcpu, msr_index, data);
+		if (vmx->host_state.loaded) {
+			reload_host_efer(vmx);
+			load_transition_efer(vmx);
+		}
+		break;
+	case MSR_FS_BASE:
+		vmcs_writel(GUEST_FS_BASE, data);
+		break;
+	case MSR_GS_BASE:
+		vmcs_writel(GUEST_GS_BASE, data);
+		break;
+#endif
+	case MSR_IA32_SYSENTER_CS:
+		vmcs_write32(GUEST_SYSENTER_CS, data);
+		break;
+	case MSR_IA32_SYSENTER_EIP:
+		vmcs_writel(GUEST_SYSENTER_EIP, data);
+		break;
+	case MSR_IA32_SYSENTER_ESP:
+		vmcs_writel(GUEST_SYSENTER_ESP, data);
+		break;
+	case MSR_IA32_TIME_STAMP_COUNTER:
+		guest_write_tsc(data);
+		break;
+	default:
+		msr = find_msr_entry(vmx, msr_index);
+		if (msr) {
+			msr->data = data;
+			if (vmx->host_state.loaded)
+				load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+			break;
+		}
+		ret = kvm_set_msr_common(vcpu, msr_index, data);
+	}
+
+	return ret;
+}
+
+/*
+ * Sync the rsp and rip registers into the vcpu structure.  This allows
+ * registers to be accessed by indexing vcpu->arch.regs.
+ */
+static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+	vcpu->arch.rip = vmcs_readl(GUEST_RIP);
+}
+
+/*
+ * Syncs rsp and rip back into the vmcs.  Should be called after possible
+ * modification.
+ */
+static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
+{
+	vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	vmcs_writel(GUEST_RIP, vcpu->arch.rip);
+}
+
+static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+{
+	unsigned long dr7 = 0x400;
+	int old_singlestep;
+
+	old_singlestep = vcpu->guest_debug.singlestep;
+
+	vcpu->guest_debug.enabled = dbg->enabled;
+	if (vcpu->guest_debug.enabled) {
+		int i;
+
+		dr7 |= 0x200;  /* exact */
+		for (i = 0; i < 4; ++i) {
+			if (!dbg->breakpoints[i].enabled)
+				continue;
+			vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
+			dr7 |= 2 << (i*2);    /* global enable */
+			dr7 |= 0 << (i*4+16); /* execution breakpoint */
+		}
+
+		vcpu->guest_debug.singlestep = dbg->singlestep;
+	} else
+		vcpu->guest_debug.singlestep = 0;
+
+	if (old_singlestep && !vcpu->guest_debug.singlestep) {
+		unsigned long flags;
+
+		flags = vmcs_readl(GUEST_RFLAGS);
+		flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+		vmcs_writel(GUEST_RFLAGS, flags);
+	}
+
+	update_exception_bitmap(vcpu);
+	vmcs_writel(GUEST_DR7, dr7);
+
+	return 0;
+}
+
+static int vmx_get_irq(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 idtv_info_field;
+
+	idtv_info_field = vmx->idt_vectoring_info;
+	if (idtv_info_field & INTR_INFO_VALID_MASK) {
+		if (is_external_interrupt(idtv_info_field))
+			return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
+		else
+			printk(KERN_DEBUG "pending exception: not handled yet\n");
+	}
+	return -1;
+}
+
+static __init int cpu_has_kvm_support(void)
+{
+	unsigned long ecx = cpuid_ecx(1);
+	return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+static __init int vmx_disabled_by_bios(void)
+{
+	u64 msr;
+
+	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+	return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
+		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+	    == MSR_IA32_FEATURE_CONTROL_LOCKED;
+	/* locked but not enabled */
+}
+
+static void hardware_enable(void *garbage)
+{
+	int cpu = raw_smp_processor_id();
+	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+	u64 old;
+
+	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
+	if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
+		    MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+	    != (MSR_IA32_FEATURE_CONTROL_LOCKED |
+		MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+		/* enable and lock */
+		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
+		       MSR_IA32_FEATURE_CONTROL_LOCKED |
+		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
+	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+	asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
+		      : "memory", "cc");
+}
+
+static void hardware_disable(void *garbage)
+{
+	asm volatile (ASM_VMX_VMXOFF : : : "cc");
+}
+
+static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
+				      u32 msr, u32 *result)
+{
+	u32 vmx_msr_low, vmx_msr_high;
+	u32 ctl = ctl_min | ctl_opt;
+
+	rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
+
+	/* Ensure minimum (required) set of control bits are supported. */
+	if (ctl_min & ~ctl)
+		return -EIO;
+
+	*result = ctl;
+	return 0;
+}
+
+static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+{
+	u32 vmx_msr_low, vmx_msr_high;
+	u32 min, opt;
+	u32 _pin_based_exec_control = 0;
+	u32 _cpu_based_exec_control = 0;
+	u32 _cpu_based_2nd_exec_control = 0;
+	u32 _vmexit_control = 0;
+	u32 _vmentry_control = 0;
+
+	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+	opt = 0;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+				&_pin_based_exec_control) < 0)
+		return -EIO;
+
+	min = CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+	      CPU_BASED_CR8_LOAD_EXITING |
+	      CPU_BASED_CR8_STORE_EXITING |
+#endif
+	      CPU_BASED_USE_IO_BITMAPS |
+	      CPU_BASED_MOV_DR_EXITING |
+	      CPU_BASED_USE_TSC_OFFSETING;
+	opt = CPU_BASED_TPR_SHADOW |
+	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+				&_cpu_based_exec_control) < 0)
+		return -EIO;
+#ifdef CONFIG_X86_64
+	if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
+					   ~CPU_BASED_CR8_STORE_EXITING;
+#endif
+	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+		min = 0;
+		opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+			SECONDARY_EXEC_WBINVD_EXITING;
+		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
+					&_cpu_based_2nd_exec_control) < 0)
+			return -EIO;
+	}
+#ifndef CONFIG_X86_64
+	if (!(_cpu_based_2nd_exec_control &
+				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
+
+	min = 0;
+#ifdef CONFIG_X86_64
+	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+	opt = 0;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
+				&_vmexit_control) < 0)
+		return -EIO;
+
+	min = opt = 0;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
+				&_vmentry_control) < 0)
+		return -EIO;
+
+	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+
+	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+		return -EIO;
+
+#ifdef CONFIG_X86_64
+	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
+	if (vmx_msr_high & (1u<<16))
+		return -EIO;
+#endif
+
+	/* Require Write-Back (WB) memory type for VMCS accesses. */
+	if (((vmx_msr_high >> 18) & 15) != 6)
+		return -EIO;
+
+	vmcs_conf->size = vmx_msr_high & 0x1fff;
+	vmcs_conf->order = get_order(vmcs_config.size);
+	vmcs_conf->revision_id = vmx_msr_low;
+
+	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+	vmcs_conf->vmexit_ctrl         = _vmexit_control;
+	vmcs_conf->vmentry_ctrl        = _vmentry_control;
+
+	return 0;
+}
+
+static struct vmcs *alloc_vmcs_cpu(int cpu)
+{
+	int node = cpu_to_node(cpu);
+	struct page *pages;
+	struct vmcs *vmcs;
+
+	pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+	if (!pages)
+		return NULL;
+	vmcs = page_address(pages);
+	memset(vmcs, 0, vmcs_config.size);
+	vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
+	return vmcs;
+}
+
+static struct vmcs *alloc_vmcs(void)
+{
+	return alloc_vmcs_cpu(raw_smp_processor_id());
+}
+
+static void free_vmcs(struct vmcs *vmcs)
+{
+	free_pages((unsigned long)vmcs, vmcs_config.order);
+}
+
+static void free_kvm_area(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		free_vmcs(per_cpu(vmxarea, cpu));
+}
+
+static __init int alloc_kvm_area(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct vmcs *vmcs;
+
+		vmcs = alloc_vmcs_cpu(cpu);
+		if (!vmcs) {
+			free_kvm_area();
+			return -ENOMEM;
+		}
+
+		per_cpu(vmxarea, cpu) = vmcs;
+	}
+	return 0;
+}
+
+static __init int hardware_setup(void)
+{
+	if (setup_vmcs_config(&vmcs_config) < 0)
+		return -EIO;
+	return alloc_kvm_area();
+}
+
+static __exit void hardware_unsetup(void)
+{
+	free_kvm_area();
+}
+
+static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
+{
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+	if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
+		vmcs_write16(sf->selector, save->selector);
+		vmcs_writel(sf->base, save->base);
+		vmcs_write32(sf->limit, save->limit);
+		vmcs_write32(sf->ar_bytes, save->ar);
+	} else {
+		u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
+			<< AR_DPL_SHIFT;
+		vmcs_write32(sf->ar_bytes, 0x93 | dpl);
+	}
+}
+
+static void enter_pmode(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+
+	vcpu->arch.rmode.active = 0;
+
+	vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
+	vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
+	vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
+
+	flags = vmcs_readl(GUEST_RFLAGS);
+	flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
+	flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
+	vmcs_writel(GUEST_RFLAGS, flags);
+
+	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
+			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
+
+	update_exception_bitmap(vcpu);
+
+	fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+	fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+	fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+	fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+
+	vmcs_write16(GUEST_SS_SELECTOR, 0);
+	vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
+
+	vmcs_write16(GUEST_CS_SELECTOR,
+		     vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
+	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+}
+
+static gva_t rmode_tss_base(struct kvm *kvm)
+{
+	if (!kvm->arch.tss_addr) {
+		gfn_t base_gfn = kvm->memslots[0].base_gfn +
+				 kvm->memslots[0].npages - 3;
+		return base_gfn << PAGE_SHIFT;
+	}
+	return kvm->arch.tss_addr;
+}
+
+static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+{
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+	save->selector = vmcs_read16(sf->selector);
+	save->base = vmcs_readl(sf->base);
+	save->limit = vmcs_read32(sf->limit);
+	save->ar = vmcs_read32(sf->ar_bytes);
+	vmcs_write16(sf->selector, save->base >> 4);
+	vmcs_write32(sf->base, save->base & 0xfffff);
+	vmcs_write32(sf->limit, 0xffff);
+	vmcs_write32(sf->ar_bytes, 0xf3);
+}
+
+static void enter_rmode(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+
+	vcpu->arch.rmode.active = 1;
+
+	vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
+
+	vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
+
+	vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+	flags = vmcs_readl(GUEST_RFLAGS);
+	vcpu->arch.rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
+
+	flags |= IOPL_MASK | X86_EFLAGS_VM;
+
+	vmcs_writel(GUEST_RFLAGS, flags);
+	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
+	update_exception_bitmap(vcpu);
+
+	vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
+	vmcs_write32(GUEST_SS_LIMIT, 0xffff);
+	vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+
+	vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
+	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+	if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
+		vmcs_writel(GUEST_CS_BASE, 0xf0000);
+	vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+
+	fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+	fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+	fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+	fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+
+	kvm_mmu_reset_context(vcpu);
+	init_rmode_tss(vcpu->kvm);
+}
+
+#ifdef CONFIG_X86_64
+
+static void enter_lmode(struct kvm_vcpu *vcpu)
+{
+	u32 guest_tr_ar;
+
+	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
+	if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
+		printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
+		       __FUNCTION__);
+		vmcs_write32(GUEST_TR_AR_BYTES,
+			     (guest_tr_ar & ~AR_TYPE_MASK)
+			     | AR_TYPE_BUSY_64_TSS);
+	}
+
+	vcpu->arch.shadow_efer |= EFER_LMA;
+
+	find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     vmcs_read32(VM_ENTRY_CONTROLS)
+		     | VM_ENTRY_IA32E_MODE);
+}
+
+static void exit_lmode(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.shadow_efer &= ~EFER_LMA;
+
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     vmcs_read32(VM_ENTRY_CONTROLS)
+		     & ~VM_ENTRY_IA32E_MODE);
+}
+
+#endif
+
+static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
+	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+}
+
+static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	vmx_fpu_deactivate(vcpu);
+
+	if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
+		enter_pmode(vcpu);
+
+	if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
+		enter_rmode(vcpu);
+
+#ifdef CONFIG_X86_64
+	if (vcpu->arch.shadow_efer & EFER_LME) {
+		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+			enter_lmode(vcpu);
+		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+			exit_lmode(vcpu);
+	}
+#endif
+
+	vmcs_writel(CR0_READ_SHADOW, cr0);
+	vmcs_writel(GUEST_CR0,
+		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
+	vcpu->arch.cr0 = cr0;
+
+	if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
+		vmx_fpu_activate(vcpu);
+}
+
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	vmcs_writel(GUEST_CR3, cr3);
+	if (vcpu->arch.cr0 & X86_CR0_PE)
+		vmx_fpu_deactivate(vcpu);
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	vmcs_writel(CR4_READ_SHADOW, cr4);
+	vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
+		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
+	vcpu->arch.cr4 = cr4;
+}
+
+#ifdef CONFIG_X86_64
+
+static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+	vcpu->arch.shadow_efer = efer;
+	if (efer & EFER_LMA) {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+				     vmcs_read32(VM_ENTRY_CONTROLS) |
+				     VM_ENTRY_IA32E_MODE);
+		msr->data = efer;
+
+	} else {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+				     vmcs_read32(VM_ENTRY_CONTROLS) &
+				     ~VM_ENTRY_IA32E_MODE);
+
+		msr->data = efer & ~EFER_LME;
+	}
+	setup_msrs(vmx);
+}
+
+#endif
+
+static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+	return vmcs_readl(sf->base);
+}
+
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg)
+{
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+	u32 ar;
+
+	var->base = vmcs_readl(sf->base);
+	var->limit = vmcs_read32(sf->limit);
+	var->selector = vmcs_read16(sf->selector);
+	ar = vmcs_read32(sf->ar_bytes);
+	if (ar & AR_UNUSABLE_MASK)
+		ar = 0;
+	var->type = ar & 15;
+	var->s = (ar >> 4) & 1;
+	var->dpl = (ar >> 5) & 3;
+	var->present = (ar >> 7) & 1;
+	var->avl = (ar >> 12) & 1;
+	var->l = (ar >> 13) & 1;
+	var->db = (ar >> 14) & 1;
+	var->g = (ar >> 15) & 1;
+	var->unusable = (ar >> 16) & 1;
+}
+
+static u32 vmx_segment_access_rights(struct kvm_segment *var)
+{
+	u32 ar;
+
+	if (var->unusable)
+		ar = 1 << 16;
+	else {
+		ar = var->type & 15;
+		ar |= (var->s & 1) << 4;
+		ar |= (var->dpl & 3) << 5;
+		ar |= (var->present & 1) << 7;
+		ar |= (var->avl & 1) << 12;
+		ar |= (var->l & 1) << 13;
+		ar |= (var->db & 1) << 14;
+		ar |= (var->g & 1) << 15;
+	}
+	if (ar == 0) /* a 0 value means unusable */
+		ar = AR_UNUSABLE_MASK;
+
+	return ar;
+}
+
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg)
+{
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+	u32 ar;
+
+	if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
+		vcpu->arch.rmode.tr.selector = var->selector;
+		vcpu->arch.rmode.tr.base = var->base;
+		vcpu->arch.rmode.tr.limit = var->limit;
+		vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
+		return;
+	}
+	vmcs_writel(sf->base, var->base);
+	vmcs_write32(sf->limit, var->limit);
+	vmcs_write16(sf->selector, var->selector);
+	if (vcpu->arch.rmode.active && var->s) {
+		/*
+		 * Hack real-mode segments into vm86 compatibility.
+		 */
+		if (var->base == 0xffff0000 && var->selector == 0xf000)
+			vmcs_writel(sf->base, 0xf0000);
+		ar = 0xf3;
+	} else
+		ar = vmx_segment_access_rights(var);
+	vmcs_write32(sf->ar_bytes, ar);
+}
+
+static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+	u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
+
+	*db = (ar >> 14) & 1;
+	*l = (ar >> 13) & 1;
+}
+
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	dt->base = vmcs_readl(GUEST_IDTR_BASE);
+}
+
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
+	vmcs_writel(GUEST_IDTR_BASE, dt->base);
+}
+
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	dt->base = vmcs_readl(GUEST_GDTR_BASE);
+}
+
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
+	vmcs_writel(GUEST_GDTR_BASE, dt->base);
+}
+
+static int init_rmode_tss(struct kvm *kvm)
+{
+	gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
+	u16 data = 0;
+	int ret = 0;
+	int r;
+
+	down_read(&current->mm->mmap_sem);
+	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+	if (r < 0)
+		goto out;
+	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+	r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+	if (r < 0)
+		goto out;
+	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
+	if (r < 0)
+		goto out;
+	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+	if (r < 0)
+		goto out;
+	data = ~0;
+	r = kvm_write_guest_page(kvm, fn, &data,
+				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+				 sizeof(u8));
+	if (r < 0)
+		goto out;
+
+	ret = 1;
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+static void seg_setup(int seg)
+{
+	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+	vmcs_write16(sf->selector, 0);
+	vmcs_writel(sf->base, 0);
+	vmcs_write32(sf->limit, 0xffff);
+	vmcs_write32(sf->ar_bytes, 0x93);
+}
+
+static int alloc_apic_access_page(struct kvm *kvm)
+{
+	struct kvm_userspace_memory_region kvm_userspace_mem;
+	int r = 0;
+
+	down_write(&current->mm->mmap_sem);
+	if (kvm->arch.apic_access_page)
+		goto out;
+	kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
+	kvm_userspace_mem.flags = 0;
+	kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
+	kvm_userspace_mem.memory_size = PAGE_SIZE;
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+	if (r)
+		goto out;
+	kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+out:
+	up_write(&current->mm->mmap_sem);
+	return r;
+}
+
+/*
+ * Sets up the vmcs for emulated real mode.
+ */
+static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+{
+	u32 host_sysenter_cs;
+	u32 junk;
+	unsigned long a;
+	struct descriptor_table dt;
+	int i;
+	unsigned long kvm_vmx_return;
+	u32 exec_control;
+
+	/* I/O */
+	vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
+	vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
+
+	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+	/* Control */
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+		vmcs_config.pin_based_exec_ctrl);
+
+	exec_control = vmcs_config.cpu_based_exec_ctrl;
+	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+		exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+				CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	if (cpu_has_secondary_exec_ctrls()) {
+		exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+		if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+			exec_control &=
+				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	}
+
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+
+	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
+	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
+	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
+
+	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
+	vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
+	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+#ifdef CONFIG_X86_64
+	rdmsrl(MSR_FS_BASE, a);
+	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
+	rdmsrl(MSR_GS_BASE, a);
+	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
+#else
+	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+#endif
+
+	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
+
+	get_idt(&dt);
+	vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
+
+	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
+	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
+	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
+	rdmsrl(MSR_IA32_SYSENTER_ESP, a);
+	vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
+	rdmsrl(MSR_IA32_SYSENTER_EIP, a);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
+
+	for (i = 0; i < NR_VMX_MSR; ++i) {
+		u32 index = vmx_msr_index[i];
+		u32 data_low, data_high;
+		u64 data;
+		int j = vmx->nmsrs;
+
+		if (rdmsr_safe(index, &data_low, &data_high) < 0)
+			continue;
+		if (wrmsr_safe(index, data_low, data_high) < 0)
+			continue;
+		data = data_low | ((u64)data_high << 32);
+		vmx->host_msrs[j].index = index;
+		vmx->host_msrs[j].reserved = 0;
+		vmx->host_msrs[j].data = data;
+		vmx->guest_msrs[j] = vmx->host_msrs[j];
+		++vmx->nmsrs;
+	}
+
+	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+	/* 22.2.1, 20.8.1 */
+	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
+
+	if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+		if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
+			return -ENOMEM;
+
+	return 0;
+}
+
+static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 msr;
+	int ret;
+
+	if (!init_rmode_tss(vmx->vcpu.kvm)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	vmx->vcpu.arch.rmode.active = 0;
+
+	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+	set_cr8(&vmx->vcpu, 0);
+	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+	if (vmx->vcpu.vcpu_id == 0)
+		msr |= MSR_IA32_APICBASE_BSP;
+	kvm_set_apic_base(&vmx->vcpu, msr);
+
+	fx_init(&vmx->vcpu);
+
+	/*
+	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
+	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
+	 */
+	if (vmx->vcpu.vcpu_id == 0) {
+		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+		vmcs_writel(GUEST_CS_BASE, 0x000f0000);
+	} else {
+		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
+		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
+	}
+	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+
+	seg_setup(VCPU_SREG_DS);
+	seg_setup(VCPU_SREG_ES);
+	seg_setup(VCPU_SREG_FS);
+	seg_setup(VCPU_SREG_GS);
+	seg_setup(VCPU_SREG_SS);
+
+	vmcs_write16(GUEST_TR_SELECTOR, 0);
+	vmcs_writel(GUEST_TR_BASE, 0);
+	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+	vmcs_writel(GUEST_LDTR_BASE, 0);
+	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
+
+	vmcs_write32(GUEST_SYSENTER_CS, 0);
+	vmcs_writel(GUEST_SYSENTER_ESP, 0);
+	vmcs_writel(GUEST_SYSENTER_EIP, 0);
+
+	vmcs_writel(GUEST_RFLAGS, 0x02);
+	if (vmx->vcpu.vcpu_id == 0)
+		vmcs_writel(GUEST_RIP, 0xfff0);
+	else
+		vmcs_writel(GUEST_RIP, 0);
+	vmcs_writel(GUEST_RSP, 0);
+
+	/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
+	vmcs_writel(GUEST_DR7, 0x400);
+
+	vmcs_writel(GUEST_GDTR_BASE, 0);
+	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+	vmcs_writel(GUEST_IDTR_BASE, 0);
+	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+	vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+
+	guest_write_tsc(0);
+
+	/* Special registers */
+	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+	setup_msrs(vmx);
+
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
+
+	if (cpu_has_vmx_tpr_shadow()) {
+		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+		if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+				page_to_phys(vmx->vcpu.arch.apic->regs_page));
+		vmcs_write32(TPR_THRESHOLD, 0);
+	}
+
+	if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+		vmcs_write64(APIC_ACCESS_ADDR,
+			     page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+
+	vmx->vcpu.arch.cr0 = 0x60000010;
+	vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
+	vmx_set_cr4(&vmx->vcpu, 0);
+#ifdef CONFIG_X86_64
+	vmx_set_efer(&vmx->vcpu, 0);
+#endif
+	vmx_fpu_activate(&vmx->vcpu);
+	update_exception_bitmap(&vmx->vcpu);
+
+	return 0;
+
+out:
+	return ret;
+}
+
+static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vcpu->arch.rmode.active) {
+		vmx->rmode.irq.pending = true;
+		vmx->rmode.irq.vector = irq;
+		vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			     irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+		vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
+		return;
+	}
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+}
+
+static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
+{
+	int word_index = __ffs(vcpu->arch.irq_summary);
+	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+	int irq = word_index * BITS_PER_LONG + bit_index;
+
+	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+	if (!vcpu->arch.irq_pending[word_index])
+		clear_bit(word_index, &vcpu->arch.irq_summary);
+	vmx_inject_irq(vcpu, irq);
+}
+
+
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+				       struct kvm_run *kvm_run)
+{
+	u32 cpu_based_vm_exec_control;
+
+	vcpu->arch.interrupt_window_open =
+		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+
+	if (vcpu->arch.interrupt_window_open &&
+	    vcpu->arch.irq_summary &&
+	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
+		/*
+		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
+		 */
+		kvm_do_inject_irq(vcpu);
+
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	if (!vcpu->arch.interrupt_window_open &&
+	    (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
+		/*
+		 * Interrupts blocked.  Wait for unblock.
+		 */
+		cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+	else
+		cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+	int ret;
+	struct kvm_userspace_memory_region tss_mem = {
+		.slot = 8,
+		.guest_phys_addr = addr,
+		.memory_size = PAGE_SIZE * 3,
+		.flags = 0,
+	};
+
+	ret = kvm_set_memory_region(kvm, &tss_mem, 0);
+	if (ret)
+		return ret;
+	kvm->arch.tss_addr = addr;
+	return 0;
+}
+
+static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
+{
+	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
+
+	set_debugreg(dbg->bp[0], 0);
+	set_debugreg(dbg->bp[1], 1);
+	set_debugreg(dbg->bp[2], 2);
+	set_debugreg(dbg->bp[3], 3);
+
+	if (dbg->singlestep) {
+		unsigned long flags;
+
+		flags = vmcs_readl(GUEST_RFLAGS);
+		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+		vmcs_writel(GUEST_RFLAGS, flags);
+	}
+}
+
+static int handle_rmode_exception(struct kvm_vcpu *vcpu,
+				  int vec, u32 err_code)
+{
+	if (!vcpu->arch.rmode.active)
+		return 0;
+
+	/*
+	 * Instruction with address size override prefix opcode 0x67
+	 * Cause the #SS fault with 0 error code in VM86 mode.
+	 */
+	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
+		if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
+			return 1;
+	return 0;
+}
+
+static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info, error_code;
+	unsigned long cr2, rip;
+	u32 vect_info;
+	enum emulation_result er;
+
+	vect_info = vmx->idt_vectoring_info;
+	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+						!is_page_fault(intr_info))
+		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
+		       "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
+
+	if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
+		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
+		set_bit(irq, vcpu->arch.irq_pending);
+		set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+	}
+
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+		return 1;  /* already handled by vmx_vcpu_run() */
+
+	if (is_no_device(intr_info)) {
+		vmx_fpu_activate(vcpu);
+		return 1;
+	}
+
+	if (is_invalid_opcode(intr_info)) {
+		er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+		if (er != EMULATE_DONE)
+			kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	error_code = 0;
+	rip = vmcs_readl(GUEST_RIP);
+	if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
+		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	if (is_page_fault(intr_info)) {
+		cr2 = vmcs_readl(EXIT_QUALIFICATION);
+		return kvm_mmu_page_fault(vcpu, cr2, error_code);
+	}
+
+	if (vcpu->arch.rmode.active &&
+	    handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
+								error_code)) {
+		if (vcpu->arch.halt_request) {
+			vcpu->arch.halt_request = 0;
+			return kvm_emulate_halt(vcpu);
+		}
+		return 1;
+	}
+
+	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
+	    (INTR_TYPE_EXCEPTION | 1)) {
+		kvm_run->exit_reason = KVM_EXIT_DEBUG;
+		return 0;
+	}
+	kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+	kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
+	kvm_run->ex.error_code = error_code;
+	return 0;
+}
+
+static int handle_external_interrupt(struct kvm_vcpu *vcpu,
+				     struct kvm_run *kvm_run)
+{
+	++vcpu->stat.irq_exits;
+	return 1;
+}
+
+static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+	return 0;
+}
+
+static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	unsigned long exit_qualification;
+	int size, down, in, string, rep;
+	unsigned port;
+
+	++vcpu->stat.io_exits;
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	string = (exit_qualification & 16) != 0;
+
+	if (string) {
+		if (emulate_instruction(vcpu,
+					kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+			return 0;
+		return 1;
+	}
+
+	size = (exit_qualification & 7) + 1;
+	in = (exit_qualification & 8) != 0;
+	down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
+	rep = (exit_qualification & 32) != 0;
+	port = exit_qualification >> 16;
+
+	return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
+}
+
+static void
+vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+	/*
+	 * Patch in the VMCALL instruction:
+	 */
+	hypercall[0] = 0x0f;
+	hypercall[1] = 0x01;
+	hypercall[2] = 0xc1;
+}
+
+static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	unsigned long exit_qualification;
+	int cr;
+	int reg;
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	cr = exit_qualification & 15;
+	reg = (exit_qualification >> 8) & 15;
+	switch ((exit_qualification >> 4) & 3) {
+	case 0: /* mov to cr */
+		switch (cr) {
+		case 0:
+			vcpu_load_rsp_rip(vcpu);
+			set_cr0(vcpu, vcpu->arch.regs[reg]);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		case 3:
+			vcpu_load_rsp_rip(vcpu);
+			set_cr3(vcpu, vcpu->arch.regs[reg]);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		case 4:
+			vcpu_load_rsp_rip(vcpu);
+			set_cr4(vcpu, vcpu->arch.regs[reg]);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		case 8:
+			vcpu_load_rsp_rip(vcpu);
+			set_cr8(vcpu, vcpu->arch.regs[reg]);
+			skip_emulated_instruction(vcpu);
+			if (irqchip_in_kernel(vcpu->kvm))
+				return 1;
+			kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+			return 0;
+		};
+		break;
+	case 2: /* clts */
+		vcpu_load_rsp_rip(vcpu);
+		vmx_fpu_deactivate(vcpu);
+		vcpu->arch.cr0 &= ~X86_CR0_TS;
+		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+		vmx_fpu_activate(vcpu);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	case 1: /*mov from cr*/
+		switch (cr) {
+		case 3:
+			vcpu_load_rsp_rip(vcpu);
+			vcpu->arch.regs[reg] = vcpu->arch.cr3;
+			vcpu_put_rsp_rip(vcpu);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		case 8:
+			vcpu_load_rsp_rip(vcpu);
+			vcpu->arch.regs[reg] = get_cr8(vcpu);
+			vcpu_put_rsp_rip(vcpu);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		break;
+	case 3: /* lmsw */
+		lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+
+		skip_emulated_instruction(vcpu);
+		return 1;
+	default:
+		break;
+	}
+	kvm_run->exit_reason = 0;
+	pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+	       (int)(exit_qualification >> 4) & 3, cr);
+	return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	unsigned long exit_qualification;
+	unsigned long val;
+	int dr, reg;
+
+	/*
+	 * FIXME: this code assumes the host is debugging the guest.
+	 *        need to deal with guest debugging itself too.
+	 */
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	dr = exit_qualification & 7;
+	reg = (exit_qualification >> 8) & 15;
+	vcpu_load_rsp_rip(vcpu);
+	if (exit_qualification & 16) {
+		/* mov from dr */
+		switch (dr) {
+		case 6:
+			val = 0xffff0ff0;
+			break;
+		case 7:
+			val = 0x400;
+			break;
+		default:
+			val = 0;
+		}
+		vcpu->arch.regs[reg] = val;
+	} else {
+		/* mov to dr */
+	}
+	vcpu_put_rsp_rip(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	kvm_emulate_cpuid(vcpu);
+	return 1;
+}
+
+static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+	u64 data;
+
+	if (vmx_get_msr(vcpu, ecx, &data)) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	/* FIXME: handling of bits 32:63 of rax, rdx */
+	vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
+	vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+		| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+
+	if (vmx_set_msr(vcpu, ecx, data) != 0) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
+				      struct kvm_run *kvm_run)
+{
+	return 1;
+}
+
+static int handle_interrupt_window(struct kvm_vcpu *vcpu,
+				   struct kvm_run *kvm_run)
+{
+	u32 cpu_based_vm_exec_control;
+
+	/* clear pending irq */
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	/*
+	 * If the user space waits to inject interrupts, exit as soon as
+	 * possible
+	 */
+	if (kvm_run->request_interrupt_window &&
+	    !vcpu->arch.irq_summary) {
+		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+		++vcpu->stat.irq_window_exits;
+		return 0;
+	}
+	return 1;
+}
+
+static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	skip_emulated_instruction(vcpu);
+	return kvm_emulate_halt(vcpu);
+}
+
+static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	skip_emulated_instruction(vcpu);
+	kvm_emulate_hypercall(vcpu);
+	return 1;
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	skip_emulated_instruction(vcpu);
+	/* TODO: Add support for VT-d/pass-through device */
+	return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u64 exit_qualification;
+	enum emulation_result er;
+	unsigned long offset;
+
+	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+	offset = exit_qualification & 0xffful;
+
+	er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+	if (er !=  EMULATE_DONE) {
+		printk(KERN_ERR
+		       "Fail to handle apic access vmexit! Offset is 0x%lx\n",
+		       offset);
+		return -ENOTSUPP;
+	}
+	return 1;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
+				      struct kvm_run *kvm_run) = {
+	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
+	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
+	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
+	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
+	[EXIT_REASON_CR_ACCESS]               = handle_cr,
+	[EXIT_REASON_DR_ACCESS]               = handle_dr,
+	[EXIT_REASON_CPUID]                   = handle_cpuid,
+	[EXIT_REASON_MSR_READ]                = handle_rdmsr,
+	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
+	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
+	[EXIT_REASON_HLT]                     = handle_halt,
+	[EXIT_REASON_VMCALL]                  = handle_vmcall,
+	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
+	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
+	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+	ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+/*
+ * The guest has exited.  See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 vectoring_info = vmx->idt_vectoring_info;
+
+	if (unlikely(vmx->fail)) {
+		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		kvm_run->fail_entry.hardware_entry_failure_reason
+			= vmcs_read32(VM_INSTRUCTION_ERROR);
+		return 0;
+	}
+
+	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+				exit_reason != EXIT_REASON_EXCEPTION_NMI)
+		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
+		       "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
+	if (exit_reason < kvm_vmx_max_exit_handlers
+	    && kvm_vmx_exit_handlers[exit_reason])
+		return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
+	else {
+		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+		kvm_run->hw.hardware_exit_reason = exit_reason;
+	}
+	return 0;
+}
+
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+}
+
+static void update_tpr_threshold(struct kvm_vcpu *vcpu)
+{
+	int max_irr, tpr;
+
+	if (!vm_need_tpr_shadow(vcpu->kvm))
+		return;
+
+	if (!kvm_lapic_enabled(vcpu) ||
+	    ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
+		vmcs_write32(TPR_THRESHOLD, 0);
+		return;
+	}
+
+	tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
+	vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
+}
+
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+	u32 cpu_based_vm_exec_control;
+
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 idtv_info_field, intr_info_field;
+	int has_ext_irq, interrupt_window_open;
+	int vector;
+
+	update_tpr_threshold(vcpu);
+
+	has_ext_irq = kvm_cpu_has_interrupt(vcpu);
+	intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	idtv_info_field = vmx->idt_vectoring_info;
+	if (intr_info_field & INTR_INFO_VALID_MASK) {
+		if (idtv_info_field & INTR_INFO_VALID_MASK) {
+			/* TODO: fault when IDT_Vectoring */
+			if (printk_ratelimit())
+				printk(KERN_ERR "Fault when IDT_Vectoring\n");
+		}
+		if (has_ext_irq)
+			enable_irq_window(vcpu);
+		return;
+	}
+	if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
+		if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
+		    == INTR_TYPE_EXT_INTR
+		    && vcpu->arch.rmode.active) {
+			u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
+
+			vmx_inject_irq(vcpu, vect);
+			if (unlikely(has_ext_irq))
+				enable_irq_window(vcpu);
+			return;
+		}
+
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+				vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+
+		if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
+			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+				vmcs_read32(IDT_VECTORING_ERROR_CODE));
+		if (unlikely(has_ext_irq))
+			enable_irq_window(vcpu);
+		return;
+	}
+	if (!has_ext_irq)
+		return;
+	interrupt_window_open =
+		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+	if (interrupt_window_open) {
+		vector = kvm_cpu_get_interrupt(vcpu);
+		vmx_inject_irq(vcpu, vector);
+		kvm_timer_intr_post(vcpu, vector);
+	} else
+		enable_irq_window(vcpu);
+}
+
+/*
+ * Failure to inject an interrupt should give us the information
+ * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
+ * when fetching the interrupt redirection bitmap in the real-mode
+ * tss, this doesn't happen.  So we do it ourselves.
+ */
+static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+{
+	vmx->rmode.irq.pending = 0;
+	if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+		return;
+	vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+	if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+		vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+		vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+		return;
+	}
+	vmx->idt_vectoring_info =
+		VECTORING_INFO_VALID_MASK
+		| INTR_TYPE_EXT_INTR
+		| vmx->rmode.irq.vector;
+}
+
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info;
+
+	/*
+	 * Loading guest fpu may have cleared host cr0.ts
+	 */
+	vmcs_writel(HOST_CR0, read_cr0());
+
+	asm(
+		/* Store host registers */
+#ifdef CONFIG_X86_64
+		"push %%rdx; push %%rbp;"
+		"push %%rcx \n\t"
+#else
+		"push %%edx; push %%ebp;"
+		"push %%ecx \n\t"
+#endif
+		ASM_VMX_VMWRITE_RSP_RDX "\n\t"
+		/* Check if vmlaunch of vmresume is needed */
+		"cmpl $0, %c[launched](%0) \n\t"
+		/* Load guest registers.  Don't clobber flags. */
+#ifdef CONFIG_X86_64
+		"mov %c[cr2](%0), %%rax \n\t"
+		"mov %%rax, %%cr2 \n\t"
+		"mov %c[rax](%0), %%rax \n\t"
+		"mov %c[rbx](%0), %%rbx \n\t"
+		"mov %c[rdx](%0), %%rdx \n\t"
+		"mov %c[rsi](%0), %%rsi \n\t"
+		"mov %c[rdi](%0), %%rdi \n\t"
+		"mov %c[rbp](%0), %%rbp \n\t"
+		"mov %c[r8](%0),  %%r8  \n\t"
+		"mov %c[r9](%0),  %%r9  \n\t"
+		"mov %c[r10](%0), %%r10 \n\t"
+		"mov %c[r11](%0), %%r11 \n\t"
+		"mov %c[r12](%0), %%r12 \n\t"
+		"mov %c[r13](%0), %%r13 \n\t"
+		"mov %c[r14](%0), %%r14 \n\t"
+		"mov %c[r15](%0), %%r15 \n\t"
+		"mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
+#else
+		"mov %c[cr2](%0), %%eax \n\t"
+		"mov %%eax,   %%cr2 \n\t"
+		"mov %c[rax](%0), %%eax \n\t"
+		"mov %c[rbx](%0), %%ebx \n\t"
+		"mov %c[rdx](%0), %%edx \n\t"
+		"mov %c[rsi](%0), %%esi \n\t"
+		"mov %c[rdi](%0), %%edi \n\t"
+		"mov %c[rbp](%0), %%ebp \n\t"
+		"mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
+#endif
+		/* Enter guest mode */
+		"jne .Llaunched \n\t"
+		ASM_VMX_VMLAUNCH "\n\t"
+		"jmp .Lkvm_vmx_return \n\t"
+		".Llaunched: " ASM_VMX_VMRESUME "\n\t"
+		".Lkvm_vmx_return: "
+		/* Save guest registers, load host registers, keep flags */
+#ifdef CONFIG_X86_64
+		"xchg %0,     (%%rsp) \n\t"
+		"mov %%rax, %c[rax](%0) \n\t"
+		"mov %%rbx, %c[rbx](%0) \n\t"
+		"pushq (%%rsp); popq %c[rcx](%0) \n\t"
+		"mov %%rdx, %c[rdx](%0) \n\t"
+		"mov %%rsi, %c[rsi](%0) \n\t"
+		"mov %%rdi, %c[rdi](%0) \n\t"
+		"mov %%rbp, %c[rbp](%0) \n\t"
+		"mov %%r8,  %c[r8](%0) \n\t"
+		"mov %%r9,  %c[r9](%0) \n\t"
+		"mov %%r10, %c[r10](%0) \n\t"
+		"mov %%r11, %c[r11](%0) \n\t"
+		"mov %%r12, %c[r12](%0) \n\t"
+		"mov %%r13, %c[r13](%0) \n\t"
+		"mov %%r14, %c[r14](%0) \n\t"
+		"mov %%r15, %c[r15](%0) \n\t"
+		"mov %%cr2, %%rax   \n\t"
+		"mov %%rax, %c[cr2](%0) \n\t"
+
+		"pop  %%rbp; pop  %%rbp; pop  %%rdx \n\t"
+#else
+		"xchg %0, (%%esp) \n\t"
+		"mov %%eax, %c[rax](%0) \n\t"
+		"mov %%ebx, %c[rbx](%0) \n\t"
+		"pushl (%%esp); popl %c[rcx](%0) \n\t"
+		"mov %%edx, %c[rdx](%0) \n\t"
+		"mov %%esi, %c[rsi](%0) \n\t"
+		"mov %%edi, %c[rdi](%0) \n\t"
+		"mov %%ebp, %c[rbp](%0) \n\t"
+		"mov %%cr2, %%eax  \n\t"
+		"mov %%eax, %c[cr2](%0) \n\t"
+
+		"pop %%ebp; pop %%ebp; pop %%edx \n\t"
+#endif
+		"setbe %c[fail](%0) \n\t"
+	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
+		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
+		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+		[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+		[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+		[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+		[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+		[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+		[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
+#ifdef CONFIG_X86_64
+		[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+		[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+		[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+		[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+		[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+		[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+		[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+		[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
+#endif
+		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+	      : "cc", "memory"
+#ifdef CONFIG_X86_64
+		, "rbx", "rdi", "rsi"
+		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+		, "ebx", "edi", "rsi"
+#endif
+	      );
+
+	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	if (vmx->rmode.irq.pending)
+		fixup_rmode_irq(vmx);
+
+	vcpu->arch.interrupt_window_open =
+		(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
+
+	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+	vmx->launched = 1;
+
+	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	/* We need to handle NMIs before interrupts are enabled */
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+		asm("int $2");
+}
+
+static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->vmcs) {
+		on_each_cpu(__vcpu_clear, vmx, 0, 1);
+		free_vmcs(vmx->vmcs);
+		vmx->vmcs = NULL;
+	}
+}
+
+static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	vmx_free_vmcs(vcpu);
+	kfree(vmx->host_msrs);
+	kfree(vmx->guest_msrs);
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vmx);
+}
+
+static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+	int err;
+	struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	int cpu;
+
+	if (!vmx)
+		return ERR_PTR(-ENOMEM);
+
+	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!vmx->guest_msrs) {
+		err = -ENOMEM;
+		goto uninit_vcpu;
+	}
+
+	vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!vmx->host_msrs)
+		goto free_guest_msrs;
+
+	vmx->vmcs = alloc_vmcs();
+	if (!vmx->vmcs)
+		goto free_msrs;
+
+	vmcs_clear(vmx->vmcs);
+
+	cpu = get_cpu();
+	vmx_vcpu_load(&vmx->vcpu, cpu);
+	err = vmx_vcpu_setup(vmx);
+	vmx_vcpu_put(&vmx->vcpu);
+	put_cpu();
+	if (err)
+		goto free_vmcs;
+
+	return &vmx->vcpu;
+
+free_vmcs:
+	free_vmcs(vmx->vmcs);
+free_msrs:
+	kfree(vmx->host_msrs);
+free_guest_msrs:
+	kfree(vmx->guest_msrs);
+uninit_vcpu:
+	kvm_vcpu_uninit(&vmx->vcpu);
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vmx);
+	return ERR_PTR(err);
+}
+
+static void __init vmx_check_processor_compat(void *rtn)
+{
+	struct vmcs_config vmcs_conf;
+
+	*(int *)rtn = 0;
+	if (setup_vmcs_config(&vmcs_conf) < 0)
+		*(int *)rtn = -EIO;
+	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
+		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
+				smp_processor_id());
+		*(int *)rtn = -EIO;
+	}
+}
+
+static struct kvm_x86_ops vmx_x86_ops = {
+	.cpu_has_kvm_support = cpu_has_kvm_support,
+	.disabled_by_bios = vmx_disabled_by_bios,
+	.hardware_setup = hardware_setup,
+	.hardware_unsetup = hardware_unsetup,
+	.check_processor_compatibility = vmx_check_processor_compat,
+	.hardware_enable = hardware_enable,
+	.hardware_disable = hardware_disable,
+	.cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+
+	.vcpu_create = vmx_create_vcpu,
+	.vcpu_free = vmx_free_vcpu,
+	.vcpu_reset = vmx_vcpu_reset,
+
+	.prepare_guest_switch = vmx_save_host_state,
+	.vcpu_load = vmx_vcpu_load,
+	.vcpu_put = vmx_vcpu_put,
+	.vcpu_decache = vmx_vcpu_decache,
+
+	.set_guest_debug = set_guest_debug,
+	.guest_debug_pre = kvm_guest_debug_pre,
+	.get_msr = vmx_get_msr,
+	.set_msr = vmx_set_msr,
+	.get_segment_base = vmx_get_segment_base,
+	.get_segment = vmx_get_segment,
+	.set_segment = vmx_set_segment,
+	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
+	.set_cr0 = vmx_set_cr0,
+	.set_cr3 = vmx_set_cr3,
+	.set_cr4 = vmx_set_cr4,
+#ifdef CONFIG_X86_64
+	.set_efer = vmx_set_efer,
+#endif
+	.get_idt = vmx_get_idt,
+	.set_idt = vmx_set_idt,
+	.get_gdt = vmx_get_gdt,
+	.set_gdt = vmx_set_gdt,
+	.cache_regs = vcpu_load_rsp_rip,
+	.decache_regs = vcpu_put_rsp_rip,
+	.get_rflags = vmx_get_rflags,
+	.set_rflags = vmx_set_rflags,
+
+	.tlb_flush = vmx_flush_tlb,
+
+	.run = vmx_vcpu_run,
+	.handle_exit = kvm_handle_exit,
+	.skip_emulated_instruction = skip_emulated_instruction,
+	.patch_hypercall = vmx_patch_hypercall,
+	.get_irq = vmx_get_irq,
+	.set_irq = vmx_inject_irq,
+	.queue_exception = vmx_queue_exception,
+	.exception_injected = vmx_exception_injected,
+	.inject_pending_irq = vmx_intr_assist,
+	.inject_pending_vectors = do_interrupt_requests,
+
+	.set_tss_addr = vmx_set_tss_addr,
+};
+
+static int __init vmx_init(void)
+{
+	void *iova;
+	int r;
+
+	vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	if (!vmx_io_bitmap_a)
+		return -ENOMEM;
+
+	vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	if (!vmx_io_bitmap_b) {
+		r = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Allow direct access to the PC debug port (it is often used for I/O
+	 * delays, but the vmexits simply slow things down).
+	 */
+	iova = kmap(vmx_io_bitmap_a);
+	memset(iova, 0xff, PAGE_SIZE);
+	clear_bit(0x80, iova);
+	kunmap(vmx_io_bitmap_a);
+
+	iova = kmap(vmx_io_bitmap_b);
+	memset(iova, 0xff, PAGE_SIZE);
+	kunmap(vmx_io_bitmap_b);
+
+	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+	if (r)
+		goto out1;
+
+	if (bypass_guest_pf)
+		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
+
+	return 0;
+
+out1:
+	__free_page(vmx_io_bitmap_b);
+out:
+	__free_page(vmx_io_bitmap_a);
+	return r;
+}
+
+static void __exit vmx_exit(void)
+{
+	__free_page(vmx_io_bitmap_b);
+	__free_page(vmx_io_bitmap_a);
+
+	kvm_exit();
+}
+
+module_init(vmx_init)
+module_exit(vmx_exit)
diff -puN /dev/null arch/x86/kvm/vmx.h
--- /dev/null
+++ a/arch/x86/kvm/vmx.h
@@ -0,0 +1,324 @@
+#ifndef VMX_H
+#define VMX_H
+
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * A few random additions are:
+ * Copyright (C) 2006 Qumranet
+ *    Avi Kivity <avi@qumranet.com>
+ *    Yaniv Kamay <yaniv@qumranet.com>
+ *
+ */
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING             0x00000008
+#define CPU_BASED_HLT_EXITING                   0x00000080
+#define CPU_BASED_INVLPG_EXITING                0x00000200
+#define CPU_BASED_MWAIT_EXITING                 0x00000400
+#define CPU_BASED_RDPMC_EXITING                 0x00000800
+#define CPU_BASED_RDTSC_EXITING                 0x00001000
+#define CPU_BASED_CR8_LOAD_EXITING              0x00080000
+#define CPU_BASED_CR8_STORE_EXITING             0x00100000
+#define CPU_BASED_TPR_SHADOW                    0x00200000
+#define CPU_BASED_MOV_DR_EXITING                0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING             0x01000000
+#define CPU_BASED_USE_IO_BITMAPS                0x02000000
+#define CPU_BASED_USE_MSR_BITMAPS               0x10000000
+#define CPU_BASED_MONITOR_EXITING               0x20000000
+#define CPU_BASED_PAUSE_EXITING                 0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
+
+
+#define PIN_BASED_EXT_INTR_MASK                 0x00000001
+#define PIN_BASED_NMI_EXITING                   0x00000008
+#define PIN_BASED_VIRTUAL_NMIS                  0x00000020
+
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
+#define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
+
+#define VM_ENTRY_IA32E_MODE                     0x00000200
+#define VM_ENTRY_SMM                            0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+
+/* VMCS Encodings */
+enum vmcs_field {
+	GUEST_ES_SELECTOR               = 0x00000800,
+	GUEST_CS_SELECTOR               = 0x00000802,
+	GUEST_SS_SELECTOR               = 0x00000804,
+	GUEST_DS_SELECTOR               = 0x00000806,
+	GUEST_FS_SELECTOR               = 0x00000808,
+	GUEST_GS_SELECTOR               = 0x0000080a,
+	GUEST_LDTR_SELECTOR             = 0x0000080c,
+	GUEST_TR_SELECTOR               = 0x0000080e,
+	HOST_ES_SELECTOR                = 0x00000c00,
+	HOST_CS_SELECTOR                = 0x00000c02,
+	HOST_SS_SELECTOR                = 0x00000c04,
+	HOST_DS_SELECTOR                = 0x00000c06,
+	HOST_FS_SELECTOR                = 0x00000c08,
+	HOST_GS_SELECTOR                = 0x00000c0a,
+	HOST_TR_SELECTOR                = 0x00000c0c,
+	IO_BITMAP_A                     = 0x00002000,
+	IO_BITMAP_A_HIGH                = 0x00002001,
+	IO_BITMAP_B                     = 0x00002002,
+	IO_BITMAP_B_HIGH                = 0x00002003,
+	MSR_BITMAP                      = 0x00002004,
+	MSR_BITMAP_HIGH                 = 0x00002005,
+	VM_EXIT_MSR_STORE_ADDR          = 0x00002006,
+	VM_EXIT_MSR_STORE_ADDR_HIGH     = 0x00002007,
+	VM_EXIT_MSR_LOAD_ADDR           = 0x00002008,
+	VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
+	VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
+	VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
+	TSC_OFFSET                      = 0x00002010,
+	TSC_OFFSET_HIGH                 = 0x00002011,
+	VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
+	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
+	APIC_ACCESS_ADDR		= 0x00002014,
+	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+	VMCS_LINK_POINTER               = 0x00002800,
+	VMCS_LINK_POINTER_HIGH          = 0x00002801,
+	GUEST_IA32_DEBUGCTL             = 0x00002802,
+	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
+	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
+	EXCEPTION_BITMAP                = 0x00004004,
+	PAGE_FAULT_ERROR_CODE_MASK      = 0x00004006,
+	PAGE_FAULT_ERROR_CODE_MATCH     = 0x00004008,
+	CR3_TARGET_COUNT                = 0x0000400a,
+	VM_EXIT_CONTROLS                = 0x0000400c,
+	VM_EXIT_MSR_STORE_COUNT         = 0x0000400e,
+	VM_EXIT_MSR_LOAD_COUNT          = 0x00004010,
+	VM_ENTRY_CONTROLS               = 0x00004012,
+	VM_ENTRY_MSR_LOAD_COUNT         = 0x00004014,
+	VM_ENTRY_INTR_INFO_FIELD        = 0x00004016,
+	VM_ENTRY_EXCEPTION_ERROR_CODE   = 0x00004018,
+	VM_ENTRY_INSTRUCTION_LEN        = 0x0000401a,
+	TPR_THRESHOLD                   = 0x0000401c,
+	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
+	VM_INSTRUCTION_ERROR            = 0x00004400,
+	VM_EXIT_REASON                  = 0x00004402,
+	VM_EXIT_INTR_INFO               = 0x00004404,
+	VM_EXIT_INTR_ERROR_CODE         = 0x00004406,
+	IDT_VECTORING_INFO_FIELD        = 0x00004408,
+	IDT_VECTORING_ERROR_CODE        = 0x0000440a,
+	VM_EXIT_INSTRUCTION_LEN         = 0x0000440c,
+	VMX_INSTRUCTION_INFO            = 0x0000440e,
+	GUEST_ES_LIMIT                  = 0x00004800,
+	GUEST_CS_LIMIT                  = 0x00004802,
+	GUEST_SS_LIMIT                  = 0x00004804,
+	GUEST_DS_LIMIT                  = 0x00004806,
+	GUEST_FS_LIMIT                  = 0x00004808,
+	GUEST_GS_LIMIT                  = 0x0000480a,
+	GUEST_LDTR_LIMIT                = 0x0000480c,
+	GUEST_TR_LIMIT                  = 0x0000480e,
+	GUEST_GDTR_LIMIT                = 0x00004810,
+	GUEST_IDTR_LIMIT                = 0x00004812,
+	GUEST_ES_AR_BYTES               = 0x00004814,
+	GUEST_CS_AR_BYTES               = 0x00004816,
+	GUEST_SS_AR_BYTES               = 0x00004818,
+	GUEST_DS_AR_BYTES               = 0x0000481a,
+	GUEST_FS_AR_BYTES               = 0x0000481c,
+	GUEST_GS_AR_BYTES               = 0x0000481e,
+	GUEST_LDTR_AR_BYTES             = 0x00004820,
+	GUEST_TR_AR_BYTES               = 0x00004822,
+	GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
+	GUEST_ACTIVITY_STATE            = 0X00004826,
+	GUEST_SYSENTER_CS               = 0x0000482A,
+	HOST_IA32_SYSENTER_CS           = 0x00004c00,
+	CR0_GUEST_HOST_MASK             = 0x00006000,
+	CR4_GUEST_HOST_MASK             = 0x00006002,
+	CR0_READ_SHADOW                 = 0x00006004,
+	CR4_READ_SHADOW                 = 0x00006006,
+	CR3_TARGET_VALUE0               = 0x00006008,
+	CR3_TARGET_VALUE1               = 0x0000600a,
+	CR3_TARGET_VALUE2               = 0x0000600c,
+	CR3_TARGET_VALUE3               = 0x0000600e,
+	EXIT_QUALIFICATION              = 0x00006400,
+	GUEST_LINEAR_ADDRESS            = 0x0000640a,
+	GUEST_CR0                       = 0x00006800,
+	GUEST_CR3                       = 0x00006802,
+	GUEST_CR4                       = 0x00006804,
+	GUEST_ES_BASE                   = 0x00006806,
+	GUEST_CS_BASE                   = 0x00006808,
+	GUEST_SS_BASE                   = 0x0000680a,
+	GUEST_DS_BASE                   = 0x0000680c,
+	GUEST_FS_BASE                   = 0x0000680e,
+	GUEST_GS_BASE                   = 0x00006810,
+	GUEST_LDTR_BASE                 = 0x00006812,
+	GUEST_TR_BASE                   = 0x00006814,
+	GUEST_GDTR_BASE                 = 0x00006816,
+	GUEST_IDTR_BASE                 = 0x00006818,
+	GUEST_DR7                       = 0x0000681a,
+	GUEST_RSP                       = 0x0000681c,
+	GUEST_RIP                       = 0x0000681e,
+	GUEST_RFLAGS                    = 0x00006820,
+	GUEST_PENDING_DBG_EXCEPTIONS    = 0x00006822,
+	GUEST_SYSENTER_ESP              = 0x00006824,
+	GUEST_SYSENTER_EIP              = 0x00006826,
+	HOST_CR0                        = 0x00006c00,
+	HOST_CR3                        = 0x00006c02,
+	HOST_CR4                        = 0x00006c04,
+	HOST_FS_BASE                    = 0x00006c06,
+	HOST_GS_BASE                    = 0x00006c08,
+	HOST_TR_BASE                    = 0x00006c0a,
+	HOST_GDTR_BASE                  = 0x00006c0c,
+	HOST_IDTR_BASE                  = 0x00006c0e,
+	HOST_IA32_SYSENTER_ESP          = 0x00006c10,
+	HOST_IA32_SYSENTER_EIP          = 0x00006c12,
+	HOST_RSP                        = 0x00006c14,
+	HOST_RIP                        = 0x00006c16,
+};
+
+#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+
+#define EXIT_REASON_EXCEPTION_NMI       0
+#define EXIT_REASON_EXTERNAL_INTERRUPT  1
+#define EXIT_REASON_TRIPLE_FAULT        2
+
+#define EXIT_REASON_PENDING_INTERRUPT   7
+
+#define EXIT_REASON_TASK_SWITCH         9
+#define EXIT_REASON_CPUID               10
+#define EXIT_REASON_HLT                 12
+#define EXIT_REASON_INVLPG              14
+#define EXIT_REASON_RDPMC               15
+#define EXIT_REASON_RDTSC               16
+#define EXIT_REASON_VMCALL              18
+#define EXIT_REASON_VMCLEAR             19
+#define EXIT_REASON_VMLAUNCH            20
+#define EXIT_REASON_VMPTRLD             21
+#define EXIT_REASON_VMPTRST             22
+#define EXIT_REASON_VMREAD              23
+#define EXIT_REASON_VMRESUME            24
+#define EXIT_REASON_VMWRITE             25
+#define EXIT_REASON_VMOFF               26
+#define EXIT_REASON_VMON                27
+#define EXIT_REASON_CR_ACCESS           28
+#define EXIT_REASON_DR_ACCESS           29
+#define EXIT_REASON_IO_INSTRUCTION      30
+#define EXIT_REASON_MSR_READ            31
+#define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_WBINVD		54
+
+/*
+ * Interruption-information format
+ */
+#define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
+#define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
+#define INTR_INFO_DELIEVER_CODE_MASK    0x800           /* 11 */
+#define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
+
+#define VECTORING_INFO_VECTOR_MASK           	INTR_INFO_VECTOR_MASK
+#define VECTORING_INFO_TYPE_MASK        	INTR_INFO_INTR_TYPE_MASK
+#define VECTORING_INFO_DELIEVER_CODE_MASK    	INTR_INFO_DELIEVER_CODE_MASK
+#define VECTORING_INFO_VALID_MASK       	INTR_INFO_VALID_MASK
+
+#define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
+#define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
+#define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
+
+/*
+ * Exit Qualifications for MOV for Control Register Access
+ */
+#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control reg.*/
+#define CONTROL_REG_ACCESS_TYPE         0x30    /* 5:4, access type */
+#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose reg. */
+#define LMSW_SOURCE_DATA_SHIFT 16
+#define LMSW_SOURCE_DATA  (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
+#define REG_EAX                         (0 << 8)
+#define REG_ECX                         (1 << 8)
+#define REG_EDX                         (2 << 8)
+#define REG_EBX                         (3 << 8)
+#define REG_ESP                         (4 << 8)
+#define REG_EBP                         (5 << 8)
+#define REG_ESI                         (6 << 8)
+#define REG_EDI                         (7 << 8)
+#define REG_R8                         (8 << 8)
+#define REG_R9                         (9 << 8)
+#define REG_R10                        (10 << 8)
+#define REG_R11                        (11 << 8)
+#define REG_R12                        (12 << 8)
+#define REG_R13                        (13 << 8)
+#define REG_R14                        (14 << 8)
+#define REG_R15                        (15 << 8)
+
+/*
+ * Exit Qualifications for MOV for Debug Register Access
+ */
+#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug reg. */
+#define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
+#define TYPE_MOV_TO_DR                  (0 << 4)
+#define TYPE_MOV_FROM_DR                (1 << 4)
+#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
+
+
+/* segment AR */
+#define SEGMENT_AR_L_MASK (1 << 13)
+
+#define AR_TYPE_ACCESSES_MASK 1
+#define AR_TYPE_READABLE_MASK (1 << 1)
+#define AR_TYPE_WRITEABLE_MASK (1 << 2)
+#define AR_TYPE_CODE_MASK (1 << 3)
+#define AR_TYPE_MASK 0x0f
+#define AR_TYPE_BUSY_64_TSS 11
+#define AR_TYPE_BUSY_32_TSS 11
+#define AR_TYPE_BUSY_16_TSS 3
+#define AR_TYPE_LDT 2
+
+#define AR_UNUSABLE_MASK (1 << 16)
+#define AR_S_MASK (1 << 4)
+#define AR_P_MASK (1 << 7)
+#define AR_L_MASK (1 << 13)
+#define AR_DB_MASK (1 << 14)
+#define AR_G_MASK (1 << 15)
+#define AR_DPL_SHIFT 5
+#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
+
+#define AR_RESERVD_MASK 0xfffe0f00
+
+#define MSR_IA32_VMX_BASIC                      0x480
+#define MSR_IA32_VMX_PINBASED_CTLS              0x481
+#define MSR_IA32_VMX_PROCBASED_CTLS             0x482
+#define MSR_IA32_VMX_EXIT_CTLS                  0x483
+#define MSR_IA32_VMX_ENTRY_CTLS                 0x484
+#define MSR_IA32_VMX_MISC                       0x485
+#define MSR_IA32_VMX_CR0_FIXED0                 0x486
+#define MSR_IA32_VMX_CR0_FIXED1                 0x487
+#define MSR_IA32_VMX_CR4_FIXED0                 0x488
+#define MSR_IA32_VMX_CR4_FIXED1                 0x489
+#define MSR_IA32_VMX_VMCS_ENUM                  0x48a
+#define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
+
+#define MSR_IA32_FEATURE_CONTROL                0x3a
+#define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
+#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
+
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
+
+#endif
diff -puN /dev/null arch/x86/kvm/x86.c
--- /dev/null
+++ a/arch/x86/kvm/x86.c
@@ -0,0 +1,3287 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * derived from drivers/kvm/kvm_main.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include "segment_descriptor.h"
+#include "irq.h"
+#include "mmu.h"
+
+#include <linux/kvm.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+
+#include <asm/uaccess.h>
+#include <asm/msr.h>
+
+#define MAX_IO_MSRS 256
+#define CR0_RESERVED_BITS						\
+	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+#define CR4_RESERVED_BITS						\
+	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
+			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
+			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+#define EFER_RESERVED_BITS 0xfffffffffffff2fe
+
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+
+struct kvm_x86_ops *kvm_x86_ops;
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+	{ "pf_fixed", VCPU_STAT(pf_fixed) },
+	{ "pf_guest", VCPU_STAT(pf_guest) },
+	{ "tlb_flush", VCPU_STAT(tlb_flush) },
+	{ "invlpg", VCPU_STAT(invlpg) },
+	{ "exits", VCPU_STAT(exits) },
+	{ "io_exits", VCPU_STAT(io_exits) },
+	{ "mmio_exits", VCPU_STAT(mmio_exits) },
+	{ "signal_exits", VCPU_STAT(signal_exits) },
+	{ "irq_window", VCPU_STAT(irq_window_exits) },
+	{ "halt_exits", VCPU_STAT(halt_exits) },
+	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
+	{ "request_irq", VCPU_STAT(request_irq_exits) },
+	{ "irq_exits", VCPU_STAT(irq_exits) },
+	{ "host_state_reload", VCPU_STAT(host_state_reload) },
+	{ "efer_reload", VCPU_STAT(efer_reload) },
+	{ "fpu_reload", VCPU_STAT(fpu_reload) },
+	{ "insn_emulation", VCPU_STAT(insn_emulation) },
+	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
+	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
+	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
+	{ "mmu_flooded", VM_STAT(mmu_flooded) },
+	{ "mmu_recycled", VM_STAT(mmu_recycled) },
+	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+	{ NULL }
+};
+
+
+unsigned long segment_base(u16 selector)
+{
+	struct descriptor_table gdt;
+	struct segment_descriptor *d;
+	unsigned long table_base;
+	unsigned long v;
+
+	if (selector == 0)
+		return 0;
+
+	asm("sgdt %0" : "=m"(gdt));
+	table_base = gdt.base;
+
+	if (selector & 4) {           /* from ldt */
+		u16 ldt_selector;
+
+		asm("sldt %0" : "=g"(ldt_selector));
+		table_base = segment_base(ldt_selector);
+	}
+	d = (struct segment_descriptor *)(table_base + (selector & ~7));
+	v = d->base_low | ((unsigned long)d->base_mid << 16) |
+		((unsigned long)d->base_high << 24);
+#ifdef CONFIG_X86_64
+	if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+		v |= ((unsigned long) \
+		      ((struct segment_descriptor_64 *)d)->base_higher) << 32;
+#endif
+	return v;
+}
+EXPORT_SYMBOL_GPL(segment_base);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
+{
+	if (irqchip_in_kernel(vcpu->kvm))
+		return vcpu->arch.apic_base;
+	else
+		return vcpu->arch.apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
+
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
+{
+	/* TODO: reserve bits check */
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_lapic_set_base(vcpu, data);
+	else
+		vcpu->arch.apic_base = data;
+}
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+	WARN_ON(vcpu->arch.exception.pending);
+	vcpu->arch.exception.pending = true;
+	vcpu->arch.exception.has_error_code = false;
+	vcpu->arch.exception.nr = nr;
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
+
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+			   u32 error_code)
+{
+	++vcpu->stat.pf_guest;
+	if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
+		printk(KERN_DEBUG "kvm: inject_page_fault:"
+		       " double fault 0x%lx\n", addr);
+		vcpu->arch.exception.nr = DF_VECTOR;
+		vcpu->arch.exception.error_code = 0;
+		return;
+	}
+	vcpu->arch.cr2 = addr;
+	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+}
+
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+	WARN_ON(vcpu->arch.exception.pending);
+	vcpu->arch.exception.pending = true;
+	vcpu->arch.exception.has_error_code = true;
+	vcpu->arch.exception.nr = nr;
+	vcpu->arch.exception.error_code = error_code;
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
+
+static void __queue_exception(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+				     vcpu->arch.exception.has_error_code,
+				     vcpu->arch.exception.error_code);
+}
+
+/*
+ * Load the pae pdptrs.  Return true is they are all valid.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+	int i;
+	int ret;
+	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+
+	down_read(&current->mm->mmap_sem);
+	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
+				  offset * sizeof(u64), sizeof(pdpte));
+	if (ret < 0) {
+		ret = 0;
+		goto out;
+	}
+	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
+			ret = 0;
+			goto out;
+		}
+	}
+	ret = 1;
+
+	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+out:
+	up_read(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+	bool changed = true;
+	int r;
+
+	if (is_long_mode(vcpu) || !is_pae(vcpu))
+		return false;
+
+	down_read(&current->mm->mmap_sem);
+	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+	if (r < 0)
+		goto out;
+	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+out:
+	up_read(&current->mm->mmap_sem);
+
+	return changed;
+}
+
+void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	if (cr0 & CR0_RESERVED_BITS) {
+		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
+		       cr0, vcpu->arch.cr0);
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
+		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
+		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
+		       "and a clear PE flag\n");
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+#ifdef CONFIG_X86_64
+		if ((vcpu->arch.shadow_efer & EFER_LME)) {
+			int cs_db, cs_l;
+
+			if (!is_pae(vcpu)) {
+				printk(KERN_DEBUG "set_cr0: #GP, start paging "
+				       "in long mode while PAE is disabled\n");
+				kvm_inject_gp(vcpu, 0);
+				return;
+			}
+			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+			if (cs_l) {
+				printk(KERN_DEBUG "set_cr0: #GP, start paging "
+				       "in long mode while CS.L == 1\n");
+				kvm_inject_gp(vcpu, 0);
+				return;
+
+			}
+		} else
+#endif
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
+			       "reserved bits\n");
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+
+	}
+
+	kvm_x86_ops->set_cr0(vcpu, cr0);
+	vcpu->arch.cr0 = cr0;
+
+	kvm_mmu_reset_context(vcpu);
+	return;
+}
+EXPORT_SYMBOL_GPL(set_cr0);
+
+void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+	set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+}
+EXPORT_SYMBOL_GPL(lmsw);
+
+void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	if (cr4 & CR4_RESERVED_BITS) {
+		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (is_long_mode(vcpu)) {
+		if (!(cr4 & X86_CR4_PAE)) {
+			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
+			       "in long mode\n");
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
+		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
+		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (cr4 & X86_CR4_VMXE) {
+		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+	kvm_x86_ops->set_cr4(vcpu, cr4);
+	vcpu->arch.cr4 = cr4;
+	kvm_mmu_reset_context(vcpu);
+}
+EXPORT_SYMBOL_GPL(set_cr4);
+
+void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+		kvm_mmu_flush_tlb(vcpu);
+		return;
+	}
+
+	if (is_long_mode(vcpu)) {
+		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
+			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	} else {
+		if (is_pae(vcpu)) {
+			if (cr3 & CR3_PAE_RESERVED_BITS) {
+				printk(KERN_DEBUG
+				       "set_cr3: #GP, reserved bits\n");
+				kvm_inject_gp(vcpu, 0);
+				return;
+			}
+			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
+				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
+				       "reserved bits\n");
+				kvm_inject_gp(vcpu, 0);
+				return;
+			}
+		}
+		/*
+		 * We don't check reserved bits in nonpae mode, because
+		 * this isn't enforced, and VMware depends on this.
+		 */
+	}
+
+	down_read(&current->mm->mmap_sem);
+	/*
+	 * Does the new cr3 value map to physical memory? (Note, we
+	 * catch an invalid cr3 even in real-mode, because it would
+	 * cause trouble later on when we turn on paging anyway.)
+	 *
+	 * A real CPU would silently accept an invalid cr3 and would
+	 * attempt to use it - with largely undefined (and often hard
+	 * to debug) behavior on the guest side.
+	 */
+	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
+		kvm_inject_gp(vcpu, 0);
+	else {
+		vcpu->arch.cr3 = cr3;
+		vcpu->arch.mmu.new_cr3(vcpu);
+	}
+	up_read(&current->mm->mmap_sem);
+}
+EXPORT_SYMBOL_GPL(set_cr3);
+
+void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+	if (cr8 & CR8_RESERVED_BITS) {
+		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_lapic_set_tpr(vcpu, cr8);
+	else
+		vcpu->arch.cr8 = cr8;
+}
+EXPORT_SYMBOL_GPL(set_cr8);
+
+unsigned long get_cr8(struct kvm_vcpu *vcpu)
+{
+	if (irqchip_in_kernel(vcpu->kvm))
+		return kvm_lapic_get_cr8(vcpu);
+	else
+		return vcpu->arch.cr8;
+}
+EXPORT_SYMBOL_GPL(get_cr8);
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu.
+ */
+static u32 msrs_to_save[] = {
+	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+	MSR_K6_STAR,
+#ifdef CONFIG_X86_64
+	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+	MSR_IA32_TIME_STAMP_COUNTER,
+};
+
+static unsigned num_msrs_to_save;
+
+static u32 emulated_msrs[] = {
+	MSR_IA32_MISC_ENABLE,
+};
+
+#ifdef CONFIG_X86_64
+
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	if (efer & EFER_RESERVED_BITS) {
+		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
+		       efer);
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	if (is_paging(vcpu)
+	    && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
+		kvm_inject_gp(vcpu, 0);
+		return;
+	}
+
+	kvm_x86_ops->set_efer(vcpu, efer);
+
+	efer &= ~EFER_LMA;
+	efer |= vcpu->arch.shadow_efer & EFER_LMA;
+
+	vcpu->arch.shadow_efer = efer;
+}
+
+#endif
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+}
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+	return kvm_set_msr(vcpu, index, *data);
+}
+
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	switch (msr) {
+#ifdef CONFIG_X86_64
+	case MSR_EFER:
+		set_efer(vcpu, data);
+		break;
+#endif
+	case MSR_IA32_MC0_STATUS:
+		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
+		       __FUNCTION__, data);
+		break;
+	case MSR_IA32_MCG_STATUS:
+		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
+			__FUNCTION__, data);
+		break;
+	case MSR_IA32_UCODE_REV:
+	case MSR_IA32_UCODE_WRITE:
+	case 0x200 ... 0x2ff: /* MTRRs */
+		break;
+	case MSR_IA32_APICBASE:
+		kvm_set_apic_base(vcpu, data);
+		break;
+	case MSR_IA32_MISC_ENABLE:
+		vcpu->arch.ia32_misc_enable_msr = data;
+		break;
+	default:
+		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
+
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+	u64 data;
+
+	switch (msr) {
+	case 0xc0010010: /* SYSCFG */
+	case 0xc0010015: /* HWCR */
+	case MSR_IA32_PLATFORM_ID:
+	case MSR_IA32_P5_MC_ADDR:
+	case MSR_IA32_P5_MC_TYPE:
+	case MSR_IA32_MC0_CTL:
+	case MSR_IA32_MCG_STATUS:
+	case MSR_IA32_MCG_CAP:
+	case MSR_IA32_MC0_MISC:
+	case MSR_IA32_MC0_MISC+4:
+	case MSR_IA32_MC0_MISC+8:
+	case MSR_IA32_MC0_MISC+12:
+	case MSR_IA32_MC0_MISC+16:
+	case MSR_IA32_UCODE_REV:
+	case MSR_IA32_PERF_STATUS:
+	case MSR_IA32_EBL_CR_POWERON:
+		/* MTRR registers */
+	case 0xfe:
+	case 0x200 ... 0x2ff:
+		data = 0;
+		break;
+	case 0xcd: /* fsb frequency */
+		data = 3;
+		break;
+	case MSR_IA32_APICBASE:
+		data = kvm_get_apic_base(vcpu);
+		break;
+	case MSR_IA32_MISC_ENABLE:
+		data = vcpu->arch.ia32_misc_enable_msr;
+		break;
+#ifdef CONFIG_X86_64
+	case MSR_EFER:
+		data = vcpu->arch.shadow_efer;
+		break;
+#endif
+	default:
+		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+		return 1;
+	}
+	*pdata = data;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+		    struct kvm_msr_entry *entries,
+		    int (*do_msr)(struct kvm_vcpu *vcpu,
+				  unsigned index, u64 *data))
+{
+	int i;
+
+	vcpu_load(vcpu);
+
+	for (i = 0; i < msrs->nmsrs; ++i)
+		if (do_msr(vcpu, entries[i].index, &entries[i].data))
+			break;
+
+	vcpu_put(vcpu);
+
+	return i;
+}
+
+/*
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+		  int (*do_msr)(struct kvm_vcpu *vcpu,
+				unsigned index, u64 *data),
+		  int writeback)
+{
+	struct kvm_msrs msrs;
+	struct kvm_msr_entry *entries;
+	int r, n;
+	unsigned size;
+
+	r = -EFAULT;
+	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
+		goto out;
+
+	r = -E2BIG;
+	if (msrs.nmsrs >= MAX_IO_MSRS)
+		goto out;
+
+	r = -ENOMEM;
+	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+	entries = vmalloc(size);
+	if (!entries)
+		goto out;
+
+	r = -EFAULT;
+	if (copy_from_user(entries, user_msrs->entries, size))
+		goto out_free;
+
+	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+	if (r < 0)
+		goto out_free;
+
+	r = -EFAULT;
+	if (writeback && copy_to_user(user_msrs->entries, entries, size))
+		goto out_free;
+
+	r = n;
+
+out_free:
+	vfree(entries);
+out:
+	return r;
+}
+
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it.
+ */
+void decache_vcpus_on_cpu(int cpu)
+{
+	struct kvm *vm;
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	spin_lock(&kvm_lock);
+	list_for_each_entry(vm, &vm_list, vm_list)
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = vm->vcpus[i];
+			if (!vcpu)
+				continue;
+			/*
+			 * If the vcpu is locked, then it is running on some
+			 * other cpu and therefore it is not cached on the
+			 * cpu in question.
+			 *
+			 * If it's not locked, check the last cpu it executed
+			 * on.
+			 */
+			if (mutex_trylock(&vcpu->mutex)) {
+				if (vcpu->cpu == cpu) {
+					kvm_x86_ops->vcpu_decache(vcpu);
+					vcpu->cpu = -1;
+				}
+				mutex_unlock(&vcpu->mutex);
+			}
+		}
+	spin_unlock(&kvm_lock);
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+	int r;
+
+	switch (ext) {
+	case KVM_CAP_IRQCHIP:
+	case KVM_CAP_HLT:
+	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_SET_TSS_ADDR:
+	case KVM_CAP_EXT_CPUID:
+		r = 1;
+		break;
+	case KVM_CAP_VAPIC:
+		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
+		break;
+	default:
+		r = 0;
+		break;
+	}
+	return r;
+
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+			unsigned int ioctl, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	long r;
+
+	switch (ioctl) {
+	case KVM_GET_MSR_INDEX_LIST: {
+		struct kvm_msr_list __user *user_msr_list = argp;
+		struct kvm_msr_list msr_list;
+		unsigned n;
+
+		r = -EFAULT;
+		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+			goto out;
+		n = msr_list.nmsrs;
+		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+			goto out;
+		r = -E2BIG;
+		if (n < num_msrs_to_save)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+				 num_msrs_to_save * sizeof(u32)))
+			goto out;
+		if (copy_to_user(user_msr_list->indices
+				 + num_msrs_to_save * sizeof(u32),
+				 &emulated_msrs,
+				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+			goto out;
+		r = 0;
+		break;
+	}
+	default:
+		r = -EINVAL;
+	}
+out:
+	return r;
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	kvm_x86_ops->vcpu_load(vcpu, cpu);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->vcpu_put(vcpu);
+	kvm_put_guest_fpu(vcpu);
+}
+
+static int is_efer_nx(void)
+{
+	u64 efer;
+
+	rdmsrl(MSR_EFER, efer);
+	return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_cpuid_entry2 *e, *entry;
+
+	entry = NULL;
+	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		e = &vcpu->arch.cpuid_entries[i];
+		if (e->function == 0x80000001) {
+			entry = e;
+			break;
+		}
+	}
+	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
+		entry->edx &= ~(1 << 20);
+		printk(KERN_INFO "kvm: guest NX capability removed\n");
+	}
+}
+
+/* when an old userspace process fills a new kernel module */
+static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+				    struct kvm_cpuid *cpuid,
+				    struct kvm_cpuid_entry __user *entries)
+{
+	int r, i;
+	struct kvm_cpuid_entry *cpuid_entries;
+
+	r = -E2BIG;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		goto out;
+	r = -ENOMEM;
+	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+	if (!cpuid_entries)
+		goto out;
+	r = -EFAULT;
+	if (copy_from_user(cpuid_entries, entries,
+			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+		goto out_free;
+	for (i = 0; i < cpuid->nent; i++) {
+		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+		vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+		vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+		vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+		vcpu->arch.cpuid_entries[i].index = 0;
+		vcpu->arch.cpuid_entries[i].flags = 0;
+		vcpu->arch.cpuid_entries[i].padding[0] = 0;
+		vcpu->arch.cpuid_entries[i].padding[1] = 0;
+		vcpu->arch.cpuid_entries[i].padding[2] = 0;
+	}
+	vcpu->arch.cpuid_nent = cpuid->nent;
+	cpuid_fix_nx_cap(vcpu);
+	r = 0;
+
+out_free:
+	vfree(cpuid_entries);
+out:
+	return r;
+}
+
+static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+				    struct kvm_cpuid2 *cpuid,
+				    struct kvm_cpuid_entry2 __user *entries)
+{
+	int r;
+
+	r = -E2BIG;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		goto out;
+	r = -EFAULT;
+	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+		goto out;
+	vcpu->arch.cpuid_nent = cpuid->nent;
+	return 0;
+
+out:
+	return r;
+}
+
+static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+				    struct kvm_cpuid2 *cpuid,
+				    struct kvm_cpuid_entry2 __user *entries)
+{
+	int r;
+
+	r = -E2BIG;
+	if (cpuid->nent < vcpu->arch.cpuid_nent)
+		goto out;
+	r = -EFAULT;
+	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+			   vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+		goto out;
+	return 0;
+
+out:
+	cpuid->nent = vcpu->arch.cpuid_nent;
+	return r;
+}
+
+static inline u32 bit(int bitno)
+{
+	return 1 << (bitno & 31);
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+			  u32 index)
+{
+	entry->function = function;
+	entry->index = index;
+	cpuid_count(entry->function, entry->index,
+		&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+	entry->flags = 0;
+}
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+			 u32 index, int *nent, int maxnent)
+{
+	const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
+		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+		bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
+		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+		bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
+		bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
+		bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
+	const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
+		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+		bit(X86_FEATURE_PGE) |
+		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+		bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
+		bit(X86_FEATURE_SYSCALL) |
+		(bit(X86_FEATURE_NX) && is_efer_nx()) |
+#ifdef CONFIG_X86_64
+		bit(X86_FEATURE_LM) |
+#endif
+		bit(X86_FEATURE_MMXEXT) |
+		bit(X86_FEATURE_3DNOWEXT) |
+		bit(X86_FEATURE_3DNOW);
+	const u32 kvm_supported_word3_x86_features =
+		bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+	const u32 kvm_supported_word6_x86_features =
+		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+
+	/* all func 2 cpuid_count() should be called on the same cpu */
+	get_cpu();
+	do_cpuid_1_ent(entry, function, index);
+	++*nent;
+
+	switch (function) {
+	case 0:
+		entry->eax = min(entry->eax, (u32)0xb);
+		break;
+	case 1:
+		entry->edx &= kvm_supported_word0_x86_features;
+		entry->ecx &= kvm_supported_word3_x86_features;
+		break;
+	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
+	 * may return different values. This forces us to get_cpu() before
+	 * issuing the first command, and also to emulate this annoying behavior
+	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+	case 2: {
+		int t, times = entry->eax & 0xff;
+
+		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+		for (t = 1; t < times && *nent < maxnent; ++t) {
+			do_cpuid_1_ent(&entry[t], function, 0);
+			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+			++*nent;
+		}
+		break;
+	}
+	/* function 4 and 0xb have additional index. */
+	case 4: {
+		int index, cache_type;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* read more entries until cache_type is zero */
+		for (index = 1; *nent < maxnent; ++index) {
+			cache_type = entry[index - 1].eax & 0x1f;
+			if (!cache_type)
+				break;
+			do_cpuid_1_ent(&entry[index], function, index);
+			entry[index].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
+	case 0xb: {
+		int index, level_type;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* read more entries until level_type is zero */
+		for (index = 1; *nent < maxnent; ++index) {
+			level_type = entry[index - 1].ecx & 0xff;
+			if (!level_type)
+				break;
+			do_cpuid_1_ent(&entry[index], function, index);
+			entry[index].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
+	case 0x80000000:
+		entry->eax = min(entry->eax, 0x8000001a);
+		break;
+	case 0x80000001:
+		entry->edx &= kvm_supported_word1_x86_features;
+		entry->ecx &= kvm_supported_word6_x86_features;
+		break;
+	}
+	put_cpu();
+}
+
+static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
+				    struct kvm_cpuid2 *cpuid,
+				    struct kvm_cpuid_entry2 __user *entries)
+{
+	struct kvm_cpuid_entry2 *cpuid_entries;
+	int limit, nent = 0, r = -E2BIG;
+	u32 func;
+
+	if (cpuid->nent < 1)
+		goto out;
+	r = -ENOMEM;
+	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+	if (!cpuid_entries)
+		goto out;
+
+	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+	limit = cpuid_entries[0].eax;
+	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+		do_cpuid_ent(&cpuid_entries[nent], func, 0,
+				&nent, cpuid->nent);
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+	limit = cpuid_entries[nent - 1].eax;
+	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+		do_cpuid_ent(&cpuid_entries[nent], func, 0,
+			       &nent, cpuid->nent);
+	r = -EFAULT;
+	if (copy_to_user(entries, cpuid_entries,
+			nent * sizeof(struct kvm_cpuid_entry2)))
+		goto out_free;
+	cpuid->nent = nent;
+	r = 0;
+
+out_free:
+	vfree(cpuid_entries);
+out:
+	return r;
+}
+
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+				    struct kvm_lapic_state *s)
+{
+	vcpu_load(vcpu);
+	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+				    struct kvm_lapic_state *s)
+{
+	vcpu_load(vcpu);
+	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+	kvm_apic_post_state_restore(vcpu);
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+				    struct kvm_interrupt *irq)
+{
+	if (irq->irq < 0 || irq->irq >= 256)
+		return -EINVAL;
+	if (irqchip_in_kernel(vcpu->kvm))
+		return -ENXIO;
+	vcpu_load(vcpu);
+
+	set_bit(irq->irq, vcpu->arch.irq_pending);
+	set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
+					   struct kvm_tpr_access_ctl *tac)
+{
+	if (tac->flags)
+		return -EINVAL;
+	vcpu->arch.tpr_access_reporting = !!tac->enabled;
+	return 0;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+			 unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r;
+
+	switch (ioctl) {
+	case KVM_GET_LAPIC: {
+		struct kvm_lapic_state lapic;
+
+		memset(&lapic, 0, sizeof lapic);
+		r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &lapic, sizeof lapic))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_LAPIC: {
+		struct kvm_lapic_state lapic;
+
+		r = -EFAULT;
+		if (copy_from_user(&lapic, argp, sizeof lapic))
+			goto out;
+		r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_INTERRUPT: {
+		struct kvm_interrupt irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&irq, argp, sizeof irq))
+			goto out;
+		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_CPUID: {
+		struct kvm_cpuid __user *cpuid_arg = argp;
+		struct kvm_cpuid cpuid;
+
+		r = -EFAULT;
+		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+			goto out;
+		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_SET_CPUID2: {
+		struct kvm_cpuid2 __user *cpuid_arg = argp;
+		struct kvm_cpuid2 cpuid;
+
+		r = -EFAULT;
+		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+			goto out;
+		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+				cpuid_arg->entries);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_GET_CPUID2: {
+		struct kvm_cpuid2 __user *cpuid_arg = argp;
+		struct kvm_cpuid2 cpuid;
+
+		r = -EFAULT;
+		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+			goto out;
+		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+				cpuid_arg->entries);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_GET_MSRS:
+		r = msr_io(vcpu, argp, kvm_get_msr, 1);
+		break;
+	case KVM_SET_MSRS:
+		r = msr_io(vcpu, argp, do_set_msr, 0);
+		break;
+	case KVM_TPR_ACCESS_REPORTING: {
+		struct kvm_tpr_access_ctl tac;
+
+		r = -EFAULT;
+		if (copy_from_user(&tac, argp, sizeof tac))
+			goto out;
+		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &tac, sizeof tac))
+			goto out;
+		r = 0;
+		break;
+	};
+	case KVM_SET_VAPIC_ADDR: {
+		struct kvm_vapic_addr va;
+
+		r = -EINVAL;
+		if (!irqchip_in_kernel(vcpu->kvm))
+			goto out;
+		r = -EFAULT;
+		if (copy_from_user(&va, argp, sizeof va))
+			goto out;
+		r = 0;
+		kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+		break;
+	}
+	default:
+		r = -EINVAL;
+	}
+out:
+	return r;
+}
+
+static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
+{
+	int ret;
+
+	if (addr > (unsigned int)(-3 * PAGE_SIZE))
+		return -1;
+	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+	return ret;
+}
+
+static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
+					  u32 kvm_nr_mmu_pages)
+{
+	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
+		return -EINVAL;
+
+	down_write(&current->mm->mmap_sem);
+
+	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
+	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+
+	up_write(&current->mm->mmap_sem);
+	return 0;
+}
+
+static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+{
+	return kvm->arch.n_alloc_mmu_pages;
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+	struct kvm_mem_alias *alias;
+
+	for (i = 0; i < kvm->arch.naliases; ++i) {
+		alias = &kvm->arch.aliases[i];
+		if (gfn >= alias->base_gfn
+		    && gfn < alias->base_gfn + alias->npages)
+			return alias->target_gfn + gfn - alias->base_gfn;
+	}
+	return gfn;
+}
+
+/*
+ * Set a new alias region.  Aliases map a portion of physical memory into
+ * another portion.  This is useful for memory windows, for example the PC
+ * VGA region.
+ */
+static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
+					 struct kvm_memory_alias *alias)
+{
+	int r, n;
+	struct kvm_mem_alias *p;
+
+	r = -EINVAL;
+	/* General sanity checks */
+	if (alias->memory_size & (PAGE_SIZE - 1))
+		goto out;
+	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
+		goto out;
+	if (alias->slot >= KVM_ALIAS_SLOTS)
+		goto out;
+	if (alias->guest_phys_addr + alias->memory_size
+	    < alias->guest_phys_addr)
+		goto out;
+	if (alias->target_phys_addr + alias->memory_size
+	    < alias->target_phys_addr)
+		goto out;
+
+	down_write(&current->mm->mmap_sem);
+
+	p = &kvm->arch.aliases[alias->slot];
+	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
+	p->npages = alias->memory_size >> PAGE_SHIFT;
+	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+
+	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
+		if (kvm->arch.aliases[n - 1].npages)
+			break;
+	kvm->arch.naliases = n;
+
+	kvm_mmu_zap_all(kvm);
+
+	up_write(&current->mm->mmap_sem);
+
+	return 0;
+
+out:
+	return r;
+}
+
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+	int r;
+
+	r = 0;
+	switch (chip->chip_id) {
+	case KVM_IRQCHIP_PIC_MASTER:
+		memcpy(&chip->chip.pic,
+			&pic_irqchip(kvm)->pics[0],
+			sizeof(struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_PIC_SLAVE:
+		memcpy(&chip->chip.pic,
+			&pic_irqchip(kvm)->pics[1],
+			sizeof(struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_IOAPIC:
+		memcpy(&chip->chip.ioapic,
+			ioapic_irqchip(kvm),
+			sizeof(struct kvm_ioapic_state));
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+	return r;
+}
+
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+	int r;
+
+	r = 0;
+	switch (chip->chip_id) {
+	case KVM_IRQCHIP_PIC_MASTER:
+		memcpy(&pic_irqchip(kvm)->pics[0],
+			&chip->chip.pic,
+			sizeof(struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_PIC_SLAVE:
+		memcpy(&pic_irqchip(kvm)->pics[1],
+			&chip->chip.pic,
+			sizeof(struct kvm_pic_state));
+		break;
+	case KVM_IRQCHIP_IOAPIC:
+		memcpy(ioapic_irqchip(kvm),
+			&chip->chip.ioapic,
+			sizeof(struct kvm_ioapic_state));
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+	kvm_pic_update_irq(pic_irqchip(kvm));
+	return r;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+				      struct kvm_dirty_log *log)
+{
+	int r;
+	int n;
+	struct kvm_memory_slot *memslot;
+	int is_dirty = 0;
+
+	down_write(&current->mm->mmap_sem);
+
+	r = kvm_get_dirty_log(kvm, log, &is_dirty);
+	if (r)
+		goto out;
+
+	/* If nothing is dirty, don't bother messing with page tables. */
+	if (is_dirty) {
+		kvm_mmu_slot_remove_write_access(kvm, log->slot);
+		kvm_flush_remote_tlbs(kvm);
+		memslot = &kvm->memslots[log->slot];
+		n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+		memset(memslot->dirty_bitmap, 0, n);
+	}
+	r = 0;
+out:
+	up_write(&current->mm->mmap_sem);
+	return r;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+		       unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r = -EINVAL;
+
+	switch (ioctl) {
+	case KVM_SET_TSS_ADDR:
+		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
+		if (r < 0)
+			goto out;
+		break;
+	case KVM_SET_MEMORY_REGION: {
+		struct kvm_memory_region kvm_mem;
+		struct kvm_userspace_memory_region kvm_userspace_mem;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
+			goto out;
+		kvm_userspace_mem.slot = kvm_mem.slot;
+		kvm_userspace_mem.flags = kvm_mem.flags;
+		kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
+		kvm_userspace_mem.memory_size = kvm_mem.memory_size;
+		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_SET_NR_MMU_PAGES:
+		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
+		if (r)
+			goto out;
+		break;
+	case KVM_GET_NR_MMU_PAGES:
+		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
+		break;
+	case KVM_SET_MEMORY_ALIAS: {
+		struct kvm_memory_alias alias;
+
+		r = -EFAULT;
+		if (copy_from_user(&alias, argp, sizeof alias))
+			goto out;
+		r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_CREATE_IRQCHIP:
+		r = -ENOMEM;
+		kvm->arch.vpic = kvm_create_pic(kvm);
+		if (kvm->arch.vpic) {
+			r = kvm_ioapic_init(kvm);
+			if (r) {
+				kfree(kvm->arch.vpic);
+				kvm->arch.vpic = NULL;
+				goto out;
+			}
+		} else
+			goto out;
+		break;
+	case KVM_IRQ_LINE: {
+		struct kvm_irq_level irq_event;
+
+		r = -EFAULT;
+		if (copy_from_user(&irq_event, argp, sizeof irq_event))
+			goto out;
+		if (irqchip_in_kernel(kvm)) {
+			mutex_lock(&kvm->lock);
+			if (irq_event.irq < 16)
+				kvm_pic_set_irq(pic_irqchip(kvm),
+					irq_event.irq,
+					irq_event.level);
+			kvm_ioapic_set_irq(kvm->arch.vioapic,
+					irq_event.irq,
+					irq_event.level);
+			mutex_unlock(&kvm->lock);
+			r = 0;
+		}
+		break;
+	}
+	case KVM_GET_IRQCHIP: {
+		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+		struct kvm_irqchip chip;
+
+		r = -EFAULT;
+		if (copy_from_user(&chip, argp, sizeof chip))
+			goto out;
+		r = -ENXIO;
+		if (!irqchip_in_kernel(kvm))
+			goto out;
+		r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &chip, sizeof chip))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_IRQCHIP: {
+		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+		struct kvm_irqchip chip;
+
+		r = -EFAULT;
+		if (copy_from_user(&chip, argp, sizeof chip))
+			goto out;
+		r = -ENXIO;
+		if (!irqchip_in_kernel(kvm))
+			goto out;
+		r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_GET_SUPPORTED_CPUID: {
+		struct kvm_cpuid2 __user *cpuid_arg = argp;
+		struct kvm_cpuid2 cpuid;
+
+		r = -EFAULT;
+		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+			goto out;
+		r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
+			cpuid_arg->entries);
+		if (r)
+			goto out;
+
+		r = -EFAULT;
+		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+			goto out;
+		r = 0;
+		break;
+	}
+	default:
+		;
+	}
+out:
+	return r;
+}
+
+static void kvm_init_msr_list(void)
+{
+	u32 dummy[2];
+	unsigned i, j;
+
+	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
+		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+			continue;
+		if (j < i)
+			msrs_to_save[j] = msrs_to_save[i];
+		j++;
+	}
+	num_msrs_to_save = j;
+}
+
+/*
+ * Only apic need an MMIO device hook, so shortcut now..
+ */
+static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
+						gpa_t addr)
+{
+	struct kvm_io_device *dev;
+
+	if (vcpu->arch.apic) {
+		dev = &vcpu->arch.apic->dev;
+		if (dev->in_range(dev, addr))
+			return dev;
+	}
+	return NULL;
+}
+
+
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+						gpa_t addr)
+{
+	struct kvm_io_device *dev;
+
+	dev = vcpu_find_pervcpu_dev(vcpu, addr);
+	if (dev == NULL)
+		dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+	return dev;
+}
+
+int emulator_read_std(unsigned long addr,
+			     void *val,
+			     unsigned int bytes,
+			     struct kvm_vcpu *vcpu)
+{
+	void *data = val;
+	int r = X86EMUL_CONTINUE;
+
+	down_read(&current->mm->mmap_sem);
+	while (bytes) {
+		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+		unsigned offset = addr & (PAGE_SIZE-1);
+		unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+		int ret;
+
+		if (gpa == UNMAPPED_GVA) {
+			r = X86EMUL_PROPAGATE_FAULT;
+			goto out;
+		}
+		ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
+		if (ret < 0) {
+			r = X86EMUL_UNHANDLEABLE;
+			goto out;
+		}
+
+		bytes -= tocopy;
+		data += tocopy;
+		addr += tocopy;
+	}
+out:
+	up_read(&current->mm->mmap_sem);
+	return r;
+}
+EXPORT_SYMBOL_GPL(emulator_read_std);
+
+static int emulator_read_emulated(unsigned long addr,
+				  void *val,
+				  unsigned int bytes,
+				  struct kvm_vcpu *vcpu)
+{
+	struct kvm_io_device *mmio_dev;
+	gpa_t                 gpa;
+
+	if (vcpu->mmio_read_completed) {
+		memcpy(val, vcpu->mmio_data, bytes);
+		vcpu->mmio_read_completed = 0;
+		return X86EMUL_CONTINUE;
+	}
+
+	down_read(&current->mm->mmap_sem);
+	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+	up_read(&current->mm->mmap_sem);
+
+	/* For APIC access vmexit */
+	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		goto mmio;
+
+	if (emulator_read_std(addr, val, bytes, vcpu)
+			== X86EMUL_CONTINUE)
+		return X86EMUL_CONTINUE;
+	if (gpa == UNMAPPED_GVA)
+		return X86EMUL_PROPAGATE_FAULT;
+
+mmio:
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	mutex_lock(&vcpu->kvm->lock);
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+	if (mmio_dev) {
+		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+		mutex_unlock(&vcpu->kvm->lock);
+		return X86EMUL_CONTINUE;
+	}
+	mutex_unlock(&vcpu->kvm->lock);
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 0;
+
+	return X86EMUL_UNHANDLEABLE;
+}
+
+static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+			       const void *val, int bytes)
+{
+	int ret;
+
+	down_read(&current->mm->mmap_sem);
+	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+	if (ret < 0) {
+		up_read(&current->mm->mmap_sem);
+		return 0;
+	}
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+	up_read(&current->mm->mmap_sem);
+	return 1;
+}
+
+static int emulator_write_emulated_onepage(unsigned long addr,
+					   const void *val,
+					   unsigned int bytes,
+					   struct kvm_vcpu *vcpu)
+{
+	struct kvm_io_device *mmio_dev;
+	gpa_t                 gpa;
+
+	down_read(&current->mm->mmap_sem);
+	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+	up_read(&current->mm->mmap_sem);
+
+	if (gpa == UNMAPPED_GVA) {
+		kvm_inject_page_fault(vcpu, addr, 2);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	/* For APIC access vmexit */
+	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		goto mmio;
+
+	if (emulator_write_phys(vcpu, gpa, val, bytes))
+		return X86EMUL_CONTINUE;
+
+mmio:
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	mutex_lock(&vcpu->kvm->lock);
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+	if (mmio_dev) {
+		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+		mutex_unlock(&vcpu->kvm->lock);
+		return X86EMUL_CONTINUE;
+	}
+	mutex_unlock(&vcpu->kvm->lock);
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 1;
+	memcpy(vcpu->mmio_data, val, bytes);
+
+	return X86EMUL_CONTINUE;
+}
+
+int emulator_write_emulated(unsigned long addr,
+				   const void *val,
+				   unsigned int bytes,
+				   struct kvm_vcpu *vcpu)
+{
+	/* Crossing a page boundary? */
+	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
+		int rc, now;
+
+		now = -addr & ~PAGE_MASK;
+		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		addr += now;
+		val += now;
+		bytes -= now;
+	}
+	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+}
+EXPORT_SYMBOL_GPL(emulator_write_emulated);
+
+static int emulator_cmpxchg_emulated(unsigned long addr,
+				     const void *old,
+				     const void *new,
+				     unsigned int bytes,
+				     struct kvm_vcpu *vcpu)
+{
+	static int reported;
+
+	if (!reported) {
+		reported = 1;
+		printk(KERN_WARNING "kvm: emulating exchange as write\n");
+	}
+#ifndef CONFIG_X86_64
+	/* guests cmpxchg8b have to be emulated atomically */
+	if (bytes == 8) {
+		gpa_t gpa;
+		struct page *page;
+		char *addr;
+		u64 val;
+
+		down_read(&current->mm->mmap_sem);
+		gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+
+		if (gpa == UNMAPPED_GVA ||
+		   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+			goto emul_write;
+
+		if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+			goto emul_write;
+
+		val = *(u64 *)new;
+		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+		addr = kmap_atomic(page, KM_USER0);
+		set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
+		kunmap_atomic(addr, KM_USER0);
+		kvm_release_page_dirty(page);
+	emul_write:
+		up_read(&current->mm->mmap_sem);
+	}
+#endif
+
+	return emulator_write_emulated(addr, new, bytes, vcpu);
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+	return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
+
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+{
+	return X86EMUL_CONTINUE;
+}
+
+int emulate_clts(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
+	return X86EMUL_CONTINUE;
+}
+
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+{
+	struct kvm_vcpu *vcpu = ctxt->vcpu;
+
+	switch (dr) {
+	case 0 ... 3:
+		*dest = kvm_x86_ops->get_dr(vcpu, dr);
+		return X86EMUL_CONTINUE;
+	default:
+		pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
+		return X86EMUL_UNHANDLEABLE;
+	}
+}
+
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+{
+	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
+	int exception;
+
+	kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
+	if (exception) {
+		/* FIXME: better handling */
+		return X86EMUL_UNHANDLEABLE;
+	}
+	return X86EMUL_CONTINUE;
+}
+
+void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+{
+	static int reported;
+	u8 opcodes[4];
+	unsigned long rip = vcpu->arch.rip;
+	unsigned long rip_linear;
+
+	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
+
+	if (reported)
+		return;
+
+	emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+
+	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
+	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+	reported = 1;
+}
+EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
+
+struct x86_emulate_ops emulate_ops = {
+	.read_std            = emulator_read_std,
+	.read_emulated       = emulator_read_emulated,
+	.write_emulated      = emulator_write_emulated,
+	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+};
+
+int emulate_instruction(struct kvm_vcpu *vcpu,
+			struct kvm_run *run,
+			unsigned long cr2,
+			u16 error_code,
+			int emulation_type)
+{
+	int r;
+	struct decode_cache *c;
+
+	vcpu->arch.mmio_fault_cr2 = cr2;
+	kvm_x86_ops->cache_regs(vcpu);
+
+	vcpu->mmio_is_write = 0;
+	vcpu->arch.pio.string = 0;
+
+	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+		int cs_db, cs_l;
+		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+		vcpu->arch.emulate_ctxt.vcpu = vcpu;
+		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+		vcpu->arch.emulate_ctxt.mode =
+			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+			? X86EMUL_MODE_REAL : cs_l
+			? X86EMUL_MODE_PROT64 :	cs_db
+			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+
+		if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
+			vcpu->arch.emulate_ctxt.cs_base = 0;
+			vcpu->arch.emulate_ctxt.ds_base = 0;
+			vcpu->arch.emulate_ctxt.es_base = 0;
+			vcpu->arch.emulate_ctxt.ss_base = 0;
+		} else {
+			vcpu->arch.emulate_ctxt.cs_base =
+					get_segment_base(vcpu, VCPU_SREG_CS);
+			vcpu->arch.emulate_ctxt.ds_base =
+					get_segment_base(vcpu, VCPU_SREG_DS);
+			vcpu->arch.emulate_ctxt.es_base =
+					get_segment_base(vcpu, VCPU_SREG_ES);
+			vcpu->arch.emulate_ctxt.ss_base =
+					get_segment_base(vcpu, VCPU_SREG_SS);
+		}
+
+		vcpu->arch.emulate_ctxt.gs_base =
+					get_segment_base(vcpu, VCPU_SREG_GS);
+		vcpu->arch.emulate_ctxt.fs_base =
+					get_segment_base(vcpu, VCPU_SREG_FS);
+
+		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+		/* Reject the instructions other than VMCALL/VMMCALL when
+		 * try to emulate invalid opcode */
+		c = &vcpu->arch.emulate_ctxt.decode;
+		if ((emulation_type & EMULTYPE_TRAP_UD) &&
+		    (!(c->twobyte && c->b == 0x01 &&
+		      (c->modrm_reg == 0 || c->modrm_reg == 3) &&
+		       c->modrm_mod == 3 && c->modrm_rm == 1)))
+			return EMULATE_FAIL;
+
+		++vcpu->stat.insn_emulation;
+		if (r)  {
+			++vcpu->stat.insn_emulation_fail;
+			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+				return EMULATE_DONE;
+			return EMULATE_FAIL;
+		}
+	}
+
+	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+	if (vcpu->arch.pio.string)
+		return EMULATE_DO_MMIO;
+
+	if ((r || vcpu->mmio_is_write) && run) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		run->mmio.phys_addr = vcpu->mmio_phys_addr;
+		memcpy(run->mmio.data, vcpu->mmio_data, 8);
+		run->mmio.len = vcpu->mmio_size;
+		run->mmio.is_write = vcpu->mmio_is_write;
+	}
+
+	if (r) {
+		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+			return EMULATE_DONE;
+		if (!vcpu->mmio_needed) {
+			kvm_report_emulation_failure(vcpu, "mmio");
+			return EMULATE_FAIL;
+		}
+		return EMULATE_DO_MMIO;
+	}
+
+	kvm_x86_ops->decache_regs(vcpu);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+	if (vcpu->mmio_is_write) {
+		vcpu->mmio_needed = 0;
+		return EMULATE_DO_MMIO;
+	}
+
+	return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(emulate_instruction);
+
+static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
+		if (vcpu->arch.pio.guest_pages[i]) {
+			kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
+			vcpu->arch.pio.guest_pages[i] = NULL;
+		}
+}
+
+static int pio_copy_data(struct kvm_vcpu *vcpu)
+{
+	void *p = vcpu->arch.pio_data;
+	void *q;
+	unsigned bytes;
+	int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
+
+	q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
+		 PAGE_KERNEL);
+	if (!q) {
+		free_pio_guest_pages(vcpu);
+		return -ENOMEM;
+	}
+	q += vcpu->arch.pio.guest_page_offset;
+	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
+	if (vcpu->arch.pio.in)
+		memcpy(q, p, bytes);
+	else
+		memcpy(p, q, bytes);
+	q -= vcpu->arch.pio.guest_page_offset;
+	vunmap(q);
+	free_pio_guest_pages(vcpu);
+	return 0;
+}
+
+int complete_pio(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pio_request *io = &vcpu->arch.pio;
+	long delta;
+	int r;
+
+	kvm_x86_ops->cache_regs(vcpu);
+
+	if (!io->string) {
+		if (io->in)
+			memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
+			       io->size);
+	} else {
+		if (io->in) {
+			r = pio_copy_data(vcpu);
+			if (r) {
+				kvm_x86_ops->cache_regs(vcpu);
+				return r;
+			}
+		}
+
+		delta = 1;
+		if (io->rep) {
+			delta *= io->cur_count;
+			/*
+			 * The size of the register should really depend on
+			 * current address size.
+			 */
+			vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
+		}
+		if (io->down)
+			delta = -delta;
+		delta *= io->size;
+		if (io->in)
+			vcpu->arch.regs[VCPU_REGS_RDI] += delta;
+		else
+			vcpu->arch.regs[VCPU_REGS_RSI] += delta;
+	}
+
+	kvm_x86_ops->decache_regs(vcpu);
+
+	io->count -= io->cur_count;
+	io->cur_count = 0;
+
+	return 0;
+}
+
+static void kernel_pio(struct kvm_io_device *pio_dev,
+		       struct kvm_vcpu *vcpu,
+		       void *pd)
+{
+	/* TODO: String I/O for in kernel device */
+
+	mutex_lock(&vcpu->kvm->lock);
+	if (vcpu->arch.pio.in)
+		kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
+				  vcpu->arch.pio.size,
+				  pd);
+	else
+		kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
+				   vcpu->arch.pio.size,
+				   pd);
+	mutex_unlock(&vcpu->kvm->lock);
+}
+
+static void pio_string_write(struct kvm_io_device *pio_dev,
+			     struct kvm_vcpu *vcpu)
+{
+	struct kvm_pio_request *io = &vcpu->arch.pio;
+	void *pd = vcpu->arch.pio_data;
+	int i;
+
+	mutex_lock(&vcpu->kvm->lock);
+	for (i = 0; i < io->cur_count; i++) {
+		kvm_iodevice_write(pio_dev, io->port,
+				   io->size,
+				   pd);
+		pd += io->size;
+	}
+	mutex_unlock(&vcpu->kvm->lock);
+}
+
+static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
+					       gpa_t addr)
+{
+	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
+}
+
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+		  int size, unsigned port)
+{
+	struct kvm_io_device *pio_dev;
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+	vcpu->run->io.size = vcpu->arch.pio.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
+	vcpu->run->io.port = vcpu->arch.pio.port = port;
+	vcpu->arch.pio.in = in;
+	vcpu->arch.pio.string = 0;
+	vcpu->arch.pio.down = 0;
+	vcpu->arch.pio.guest_page_offset = 0;
+	vcpu->arch.pio.rep = 0;
+
+	kvm_x86_ops->cache_regs(vcpu);
+	memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
+	kvm_x86_ops->decache_regs(vcpu);
+
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+	pio_dev = vcpu_find_pio_dev(vcpu, port);
+	if (pio_dev) {
+		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+		complete_pio(vcpu);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio);
+
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+		  int size, unsigned long count, int down,
+		  gva_t address, int rep, unsigned port)
+{
+	unsigned now, in_page;
+	int i, ret = 0;
+	int nr_pages = 1;
+	struct page *page;
+	struct kvm_io_device *pio_dev;
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+	vcpu->run->io.size = vcpu->arch.pio.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+	vcpu->run->io.port = vcpu->arch.pio.port = port;
+	vcpu->arch.pio.in = in;
+	vcpu->arch.pio.string = 1;
+	vcpu->arch.pio.down = down;
+	vcpu->arch.pio.guest_page_offset = offset_in_page(address);
+	vcpu->arch.pio.rep = rep;
+
+	if (!count) {
+		kvm_x86_ops->skip_emulated_instruction(vcpu);
+		return 1;
+	}
+
+	if (!down)
+		in_page = PAGE_SIZE - offset_in_page(address);
+	else
+		in_page = offset_in_page(address) + size;
+	now = min(count, (unsigned long)in_page / size);
+	if (!now) {
+		/*
+		 * String I/O straddles page boundary.  Pin two guest pages
+		 * so that we satisfy atomicity constraints.  Do just one
+		 * transaction to avoid complexity.
+		 */
+		nr_pages = 2;
+		now = 1;
+	}
+	if (down) {
+		/*
+		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
+		 */
+		pr_unimpl(vcpu, "guest string pio down\n");
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+	vcpu->run->io.count = now;
+	vcpu->arch.pio.cur_count = now;
+
+	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
+		kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+	for (i = 0; i < nr_pages; ++i) {
+		down_read(&current->mm->mmap_sem);
+		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
+		vcpu->arch.pio.guest_pages[i] = page;
+		up_read(&current->mm->mmap_sem);
+		if (!page) {
+			kvm_inject_gp(vcpu, 0);
+			free_pio_guest_pages(vcpu);
+			return 1;
+		}
+	}
+
+	pio_dev = vcpu_find_pio_dev(vcpu, port);
+	if (!vcpu->arch.pio.in) {
+		/* string PIO write */
+		ret = pio_copy_data(vcpu);
+		if (ret >= 0 && pio_dev) {
+			pio_string_write(pio_dev, vcpu);
+			complete_pio(vcpu);
+			if (vcpu->arch.pio.count == 0)
+				ret = 1;
+		}
+	} else if (pio_dev)
+		pr_unimpl(vcpu, "no string pio read support yet, "
+		       "port %x size %d count %ld\n",
+			port, size, count);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+
+int kvm_arch_init(void *opaque)
+{
+	int r;
+	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+
+	if (kvm_x86_ops) {
+		printk(KERN_ERR "kvm: already loaded the other module\n");
+		r = -EEXIST;
+		goto out;
+	}
+
+	if (!ops->cpu_has_kvm_support()) {
+		printk(KERN_ERR "kvm: no hardware support\n");
+		r = -EOPNOTSUPP;
+		goto out;
+	}
+	if (ops->disabled_by_bios()) {
+		printk(KERN_ERR "kvm: disabled by bios\n");
+		r = -EOPNOTSUPP;
+		goto out;
+	}
+
+	r = kvm_mmu_module_init();
+	if (r)
+		goto out;
+
+	kvm_init_msr_list();
+
+	kvm_x86_ops = ops;
+	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+	return 0;
+
+out:
+	return r;
+}
+
+void kvm_arch_exit(void)
+{
+	kvm_x86_ops = NULL;
+	kvm_mmu_module_exit();
+}
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+	++vcpu->stat.halt_exits;
+	if (irqchip_in_kernel(vcpu->kvm)) {
+		vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
+		kvm_vcpu_block(vcpu);
+		if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
+			return -EINTR;
+		return 1;
+	} else {
+		vcpu->run->exit_reason = KVM_EXIT_HLT;
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr, a0, a1, a2, a3, ret;
+
+	kvm_x86_ops->cache_regs(vcpu);
+
+	nr = vcpu->arch.regs[VCPU_REGS_RAX];
+	a0 = vcpu->arch.regs[VCPU_REGS_RBX];
+	a1 = vcpu->arch.regs[VCPU_REGS_RCX];
+	a2 = vcpu->arch.regs[VCPU_REGS_RDX];
+	a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+
+	if (!is_long_mode(vcpu)) {
+		nr &= 0xFFFFFFFF;
+		a0 &= 0xFFFFFFFF;
+		a1 &= 0xFFFFFFFF;
+		a2 &= 0xFFFFFFFF;
+		a3 &= 0xFFFFFFFF;
+	}
+
+	switch (nr) {
+	case KVM_HC_VAPIC_POLL_IRQ:
+		ret = 0;
+		break;
+	default:
+		ret = -KVM_ENOSYS;
+		break;
+	}
+	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
+	kvm_x86_ops->decache_regs(vcpu);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+{
+	char instruction[3];
+	int ret = 0;
+
+
+	/*
+	 * Blow out the MMU to ensure that no other VCPU has an active mapping
+	 * to ensure that the updated hypercall appears atomically across all
+	 * VCPUs.
+	 */
+	kvm_mmu_zap_all(vcpu->kvm);
+
+	kvm_x86_ops->cache_regs(vcpu);
+	kvm_x86_ops->patch_hypercall(vcpu, instruction);
+	if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
+	    != X86EMUL_CONTINUE)
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+	struct descriptor_table dt = { limit, base };
+
+	kvm_x86_ops->set_gdt(vcpu, &dt);
+}
+
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+	struct descriptor_table dt = { limit, base };
+
+	kvm_x86_ops->set_idt(vcpu, &dt);
+}
+
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+		   unsigned long *rflags)
+{
+	lmsw(vcpu, msw);
+	*rflags = kvm_x86_ops->get_rflags(vcpu);
+}
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
+{
+	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+	switch (cr) {
+	case 0:
+		return vcpu->arch.cr0;
+	case 2:
+		return vcpu->arch.cr2;
+	case 3:
+		return vcpu->arch.cr3;
+	case 4:
+		return vcpu->arch.cr4;
+	case 8:
+		return get_cr8(vcpu);
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+		return 0;
+	}
+}
+
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
+		     unsigned long *rflags)
+{
+	switch (cr) {
+	case 0:
+		set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+		*rflags = kvm_x86_ops->get_rflags(vcpu);
+		break;
+	case 2:
+		vcpu->arch.cr2 = val;
+		break;
+	case 3:
+		set_cr3(vcpu, val);
+		break;
+	case 4:
+		set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+		break;
+	case 8:
+		set_cr8(vcpu, val & 0xfUL);
+		break;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+	}
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+	int j, nent = vcpu->arch.cpuid_nent;
+
+	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+	/* when no next entry is found, the current entry[i] is reselected */
+	for (j = i + 1; j == i; j = (j + 1) % nent) {
+		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+		if (ej->function == e->function) {
+			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+			return j;
+		}
+	}
+	return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+	u32 function, u32 index)
+{
+	if (e->function != function)
+		return 0;
+	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+		return 0;
+	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+		!(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+		return 0;
+	return 1;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+	int i;
+	u32 function, index;
+	struct kvm_cpuid_entry2 *e, *best;
+
+	kvm_x86_ops->cache_regs(vcpu);
+	function = vcpu->arch.regs[VCPU_REGS_RAX];
+	index = vcpu->arch.regs[VCPU_REGS_RCX];
+	vcpu->arch.regs[VCPU_REGS_RAX] = 0;
+	vcpu->arch.regs[VCPU_REGS_RBX] = 0;
+	vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+	vcpu->arch.regs[VCPU_REGS_RDX] = 0;
+	best = NULL;
+	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		e = &vcpu->arch.cpuid_entries[i];
+		if (is_matching_cpuid_entry(e, function, index)) {
+			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+				move_to_next_stateful_cpuid_entry(vcpu, i);
+			best = e;
+			break;
+		}
+		/*
+		 * Both basic or both extended?
+		 */
+		if (((e->function ^ function) & 0x80000000) == 0)
+			if (!best || e->function > best->function)
+				best = e;
+	}
+	if (best) {
+		vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
+		vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
+		vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
+		vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
+	}
+	kvm_x86_ops->decache_regs(vcpu);
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
+
+/*
+ * Check if userspace requested an interrupt window, and that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
+					  struct kvm_run *kvm_run)
+{
+	return (!vcpu->arch.irq_summary &&
+		kvm_run->request_interrupt_window &&
+		vcpu->arch.interrupt_window_open &&
+		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+}
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu,
+			      struct kvm_run *kvm_run)
+{
+	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+	kvm_run->cr8 = get_cr8(vcpu);
+	kvm_run->apic_base = kvm_get_apic_base(vcpu);
+	if (irqchip_in_kernel(vcpu->kvm))
+		kvm_run->ready_for_interrupt_injection = 1;
+	else
+		kvm_run->ready_for_interrupt_injection =
+					(vcpu->arch.interrupt_window_open &&
+					 vcpu->arch.irq_summary == 0);
+}
+
+static void vapic_enter(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	struct page *page;
+
+	if (!apic || !apic->vapic_addr)
+		return;
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+	vcpu->arch.apic->vapic_page = page;
+	up_read(&current->mm->mmap_sem);
+}
+
+static void vapic_exit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (!apic || !apic->vapic_addr)
+		return;
+
+	kvm_release_page_dirty(apic->vapic_page);
+	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	int r;
+
+	if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
+		pr_debug("vcpu %d received sipi with vector # %x\n",
+		       vcpu->vcpu_id, vcpu->arch.sipi_vector);
+		kvm_lapic_reset(vcpu);
+		r = kvm_x86_ops->vcpu_reset(vcpu);
+		if (r)
+			return r;
+		vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+	}
+
+	vapic_enter(vcpu);
+
+preempted:
+	if (vcpu->guest_debug.enabled)
+		kvm_x86_ops->guest_debug_pre(vcpu);
+
+again:
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r))
+		goto out;
+
+	if (vcpu->requests) {
+		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+			__kvm_migrate_apic_timer(vcpu);
+		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
+				       &vcpu->requests)) {
+			kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
+			r = 0;
+			goto out;
+		}
+	}
+
+	kvm_inject_pending_timer_irqs(vcpu);
+
+	preempt_disable();
+
+	kvm_x86_ops->prepare_guest_switch(vcpu);
+	kvm_load_guest_fpu(vcpu);
+
+	local_irq_disable();
+
+	if (need_resched()) {
+		local_irq_enable();
+		preempt_enable();
+		r = 1;
+		goto out;
+	}
+
+	if (signal_pending(current)) {
+		local_irq_enable();
+		preempt_enable();
+		r = -EINTR;
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		++vcpu->stat.signal_exits;
+		goto out;
+	}
+
+	if (vcpu->arch.exception.pending)
+		__queue_exception(vcpu);
+	else if (irqchip_in_kernel(vcpu->kvm))
+		kvm_x86_ops->inject_pending_irq(vcpu);
+	else
+		kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+
+	kvm_lapic_sync_to_vapic(vcpu);
+
+	vcpu->guest_mode = 1;
+	kvm_guest_enter();
+
+	if (vcpu->requests)
+		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+			kvm_x86_ops->tlb_flush(vcpu);
+
+	kvm_x86_ops->run(vcpu, kvm_run);
+
+	vcpu->guest_mode = 0;
+	local_irq_enable();
+
+	++vcpu->stat.exits;
+
+	/*
+	 * We must have an instruction between local_irq_enable() and
+	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
+	 * the interrupt shadow.  The stat.exits increment will do nicely.
+	 * But we need to prevent reordering, hence this barrier():
+	 */
+	barrier();
+
+	kvm_guest_exit();
+
+	preempt_enable();
+
+	/*
+	 * Profile KVM exit RIPs:
+	 */
+	if (unlikely(prof_on == KVM_PROFILING)) {
+		kvm_x86_ops->cache_regs(vcpu);
+		profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
+	}
+
+	if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
+		vcpu->arch.exception.pending = false;
+
+	kvm_lapic_sync_from_vapic(vcpu);
+
+	r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+
+	if (r > 0) {
+		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+			r = -EINTR;
+			kvm_run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.request_irq_exits;
+			goto out;
+		}
+		if (!need_resched())
+			goto again;
+	}
+
+out:
+	if (r > 0) {
+		kvm_resched(vcpu);
+		goto preempted;
+	}
+
+	post_kvm_run_save(vcpu, kvm_run);
+
+	vapic_exit(vcpu);
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	int r;
+	sigset_t sigsaved;
+
+	vcpu_load(vcpu);
+
+	if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
+		kvm_vcpu_block(vcpu);
+		vcpu_put(vcpu);
+		return -EAGAIN;
+	}
+
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+	/* re-sync apic's tpr */
+	if (!irqchip_in_kernel(vcpu->kvm))
+		set_cr8(vcpu, kvm_run->cr8);
+
+	if (vcpu->arch.pio.cur_count) {
+		r = complete_pio(vcpu);
+		if (r)
+			goto out;
+	}
+#if CONFIG_HAS_IOMEM
+	if (vcpu->mmio_needed) {
+		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+		vcpu->mmio_read_completed = 1;
+		vcpu->mmio_needed = 0;
+		r = emulate_instruction(vcpu, kvm_run,
+					vcpu->arch.mmio_fault_cr2, 0,
+					EMULTYPE_NO_DECODE);
+		if (r == EMULATE_DO_MMIO) {
+			/*
+			 * Read-modify-write.  Back to userspace.
+			 */
+			r = 0;
+			goto out;
+		}
+	}
+#endif
+	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
+		kvm_x86_ops->cache_regs(vcpu);
+		vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
+		kvm_x86_ops->decache_regs(vcpu);
+	}
+
+	r = __vcpu_run(vcpu, kvm_run);
+
+out:
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	vcpu_put(vcpu);
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	vcpu_load(vcpu);
+
+	kvm_x86_ops->cache_regs(vcpu);
+
+	regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
+	regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
+	regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
+	regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
+	regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
+	regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
+	regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+	regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
+#ifdef CONFIG_X86_64
+	regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
+	regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
+	regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
+	regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
+	regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
+	regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
+	regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
+	regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+#endif
+
+	regs->rip = vcpu->arch.rip;
+	regs->rflags = kvm_x86_ops->get_rflags(vcpu);
+
+	/*
+	 * Don't leak debug flags in case they were set for guest debugging
+	 */
+	if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
+		regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	vcpu_load(vcpu);
+
+	vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
+	vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
+	vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
+	vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
+	vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
+	vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
+	vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
+	vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
+#ifdef CONFIG_X86_64
+	vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
+	vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
+	vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
+	vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
+	vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
+	vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
+	vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
+	vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
+#endif
+
+	vcpu->arch.rip = regs->rip;
+	kvm_x86_ops->set_rflags(vcpu, regs->rflags);
+
+	kvm_x86_ops->decache_regs(vcpu);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static void get_segment(struct kvm_vcpu *vcpu,
+			struct kvm_segment *var, int seg)
+{
+	return kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+	struct kvm_segment cs;
+
+	get_segment(vcpu, &cs, VCPU_SREG_CS);
+	*db = cs.db;
+	*l = cs.l;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	struct descriptor_table dt;
+	int pending_vec;
+
+	vcpu_load(vcpu);
+
+	get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+	get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+	get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+	get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+	get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+	get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+	get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+	get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+	kvm_x86_ops->get_idt(vcpu, &dt);
+	sregs->idt.limit = dt.limit;
+	sregs->idt.base = dt.base;
+	kvm_x86_ops->get_gdt(vcpu, &dt);
+	sregs->gdt.limit = dt.limit;
+	sregs->gdt.base = dt.base;
+
+	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+	sregs->cr0 = vcpu->arch.cr0;
+	sregs->cr2 = vcpu->arch.cr2;
+	sregs->cr3 = vcpu->arch.cr3;
+	sregs->cr4 = vcpu->arch.cr4;
+	sregs->cr8 = get_cr8(vcpu);
+	sregs->efer = vcpu->arch.shadow_efer;
+	sregs->apic_base = kvm_get_apic_base(vcpu);
+
+	if (irqchip_in_kernel(vcpu->kvm)) {
+		memset(sregs->interrupt_bitmap, 0,
+		       sizeof sregs->interrupt_bitmap);
+		pending_vec = kvm_x86_ops->get_irq(vcpu);
+		if (pending_vec >= 0)
+			set_bit(pending_vec,
+				(unsigned long *)sregs->interrupt_bitmap);
+	} else
+		memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
+		       sizeof sregs->interrupt_bitmap);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+static void set_segment(struct kvm_vcpu *vcpu,
+			struct kvm_segment *var, int seg)
+{
+	return kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	int mmu_reset_needed = 0;
+	int i, pending_vec, max_bits;
+	struct descriptor_table dt;
+
+	vcpu_load(vcpu);
+
+	dt.limit = sregs->idt.limit;
+	dt.base = sregs->idt.base;
+	kvm_x86_ops->set_idt(vcpu, &dt);
+	dt.limit = sregs->gdt.limit;
+	dt.base = sregs->gdt.base;
+	kvm_x86_ops->set_gdt(vcpu, &dt);
+
+	vcpu->arch.cr2 = sregs->cr2;
+	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+	vcpu->arch.cr3 = sregs->cr3;
+
+	set_cr8(vcpu, sregs->cr8);
+
+	mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
+#ifdef CONFIG_X86_64
+	kvm_x86_ops->set_efer(vcpu, sregs->efer);
+#endif
+	kvm_set_apic_base(vcpu, sregs->apic_base);
+
+	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+
+	mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
+	vcpu->arch.cr0 = sregs->cr0;
+	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+
+	mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
+	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+	if (!is_long_mode(vcpu) && is_pae(vcpu))
+		load_pdptrs(vcpu, vcpu->arch.cr3);
+
+	if (mmu_reset_needed)
+		kvm_mmu_reset_context(vcpu);
+
+	if (!irqchip_in_kernel(vcpu->kvm)) {
+		memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
+		       sizeof vcpu->arch.irq_pending);
+		vcpu->arch.irq_summary = 0;
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
+			if (vcpu->arch.irq_pending[i])
+				__set_bit(i, &vcpu->arch.irq_summary);
+	} else {
+		max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+		pending_vec = find_first_bit(
+			(const unsigned long *)sregs->interrupt_bitmap,
+			max_bits);
+		/* Only pending external irq is handled here */
+		if (pending_vec < max_bits) {
+			kvm_x86_ops->set_irq(vcpu, pending_vec);
+			pr_debug("Set back pending irq %d\n",
+				 pending_vec);
+		}
+	}
+
+	set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+	set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+	set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+	set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+	set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+	set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+	set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+	set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+				    struct kvm_debug_guest *dbg)
+{
+	int r;
+
+	vcpu_load(vcpu);
+
+	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
+
+	vcpu_put(vcpu);
+
+	return r;
+}
+
+/*
+ * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
+ * we have asm/x86/processor.h
+ */
+struct fxsave {
+	u16	cwd;
+	u16	swd;
+	u16	twd;
+	u16	fop;
+	u64	rip;
+	u64	rdp;
+	u32	mxcsr;
+	u32	mxcsr_mask;
+	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
+#ifdef CONFIG_X86_64
+	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
+#else
+	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
+#endif
+};
+
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+				    struct kvm_translation *tr)
+{
+	unsigned long vaddr = tr->linear_address;
+	gpa_t gpa;
+
+	vcpu_load(vcpu);
+	down_read(&current->mm->mmap_sem);
+	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
+	up_read(&current->mm->mmap_sem);
+	tr->physical_address = gpa;
+	tr->valid = gpa != UNMAPPED_GVA;
+	tr->writeable = 1;
+	tr->usermode = 0;
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+	vcpu_load(vcpu);
+
+	memcpy(fpu->fpr, fxsave->st_space, 128);
+	fpu->fcw = fxsave->cwd;
+	fpu->fsw = fxsave->swd;
+	fpu->ftwx = fxsave->twd;
+	fpu->last_opcode = fxsave->fop;
+	fpu->last_ip = fxsave->rip;
+	fpu->last_dp = fxsave->rdp;
+	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+	vcpu_load(vcpu);
+
+	memcpy(fxsave->st_space, fpu->fpr, 128);
+	fxsave->cwd = fpu->fcw;
+	fxsave->swd = fpu->fsw;
+	fxsave->twd = fpu->ftwx;
+	fxsave->fop = fpu->last_opcode;
+	fxsave->rip = fpu->last_ip;
+	fxsave->rdp = fpu->last_dp;
+	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
+void fx_init(struct kvm_vcpu *vcpu)
+{
+	unsigned after_mxcsr_mask;
+
+	/* Initialize guest FPU by resetting ours and saving into guest's */
+	preempt_disable();
+	fx_save(&vcpu->arch.host_fx_image);
+	fpu_init();
+	fx_save(&vcpu->arch.guest_fx_image);
+	fx_restore(&vcpu->arch.host_fx_image);
+	preempt_enable();
+
+	vcpu->arch.cr0 |= X86_CR0_ET;
+	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
+	vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
+	memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
+	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+}
+EXPORT_SYMBOL_GPL(fx_init);
+
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+		return;
+
+	vcpu->guest_fpu_loaded = 1;
+	fx_save(&vcpu->arch.host_fx_image);
+	fx_restore(&vcpu->arch.guest_fx_image);
+}
+EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
+
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->guest_fpu_loaded)
+		return;
+
+	vcpu->guest_fpu_loaded = 0;
+	fx_save(&vcpu->arch.guest_fx_image);
+	fx_restore(&vcpu->arch.host_fx_image);
+	++vcpu->stat.fpu_reload;
+}
+EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->vcpu_free(vcpu);
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
+						unsigned int id)
+{
+	return kvm_x86_ops->vcpu_create(kvm, id);
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	/* We do fxsave: this must be aligned. */
+	BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+
+	vcpu_load(vcpu);
+	r = kvm_arch_vcpu_reset(vcpu);
+	if (r == 0)
+		r = kvm_mmu_setup(vcpu);
+	vcpu_put(vcpu);
+	if (r < 0)
+		goto free_vcpu;
+
+	return 0;
+free_vcpu:
+	kvm_x86_ops->vcpu_free(vcpu);
+	return r;
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	vcpu_load(vcpu);
+	kvm_mmu_unload(vcpu);
+	vcpu_put(vcpu);
+
+	kvm_x86_ops->vcpu_free(vcpu);
+}
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->vcpu_reset(vcpu);
+}
+
+void kvm_arch_hardware_enable(void *garbage)
+{
+	kvm_x86_ops->hardware_enable(garbage);
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+	kvm_x86_ops->hardware_disable(garbage);
+}
+
+int kvm_arch_hardware_setup(void)
+{
+	return kvm_x86_ops->hardware_setup();
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+	kvm_x86_ops->hardware_unsetup();
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+	kvm_x86_ops->check_processor_compatibility(rtn);
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	struct page *page;
+	struct kvm *kvm;
+	int r;
+
+	BUG_ON(vcpu->kvm == NULL);
+	kvm = vcpu->kvm;
+
+	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+		vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+	else
+		vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		r = -ENOMEM;
+		goto fail;
+	}
+	vcpu->arch.pio_data = page_address(page);
+
+	r = kvm_mmu_create(vcpu);
+	if (r < 0)
+		goto fail_free_pio_data;
+
+	if (irqchip_in_kernel(kvm)) {
+		r = kvm_create_lapic(vcpu);
+		if (r < 0)
+			goto fail_mmu_destroy;
+	}
+
+	return 0;
+
+fail_mmu_destroy:
+	kvm_mmu_destroy(vcpu);
+fail_free_pio_data:
+	free_page((unsigned long)vcpu->arch.pio_data);
+fail:
+	return r;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	kvm_free_lapic(vcpu);
+	kvm_mmu_destroy(vcpu);
+	free_page((unsigned long)vcpu->arch.pio_data);
+}
+
+struct  kvm *kvm_arch_create_vm(void)
+{
+	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+
+	return kvm;
+}
+
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+	vcpu_load(vcpu);
+	kvm_mmu_unload(vcpu);
+	vcpu_put(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+	unsigned int i;
+
+	/*
+	 * Unpin any mmu pages first.
+	 */
+	for (i = 0; i < KVM_MAX_VCPUS; ++i)
+		if (kvm->vcpus[i])
+			kvm_unload_vcpu_mmu(kvm->vcpus[i]);
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (kvm->vcpus[i]) {
+			kvm_arch_vcpu_free(kvm->vcpus[i]);
+			kvm->vcpus[i] = NULL;
+		}
+	}
+
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+	kfree(kvm->arch.vpic);
+	kfree(kvm->arch.vioapic);
+	kvm_free_vcpus(kvm);
+	kvm_free_physmem(kvm);
+	kfree(kvm);
+}
+
+int kvm_arch_set_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem,
+				struct kvm_memory_slot old,
+				int user_alloc)
+{
+	int npages = mem->memory_size >> PAGE_SHIFT;
+	struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+
+	/*To keep backward compatibility with older userspace,
+	 *x86 needs to hanlde !user_alloc case.
+	 */
+	if (!user_alloc) {
+		if (npages && !old.rmap) {
+			memslot->userspace_addr = do_mmap(NULL, 0,
+						     npages * PAGE_SIZE,
+						     PROT_READ | PROT_WRITE,
+						     MAP_SHARED | MAP_ANONYMOUS,
+						     0);
+
+			if (IS_ERR((void *)memslot->userspace_addr))
+				return PTR_ERR((void *)memslot->userspace_addr);
+		} else {
+			if (!old.user_alloc && old.rmap) {
+				int ret;
+
+				ret = do_munmap(current->mm, old.userspace_addr,
+						old.npages * PAGE_SIZE);
+				if (ret < 0)
+					printk(KERN_WARNING
+				       "kvm_vm_ioctl_set_memory_region: "
+				       "failed to munmap memory\n");
+			}
+		}
+	}
+
+	if (!kvm->arch.n_requested_mmu_pages) {
+		unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+	}
+
+	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+	kvm_flush_remote_tlbs(kvm);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
+	       || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
+}
+
+static void vcpu_kick_intr(void *info)
+{
+#ifdef DEBUG
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
+	printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
+#endif
+}
+
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int ipi_pcpu = vcpu->cpu;
+
+	if (waitqueue_active(&vcpu->wq)) {
+		wake_up_interruptible(&vcpu->wq);
+		++vcpu->stat.halt_wakeup;
+	}
+	if (vcpu->guest_mode)
+		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+}
diff -puN /dev/null arch/x86/kvm/x86_emulate.c
--- /dev/null
+++ a/arch/x86/kvm/x86_emulate.c
@@ -0,0 +1,1912 @@
+/******************************************************************************
+ * x86_emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ *
+ *   Avi Kivity <avi@qumranet.com>
+ *   Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <stdint.h>
+#include <public/xen.h>
+#define DPRINTF(_f, _a ...) printf(_f , ## _a)
+#else
+#include <linux/kvm_host.h>
+#define DPRINTF(x...) do {} while (0)
+#endif
+#include <linux/module.h>
+#include <asm/kvm_x86_emulate.h>
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp      (1<<0)	/* 8-bit operands. */
+/* Destination operand type. */
+#define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
+#define DstReg      (2<<1)	/* Register operand. */
+#define DstMem      (3<<1)	/* Memory operand. */
+#define DstMask     (3<<1)
+/* Source operand type. */
+#define SrcNone     (0<<3)	/* No source operand. */
+#define SrcImplicit (0<<3)	/* Source operand is implicit in the opcode. */
+#define SrcReg      (1<<3)	/* Register operand. */
+#define SrcMem      (2<<3)	/* Memory operand. */
+#define SrcMem16    (3<<3)	/* Memory operand (16-bit). */
+#define SrcMem32    (4<<3)	/* Memory operand (32-bit). */
+#define SrcImm      (5<<3)	/* Immediate operand. */
+#define SrcImmByte  (6<<3)	/* 8-bit sign-extended immediate operand. */
+#define SrcMask     (7<<3)
+/* Generic ModRM decode. */
+#define ModRM       (1<<6)
+/* Destination is only written; never read. */
+#define Mov         (1<<7)
+#define BitOp       (1<<8)
+#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
+#define String      (1<<10)     /* String instruction (rep capable) */
+#define Stack       (1<<11)     /* Stack instruction (push/pop) */
+
+static u16 opcode_table[256] = {
+	/* 0x00 - 0x07 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x08 - 0x0F */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x10 - 0x17 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x18 - 0x1F */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x20 - 0x27 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	SrcImmByte, SrcImm, 0, 0,
+	/* 0x28 - 0x2F */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x30 - 0x37 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x38 - 0x3F */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+	0, 0, 0, 0,
+	/* 0x40 - 0x47 */
+	DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+	/* 0x48 - 0x4F */
+	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
+	/* 0x50 - 0x57 */
+	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+	/* 0x58 - 0x5F */
+	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+	/* 0x60 - 0x67 */
+	0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
+	0, 0, 0, 0,
+	/* 0x68 - 0x6F */
+	0, 0, ImplicitOps | Mov | Stack, 0,
+	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
+	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
+	/* 0x70 - 0x77 */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x78 - 0x7F */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x80 - 0x87 */
+	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+	/* 0x88 - 0x8F */
+	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
+	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
+	/* 0x90 - 0x9F */
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+	/* 0xA0 - 0xA7 */
+	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
+	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+	ByteOp | ImplicitOps | String, ImplicitOps | String,
+	/* 0xA8 - 0xAF */
+	0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+	ByteOp | ImplicitOps | String, ImplicitOps | String,
+	/* 0xB0 - 0xBF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xC0 - 0xC7 */
+	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+	0, ImplicitOps | Stack, 0, 0,
+	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
+	/* 0xC8 - 0xCF */
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xD0 - 0xD7 */
+	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+	0, 0, 0, 0,
+	/* 0xD8 - 0xDF */
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xE0 - 0xE7 */
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xE8 - 0xEF */
+	ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
+	0, 0, 0, 0,
+	/* 0xF0 - 0xF7 */
+	0, 0, 0, 0,
+	ImplicitOps, ImplicitOps,
+	ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	/* 0xF8 - 0xFF */
+	ImplicitOps, 0, ImplicitOps, ImplicitOps,
+	0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
+};
+
+static u16 twobyte_table[256] = {
+	/* 0x00 - 0x0F */
+	0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
+	ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+	/* 0x10 - 0x1F */
+	0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x20 - 0x2F */
+	ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x30 - 0x3F */
+	ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x40 - 0x47 */
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	/* 0x48 - 0x4F */
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	/* 0x50 - 0x5F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x60 - 0x6F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x70 - 0x7F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0x80 - 0x8F */
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	/* 0x90 - 0x9F */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xA0 - 0xA7 */
+	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+	/* 0xA8 - 0xAF */
+	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+	/* 0xB0 - 0xB7 */
+	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
+	    DstMem | SrcReg | ModRM | BitOp,
+	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+	    DstReg | SrcMem16 | ModRM | Mov,
+	/* 0xB8 - 0xBF */
+	0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
+	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+	    DstReg | SrcMem16 | ModRM | Mov,
+	/* 0xC0 - 0xCF */
+	0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xD0 - 0xDF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xE0 - 0xEF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xF0 - 0xFF */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* EFLAGS bit definitions. */
+#define EFLG_OF (1<<11)
+#define EFLG_DF (1<<10)
+#define EFLG_SF (1<<7)
+#define EFLG_ZF (1<<6)
+#define EFLG_AF (1<<4)
+#define EFLG_PF (1<<2)
+#define EFLG_CF (1<<0)
+
+/*
+ * Instruction emulation:
+ * Most instructions are emulated directly via a fragment of inline assembly
+ * code. This allows us to save/restore EFLAGS and thus very easily pick up
+ * any modified flags.
+ */
+
+#if defined(CONFIG_X86_64)
+#define _LO32 "k"		/* force 32-bit operand */
+#define _STK  "%%rsp"		/* stack pointer */
+#elif defined(__i386__)
+#define _LO32 ""		/* force 32-bit operand */
+#define _STK  "%%esp"		/* stack pointer */
+#endif
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+
+/* Before executing instruction: restore necessary bits in EFLAGS. */
+#define _PRE_EFLAGS(_sav, _msk, _tmp)					\
+	/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
+	"movl %"_sav",%"_LO32 _tmp"; "                                  \
+	"push %"_tmp"; "                                                \
+	"push %"_tmp"; "                                                \
+	"movl %"_msk",%"_LO32 _tmp"; "                                  \
+	"andl %"_LO32 _tmp",("_STK"); "                                 \
+	"pushf; "                                                       \
+	"notl %"_LO32 _tmp"; "                                          \
+	"andl %"_LO32 _tmp",("_STK"); "                                 \
+	"andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); "	\
+	"pop  %"_tmp"; "                                                \
+	"orl  %"_LO32 _tmp",("_STK"); "                                 \
+	"popf; "                                                        \
+	"pop  %"_sav"; "
+
+/* After executing instruction: write-back necessary bits in EFLAGS. */
+#define _POST_EFLAGS(_sav, _msk, _tmp) \
+	/* _sav |= EFLAGS & _msk; */		\
+	"pushf; "				\
+	"pop  %"_tmp"; "			\
+	"andl %"_msk",%"_LO32 _tmp"; "		\
+	"orl  %"_LO32 _tmp",%"_sav"; "
+
+/* Raw emulation: instruction has two explicit operands. */
+#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+	do { 								    \
+		unsigned long _tmp;					    \
+									    \
+		switch ((_dst).bytes) {					    \
+		case 2:							    \
+			__asm__ __volatile__ (				    \
+				_PRE_EFLAGS("0", "4", "2")		    \
+				_op"w %"_wx"3,%1; "			    \
+				_POST_EFLAGS("0", "4", "2")		    \
+				: "=m" (_eflags), "=m" ((_dst).val),        \
+				  "=&r" (_tmp)				    \
+				: _wy ((_src).val), "i" (EFLAGS_MASK));     \
+			break;						    \
+		case 4:							    \
+			__asm__ __volatile__ (				    \
+				_PRE_EFLAGS("0", "4", "2")		    \
+				_op"l %"_lx"3,%1; "			    \
+				_POST_EFLAGS("0", "4", "2")		    \
+				: "=m" (_eflags), "=m" ((_dst).val),	    \
+				  "=&r" (_tmp)				    \
+				: _ly ((_src).val), "i" (EFLAGS_MASK));     \
+			break;						    \
+		case 8:							    \
+			__emulate_2op_8byte(_op, _src, _dst,		    \
+					    _eflags, _qx, _qy);		    \
+			break;						    \
+		}							    \
+	} while (0)
+
+#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+	do {								     \
+		unsigned long _tmp;					     \
+		switch ((_dst).bytes) {				             \
+		case 1:							     \
+			__asm__ __volatile__ (				     \
+				_PRE_EFLAGS("0", "4", "2")		     \
+				_op"b %"_bx"3,%1; "			     \
+				_POST_EFLAGS("0", "4", "2")		     \
+				: "=m" (_eflags), "=m" ((_dst).val),	     \
+				  "=&r" (_tmp)				     \
+				: _by ((_src).val), "i" (EFLAGS_MASK));      \
+			break;						     \
+		default:						     \
+			__emulate_2op_nobyte(_op, _src, _dst, _eflags,	     \
+					     _wx, _wy, _lx, _ly, _qx, _qy);  \
+			break;						     \
+		}							     \
+	} while (0)
+
+/* Source operand is byte-sized and may be restricted to just %cl. */
+#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                      \
+	__emulate_2op(_op, _src, _dst, _eflags,				\
+		      "b", "c", "b", "c", "b", "c", "b", "c")
+
+/* Source operand is byte, word, long or quad sized. */
+#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                      \
+	__emulate_2op(_op, _src, _dst, _eflags,				\
+		      "b", "q", "w", "r", _LO32, "r", "", "r")
+
+/* Source operand is word, long or quad sized. */
+#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)               \
+	__emulate_2op_nobyte(_op, _src, _dst, _eflags,			\
+			     "w", "r", _LO32, "r", "", "r")
+
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags)                                    \
+	do {								\
+		unsigned long _tmp;					\
+									\
+		switch ((_dst).bytes) {				        \
+		case 1:							\
+			__asm__ __volatile__ (				\
+				_PRE_EFLAGS("0", "3", "2")		\
+				_op"b %1; "				\
+				_POST_EFLAGS("0", "3", "2")		\
+				: "=m" (_eflags), "=m" ((_dst).val),	\
+				  "=&r" (_tmp)				\
+				: "i" (EFLAGS_MASK));			\
+			break;						\
+		case 2:							\
+			__asm__ __volatile__ (				\
+				_PRE_EFLAGS("0", "3", "2")		\
+				_op"w %1; "				\
+				_POST_EFLAGS("0", "3", "2")		\
+				: "=m" (_eflags), "=m" ((_dst).val),	\
+				  "=&r" (_tmp)				\
+				: "i" (EFLAGS_MASK));			\
+			break;						\
+		case 4:							\
+			__asm__ __volatile__ (				\
+				_PRE_EFLAGS("0", "3", "2")		\
+				_op"l %1; "				\
+				_POST_EFLAGS("0", "3", "2")		\
+				: "=m" (_eflags), "=m" ((_dst).val),	\
+				  "=&r" (_tmp)				\
+				: "i" (EFLAGS_MASK));			\
+			break;						\
+		case 8:							\
+			__emulate_1op_8byte(_op, _dst, _eflags);	\
+			break;						\
+		}							\
+	} while (0)
+
+/* Emulate an instruction with quadword operands (x86/64 only). */
+#if defined(CONFIG_X86_64)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
+	do {								  \
+		__asm__ __volatile__ (					  \
+			_PRE_EFLAGS("0", "4", "2")			  \
+			_op"q %"_qx"3,%1; "				  \
+			_POST_EFLAGS("0", "4", "2")			  \
+			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+			: _qy ((_src).val), "i" (EFLAGS_MASK));		\
+	} while (0)
+
+#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
+	do {								  \
+		__asm__ __volatile__ (					  \
+			_PRE_EFLAGS("0", "3", "2")			  \
+			_op"q %1; "					  \
+			_POST_EFLAGS("0", "3", "2")			  \
+			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+			: "i" (EFLAGS_MASK));				  \
+	} while (0)
+
+#elif defined(__i386__)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
+#define __emulate_1op_8byte(_op, _dst, _eflags)
+#endif				/* __i386__ */
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip)                                  \
+({	unsigned long _x;						\
+	rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));		\
+	if (rc != 0)							\
+		goto done;						\
+	(_eip) += (_size);						\
+	(_type)_x;							\
+})
+
+/* Access/update address held in a register, based on addressing mode. */
+#define address_mask(reg)						\
+	((c->ad_bytes == sizeof(unsigned long)) ? 			\
+		(reg) :	((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
+#define register_address(base, reg)                                     \
+	((base) + address_mask(reg))
+#define register_address_increment(reg, inc)                            \
+	do {								\
+		/* signed type ensures sign extension to long */        \
+		int _inc = (inc);					\
+		if (c->ad_bytes == sizeof(unsigned long))		\
+			(reg) += _inc;					\
+		else							\
+			(reg) = ((reg) & 				\
+				 ~((1UL << (c->ad_bytes << 3)) - 1)) |	\
+				(((reg) + _inc) &			\
+				 ((1UL << (c->ad_bytes << 3)) - 1));	\
+	} while (0)
+
+#define JMP_REL(rel) 							\
+	do {								\
+		register_address_increment(c->eip, rel);		\
+	} while (0)
+
+static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops,
+			      unsigned long linear, u8 *dest)
+{
+	struct fetch_cache *fc = &ctxt->decode.fetch;
+	int rc;
+	int size;
+
+	if (linear < fc->start || linear >= fc->end) {
+		size = min(15UL, PAGE_SIZE - offset_in_page(linear));
+		rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
+		if (rc)
+			return rc;
+		fc->start = linear;
+		fc->end = linear + size;
+	}
+	*dest = fc->data[linear - fc->start];
+	return 0;
+}
+
+static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 unsigned long eip, void *dest, unsigned size)
+{
+	int rc = 0;
+
+	eip += ctxt->cs_base;
+	while (size--) {
+		rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(u8 modrm_reg, unsigned long *regs,
+			     int highbyte_regs)
+{
+	void *p;
+
+	p = &regs[modrm_reg];
+	if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+		p = (unsigned char *)&regs[modrm_reg & 3] + 1;
+	return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops,
+			   void *ptr,
+			   u16 *size, unsigned long *address, int op_bytes)
+{
+	int rc;
+
+	if (op_bytes == 2)
+		op_bytes = 3;
+	*address = 0;
+	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
+			   ctxt->vcpu);
+	if (rc)
+		return rc;
+	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
+			   ctxt->vcpu);
+	return rc;
+}
+
+static int test_cc(unsigned int condition, unsigned int flags)
+{
+	int rc = 0;
+
+	switch ((condition & 15) >> 1) {
+	case 0: /* o */
+		rc |= (flags & EFLG_OF);
+		break;
+	case 1: /* b/c/nae */
+		rc |= (flags & EFLG_CF);
+		break;
+	case 2: /* z/e */
+		rc |= (flags & EFLG_ZF);
+		break;
+	case 3: /* be/na */
+		rc |= (flags & (EFLG_CF|EFLG_ZF));
+		break;
+	case 4: /* s */
+		rc |= (flags & EFLG_SF);
+		break;
+	case 5: /* p/pe */
+		rc |= (flags & EFLG_PF);
+		break;
+	case 7: /* le/ng */
+		rc |= (flags & EFLG_ZF);
+		/* fall through */
+	case 6: /* l/nge */
+		rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+		break;
+	}
+
+	/* Odd condition identifiers (lsb == 1) have inverted sense. */
+	return (!!rc ^ (condition & 1));
+}
+
+static void decode_register_operand(struct operand *op,
+				    struct decode_cache *c,
+				    int inhibit_bytereg)
+{
+	unsigned reg = c->modrm_reg;
+	int highbyte_regs = c->rex_prefix == 0;
+
+	if (!(c->d & ModRM))
+		reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+	op->type = OP_REG;
+	if ((c->d & ByteOp) && !inhibit_bytereg) {
+		op->ptr = decode_register(reg, c->regs, highbyte_regs);
+		op->val = *(u8 *)op->ptr;
+		op->bytes = 1;
+	} else {
+		op->ptr = decode_register(reg, c->regs, 0);
+		op->bytes = c->op_bytes;
+		switch (op->bytes) {
+		case 2:
+			op->val = *(u16 *)op->ptr;
+			break;
+		case 4:
+			op->val = *(u32 *)op->ptr;
+			break;
+		case 8:
+			op->val = *(u64 *) op->ptr;
+			break;
+		}
+	}
+	op->orig_val = op->val;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+			struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	u8 sib;
+	int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
+	int rc = 0;
+
+	if (c->rex_prefix) {
+		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
+		index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
+		c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+	}
+
+	c->modrm = insn_fetch(u8, 1, c->eip);
+	c->modrm_mod |= (c->modrm & 0xc0) >> 6;
+	c->modrm_reg |= (c->modrm & 0x38) >> 3;
+	c->modrm_rm |= (c->modrm & 0x07);
+	c->modrm_ea = 0;
+	c->use_modrm_ea = 1;
+
+	if (c->modrm_mod == 3) {
+		c->modrm_val = *(unsigned long *)
+			decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+		return rc;
+	}
+
+	if (c->ad_bytes == 2) {
+		unsigned bx = c->regs[VCPU_REGS_RBX];
+		unsigned bp = c->regs[VCPU_REGS_RBP];
+		unsigned si = c->regs[VCPU_REGS_RSI];
+		unsigned di = c->regs[VCPU_REGS_RDI];
+
+		/* 16-bit ModR/M decode. */
+		switch (c->modrm_mod) {
+		case 0:
+			if (c->modrm_rm == 6)
+				c->modrm_ea += insn_fetch(u16, 2, c->eip);
+			break;
+		case 1:
+			c->modrm_ea += insn_fetch(s8, 1, c->eip);
+			break;
+		case 2:
+			c->modrm_ea += insn_fetch(u16, 2, c->eip);
+			break;
+		}
+		switch (c->modrm_rm) {
+		case 0:
+			c->modrm_ea += bx + si;
+			break;
+		case 1:
+			c->modrm_ea += bx + di;
+			break;
+		case 2:
+			c->modrm_ea += bp + si;
+			break;
+		case 3:
+			c->modrm_ea += bp + di;
+			break;
+		case 4:
+			c->modrm_ea += si;
+			break;
+		case 5:
+			c->modrm_ea += di;
+			break;
+		case 6:
+			if (c->modrm_mod != 0)
+				c->modrm_ea += bp;
+			break;
+		case 7:
+			c->modrm_ea += bx;
+			break;
+		}
+		if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
+		    (c->modrm_rm == 6 && c->modrm_mod != 0))
+			if (!c->override_base)
+				c->override_base = &ctxt->ss_base;
+		c->modrm_ea = (u16)c->modrm_ea;
+	} else {
+		/* 32/64-bit ModR/M decode. */
+		switch (c->modrm_rm) {
+		case 4:
+		case 12:
+			sib = insn_fetch(u8, 1, c->eip);
+			index_reg |= (sib >> 3) & 7;
+			base_reg |= sib & 7;
+			scale = sib >> 6;
+
+			switch (base_reg) {
+			case 5:
+				if (c->modrm_mod != 0)
+					c->modrm_ea += c->regs[base_reg];
+				else
+					c->modrm_ea +=
+						insn_fetch(s32, 4, c->eip);
+				break;
+			default:
+				c->modrm_ea += c->regs[base_reg];
+			}
+			switch (index_reg) {
+			case 4:
+				break;
+			default:
+				c->modrm_ea += c->regs[index_reg] << scale;
+			}
+			break;
+		case 5:
+			if (c->modrm_mod != 0)
+				c->modrm_ea += c->regs[c->modrm_rm];
+			else if (ctxt->mode == X86EMUL_MODE_PROT64)
+				rip_relative = 1;
+			break;
+		default:
+			c->modrm_ea += c->regs[c->modrm_rm];
+			break;
+		}
+		switch (c->modrm_mod) {
+		case 0:
+			if (c->modrm_rm == 5)
+				c->modrm_ea += insn_fetch(s32, 4, c->eip);
+			break;
+		case 1:
+			c->modrm_ea += insn_fetch(s8, 1, c->eip);
+			break;
+		case 2:
+			c->modrm_ea += insn_fetch(s32, 4, c->eip);
+			break;
+		}
+	}
+	if (rip_relative) {
+		c->modrm_ea += c->eip;
+		switch (c->d & SrcMask) {
+		case SrcImmByte:
+			c->modrm_ea += 1;
+			break;
+		case SrcImm:
+			if (c->d & ByteOp)
+				c->modrm_ea += 1;
+			else
+				if (c->op_bytes == 8)
+					c->modrm_ea += 4;
+				else
+					c->modrm_ea += c->op_bytes;
+		}
+	}
+done:
+	return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+		      struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc = 0;
+
+	switch (c->ad_bytes) {
+	case 2:
+		c->modrm_ea = insn_fetch(u16, 2, c->eip);
+		break;
+	case 4:
+		c->modrm_ea = insn_fetch(u32, 4, c->eip);
+		break;
+	case 8:
+		c->modrm_ea = insn_fetch(u64, 8, c->eip);
+		break;
+	}
+done:
+	return rc;
+}
+
+int
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc = 0;
+	int mode = ctxt->mode;
+	int def_op_bytes, def_ad_bytes;
+
+	/* Shadow copy of register state. Committed on successful emulation. */
+
+	memset(c, 0, sizeof(struct decode_cache));
+	c->eip = ctxt->vcpu->arch.rip;
+	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+
+	switch (mode) {
+	case X86EMUL_MODE_REAL:
+	case X86EMUL_MODE_PROT16:
+		def_op_bytes = def_ad_bytes = 2;
+		break;
+	case X86EMUL_MODE_PROT32:
+		def_op_bytes = def_ad_bytes = 4;
+		break;
+#ifdef CONFIG_X86_64
+	case X86EMUL_MODE_PROT64:
+		def_op_bytes = 4;
+		def_ad_bytes = 8;
+		break;
+#endif
+	default:
+		return -1;
+	}
+
+	c->op_bytes = def_op_bytes;
+	c->ad_bytes = def_ad_bytes;
+
+	/* Legacy prefixes. */
+	for (;;) {
+		switch (c->b = insn_fetch(u8, 1, c->eip)) {
+		case 0x66:	/* operand-size override */
+			/* switch between 2/4 bytes */
+			c->op_bytes = def_op_bytes ^ 6;
+			break;
+		case 0x67:	/* address-size override */
+			if (mode == X86EMUL_MODE_PROT64)
+				/* switch between 4/8 bytes */
+				c->ad_bytes = def_ad_bytes ^ 12;
+			else
+				/* switch between 2/4 bytes */
+				c->ad_bytes = def_ad_bytes ^ 6;
+			break;
+		case 0x2e:	/* CS override */
+			c->override_base = &ctxt->cs_base;
+			break;
+		case 0x3e:	/* DS override */
+			c->override_base = &ctxt->ds_base;
+			break;
+		case 0x26:	/* ES override */
+			c->override_base = &ctxt->es_base;
+			break;
+		case 0x64:	/* FS override */
+			c->override_base = &ctxt->fs_base;
+			break;
+		case 0x65:	/* GS override */
+			c->override_base = &ctxt->gs_base;
+			break;
+		case 0x36:	/* SS override */
+			c->override_base = &ctxt->ss_base;
+			break;
+		case 0x40 ... 0x4f: /* REX */
+			if (mode != X86EMUL_MODE_PROT64)
+				goto done_prefixes;
+			c->rex_prefix = c->b;
+			continue;
+		case 0xf0:	/* LOCK */
+			c->lock_prefix = 1;
+			break;
+		case 0xf2:	/* REPNE/REPNZ */
+			c->rep_prefix = REPNE_PREFIX;
+			break;
+		case 0xf3:	/* REP/REPE/REPZ */
+			c->rep_prefix = REPE_PREFIX;
+			break;
+		default:
+			goto done_prefixes;
+		}
+
+		/* Any legacy prefix after a REX prefix nullifies its effect. */
+
+		c->rex_prefix = 0;
+	}
+
+done_prefixes:
+
+	/* REX prefix. */
+	if (c->rex_prefix)
+		if (c->rex_prefix & 8)
+			c->op_bytes = 8;	/* REX.W */
+
+	/* Opcode byte(s). */
+	c->d = opcode_table[c->b];
+	if (c->d == 0) {
+		/* Two-byte opcode? */
+		if (c->b == 0x0f) {
+			c->twobyte = 1;
+			c->b = insn_fetch(u8, 1, c->eip);
+			c->d = twobyte_table[c->b];
+		}
+
+		/* Unrecognised? */
+		if (c->d == 0) {
+			DPRINTF("Cannot emulate %02x\n", c->b);
+			return -1;
+		}
+	}
+
+	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+		c->op_bytes = 8;
+
+	/* ModRM and SIB bytes. */
+	if (c->d & ModRM)
+		rc = decode_modrm(ctxt, ops);
+	else if (c->d & MemAbs)
+		rc = decode_abs(ctxt, ops);
+	if (rc)
+		goto done;
+
+	if (!c->override_base)
+		c->override_base = &ctxt->ds_base;
+	if (mode == X86EMUL_MODE_PROT64 &&
+	    c->override_base != &ctxt->fs_base &&
+	    c->override_base != &ctxt->gs_base)
+		c->override_base = NULL;
+
+	if (c->override_base)
+		c->modrm_ea += *c->override_base;
+
+	if (c->ad_bytes != 8)
+		c->modrm_ea = (u32)c->modrm_ea;
+	/*
+	 * Decode and fetch the source operand: register, memory
+	 * or immediate.
+	 */
+	switch (c->d & SrcMask) {
+	case SrcNone:
+		break;
+	case SrcReg:
+		decode_register_operand(&c->src, c, 0);
+		break;
+	case SrcMem16:
+		c->src.bytes = 2;
+		goto srcmem_common;
+	case SrcMem32:
+		c->src.bytes = 4;
+		goto srcmem_common;
+	case SrcMem:
+		c->src.bytes = (c->d & ByteOp) ? 1 :
+							   c->op_bytes;
+		/* Don't fetch the address for invlpg: it could be unmapped. */
+		if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+			break;
+	srcmem_common:
+		/*
+		 * For instructions with a ModR/M byte, switch to register
+		 * access if Mod = 3.
+		 */
+		if ((c->d & ModRM) && c->modrm_mod == 3) {
+			c->src.type = OP_REG;
+			break;
+		}
+		c->src.type = OP_MEM;
+		break;
+	case SrcImm:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		if (c->src.bytes == 8)
+			c->src.bytes = 4;
+		/* NB. Immediates are sign-extended as necessary. */
+		switch (c->src.bytes) {
+		case 1:
+			c->src.val = insn_fetch(s8, 1, c->eip);
+			break;
+		case 2:
+			c->src.val = insn_fetch(s16, 2, c->eip);
+			break;
+		case 4:
+			c->src.val = insn_fetch(s32, 4, c->eip);
+			break;
+		}
+		break;
+	case SrcImmByte:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = 1;
+		c->src.val = insn_fetch(s8, 1, c->eip);
+		break;
+	}
+
+	/* Decode and fetch the destination operand: register or memory. */
+	switch (c->d & DstMask) {
+	case ImplicitOps:
+		/* Special instructions do their own operand decoding. */
+		return 0;
+	case DstReg:
+		decode_register_operand(&c->dst, c,
+			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+		break;
+	case DstMem:
+		if ((c->d & ModRM) && c->modrm_mod == 3) {
+			c->dst.type = OP_REG;
+			break;
+		}
+		c->dst.type = OP_MEM;
+		break;
+	}
+
+done:
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.type  = OP_MEM;
+	c->dst.bytes = c->op_bytes;
+	c->dst.val = c->src.val;
+	register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
+	c->dst.ptr = (void *) register_address(ctxt->ss_base,
+					       c->regs[VCPU_REGS_RSP]);
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+
+	rc = ops->read_std(register_address(ctxt->ss_base,
+					    c->regs[VCPU_REGS_RSP]),
+			   &c->dst.val, c->dst.bytes, ctxt->vcpu);
+	if (rc != 0)
+		return rc;
+
+	register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+
+	return 0;
+}
+
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	switch (c->modrm_reg) {
+	case 0:	/* rol */
+		emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+		break;
+	case 1:	/* ror */
+		emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+		break;
+	case 2:	/* rcl */
+		emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+		break;
+	case 3:	/* rcr */
+		emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+		break;
+	case 4:	/* sal/shl */
+	case 6:	/* sal/shl */
+		emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+		break;
+	case 5:	/* shr */
+		emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+		break;
+	case 7:	/* sar */
+		emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+		break;
+	}
+}
+
+static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
+			       struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc = 0;
+
+	switch (c->modrm_reg) {
+	case 0 ... 1:	/* test */
+		/*
+		 * Special case in Grp3: test has an immediate
+		 * source operand.
+		 */
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		if (c->src.bytes == 8)
+			c->src.bytes = 4;
+		switch (c->src.bytes) {
+		case 1:
+			c->src.val = insn_fetch(s8, 1, c->eip);
+			break;
+		case 2:
+			c->src.val = insn_fetch(s16, 2, c->eip);
+			break;
+		case 4:
+			c->src.val = insn_fetch(s32, 4, c->eip);
+			break;
+		}
+		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+		break;
+	case 2:	/* not */
+		c->dst.val = ~c->dst.val;
+		break;
+	case 3:	/* neg */
+		emulate_1op("neg", c->dst, ctxt->eflags);
+		break;
+	default:
+		DPRINTF("Cannot emulate %02x\n", c->b);
+		rc = X86EMUL_UNHANDLEABLE;
+		break;
+	}
+done:
+	return rc;
+}
+
+static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
+			       struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+
+	switch (c->modrm_reg) {
+	case 0:	/* inc */
+		emulate_1op("inc", c->dst, ctxt->eflags);
+		break;
+	case 1:	/* dec */
+		emulate_1op("dec", c->dst, ctxt->eflags);
+		break;
+	case 4: /* jmp abs */
+		if (c->b == 0xff)
+			c->eip = c->dst.val;
+		else {
+			DPRINTF("Cannot emulate %02x\n", c->b);
+			return X86EMUL_UNHANDLEABLE;
+		}
+		break;
+	case 6:	/* push */
+
+		/* 64-bit mode: PUSH always pushes a 64-bit operand. */
+
+		if (ctxt->mode == X86EMUL_MODE_PROT64) {
+			c->dst.bytes = 8;
+			rc = ops->read_std((unsigned long)c->dst.ptr,
+					   &c->dst.val, 8, ctxt->vcpu);
+			if (rc != 0)
+				return rc;
+		}
+		register_address_increment(c->regs[VCPU_REGS_RSP],
+					   -c->dst.bytes);
+		rc = ops->write_emulated(register_address(ctxt->ss_base,
+				    c->regs[VCPU_REGS_RSP]), &c->dst.val,
+				    c->dst.bytes, ctxt->vcpu);
+		if (rc != 0)
+			return rc;
+		c->dst.type = OP_NONE;
+		break;
+	default:
+		DPRINTF("Cannot emulate %02x\n", c->b);
+		return X86EMUL_UNHANDLEABLE;
+	}
+	return 0;
+}
+
+static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
+			       struct x86_emulate_ops *ops,
+			       unsigned long memop)
+{
+	struct decode_cache *c = &ctxt->decode;
+	u64 old, new;
+	int rc;
+
+	rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
+	if (rc != 0)
+		return rc;
+
+	if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
+	    ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
+
+		c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+		c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+		ctxt->eflags &= ~EFLG_ZF;
+
+	} else {
+		new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+		       (u32) c->regs[VCPU_REGS_RBX];
+
+		rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
+		if (rc != 0)
+			return rc;
+		ctxt->eflags |= EFLG_ZF;
+	}
+	return 0;
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+			    struct x86_emulate_ops *ops)
+{
+	int rc;
+	struct decode_cache *c = &ctxt->decode;
+
+	switch (c->dst.type) {
+	case OP_REG:
+		/* The 4-byte case *is* correct:
+		 * in 64-bit mode we zero-extend.
+		 */
+		switch (c->dst.bytes) {
+		case 1:
+			*(u8 *)c->dst.ptr = (u8)c->dst.val;
+			break;
+		case 2:
+			*(u16 *)c->dst.ptr = (u16)c->dst.val;
+			break;
+		case 4:
+			*c->dst.ptr = (u32)c->dst.val;
+			break;	/* 64b: zero-ext */
+		case 8:
+			*c->dst.ptr = c->dst.val;
+			break;
+		}
+		break;
+	case OP_MEM:
+		if (c->lock_prefix)
+			rc = ops->cmpxchg_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.orig_val,
+					&c->dst.val,
+					c->dst.bytes,
+					ctxt->vcpu);
+		else
+			rc = ops->write_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.val,
+					c->dst.bytes,
+					ctxt->vcpu);
+		if (rc != 0)
+			return rc;
+		break;
+	case OP_NONE:
+		/* no writeback */
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+int
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+	unsigned long memop = 0;
+	u64 msr_data;
+	unsigned long saved_eip = 0;
+	struct decode_cache *c = &ctxt->decode;
+	int rc = 0;
+
+	/* Shadow copy of register state. Committed on successful emulation.
+	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
+	 * modify them.
+	 */
+
+	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+	saved_eip = c->eip;
+
+	if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
+		memop = c->modrm_ea;
+
+	if (c->rep_prefix && (c->d & String)) {
+		/* All REP prefixes have the same first termination condition */
+		if (c->regs[VCPU_REGS_RCX] == 0) {
+			ctxt->vcpu->arch.rip = c->eip;
+			goto done;
+		}
+		/* The second termination condition only applies for REPE
+		 * and REPNE. Test if the repeat string operation prefix is
+		 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+		 * corresponding termination condition according to:
+		 * 	- if REPE/REPZ and ZF = 0 then done
+		 * 	- if REPNE/REPNZ and ZF = 1 then done
+		 */
+		if ((c->b == 0xa6) || (c->b == 0xa7) ||
+				(c->b == 0xae) || (c->b == 0xaf)) {
+			if ((c->rep_prefix == REPE_PREFIX) &&
+				((ctxt->eflags & EFLG_ZF) == 0)) {
+					ctxt->vcpu->arch.rip = c->eip;
+					goto done;
+			}
+			if ((c->rep_prefix == REPNE_PREFIX) &&
+				((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
+				ctxt->vcpu->arch.rip = c->eip;
+				goto done;
+			}
+		}
+		c->regs[VCPU_REGS_RCX]--;
+		c->eip = ctxt->vcpu->arch.rip;
+	}
+
+	if (c->src.type == OP_MEM) {
+		c->src.ptr = (unsigned long *)memop;
+		c->src.val = 0;
+		rc = ops->read_emulated((unsigned long)c->src.ptr,
+					&c->src.val,
+					c->src.bytes,
+					ctxt->vcpu);
+		if (rc != 0)
+			goto done;
+		c->src.orig_val = c->src.val;
+	}
+
+	if ((c->d & DstMask) == ImplicitOps)
+		goto special_insn;
+
+
+	if (c->dst.type == OP_MEM) {
+		c->dst.ptr = (unsigned long *)memop;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.val = 0;
+		if (c->d & BitOp) {
+			unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+			c->dst.ptr = (void *)c->dst.ptr +
+						   (c->src.val & mask) / 8;
+		}
+		if (!(c->d & Mov) &&
+				   /* optimisation - avoid slow emulated read */
+		    ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
+					   &c->dst.val,
+					  c->dst.bytes, ctxt->vcpu)) != 0))
+			goto done;
+	}
+	c->dst.orig_val = c->dst.val;
+
+special_insn:
+
+	if (c->twobyte)
+		goto twobyte_insn;
+
+	switch (c->b) {
+	case 0x00 ... 0x05:
+	      add:		/* add */
+		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0x08 ... 0x0d:
+	      or:		/* or */
+		emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0x10 ... 0x15:
+	      adc:		/* adc */
+		emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0x18 ... 0x1d:
+	      sbb:		/* sbb */
+		emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0x20 ... 0x23:
+	      and:		/* and */
+		emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0x24:              /* and al imm8 */
+		c->dst.type = OP_REG;
+		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+		c->dst.val = *(u8 *)c->dst.ptr;
+		c->dst.bytes = 1;
+		c->dst.orig_val = c->dst.val;
+		goto and;
+	case 0x25:              /* and ax imm16, or eax imm32 */
+		c->dst.type = OP_REG;
+		c->dst.bytes = c->op_bytes;
+		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+		if (c->op_bytes == 2)
+			c->dst.val = *(u16 *)c->dst.ptr;
+		else
+			c->dst.val = *(u32 *)c->dst.ptr;
+		c->dst.orig_val = c->dst.val;
+		goto and;
+	case 0x28 ... 0x2d:
+	      sub:		/* sub */
+		emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0x30 ... 0x35:
+	      xor:		/* xor */
+		emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0x38 ... 0x3d:
+	      cmp:		/* cmp */
+		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0x40 ... 0x47: /* inc r16/r32 */
+		emulate_1op("inc", c->dst, ctxt->eflags);
+		break;
+	case 0x48 ... 0x4f: /* dec r16/r32 */
+		emulate_1op("dec", c->dst, ctxt->eflags);
+		break;
+	case 0x50 ... 0x57:  /* push reg */
+		c->dst.type  = OP_MEM;
+		c->dst.bytes = c->op_bytes;
+		c->dst.val = c->src.val;
+		register_address_increment(c->regs[VCPU_REGS_RSP],
+					   -c->op_bytes);
+		c->dst.ptr = (void *) register_address(
+			ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+		break;
+	case 0x58 ... 0x5f: /* pop reg */
+	pop_instruction:
+		if ((rc = ops->read_std(register_address(ctxt->ss_base,
+			c->regs[VCPU_REGS_RSP]), c->dst.ptr,
+			c->op_bytes, ctxt->vcpu)) != 0)
+			goto done;
+
+		register_address_increment(c->regs[VCPU_REGS_RSP],
+					   c->op_bytes);
+		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
+	case 0x63:		/* movsxd */
+		if (ctxt->mode != X86EMUL_MODE_PROT64)
+			goto cannot_emulate;
+		c->dst.val = (s32) c->src.val;
+		break;
+	case 0x6a: /* push imm8 */
+		c->src.val = 0L;
+		c->src.val = insn_fetch(s8, 1, c->eip);
+		emulate_push(ctxt);
+		break;
+	case 0x6c:		/* insb */
+	case 0x6d:		/* insw/insd */
+		 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+				1,
+				(c->d & ByteOp) ? 1 : c->op_bytes,
+				c->rep_prefix ?
+				address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+				(ctxt->eflags & EFLG_DF),
+				register_address(ctxt->es_base,
+						 c->regs[VCPU_REGS_RDI]),
+				c->rep_prefix,
+				c->regs[VCPU_REGS_RDX]) == 0) {
+			c->eip = saved_eip;
+			return -1;
+		}
+		return 0;
+	case 0x6e:		/* outsb */
+	case 0x6f:		/* outsw/outsd */
+		if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+				0,
+				(c->d & ByteOp) ? 1 : c->op_bytes,
+				c->rep_prefix ?
+				address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+				(ctxt->eflags & EFLG_DF),
+				register_address(c->override_base ?
+							*c->override_base :
+							ctxt->ds_base,
+						 c->regs[VCPU_REGS_RSI]),
+				c->rep_prefix,
+				c->regs[VCPU_REGS_RDX]) == 0) {
+			c->eip = saved_eip;
+			return -1;
+		}
+		return 0;
+	case 0x70 ... 0x7f: /* jcc (short) */ {
+		int rel = insn_fetch(s8, 1, c->eip);
+
+		if (test_cc(c->b, ctxt->eflags))
+			JMP_REL(rel);
+		break;
+	}
+	case 0x80 ... 0x83:	/* Grp1 */
+		switch (c->modrm_reg) {
+		case 0:
+			goto add;
+		case 1:
+			goto or;
+		case 2:
+			goto adc;
+		case 3:
+			goto sbb;
+		case 4:
+			goto and;
+		case 5:
+			goto sub;
+		case 6:
+			goto xor;
+		case 7:
+			goto cmp;
+		}
+		break;
+	case 0x84 ... 0x85:
+		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0x86 ... 0x87:	/* xchg */
+		/* Write back the register source. */
+		switch (c->dst.bytes) {
+		case 1:
+			*(u8 *) c->src.ptr = (u8) c->dst.val;
+			break;
+		case 2:
+			*(u16 *) c->src.ptr = (u16) c->dst.val;
+			break;
+		case 4:
+			*c->src.ptr = (u32) c->dst.val;
+			break;	/* 64b reg: zero-extend */
+		case 8:
+			*c->src.ptr = c->dst.val;
+			break;
+		}
+		/*
+		 * Write back the memory destination with implicit LOCK
+		 * prefix.
+		 */
+		c->dst.val = c->src.val;
+		c->lock_prefix = 1;
+		break;
+	case 0x88 ... 0x8b:	/* mov */
+		goto mov;
+	case 0x8d: /* lea r16/r32, m */
+		c->dst.val = c->modrm_val;
+		break;
+	case 0x8f:		/* pop (sole member of Grp1a) */
+		rc = emulate_grp1a(ctxt, ops);
+		if (rc != 0)
+			goto done;
+		break;
+	case 0x9c: /* pushf */
+		c->src.val =  (unsigned long) ctxt->eflags;
+		emulate_push(ctxt);
+		break;
+	case 0x9d: /* popf */
+		c->dst.ptr = (unsigned long *) &ctxt->eflags;
+		goto pop_instruction;
+	case 0xa0 ... 0xa1:	/* mov */
+		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+		c->dst.val = c->src.val;
+		break;
+	case 0xa2 ... 0xa3:	/* mov */
+		c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
+		break;
+	case 0xa4 ... 0xa5:	/* movs */
+		c->dst.type = OP_MEM;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = (unsigned long *)register_address(
+						   ctxt->es_base,
+						   c->regs[VCPU_REGS_RDI]);
+		if ((rc = ops->read_emulated(register_address(
+		      c->override_base ? *c->override_base :
+					ctxt->ds_base,
+					c->regs[VCPU_REGS_RSI]),
+					&c->dst.val,
+					c->dst.bytes, ctxt->vcpu)) != 0)
+			goto done;
+		register_address_increment(c->regs[VCPU_REGS_RSI],
+				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+							   : c->dst.bytes);
+		register_address_increment(c->regs[VCPU_REGS_RDI],
+				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+							   : c->dst.bytes);
+		break;
+	case 0xa6 ... 0xa7:	/* cmps */
+		c->src.type = OP_NONE; /* Disable writeback. */
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = (unsigned long *)register_address(
+				c->override_base ? *c->override_base :
+						   ctxt->ds_base,
+						   c->regs[VCPU_REGS_RSI]);
+		if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
+						&c->src.val,
+						c->src.bytes,
+						ctxt->vcpu)) != 0)
+			goto done;
+
+		c->dst.type = OP_NONE; /* Disable writeback. */
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = (unsigned long *)register_address(
+						   ctxt->es_base,
+						   c->regs[VCPU_REGS_RDI]);
+		if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
+						&c->dst.val,
+						c->dst.bytes,
+						ctxt->vcpu)) != 0)
+			goto done;
+
+		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+
+		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+
+		register_address_increment(c->regs[VCPU_REGS_RSI],
+				       (ctxt->eflags & EFLG_DF) ? -c->src.bytes
+								  : c->src.bytes);
+		register_address_increment(c->regs[VCPU_REGS_RDI],
+				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+								  : c->dst.bytes);
+
+		break;
+	case 0xaa ... 0xab:	/* stos */
+		c->dst.type = OP_MEM;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = (unsigned long *)register_address(
+						   ctxt->es_base,
+						   c->regs[VCPU_REGS_RDI]);
+		c->dst.val = c->regs[VCPU_REGS_RAX];
+		register_address_increment(c->regs[VCPU_REGS_RDI],
+				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+							   : c->dst.bytes);
+		break;
+	case 0xac ... 0xad:	/* lods */
+		c->dst.type = OP_REG;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+		if ((rc = ops->read_emulated(register_address(
+				c->override_base ? *c->override_base :
+						   ctxt->ds_base,
+						 c->regs[VCPU_REGS_RSI]),
+						 &c->dst.val,
+						 c->dst.bytes,
+						 ctxt->vcpu)) != 0)
+			goto done;
+		register_address_increment(c->regs[VCPU_REGS_RSI],
+				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+							   : c->dst.bytes);
+		break;
+	case 0xae ... 0xaf:	/* scas */
+		DPRINTF("Urk! I don't handle SCAS.\n");
+		goto cannot_emulate;
+	case 0xc0 ... 0xc1:
+		emulate_grp2(ctxt);
+		break;
+	case 0xc3: /* ret */
+		c->dst.ptr = &c->eip;
+		goto pop_instruction;
+	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
+	mov:
+		c->dst.val = c->src.val;
+		break;
+	case 0xd0 ... 0xd1:	/* Grp2 */
+		c->src.val = 1;
+		emulate_grp2(ctxt);
+		break;
+	case 0xd2 ... 0xd3:	/* Grp2 */
+		c->src.val = c->regs[VCPU_REGS_RCX];
+		emulate_grp2(ctxt);
+		break;
+	case 0xe8: /* call (near) */ {
+		long int rel;
+		switch (c->op_bytes) {
+		case 2:
+			rel = insn_fetch(s16, 2, c->eip);
+			break;
+		case 4:
+			rel = insn_fetch(s32, 4, c->eip);
+			break;
+		default:
+			DPRINTF("Call: Invalid op_bytes\n");
+			goto cannot_emulate;
+		}
+		c->src.val = (unsigned long) c->eip;
+		JMP_REL(rel);
+		c->op_bytes = c->ad_bytes;
+		emulate_push(ctxt);
+		break;
+	}
+	case 0xe9: /* jmp rel */
+	case 0xeb: /* jmp rel short */
+		JMP_REL(c->src.val);
+		c->dst.type = OP_NONE; /* Disable writeback. */
+		break;
+	case 0xf4:              /* hlt */
+		ctxt->vcpu->arch.halt_request = 1;
+		goto done;
+	case 0xf5:	/* cmc */
+		/* complement carry flag from eflags reg */
+		ctxt->eflags ^= EFLG_CF;
+		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
+	case 0xf6 ... 0xf7:	/* Grp3 */
+		rc = emulate_grp3(ctxt, ops);
+		if (rc != 0)
+			goto done;
+		break;
+	case 0xf8: /* clc */
+		ctxt->eflags &= ~EFLG_CF;
+		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
+	case 0xfa: /* cli */
+		ctxt->eflags &= ~X86_EFLAGS_IF;
+		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
+	case 0xfb: /* sti */
+		ctxt->eflags |= X86_EFLAGS_IF;
+		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
+	case 0xfe ... 0xff:	/* Grp4/Grp5 */
+		rc = emulate_grp45(ctxt, ops);
+		if (rc != 0)
+			goto done;
+		break;
+	}
+
+writeback:
+	rc = writeback(ctxt, ops);
+	if (rc != 0)
+		goto done;
+
+	/* Commit shadow register state. */
+	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+	ctxt->vcpu->arch.rip = c->eip;
+
+done:
+	if (rc == X86EMUL_UNHANDLEABLE) {
+		c->eip = saved_eip;
+		return -1;
+	}
+	return 0;
+
+twobyte_insn:
+	switch (c->b) {
+	case 0x01: /* lgdt, lidt, lmsw */
+		switch (c->modrm_reg) {
+			u16 size;
+			unsigned long address;
+
+		case 0: /* vmcall */
+			if (c->modrm_mod != 3 || c->modrm_rm != 1)
+				goto cannot_emulate;
+
+			rc = kvm_fix_hypercall(ctxt->vcpu);
+			if (rc)
+				goto done;
+
+			kvm_emulate_hypercall(ctxt->vcpu);
+			break;
+		case 2: /* lgdt */
+			rc = read_descriptor(ctxt, ops, c->src.ptr,
+					     &size, &address, c->op_bytes);
+			if (rc)
+				goto done;
+			realmode_lgdt(ctxt->vcpu, size, address);
+			break;
+		case 3: /* lidt/vmmcall */
+			if (c->modrm_mod == 3 && c->modrm_rm == 1) {
+				rc = kvm_fix_hypercall(ctxt->vcpu);
+				if (rc)
+					goto done;
+				kvm_emulate_hypercall(ctxt->vcpu);
+			} else {
+				rc = read_descriptor(ctxt, ops, c->src.ptr,
+						     &size, &address,
+						     c->op_bytes);
+				if (rc)
+					goto done;
+				realmode_lidt(ctxt->vcpu, size, address);
+			}
+			break;
+		case 4: /* smsw */
+			if (c->modrm_mod != 3)
+				goto cannot_emulate;
+			*(u16 *)&c->regs[c->modrm_rm]
+				= realmode_get_cr(ctxt->vcpu, 0);
+			break;
+		case 6: /* lmsw */
+			if (c->modrm_mod != 3)
+				goto cannot_emulate;
+			realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
+						  &ctxt->eflags);
+			break;
+		case 7: /* invlpg*/
+			emulate_invlpg(ctxt->vcpu, memop);
+			break;
+		default:
+			goto cannot_emulate;
+		}
+		/* Disable writeback. */
+		c->dst.type = OP_NONE;
+		break;
+	case 0x06:
+		emulate_clts(ctxt->vcpu);
+		c->dst.type = OP_NONE;
+		break;
+	case 0x08:		/* invd */
+	case 0x09:		/* wbinvd */
+	case 0x0d:		/* GrpP (prefetch) */
+	case 0x18:		/* Grp16 (prefetch/nop) */
+		c->dst.type = OP_NONE;
+		break;
+	case 0x20: /* mov cr, reg */
+		if (c->modrm_mod != 3)
+			goto cannot_emulate;
+		c->regs[c->modrm_rm] =
+				realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+		c->dst.type = OP_NONE;	/* no writeback */
+		break;
+	case 0x21: /* mov from dr to reg */
+		if (c->modrm_mod != 3)
+			goto cannot_emulate;
+		rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+		if (rc)
+			goto cannot_emulate;
+		c->dst.type = OP_NONE;	/* no writeback */
+		break;
+	case 0x22: /* mov reg, cr */
+		if (c->modrm_mod != 3)
+			goto cannot_emulate;
+		realmode_set_cr(ctxt->vcpu,
+				c->modrm_reg, c->modrm_val, &ctxt->eflags);
+		c->dst.type = OP_NONE;
+		break;
+	case 0x23: /* mov from reg to dr */
+		if (c->modrm_mod != 3)
+			goto cannot_emulate;
+		rc = emulator_set_dr(ctxt, c->modrm_reg,
+				     c->regs[c->modrm_rm]);
+		if (rc)
+			goto cannot_emulate;
+		c->dst.type = OP_NONE;	/* no writeback */
+		break;
+	case 0x30:
+		/* wrmsr */
+		msr_data = (u32)c->regs[VCPU_REGS_RAX]
+			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
+		rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
+		if (rc) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			c->eip = ctxt->vcpu->arch.rip;
+		}
+		rc = X86EMUL_CONTINUE;
+		c->dst.type = OP_NONE;
+		break;
+	case 0x32:
+		/* rdmsr */
+		rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
+		if (rc) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			c->eip = ctxt->vcpu->arch.rip;
+		} else {
+			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
+			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+		}
+		rc = X86EMUL_CONTINUE;
+		c->dst.type = OP_NONE;
+		break;
+	case 0x40 ... 0x4f:	/* cmov */
+		c->dst.val = c->dst.orig_val = c->src.val;
+		if (!test_cc(c->b, ctxt->eflags))
+			c->dst.type = OP_NONE; /* no writeback */
+		break;
+	case 0x80 ... 0x8f: /* jnz rel, etc*/ {
+		long int rel;
+
+		switch (c->op_bytes) {
+		case 2:
+			rel = insn_fetch(s16, 2, c->eip);
+			break;
+		case 4:
+			rel = insn_fetch(s32, 4, c->eip);
+			break;
+		case 8:
+			rel = insn_fetch(s64, 8, c->eip);
+			break;
+		default:
+			DPRINTF("jnz: Invalid op_bytes\n");
+			goto cannot_emulate;
+		}
+		if (test_cc(c->b, ctxt->eflags))
+			JMP_REL(rel);
+		c->dst.type = OP_NONE;
+		break;
+	}
+	case 0xa3:
+	      bt:		/* bt */
+		c->dst.type = OP_NONE;
+		/* only subword offset */
+		c->src.val &= (c->dst.bytes << 3) - 1;
+		emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0xab:
+	      bts:		/* bts */
+		/* only subword offset */
+		c->src.val &= (c->dst.bytes << 3) - 1;
+		emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0xb0 ... 0xb1:	/* cmpxchg */
+		/*
+		 * Save real source value, then compare EAX against
+		 * destination.
+		 */
+		c->src.orig_val = c->src.val;
+		c->src.val = c->regs[VCPU_REGS_RAX];
+		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+		if (ctxt->eflags & EFLG_ZF) {
+			/* Success: write back to memory. */
+			c->dst.val = c->src.orig_val;
+		} else {
+			/* Failure: write the value we saw to EAX. */
+			c->dst.type = OP_REG;
+			c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+		}
+		break;
+	case 0xb3:
+	      btr:		/* btr */
+		/* only subword offset */
+		c->src.val &= (c->dst.bytes << 3) - 1;
+		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0xb6 ... 0xb7:	/* movzx */
+		c->dst.bytes = c->op_bytes;
+		c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
+						       : (u16) c->src.val;
+		break;
+	case 0xba:		/* Grp8 */
+		switch (c->modrm_reg & 3) {
+		case 0:
+			goto bt;
+		case 1:
+			goto bts;
+		case 2:
+			goto btr;
+		case 3:
+			goto btc;
+		}
+		break;
+	case 0xbb:
+	      btc:		/* btc */
+		/* only subword offset */
+		c->src.val &= (c->dst.bytes << 3) - 1;
+		emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+		break;
+	case 0xbe ... 0xbf:	/* movsx */
+		c->dst.bytes = c->op_bytes;
+		c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
+							(s16) c->src.val;
+		break;
+	case 0xc3:		/* movnti */
+		c->dst.bytes = c->op_bytes;
+		c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
+							(u64) c->src.val;
+		break;
+	case 0xc7:		/* Grp9 (cmpxchg8b) */
+		rc = emulate_grp9(ctxt, ops, memop);
+		if (rc != 0)
+			goto done;
+		c->dst.type = OP_NONE;
+		break;
+	}
+	goto writeback;
+
+cannot_emulate:
+	DPRINTF("Cannot emulate %02x\n", c->b);
+	c->eip = saved_eip;
+	return -1;
+}
diff -puN drivers/Kconfig~git-kvm drivers/Kconfig
--- a/drivers/Kconfig~git-kvm
+++ a/drivers/Kconfig
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
 
 source "drivers/auxdisplay/Kconfig"
 
-source "drivers/kvm/Kconfig"
-
 source "drivers/uio/Kconfig"
 
 source "drivers/virtio/Kconfig"
diff -puN drivers/Makefile~git-kvm drivers/Makefile
--- a/drivers/Makefile~git-kvm
+++ a/drivers/Makefile
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI)		+= spi/
 obj-$(CONFIG_PCCARD)		+= pcmcia/
 obj-$(CONFIG_DIO)		+= dio/
 obj-$(CONFIG_SBUS)		+= sbus/
-obj-$(CONFIG_KVM)		+= kvm/
 obj-$(CONFIG_ZORRO)		+= zorro/
 obj-$(CONFIG_MAC)		+= macintosh/
 obj-$(CONFIG_ATA_OVER_ETH)	+= block/aoe/
diff -puN drivers/kvm/Kconfig~git-kvm /dev/null
--- a/drivers/kvm/Kconfig
+++ /dev/null
@@ -1,54 +0,0 @@
-#
-# KVM configuration
-#
-menuconfig VIRTUALIZATION
-	bool "Virtualization"
-	depends on X86
-	default y
-	---help---
-	  Say Y here to get to see options for using your Linux host to run other
-	  operating systems inside virtual machines (guests).
-	  This option alone does not add any kernel code.
-
-	  If you say N, all options in this submenu will be skipped and disabled.
-
-if VIRTUALIZATION
-
-config KVM
-	tristate "Kernel-based Virtual Machine (KVM) support"
-	depends on X86 && EXPERIMENTAL
-	select PREEMPT_NOTIFIERS
-	select ANON_INODES
-	---help---
-	  Support hosting fully virtualized guest machines using hardware
-	  virtualization extensions.  You will need a fairly recent
-	  processor equipped with virtualization extensions. You will also
-	  need to select one or more of the processor modules below.
-
-	  This module provides access to the hardware capabilities through
-	  a character device node named /dev/kvm.
-
-	  To compile this as a module, choose M here: the module
-	  will be called kvm.
-
-	  If unsure, say N.
-
-config KVM_INTEL
-	tristate "KVM for Intel processors support"
-	depends on KVM
-	---help---
-	  Provides support for KVM on Intel processors equipped with the VT
-	  extensions.
-
-config KVM_AMD
-	tristate "KVM for AMD processors support"
-	depends on KVM
-	---help---
-	  Provides support for KVM on AMD processors equipped with the AMD-V
-	  (SVM) extensions.
-
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source drivers/lguest/Kconfig
-
-endif # VIRTUALIZATION
diff -puN drivers/kvm/Makefile~git-kvm /dev/null
--- a/drivers/kvm/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Makefile for Kernel-based Virtual Machine module
-#
-
-kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
-obj-$(CONFIG_KVM) += kvm.o
-kvm-intel-objs = vmx.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-kvm-amd-objs = svm.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff -puN drivers/kvm/i8259.c~git-kvm /dev/null
--- a/drivers/kvm/i8259.c
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * 8259 interrupt controller emulation
- *
- * Copyright (c) 2003-2004 Fabrice Bellard
- * Copyright (c) 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- * Authors:
- *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *   Port from Qemu.
- */
-#include <linux/mm.h>
-#include "irq.h"
-
-/*
- * set irq level. If an edge is detected, then the IRR is set to 1
- */
-static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
-{
-	int mask;
-	mask = 1 << irq;
-	if (s->elcr & mask)	/* level triggered */
-		if (level) {
-			s->irr |= mask;
-			s->last_irr |= mask;
-		} else {
-			s->irr &= ~mask;
-			s->last_irr &= ~mask;
-		}
-	else	/* edge triggered */
-		if (level) {
-			if ((s->last_irr & mask) == 0)
-				s->irr |= mask;
-			s->last_irr |= mask;
-		} else
-			s->last_irr &= ~mask;
-}
-
-/*
- * return the highest priority found in mask (highest = smallest
- * number). Return 8 if no irq
- */
-static inline int get_priority(struct kvm_kpic_state *s, int mask)
-{
-	int priority;
-	if (mask == 0)
-		return 8;
-	priority = 0;
-	while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
-		priority++;
-	return priority;
-}
-
-/*
- * return the pic wanted interrupt. return -1 if none
- */
-static int pic_get_irq(struct kvm_kpic_state *s)
-{
-	int mask, cur_priority, priority;
-
-	mask = s->irr & ~s->imr;
-	priority = get_priority(s, mask);
-	if (priority == 8)
-		return -1;
-	/*
-	 * compute current priority. If special fully nested mode on the
-	 * master, the IRQ coming from the slave is not taken into account
-	 * for the priority computation.
-	 */
-	mask = s->isr;
-	if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
-		mask &= ~(1 << 2);
-	cur_priority = get_priority(s, mask);
-	if (priority < cur_priority)
-		/*
-		 * higher priority found: an irq should be generated
-		 */
-		return (priority + s->priority_add) & 7;
-	else
-		return -1;
-}
-
-/*
- * raise irq to CPU if necessary. must be called every time the active
- * irq may change
- */
-static void pic_update_irq(struct kvm_pic *s)
-{
-	int irq2, irq;
-
-	irq2 = pic_get_irq(&s->pics[1]);
-	if (irq2 >= 0) {
-		/*
-		 * if irq request by slave pic, signal master PIC
-		 */
-		pic_set_irq1(&s->pics[0], 2, 1);
-		pic_set_irq1(&s->pics[0], 2, 0);
-	}
-	irq = pic_get_irq(&s->pics[0]);
-	if (irq >= 0)
-		s->irq_request(s->irq_request_opaque, 1);
-	else
-		s->irq_request(s->irq_request_opaque, 0);
-}
-
-void kvm_pic_update_irq(struct kvm_pic *s)
-{
-	pic_update_irq(s);
-}
-
-void kvm_pic_set_irq(void *opaque, int irq, int level)
-{
-	struct kvm_pic *s = opaque;
-
-	pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
-	pic_update_irq(s);
-}
-
-/*
- * acknowledge interrupt 'irq'
- */
-static inline void pic_intack(struct kvm_kpic_state *s, int irq)
-{
-	if (s->auto_eoi) {
-		if (s->rotate_on_auto_eoi)
-			s->priority_add = (irq + 1) & 7;
-	} else
-		s->isr |= (1 << irq);
-	/*
-	 * We don't clear a level sensitive interrupt here
-	 */
-	if (!(s->elcr & (1 << irq)))
-		s->irr &= ~(1 << irq);
-}
-
-int kvm_pic_read_irq(struct kvm_pic *s)
-{
-	int irq, irq2, intno;
-
-	irq = pic_get_irq(&s->pics[0]);
-	if (irq >= 0) {
-		pic_intack(&s->pics[0], irq);
-		if (irq == 2) {
-			irq2 = pic_get_irq(&s->pics[1]);
-			if (irq2 >= 0)
-				pic_intack(&s->pics[1], irq2);
-			else
-				/*
-				 * spurious IRQ on slave controller
-				 */
-				irq2 = 7;
-			intno = s->pics[1].irq_base + irq2;
-			irq = irq2 + 8;
-		} else
-			intno = s->pics[0].irq_base + irq;
-	} else {
-		/*
-		 * spurious IRQ on host controller
-		 */
-		irq = 7;
-		intno = s->pics[0].irq_base + irq;
-	}
-	pic_update_irq(s);
-
-	return intno;
-}
-
-static void pic_reset(void *opaque)
-{
-	struct kvm_kpic_state *s = opaque;
-
-	s->last_irr = 0;
-	s->irr = 0;
-	s->imr = 0;
-	s->isr = 0;
-	s->priority_add = 0;
-	s->irq_base = 0;
-	s->read_reg_select = 0;
-	s->poll = 0;
-	s->special_mask = 0;
-	s->init_state = 0;
-	s->auto_eoi = 0;
-	s->rotate_on_auto_eoi = 0;
-	s->special_fully_nested_mode = 0;
-	s->init4 = 0;
-}
-
-static void pic_ioport_write(void *opaque, u32 addr, u32 val)
-{
-	struct kvm_kpic_state *s = opaque;
-	int priority, cmd, irq;
-
-	addr &= 1;
-	if (addr == 0) {
-		if (val & 0x10) {
-			pic_reset(s);	/* init */
-			/*
-			 * deassert a pending interrupt
-			 */
-			s->pics_state->irq_request(s->pics_state->
-						   irq_request_opaque, 0);
-			s->init_state = 1;
-			s->init4 = val & 1;
-			if (val & 0x02)
-				printk(KERN_ERR "single mode not supported");
-			if (val & 0x08)
-				printk(KERN_ERR
-				       "level sensitive irq not supported");
-		} else if (val & 0x08) {
-			if (val & 0x04)
-				s->poll = 1;
-			if (val & 0x02)
-				s->read_reg_select = val & 1;
-			if (val & 0x40)
-				s->special_mask = (val >> 5) & 1;
-		} else {
-			cmd = val >> 5;
-			switch (cmd) {
-			case 0:
-			case 4:
-				s->rotate_on_auto_eoi = cmd >> 2;
-				break;
-			case 1:	/* end of interrupt */
-			case 5:
-				priority = get_priority(s, s->isr);
-				if (priority != 8) {
-					irq = (priority + s->priority_add) & 7;
-					s->isr &= ~(1 << irq);
-					if (cmd == 5)
-						s->priority_add = (irq + 1) & 7;
-					pic_update_irq(s->pics_state);
-				}
-				break;
-			case 3:
-				irq = val & 7;
-				s->isr &= ~(1 << irq);
-				pic_update_irq(s->pics_state);
-				break;
-			case 6:
-				s->priority_add = (val + 1) & 7;
-				pic_update_irq(s->pics_state);
-				break;
-			case 7:
-				irq = val & 7;
-				s->isr &= ~(1 << irq);
-				s->priority_add = (irq + 1) & 7;
-				pic_update_irq(s->pics_state);
-				break;
-			default:
-				break;	/* no operation */
-			}
-		}
-	} else
-		switch (s->init_state) {
-		case 0:		/* normal mode */
-			s->imr = val;
-			pic_update_irq(s->pics_state);
-			break;
-		case 1:
-			s->irq_base = val & 0xf8;
-			s->init_state = 2;
-			break;
-		case 2:
-			if (s->init4)
-				s->init_state = 3;
-			else
-				s->init_state = 0;
-			break;
-		case 3:
-			s->special_fully_nested_mode = (val >> 4) & 1;
-			s->auto_eoi = (val >> 1) & 1;
-			s->init_state = 0;
-			break;
-		}
-}
-
-static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
-{
-	int ret;
-
-	ret = pic_get_irq(s);
-	if (ret >= 0) {
-		if (addr1 >> 7) {
-			s->pics_state->pics[0].isr &= ~(1 << 2);
-			s->pics_state->pics[0].irr &= ~(1 << 2);
-		}
-		s->irr &= ~(1 << ret);
-		s->isr &= ~(1 << ret);
-		if (addr1 >> 7 || ret != 2)
-			pic_update_irq(s->pics_state);
-	} else {
-		ret = 0x07;
-		pic_update_irq(s->pics_state);
-	}
-
-	return ret;
-}
-
-static u32 pic_ioport_read(void *opaque, u32 addr1)
-{
-	struct kvm_kpic_state *s = opaque;
-	unsigned int addr;
-	int ret;
-
-	addr = addr1;
-	addr &= 1;
-	if (s->poll) {
-		ret = pic_poll_read(s, addr1);
-		s->poll = 0;
-	} else
-		if (addr == 0)
-			if (s->read_reg_select)
-				ret = s->isr;
-			else
-				ret = s->irr;
-		else
-			ret = s->imr;
-	return ret;
-}
-
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
-{
-	struct kvm_kpic_state *s = opaque;
-	s->elcr = val & s->elcr_mask;
-}
-
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
-{
-	struct kvm_kpic_state *s = opaque;
-	return s->elcr;
-}
-
-static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
-{
-	switch (addr) {
-	case 0x20:
-	case 0x21:
-	case 0xa0:
-	case 0xa1:
-	case 0x4d0:
-	case 0x4d1:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
-static void picdev_write(struct kvm_io_device *this,
-			 gpa_t addr, int len, const void *val)
-{
-	struct kvm_pic *s = this->private;
-	unsigned char data = *(unsigned char *)val;
-
-	if (len != 1) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "PIC: non byte write\n");
-		return;
-	}
-	switch (addr) {
-	case 0x20:
-	case 0x21:
-	case 0xa0:
-	case 0xa1:
-		pic_ioport_write(&s->pics[addr >> 7], addr, data);
-		break;
-	case 0x4d0:
-	case 0x4d1:
-		elcr_ioport_write(&s->pics[addr & 1], addr, data);
-		break;
-	}
-}
-
-static void picdev_read(struct kvm_io_device *this,
-			gpa_t addr, int len, void *val)
-{
-	struct kvm_pic *s = this->private;
-	unsigned char data = 0;
-
-	if (len != 1) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "PIC: non byte read\n");
-		return;
-	}
-	switch (addr) {
-	case 0x20:
-	case 0x21:
-	case 0xa0:
-	case 0xa1:
-		data = pic_ioport_read(&s->pics[addr >> 7], addr);
-		break;
-	case 0x4d0:
-	case 0x4d1:
-		data = elcr_ioport_read(&s->pics[addr & 1], addr);
-		break;
-	}
-	*(unsigned char *)val = data;
-}
-
-/*
- * callback when PIC0 irq status changed
- */
-static void pic_irq_request(void *opaque, int level)
-{
-	struct kvm *kvm = opaque;
-	struct kvm_vcpu *vcpu = kvm->vcpus[0];
-
-	pic_irqchip(kvm)->output = level;
-	if (vcpu)
-		kvm_vcpu_kick(vcpu);
-}
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
-{
-	struct kvm_pic *s;
-	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
-	if (!s)
-		return NULL;
-	s->pics[0].elcr_mask = 0xf8;
-	s->pics[1].elcr_mask = 0xde;
-	s->irq_request = pic_irq_request;
-	s->irq_request_opaque = kvm;
-	s->pics[0].pics_state = s;
-	s->pics[1].pics_state = s;
-
-	/*
-	 * Initialize PIO device
-	 */
-	s->dev.read = picdev_read;
-	s->dev.write = picdev_write;
-	s->dev.in_range = picdev_in_range;
-	s->dev.private = s;
-	kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
-	return s;
-}
diff -puN drivers/kvm/ioapic.c~git-kvm /dev/null
--- a/drivers/kvm/ioapic.c
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- *  Copyright (C) 2001  MandrakeSoft S.A.
- *
- *    MandrakeSoft S.A.
- *    43, rue d'Aboukir
- *    75002 Paris - France
- *    http://www.linux-mandrake.com/
- *    http://www.mandrakesoft.com/
- *
- *  This library is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU Lesser General Public
- *  License as published by the Free Software Foundation; either
- *  version 2 of the License, or (at your option) any later version.
- *
- *  This library is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  Lesser General Public License for more details.
- *
- *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- *  Yunhong Jiang <yunhong.jiang@intel.com>
- *  Yaozu (Eddie) Dong <eddie.dong@intel.com>
- *  Based on Xen 3.1 code.
- */
-
-#include "kvm.h"
-#include <linux/kvm.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-#include <linux/hrtimer.h>
-#include <linux/io.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include <asm/apicdef.h>
-#include <asm/io_apic.h>
-#include "irq.h"
-/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define ioapic_debug(fmt, arg...)
-static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
-
-static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
-					  unsigned long addr,
-					  unsigned long length)
-{
-	unsigned long result = 0;
-
-	switch (ioapic->ioregsel) {
-	case IOAPIC_REG_VERSION:
-		result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
-			  | (IOAPIC_VERSION_ID & 0xff));
-		break;
-
-	case IOAPIC_REG_APIC_ID:
-	case IOAPIC_REG_ARB_ID:
-		result = ((ioapic->id & 0xf) << 24);
-		break;
-
-	default:
-		{
-			u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
-			u64 redir_content;
-
-			ASSERT(redir_index < IOAPIC_NUM_PINS);
-
-			redir_content = ioapic->redirtbl[redir_index].bits;
-			result = (ioapic->ioregsel & 0x1) ?
-			    (redir_content >> 32) & 0xffffffff :
-			    redir_content & 0xffffffff;
-			break;
-		}
-	}
-
-	return result;
-}
-
-static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
-{
-	union ioapic_redir_entry *pent;
-
-	pent = &ioapic->redirtbl[idx];
-
-	if (!pent->fields.mask) {
-		ioapic_deliver(ioapic, idx);
-		if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
-			pent->fields.remote_irr = 1;
-	}
-	if (!pent->fields.trig_mode)
-		ioapic->irr &= ~(1 << idx);
-}
-
-static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
-{
-	unsigned index;
-
-	switch (ioapic->ioregsel) {
-	case IOAPIC_REG_VERSION:
-		/* Writes are ignored. */
-		break;
-
-	case IOAPIC_REG_APIC_ID:
-		ioapic->id = (val >> 24) & 0xf;
-		break;
-
-	case IOAPIC_REG_ARB_ID:
-		break;
-
-	default:
-		index = (ioapic->ioregsel - 0x10) >> 1;
-
-		ioapic_debug("change redir index %x val %x", index, val);
-		if (index >= IOAPIC_NUM_PINS)
-			return;
-		if (ioapic->ioregsel & 1) {
-			ioapic->redirtbl[index].bits &= 0xffffffff;
-			ioapic->redirtbl[index].bits |= (u64) val << 32;
-		} else {
-			ioapic->redirtbl[index].bits &= ~0xffffffffULL;
-			ioapic->redirtbl[index].bits |= (u32) val;
-			ioapic->redirtbl[index].fields.remote_irr = 0;
-		}
-		if (ioapic->irr & (1 << index))
-			ioapic_service(ioapic, index);
-		break;
-	}
-}
-
-static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
-			   struct kvm_lapic *target,
-			   u8 vector, u8 trig_mode, u8 delivery_mode)
-{
-	ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
-		     delivery_mode);
-
-	ASSERT((delivery_mode == dest_Fixed) ||
-	       (delivery_mode == dest_LowestPrio));
-
-	kvm_apic_set_irq(target, vector, trig_mode);
-}
-
-static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				       u8 dest_mode)
-{
-	u32 mask = 0;
-	int i;
-	struct kvm *kvm = ioapic->kvm;
-	struct kvm_vcpu *vcpu;
-
-	ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
-
-	if (dest_mode == 0) {	/* Physical mode. */
-		if (dest == 0xFF) {	/* Broadcast. */
-			for (i = 0; i < KVM_MAX_VCPUS; ++i)
-				if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
-					mask |= 1 << i;
-			return mask;
-		}
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
-				if (vcpu->apic)
-					mask = 1 << i;
-				break;
-			}
-		}
-	} else if (dest != 0)	/* Logical mode, MDA non-zero. */
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (vcpu->apic &&
-			    kvm_apic_match_logical_addr(vcpu->apic, dest))
-				mask |= 1 << vcpu->vcpu_id;
-		}
-	ioapic_debug("mask %x", mask);
-	return mask;
-}
-
-static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
-{
-	u8 dest = ioapic->redirtbl[irq].fields.dest_id;
-	u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
-	u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
-	u8 vector = ioapic->redirtbl[irq].fields.vector;
-	u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
-	u32 deliver_bitmask;
-	struct kvm_lapic *target;
-	struct kvm_vcpu *vcpu;
-	int vcpu_id;
-
-	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
-		     "vector=%x trig_mode=%x",
-		     dest, dest_mode, delivery_mode, vector, trig_mode);
-
-	deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
-	if (!deliver_bitmask) {
-		ioapic_debug("no target on destination");
-		return;
-	}
-
-	switch (delivery_mode) {
-	case dest_LowestPrio:
-		target =
-		    kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
-		if (target != NULL)
-			ioapic_inj_irq(ioapic, target, vector,
-				       trig_mode, delivery_mode);
-		else
-			ioapic_debug("null round robin: "
-				     "mask=%x vector=%x delivery_mode=%x",
-				     deliver_bitmask, vector, dest_LowestPrio);
-		break;
-	case dest_Fixed:
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
-				target = vcpu->apic;
-				ioapic_inj_irq(ioapic, target, vector,
-					       trig_mode, delivery_mode);
-			}
-		}
-		break;
-
-		/* TODO: NMI */
-	default:
-		printk(KERN_WARNING "Unsupported delivery mode %d\n",
-		       delivery_mode);
-		break;
-	}
-}
-
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
-{
-	u32 old_irr = ioapic->irr;
-	u32 mask = 1 << irq;
-	union ioapic_redir_entry entry;
-
-	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
-		entry = ioapic->redirtbl[irq];
-		level ^= entry.fields.polarity;
-		if (!level)
-			ioapic->irr &= ~mask;
-		else {
-			ioapic->irr |= mask;
-			if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
-			    || !entry.fields.remote_irr)
-				ioapic_service(ioapic, irq);
-		}
-	}
-}
-
-static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
-{
-	int i;
-
-	for (i = 0; i < IOAPIC_NUM_PINS; i++)
-		if (ioapic->redirtbl[i].fields.vector == vector)
-			return i;
-	return -1;
-}
-
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
-{
-	struct kvm_ioapic *ioapic = kvm->vioapic;
-	union ioapic_redir_entry *ent;
-	int gsi;
-
-	gsi = get_eoi_gsi(ioapic, vector);
-	if (gsi == -1) {
-		printk(KERN_WARNING "Can't find redir item for %d EOI\n",
-		       vector);
-		return;
-	}
-
-	ent = &ioapic->redirtbl[gsi];
-	ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
-
-	ent->fields.remote_irr = 0;
-	if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
-		ioapic_deliver(ioapic, gsi);
-}
-
-static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
-{
-	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
-
-	return ((addr >= ioapic->base_address &&
-		 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
-}
-
-static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
-			     void *val)
-{
-	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
-	u32 result;
-
-	ioapic_debug("addr %lx", (unsigned long)addr);
-	ASSERT(!(addr & 0xf));	/* check alignment */
-
-	addr &= 0xff;
-	switch (addr) {
-	case IOAPIC_REG_SELECT:
-		result = ioapic->ioregsel;
-		break;
-
-	case IOAPIC_REG_WINDOW:
-		result = ioapic_read_indirect(ioapic, addr, len);
-		break;
-
-	default:
-		result = 0;
-		break;
-	}
-	switch (len) {
-	case 8:
-		*(u64 *) val = result;
-		break;
-	case 1:
-	case 2:
-	case 4:
-		memcpy(val, (char *)&result, len);
-		break;
-	default:
-		printk(KERN_WARNING "ioapic: wrong length %d\n", len);
-	}
-}
-
-static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
-			      const void *val)
-{
-	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
-	u32 data;
-
-	ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
-		     addr, len, val);
-	ASSERT(!(addr & 0xf));	/* check alignment */
-	if (len == 4 || len == 8)
-		data = *(u32 *) val;
-	else {
-		printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
-		return;
-	}
-
-	addr &= 0xff;
-	switch (addr) {
-	case IOAPIC_REG_SELECT:
-		ioapic->ioregsel = data;
-		break;
-
-	case IOAPIC_REG_WINDOW:
-		ioapic_write_indirect(ioapic, data);
-		break;
-
-	default:
-		break;
-	}
-}
-
-int kvm_ioapic_init(struct kvm *kvm)
-{
-	struct kvm_ioapic *ioapic;
-	int i;
-
-	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
-	if (!ioapic)
-		return -ENOMEM;
-	kvm->vioapic = ioapic;
-	for (i = 0; i < IOAPIC_NUM_PINS; i++)
-		ioapic->redirtbl[i].fields.mask = 1;
-	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
-	ioapic->dev.read = ioapic_mmio_read;
-	ioapic->dev.write = ioapic_mmio_write;
-	ioapic->dev.in_range = ioapic_in_range;
-	ioapic->dev.private = ioapic;
-	ioapic->kvm = kvm;
-	kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
-	return 0;
-}
diff -puN drivers/kvm/irq.c~git-kvm /dev/null
--- a/drivers/kvm/irq.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * irq.c: API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#include <linux/module.h>
-
-#include "kvm.h"
-#include "irq.h"
-
-/*
- * check if there is pending interrupt without
- * intack.
- */
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
-{
-	struct kvm_pic *s;
-
-	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
-		if (kvm_apic_accept_pic_intr(v)) {
-			s = pic_irqchip(v->kvm);	/* PIC */
-			return s->output;
-		} else
-			return 0;
-	}
-	return 1;
-}
-EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
-
-/*
- * Read pending interrupt vector and intack.
- */
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
-{
-	struct kvm_pic *s;
-	int vector;
-
-	vector = kvm_get_apic_interrupt(v);	/* APIC */
-	if (vector == -1) {
-		if (kvm_apic_accept_pic_intr(v)) {
-			s = pic_irqchip(v->kvm);
-			s->output = 0;		/* PIC */
-			vector = kvm_pic_read_irq(s);
-		}
-	}
-	return vector;
-}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
-
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-	printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-	int ipi_pcpu = vcpu->cpu;
-
-	if (waitqueue_active(&vcpu->wq)) {
-		wake_up_interruptible(&vcpu->wq);
-		++vcpu->stat.halt_wakeup;
-	}
-	if (vcpu->guest_mode)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
-}
-
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
-{
-	kvm_inject_apic_timer_irqs(vcpu);
-	/* TODO: PIT, RTC etc. */
-}
-EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
-
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
-	kvm_apic_timer_intr_post(vcpu, vec);
-	/* TODO: PIT, RTC etc. */
-}
-EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff -puN drivers/kvm/irq.h~git-kvm /dev/null
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * irq.h: in kernel interrupt controller related definitions
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#ifndef __IRQ_H
-#define __IRQ_H
-
-#include "kvm.h"
-
-typedef void irq_request_func(void *opaque, int level);
-
-struct kvm_kpic_state {
-	u8 last_irr;	/* edge detection */
-	u8 irr;		/* interrupt request register */
-	u8 imr;		/* interrupt mask register */
-	u8 isr;		/* interrupt service register */
-	u8 priority_add;	/* highest irq priority */
-	u8 irq_base;
-	u8 read_reg_select;
-	u8 poll;
-	u8 special_mask;
-	u8 init_state;
-	u8 auto_eoi;
-	u8 rotate_on_auto_eoi;
-	u8 special_fully_nested_mode;
-	u8 init4;		/* true if 4 byte init */
-	u8 elcr;		/* PIIX edge/trigger selection */
-	u8 elcr_mask;
-	struct kvm_pic *pics_state;
-};
-
-struct kvm_pic {
-	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-	irq_request_func *irq_request;
-	void *irq_request_opaque;
-	int output;		/* intr from master PIC */
-	struct kvm_io_device dev;
-};
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
-void kvm_pic_update_irq(struct kvm_pic *s);
-
-#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
-#define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
-#define IOAPIC_EDGE_TRIG  0
-#define IOAPIC_LEVEL_TRIG 1
-
-#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
-#define IOAPIC_MEM_LENGTH            0x100
-
-/* Direct registers. */
-#define IOAPIC_REG_SELECT  0x00
-#define IOAPIC_REG_WINDOW  0x10
-#define IOAPIC_REG_EOI     0x40	/* IA64 IOSAPIC only */
-
-/* Indirect registers. */
-#define IOAPIC_REG_APIC_ID 0x00	/* x86 IOAPIC only */
-#define IOAPIC_REG_VERSION 0x01
-#define IOAPIC_REG_ARB_ID  0x02	/* x86 IOAPIC only */
-
-struct kvm_ioapic {
-	u64 base_address;
-	u32 ioregsel;
-	u32 id;
-	u32 irr;
-	u32 pad;
-	union ioapic_redir_entry {
-		u64 bits;
-		struct {
-			u8 vector;
-			u8 delivery_mode:3;
-			u8 dest_mode:1;
-			u8 delivery_status:1;
-			u8 polarity:1;
-			u8 remote_irr:1;
-			u8 trig_mode:1;
-			u8 mask:1;
-			u8 reserve:7;
-			u8 reserved[4];
-			u8 dest_id;
-		} fields;
-	} redirtbl[IOAPIC_NUM_PINS];
-	struct kvm_io_device dev;
-	struct kvm *kvm;
-};
-
-struct kvm_lapic {
-	unsigned long base_address;
-	struct kvm_io_device dev;
-	struct {
-		atomic_t pending;
-		s64 period;	/* unit: ns */
-		u32 divide_count;
-		ktime_t last_update;
-		struct hrtimer dev;
-	} timer;
-	struct kvm_vcpu *vcpu;
-	struct page *regs_page;
-	void *regs;
-};
-
-#ifdef DEBUG
-#define ASSERT(x)  							\
-do {									\
-	if (!(x)) {							\
-		printk(KERN_EMERG "assertion failed %s: %d: %s\n",	\
-		       __FILE__, __LINE__, #x);				\
-		BUG();							\
-	}								\
-} while (0)
-#else
-#define ASSERT(x) do { } while (0)
-#endif
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
-void kvm_free_apic(struct kvm_lapic *apic);
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
-				       unsigned long bitmap);
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_ioapic_init(struct kvm *kvm);
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
-
-#endif
diff -puN drivers/kvm/kvm.h~git-kvm /dev/null
--- a/drivers/kvm/kvm.h
+++ /dev/null
@@ -1,796 +0,0 @@
-#ifndef __KVM_H
-#define __KVM_H
-
-/*
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/preempt.h>
-#include <asm/signal.h>
-
-#include <linux/kvm.h>
-#include <linux/kvm_para.h>
-
-#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
-#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
-#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
-
-#define KVM_GUEST_CR0_MASK \
-	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
-	 | X86_CR0_NW | X86_CR0_CD)
-#define KVM_VM_CR0_ALWAYS_ON \
-	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
-	 | X86_CR0_MP)
-#define KVM_GUEST_CR4_MASK \
-	(X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
-
-#define INVALID_PAGE (~(hpa_t)0)
-#define UNMAPPED_GVA (~(gpa_t)0)
-
-#define KVM_MAX_VCPUS 4
-#define KVM_ALIAS_SLOTS 4
-#define KVM_MEMORY_SLOTS 8
-#define KVM_NUM_MMU_PAGES 1024
-#define KVM_MIN_FREE_MMU_PAGES 5
-#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
-
-#define DE_VECTOR 0
-#define NM_VECTOR 7
-#define DF_VECTOR 8
-#define TS_VECTOR 10
-#define NP_VECTOR 11
-#define SS_VECTOR 12
-#define GP_VECTOR 13
-#define PF_VECTOR 14
-
-#define SELECTOR_TI_MASK (1 << 2)
-#define SELECTOR_RPL_MASK 0x03
-
-#define IOPL_SHIFT 12
-
-#define KVM_PIO_PAGE_OFFSET 1
-
-/*
- * vcpu->requests bit members
- */
-#define KVM_TLB_FLUSH 0
-
-/*
- * Address types:
- *
- *  gva - guest virtual address
- *  gpa - guest physical address
- *  gfn - guest frame number
- *  hva - host virtual address
- *  hpa - host physical address
- *  hfn - host frame number
- */
-
-typedef unsigned long  gva_t;
-typedef u64            gpa_t;
-typedef unsigned long  gfn_t;
-
-typedef unsigned long  hva_t;
-typedef u64            hpa_t;
-typedef unsigned long  hfn_t;
-
-#define NR_PTE_CHAIN_ENTRIES 5
-
-struct kvm_pte_chain {
-	u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
-	struct hlist_node link;
-};
-
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- *   bits 4:7 - page table level for this shadow (1-4)
- *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
- *   bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde
- */
-union kvm_mmu_page_role {
-	unsigned word;
-	struct {
-		unsigned glevels : 4;
-		unsigned level : 4;
-		unsigned quadrant : 2;
-		unsigned pad_for_nice_hex_output : 6;
-		unsigned metaphysical : 1;
-		unsigned hugepage_access : 3;
-	};
-};
-
-struct kvm_mmu_page {
-	struct list_head link;
-	struct hlist_node hash_link;
-
-	/*
-	 * The following two entries are used to key the shadow page in the
-	 * hash table.
-	 */
-	gfn_t gfn;
-	union kvm_mmu_page_role role;
-
-	u64 *spt;
-	unsigned long slot_bitmap; /* One bit set per slot which has memory
-				    * in this shadow page.
-				    */
-	int multimapped;         /* More than one parent_pte? */
-	int root_count;          /* Currently serving as active root */
-	union {
-		u64 *parent_pte;               /* !multimapped */
-		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
-	};
-};
-
-struct kvm_vcpu;
-extern struct kmem_cache *kvm_vcpu_cache;
-
-/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
- * mode.
- */
-struct kvm_mmu {
-	void (*new_cr3)(struct kvm_vcpu *vcpu);
-	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
-	void (*free)(struct kvm_vcpu *vcpu);
-	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
-	hpa_t root_hpa;
-	int root_level;
-	int shadow_root_level;
-
-	u64 *pae_root;
-};
-
-#define KVM_NR_MEM_OBJS 20
-
-struct kvm_mmu_memory_cache {
-	int nobjs;
-	void *objects[KVM_NR_MEM_OBJS];
-};
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_guest_debug {
-	int enabled;
-	unsigned long bp[4];
-	int singlestep;
-};
-
-enum {
-	VCPU_REGS_RAX = 0,
-	VCPU_REGS_RCX = 1,
-	VCPU_REGS_RDX = 2,
-	VCPU_REGS_RBX = 3,
-	VCPU_REGS_RSP = 4,
-	VCPU_REGS_RBP = 5,
-	VCPU_REGS_RSI = 6,
-	VCPU_REGS_RDI = 7,
-#ifdef CONFIG_X86_64
-	VCPU_REGS_R8 = 8,
-	VCPU_REGS_R9 = 9,
-	VCPU_REGS_R10 = 10,
-	VCPU_REGS_R11 = 11,
-	VCPU_REGS_R12 = 12,
-	VCPU_REGS_R13 = 13,
-	VCPU_REGS_R14 = 14,
-	VCPU_REGS_R15 = 15,
-#endif
-	NR_VCPU_REGS
-};
-
-enum {
-	VCPU_SREG_CS,
-	VCPU_SREG_DS,
-	VCPU_SREG_ES,
-	VCPU_SREG_FS,
-	VCPU_SREG_GS,
-	VCPU_SREG_SS,
-	VCPU_SREG_TR,
-	VCPU_SREG_LDTR,
-};
-
-struct kvm_pio_request {
-	unsigned long count;
-	int cur_count;
-	struct page *guest_pages[2];
-	unsigned guest_page_offset;
-	int in;
-	int port;
-	int size;
-	int string;
-	int down;
-	int rep;
-};
-
-struct kvm_stat {
-	u32 pf_fixed;
-	u32 pf_guest;
-	u32 tlb_flush;
-	u32 invlpg;
-
-	u32 exits;
-	u32 io_exits;
-	u32 mmio_exits;
-	u32 signal_exits;
-	u32 irq_window_exits;
-	u32 halt_exits;
-	u32 halt_wakeup;
-	u32 request_irq_exits;
-	u32 irq_exits;
-	u32 light_exits;
-	u32 efer_reload;
-};
-
-struct kvm_io_device {
-	void (*read)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     void *val);
-	void (*write)(struct kvm_io_device *this,
-		      gpa_t addr,
-		      int len,
-		      const void *val);
-	int (*in_range)(struct kvm_io_device *this, gpa_t addr);
-	void (*destructor)(struct kvm_io_device *this);
-
-	void             *private;
-};
-
-static inline void kvm_iodevice_read(struct kvm_io_device *dev,
-				     gpa_t addr,
-				     int len,
-				     void *val)
-{
-	dev->read(dev, addr, len, val);
-}
-
-static inline void kvm_iodevice_write(struct kvm_io_device *dev,
-				      gpa_t addr,
-				      int len,
-				      const void *val)
-{
-	dev->write(dev, addr, len, val);
-}
-
-static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
-{
-	return dev->in_range(dev, addr);
-}
-
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
-	if (dev->destructor)
-		dev->destructor(dev);
-}
-
-/*
- * It would be nice to use something smarter than a linear search, TBD...
- * Thankfully we dont expect many devices to register (famous last words :),
- * so until then it will suffice.  At least its abstracted so we can change
- * in one place.
- */
-struct kvm_io_bus {
-	int                   dev_count;
-#define NR_IOBUS_DEVS 6
-	struct kvm_io_device *devs[NR_IOBUS_DEVS];
-};
-
-void kvm_io_bus_init(struct kvm_io_bus *bus);
-void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
-			     struct kvm_io_device *dev);
-
-struct kvm_vcpu {
-	struct kvm *kvm;
-	struct preempt_notifier preempt_notifier;
-	int vcpu_id;
-	struct mutex mutex;
-	int   cpu;
-	u64 host_tsc;
-	struct kvm_run *run;
-	int interrupt_window_open;
-	int guest_mode;
-	unsigned long requests;
-	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
-	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
-	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
-
-	unsigned long cr0;
-	unsigned long cr2;
-	unsigned long cr3;
-	gpa_t para_state_gpa;
-	struct page *para_state_page;
-	gpa_t hypercall_gpa;
-	unsigned long cr4;
-	unsigned long cr8;
-	u64 pdptrs[4]; /* pae */
-	u64 shadow_efer;
-	u64 apic_base;
-	struct kvm_lapic *apic;    /* kernel irqchip context */
-#define VCPU_MP_STATE_RUNNABLE          0
-#define VCPU_MP_STATE_UNINITIALIZED     1
-#define VCPU_MP_STATE_INIT_RECEIVED     2
-#define VCPU_MP_STATE_SIPI_RECEIVED     3
-#define VCPU_MP_STATE_HALTED            4
-	int mp_state;
-	int sipi_vector;
-	u64 ia32_misc_enable_msr;
-
-	struct kvm_mmu mmu;
-
-	struct kvm_mmu_memory_cache mmu_pte_chain_cache;
-	struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
-	struct kvm_mmu_memory_cache mmu_page_cache;
-	struct kvm_mmu_memory_cache mmu_page_header_cache;
-
-	gfn_t last_pt_write_gfn;
-	int   last_pt_write_count;
-
-	struct kvm_guest_debug guest_debug;
-
-	struct i387_fxsave_struct host_fx_image;
-	struct i387_fxsave_struct guest_fx_image;
-	int fpu_active;
-	int guest_fpu_loaded;
-
-	int mmio_needed;
-	int mmio_read_completed;
-	int mmio_is_write;
-	int mmio_size;
-	unsigned char mmio_data[8];
-	gpa_t mmio_phys_addr;
-	gva_t mmio_fault_cr2;
-	struct kvm_pio_request pio;
-	void *pio_data;
-	wait_queue_head_t wq;
-
-	int sigset_active;
-	sigset_t sigset;
-
-	struct kvm_stat stat;
-
-	struct {
-		int active;
-		u8 save_iopl;
-		struct kvm_save_segment {
-			u16 selector;
-			unsigned long base;
-			u32 limit;
-			u32 ar;
-		} tr, es, ds, fs, gs;
-	} rmode;
-	int halt_request; /* real mode on Intel only */
-
-	int cpuid_nent;
-	struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
-};
-
-struct kvm_mem_alias {
-	gfn_t base_gfn;
-	unsigned long npages;
-	gfn_t target_gfn;
-};
-
-struct kvm_memory_slot {
-	gfn_t base_gfn;
-	unsigned long npages;
-	unsigned long flags;
-	struct page **phys_mem;
-	unsigned long *dirty_bitmap;
-};
-
-struct kvm {
-	struct mutex lock; /* protects everything except vcpus */
-	int naliases;
-	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-	int nmemslots;
-	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
-	/*
-	 * Hash table of struct kvm_mmu_page.
-	 */
-	struct list_head active_mmu_pages;
-	int n_free_mmu_pages;
-	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
-	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
-	unsigned long rmap_overflow;
-	struct list_head vm_list;
-	struct file *filp;
-	struct kvm_io_bus mmio_bus;
-	struct kvm_io_bus pio_bus;
-	struct kvm_pic *vpic;
-	struct kvm_ioapic *vioapic;
-	int round_robin_prev_vcpu;
-};
-
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
-	return kvm->vpic;
-}
-
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
-	return kvm->vioapic;
-}
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
-	return pic_irqchip(kvm) != 0;
-}
-
-struct descriptor_table {
-	u16 limit;
-	unsigned long base;
-} __attribute__((packed));
-
-struct kvm_x86_ops {
-	int (*cpu_has_kvm_support)(void);          /* __init */
-	int (*disabled_by_bios)(void);             /* __init */
-	void (*hardware_enable)(void *dummy);      /* __init */
-	void (*hardware_disable)(void *dummy);
-	void (*check_processor_compatibility)(void *rtn);
-	int (*hardware_setup)(void);               /* __init */
-	void (*hardware_unsetup)(void);            /* __exit */
-
-	/* Create, but do not attach this VCPU */
-	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
-	void (*vcpu_free)(struct kvm_vcpu *vcpu);
-	void (*vcpu_reset)(struct kvm_vcpu *vcpu);
-
-	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
-	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
-	void (*vcpu_put)(struct kvm_vcpu *vcpu);
-	void (*vcpu_decache)(struct kvm_vcpu *vcpu);
-
-	int (*set_guest_debug)(struct kvm_vcpu *vcpu,
-			       struct kvm_debug_guest *dbg);
-	void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
-	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
-	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
-	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
-	void (*get_segment)(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg);
-	void (*set_segment)(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg);
-	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
-	void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
-	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
-	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
-	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
-	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
-	void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
-	void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
-		       int *exception);
-	void (*cache_regs)(struct kvm_vcpu *vcpu);
-	void (*decache_regs)(struct kvm_vcpu *vcpu);
-	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
-	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-
-	void (*tlb_flush)(struct kvm_vcpu *vcpu);
-	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
-				  unsigned long addr, u32 err_code);
-
-	void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
-
-	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
-	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
-	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
-	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
-				unsigned char *hypercall_addr);
-	int (*get_irq)(struct kvm_vcpu *vcpu);
-	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
-	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
-	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
-				       struct kvm_run *run);
-};
-
-extern struct kvm_x86_ops *kvm_x86_ops;
-
-/* The guest did something we don't support. */
-#define pr_unimpl(vcpu, fmt, ...)					\
- do {									\
-	if (printk_ratelimit())						\
-		printk(KERN_ERR "kvm: %i: cpu%i " fmt,			\
-		       current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
- } while(0)
-
-#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
-#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
-
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
-		  struct module *module);
-void kvm_exit_x86(void);
-
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
-int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_zap_all(struct kvm *kvm);
-
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
-
-extern hpa_t bad_page_address;
-
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
-
-enum emulation_result {
-	EMULATE_DONE,       /* no further processing */
-	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
-	EMULATE_FAIL,         /* can't emulate this instruction */
-};
-
-int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
-			unsigned long cr2, u16 error_code);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags);
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
-		     unsigned long *rflags);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
-
-struct x86_emulate_ctxt;
-
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-		     int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-			   int size, unsigned long count, int down,
-			    gva_t address, int rep, unsigned port);
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
-int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
-int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
-		    unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
-		    unsigned long value);
-
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
-unsigned long get_cr8(struct kvm_vcpu *vcpu);
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
-
-void fx_init(struct kvm_vcpu *vcpu);
-
-void kvm_resched(struct kvm_vcpu *vcpu);
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_flush_remote_tlbs(struct kvm *kvm);
-
-int emulator_read_std(unsigned long addr,
-                      void *val,
-		      unsigned int bytes,
-		      struct kvm_vcpu *vcpu);
-int emulator_write_emulated(unsigned long addr,
-			    const void *val,
-			    unsigned int bytes,
-			    struct kvm_vcpu *vcpu);
-
-unsigned long segment_base(u16 selector);
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes);
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
-
-static inline void kvm_guest_enter(void)
-{
-	current->flags |= PF_VCPU;
-}
-
-static inline void kvm_guest_exit(void)
-{
-	current->flags &= ~PF_VCPU;
-}
-
-static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-				     u32 error_code)
-{
-	return vcpu->mmu.page_fault(vcpu, gva, error_code);
-}
-
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
-		__kvm_mmu_free_some_pages(vcpu);
-}
-
-static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
-{
-	if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
-		return 0;
-
-	return kvm_mmu_load(vcpu);
-}
-
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-	return vcpu->shadow_efer & EFER_LME;
-#else
-	return 0;
-#endif
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
-	return vcpu->cr4 & X86_CR4_PAE;
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
-	return vcpu->cr4 & X86_CR4_PSE;
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
-	return vcpu->cr0 & X86_CR0_PG;
-}
-
-static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	return slot - kvm->memslots;
-}
-
-static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
-{
-	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
-	return (struct kvm_mmu_page *)page_private(page);
-}
-
-static inline u16 read_fs(void)
-{
-	u16 seg;
-	asm ("mov %%fs, %0" : "=g"(seg));
-	return seg;
-}
-
-static inline u16 read_gs(void)
-{
-	u16 seg;
-	asm ("mov %%gs, %0" : "=g"(seg));
-	return seg;
-}
-
-static inline u16 read_ldt(void)
-{
-	u16 ldt;
-	asm ("sldt %0" : "=g"(ldt));
-	return ldt;
-}
-
-static inline void load_fs(u16 sel)
-{
-	asm ("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void load_gs(u16 sel)
-{
-	asm ("mov %0, %%gs" : : "rm"(sel));
-}
-
-#ifndef load_ldt
-static inline void load_ldt(u16 sel)
-{
-	asm ("lldt %0" : : "rm"(sel));
-}
-#endif
-
-static inline void get_idt(struct descriptor_table *table)
-{
-	asm ("sidt %0" : "=m"(*table));
-}
-
-static inline void get_gdt(struct descriptor_table *table)
-{
-	asm ("sgdt %0" : "=m"(*table));
-}
-
-static inline unsigned long read_tr_base(void)
-{
-	u16 tr;
-	asm ("str %0" : "=g"(tr));
-	return segment_base(tr);
-}
-
-#ifdef CONFIG_X86_64
-static inline unsigned long read_msr(unsigned long msr)
-{
-	u64 value;
-
-	rdmsrl(msr, value);
-	return value;
-}
-#endif
-
-static inline void fx_save(struct i387_fxsave_struct *image)
-{
-	asm ("fxsave (%0)":: "r" (image));
-}
-
-static inline void fx_restore(struct i387_fxsave_struct *image)
-{
-	asm ("fxrstor (%0)":: "r" (image));
-}
-
-static inline void fpu_init(void)
-{
-	asm ("finit");
-}
-
-static inline u32 get_rdx_init_val(void)
-{
-	return 0x600; /* P6 family */
-}
-
-#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
-
-#define MSR_IA32_TIME_STAMP_COUNTER		0x010
-
-#define TSS_IOPB_BASE_OFFSET 0x66
-#define TSS_BASE_SIZE 0x68
-#define TSS_IOPB_SIZE (65536 / 8)
-#define TSS_REDIRECTION_SIZE (256 / 8)
-#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
-
-#endif
diff -puN drivers/kvm/kvm_main.c~git-kvm /dev/null
--- a/drivers/kvm/kvm_main.c
+++ /dev/null
@@ -1,3628 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Avi Kivity   <avi@qumranet.com>
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "kvm.h"
-#include "x86_emulate.h"
-#include "segment_descriptor.h"
-#include "irq.h"
-
-#include <linux/kvm.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/percpu.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/reboot.h>
-#include <linux/debugfs.h>
-#include <linux/highmem.h>
-#include <linux/file.h>
-#include <linux/sysdev.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/smp.h>
-#include <linux/anon_inodes.h>
-#include <linux/profile.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-static DEFINE_SPINLOCK(kvm_lock);
-static LIST_HEAD(vm_list);
-
-static cpumask_t cpus_hardware_enabled;
-
-struct kvm_x86_ops *kvm_x86_ops;
-struct kmem_cache *kvm_vcpu_cache;
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
-
-static __read_mostly struct preempt_ops kvm_preempt_ops;
-
-#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
-
-static struct kvm_stats_debugfs_item {
-	const char *name;
-	int offset;
-	struct dentry *dentry;
-} debugfs_entries[] = {
-	{ "pf_fixed", STAT_OFFSET(pf_fixed) },
-	{ "pf_guest", STAT_OFFSET(pf_guest) },
-	{ "tlb_flush", STAT_OFFSET(tlb_flush) },
-	{ "invlpg", STAT_OFFSET(invlpg) },
-	{ "exits", STAT_OFFSET(exits) },
-	{ "io_exits", STAT_OFFSET(io_exits) },
-	{ "mmio_exits", STAT_OFFSET(mmio_exits) },
-	{ "signal_exits", STAT_OFFSET(signal_exits) },
-	{ "irq_window", STAT_OFFSET(irq_window_exits) },
-	{ "halt_exits", STAT_OFFSET(halt_exits) },
-	{ "halt_wakeup", STAT_OFFSET(halt_wakeup) },
-	{ "request_irq", STAT_OFFSET(request_irq_exits) },
-	{ "irq_exits", STAT_OFFSET(irq_exits) },
-	{ "light_exits", STAT_OFFSET(light_exits) },
-	{ "efer_reload", STAT_OFFSET(efer_reload) },
-	{ NULL }
-};
-
-static struct dentry *debugfs_dir;
-
-#define MAX_IO_MSRS 256
-
-#define CR0_RESERVED_BITS						\
-	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
-			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
-			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR4_RESERVED_BITS						\
-	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
-			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
-			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
-			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
-
-#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-#define EFER_RESERVED_BITS 0xfffffffffffff2fe
-
-#ifdef CONFIG_X86_64
-// LDT or TSS descriptor in the GDT. 16 bytes.
-struct segment_descriptor_64 {
-	struct segment_descriptor s;
-	u32 base_higher;
-	u32 pad_zero;
-};
-
-#endif
-
-static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
-			   unsigned long arg);
-
-unsigned long segment_base(u16 selector)
-{
-	struct descriptor_table gdt;
-	struct segment_descriptor *d;
-	unsigned long table_base;
-	typedef unsigned long ul;
-	unsigned long v;
-
-	if (selector == 0)
-		return 0;
-
-	asm ("sgdt %0" : "=m"(gdt));
-	table_base = gdt.base;
-
-	if (selector & 4) {           /* from ldt */
-		u16 ldt_selector;
-
-		asm ("sldt %0" : "=g"(ldt_selector));
-		table_base = segment_base(ldt_selector);
-	}
-	d = (struct segment_descriptor *)(table_base + (selector & ~7));
-	v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
-#ifdef CONFIG_X86_64
-	if (d->system == 0
-	    && (d->type == 2 || d->type == 9 || d->type == 11))
-		v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
-#endif
-	return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
-static inline int valid_vcpu(int n)
-{
-	return likely(n >= 0 && n < KVM_MAX_VCPUS);
-}
-
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
-		return;
-
-	vcpu->guest_fpu_loaded = 1;
-	fx_save(&vcpu->host_fx_image);
-	fx_restore(&vcpu->guest_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
-
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->guest_fpu_loaded)
-		return;
-
-	vcpu->guest_fpu_loaded = 0;
-	fx_save(&vcpu->guest_fx_image);
-	fx_restore(&vcpu->host_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put()
- */
-static void vcpu_load(struct kvm_vcpu *vcpu)
-{
-	int cpu;
-
-	mutex_lock(&vcpu->mutex);
-	cpu = get_cpu();
-	preempt_notifier_register(&vcpu->preempt_notifier);
-	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	put_cpu();
-}
-
-static void vcpu_put(struct kvm_vcpu *vcpu)
-{
-	preempt_disable();
-	kvm_x86_ops->vcpu_put(vcpu);
-	preempt_notifier_unregister(&vcpu->preempt_notifier);
-	preempt_enable();
-	mutex_unlock(&vcpu->mutex);
-}
-
-static void ack_flush(void *_completed)
-{
-}
-
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
-	int i, cpu;
-	cpumask_t cpus;
-	struct kvm_vcpu *vcpu;
-
-	cpus_clear(cpus);
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		vcpu = kvm->vcpus[i];
-		if (!vcpu)
-			continue;
-		if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
-			continue;
-		cpu = vcpu->cpu;
-		if (cpu != -1 && cpu != raw_smp_processor_id())
-			cpu_set(cpu, cpus);
-	}
-	smp_call_function_mask(cpus, ack_flush, NULL, 1);
-}
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
-{
-	struct page *page;
-	int r;
-
-	mutex_init(&vcpu->mutex);
-	vcpu->cpu = -1;
-	vcpu->mmu.root_hpa = INVALID_PAGE;
-	vcpu->kvm = kvm;
-	vcpu->vcpu_id = id;
-	if (!irqchip_in_kernel(kvm) || id == 0)
-		vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
-	else
-		vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
-	init_waitqueue_head(&vcpu->wq);
-
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	if (!page) {
-		r = -ENOMEM;
-		goto fail;
-	}
-	vcpu->run = page_address(page);
-
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	if (!page) {
-		r = -ENOMEM;
-		goto fail_free_run;
-	}
-	vcpu->pio_data = page_address(page);
-
-	r = kvm_mmu_create(vcpu);
-	if (r < 0)
-		goto fail_free_pio_data;
-
-	return 0;
-
-fail_free_pio_data:
-	free_page((unsigned long)vcpu->pio_data);
-fail_free_run:
-	free_page((unsigned long)vcpu->run);
-fail:
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
-
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
-	kvm_mmu_destroy(vcpu);
-	if (vcpu->apic)
-		hrtimer_cancel(&vcpu->apic->timer.dev);
-	kvm_free_apic(vcpu->apic);
-	free_page((unsigned long)vcpu->pio_data);
-	free_page((unsigned long)vcpu->run);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
-
-static struct kvm *kvm_create_vm(void)
-{
-	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-	if (!kvm)
-		return ERR_PTR(-ENOMEM);
-
-	kvm_io_bus_init(&kvm->pio_bus);
-	mutex_init(&kvm->lock);
-	INIT_LIST_HEAD(&kvm->active_mmu_pages);
-	kvm_io_bus_init(&kvm->mmio_bus);
-	spin_lock(&kvm_lock);
-	list_add(&kvm->vm_list, &vm_list);
-	spin_unlock(&kvm_lock);
-	return kvm;
-}
-
-/*
- * Free any memory in @free but not in @dont.
- */
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
-				  struct kvm_memory_slot *dont)
-{
-	int i;
-
-	if (!dont || free->phys_mem != dont->phys_mem)
-		if (free->phys_mem) {
-			for (i = 0; i < free->npages; ++i)
-				if (free->phys_mem[i])
-					__free_page(free->phys_mem[i]);
-			vfree(free->phys_mem);
-		}
-
-	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
-		vfree(free->dirty_bitmap);
-
-	free->phys_mem = NULL;
-	free->npages = 0;
-	free->dirty_bitmap = NULL;
-}
-
-static void kvm_free_physmem(struct kvm *kvm)
-{
-	int i;
-
-	for (i = 0; i < kvm->nmemslots; ++i)
-		kvm_free_physmem_slot(&kvm->memslots[i], NULL);
-}
-
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
-		if (vcpu->pio.guest_pages[i]) {
-			__free_page(vcpu->pio.guest_pages[i]);
-			vcpu->pio.guest_pages[i] = NULL;
-		}
-}
-
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
-{
-	vcpu_load(vcpu);
-	kvm_mmu_unload(vcpu);
-	vcpu_put(vcpu);
-}
-
-static void kvm_free_vcpus(struct kvm *kvm)
-{
-	unsigned int i;
-
-	/*
-	 * Unpin any mmu pages first.
-	 */
-	for (i = 0; i < KVM_MAX_VCPUS; ++i)
-		if (kvm->vcpus[i])
-			kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		if (kvm->vcpus[i]) {
-			kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
-			kvm->vcpus[i] = NULL;
-		}
-	}
-
-}
-
-static void kvm_destroy_vm(struct kvm *kvm)
-{
-	spin_lock(&kvm_lock);
-	list_del(&kvm->vm_list);
-	spin_unlock(&kvm_lock);
-	kvm_io_bus_destroy(&kvm->pio_bus);
-	kvm_io_bus_destroy(&kvm->mmio_bus);
-	kfree(kvm->vpic);
-	kfree(kvm->vioapic);
-	kvm_free_vcpus(kvm);
-	kvm_free_physmem(kvm);
-	kfree(kvm);
-}
-
-static int kvm_vm_release(struct inode *inode, struct file *filp)
-{
-	struct kvm *kvm = filp->private_data;
-
-	kvm_destroy_vm(kvm);
-	return 0;
-}
-
-static void inject_gp(struct kvm_vcpu *vcpu)
-{
-	kvm_x86_ops->inject_gp(vcpu, 0);
-}
-
-/*
- * Load the pae pdptrs.  Return true is they are all valid.
- */
-static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
-	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
-	int i;
-	u64 *pdpt;
-	int ret;
-	struct page *page;
-	u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
-
-	mutex_lock(&vcpu->kvm->lock);
-	page = gfn_to_page(vcpu->kvm, pdpt_gfn);
-	if (!page) {
-		ret = 0;
-		goto out;
-	}
-
-	pdpt = kmap_atomic(page, KM_USER0);
-	memcpy(pdpte, pdpt+offset, sizeof(pdpte));
-	kunmap_atomic(pdpt, KM_USER0);
-
-	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
-			ret = 0;
-			goto out;
-		}
-	}
-	ret = 1;
-
-	memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
-out:
-	mutex_unlock(&vcpu->kvm->lock);
-
-	return ret;
-}
-
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-	if (cr0 & CR0_RESERVED_BITS) {
-		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
-		       cr0, vcpu->cr0);
-		inject_gp(vcpu);
-		return;
-	}
-
-	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
-		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
-		inject_gp(vcpu);
-		return;
-	}
-
-	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
-		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
-		       "and a clear PE flag\n");
-		inject_gp(vcpu);
-		return;
-	}
-
-	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-#ifdef CONFIG_X86_64
-		if ((vcpu->shadow_efer & EFER_LME)) {
-			int cs_db, cs_l;
-
-			if (!is_pae(vcpu)) {
-				printk(KERN_DEBUG "set_cr0: #GP, start paging "
-				       "in long mode while PAE is disabled\n");
-				inject_gp(vcpu);
-				return;
-			}
-			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-			if (cs_l) {
-				printk(KERN_DEBUG "set_cr0: #GP, start paging "
-				       "in long mode while CS.L == 1\n");
-				inject_gp(vcpu);
-				return;
-
-			}
-		} else
-#endif
-		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
-			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
-			       "reserved bits\n");
-			inject_gp(vcpu);
-			return;
-		}
-
-	}
-
-	kvm_x86_ops->set_cr0(vcpu, cr0);
-	vcpu->cr0 = cr0;
-
-	mutex_lock(&vcpu->kvm->lock);
-	kvm_mmu_reset_context(vcpu);
-	mutex_unlock(&vcpu->kvm->lock);
-	return;
-}
-EXPORT_SYMBOL_GPL(set_cr0);
-
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
-{
-	set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
-}
-EXPORT_SYMBOL_GPL(lmsw);
-
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-	if (cr4 & CR4_RESERVED_BITS) {
-		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
-		inject_gp(vcpu);
-		return;
-	}
-
-	if (is_long_mode(vcpu)) {
-		if (!(cr4 & X86_CR4_PAE)) {
-			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
-			       "in long mode\n");
-			inject_gp(vcpu);
-			return;
-		}
-	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
-		   && !load_pdptrs(vcpu, vcpu->cr3)) {
-		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
-		inject_gp(vcpu);
-		return;
-	}
-
-	if (cr4 & X86_CR4_VMXE) {
-		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
-		inject_gp(vcpu);
-		return;
-	}
-	kvm_x86_ops->set_cr4(vcpu, cr4);
-	vcpu->cr4 = cr4;
-	mutex_lock(&vcpu->kvm->lock);
-	kvm_mmu_reset_context(vcpu);
-	mutex_unlock(&vcpu->kvm->lock);
-}
-EXPORT_SYMBOL_GPL(set_cr4);
-
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-	if (is_long_mode(vcpu)) {
-		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
-			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
-			inject_gp(vcpu);
-			return;
-		}
-	} else {
-		if (is_pae(vcpu)) {
-			if (cr3 & CR3_PAE_RESERVED_BITS) {
-				printk(KERN_DEBUG
-				       "set_cr3: #GP, reserved bits\n");
-				inject_gp(vcpu);
-				return;
-			}
-			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
-				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
-				       "reserved bits\n");
-				inject_gp(vcpu);
-				return;
-			}
-		} else {
-			if (cr3 & CR3_NONPAE_RESERVED_BITS) {
-				printk(KERN_DEBUG
-				       "set_cr3: #GP, reserved bits\n");
-				inject_gp(vcpu);
-				return;
-			}
-		}
-	}
-
-	mutex_lock(&vcpu->kvm->lock);
-	/*
-	 * Does the new cr3 value map to physical memory? (Note, we
-	 * catch an invalid cr3 even in real-mode, because it would
-	 * cause trouble later on when we turn on paging anyway.)
-	 *
-	 * A real CPU would silently accept an invalid cr3 and would
-	 * attempt to use it - with largely undefined (and often hard
-	 * to debug) behavior on the guest side.
-	 */
-	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-		inject_gp(vcpu);
-	else {
-		vcpu->cr3 = cr3;
-		vcpu->mmu.new_cr3(vcpu);
-	}
-	mutex_unlock(&vcpu->kvm->lock);
-}
-EXPORT_SYMBOL_GPL(set_cr3);
-
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
-	if (cr8 & CR8_RESERVED_BITS) {
-		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
-		inject_gp(vcpu);
-		return;
-	}
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_lapic_set_tpr(vcpu, cr8);
-	else
-		vcpu->cr8 = cr8;
-}
-EXPORT_SYMBOL_GPL(set_cr8);
-
-unsigned long get_cr8(struct kvm_vcpu *vcpu)
-{
-	if (irqchip_in_kernel(vcpu->kvm))
-		return kvm_lapic_get_cr8(vcpu);
-	else
-		return vcpu->cr8;
-}
-EXPORT_SYMBOL_GPL(get_cr8);
-
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
-{
-	if (irqchip_in_kernel(vcpu->kvm))
-		return vcpu->apic_base;
-	else
-		return vcpu->apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
-
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
-{
-	/* TODO: reserve bits check */
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_lapic_set_base(vcpu, data);
-	else
-		vcpu->apic_base = data;
-}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-
-void fx_init(struct kvm_vcpu *vcpu)
-{
-	unsigned after_mxcsr_mask;
-
-	/* Initialize guest FPU by resetting ours and saving into guest's */
-	preempt_disable();
-	fx_save(&vcpu->host_fx_image);
-	fpu_init();
-	fx_save(&vcpu->guest_fx_image);
-	fx_restore(&vcpu->host_fx_image);
-	preempt_enable();
-
-	vcpu->cr0 |= X86_CR0_ET;
-	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-	vcpu->guest_fx_image.mxcsr = 0x1f80;
-	memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
-	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-/*
- * Allocate some memory and give it an address in the guest physical address
- * space.
- *
- * Discontiguous memory is allowed, mostly for framebuffers.
- */
-static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-					  struct kvm_memory_region *mem)
-{
-	int r;
-	gfn_t base_gfn;
-	unsigned long npages;
-	unsigned long i;
-	struct kvm_memory_slot *memslot;
-	struct kvm_memory_slot old, new;
-
-	r = -EINVAL;
-	/* General sanity checks */
-	if (mem->memory_size & (PAGE_SIZE - 1))
-		goto out;
-	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
-		goto out;
-	if (mem->slot >= KVM_MEMORY_SLOTS)
-		goto out;
-	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
-		goto out;
-
-	memslot = &kvm->memslots[mem->slot];
-	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
-	npages = mem->memory_size >> PAGE_SHIFT;
-
-	if (!npages)
-		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-
-	mutex_lock(&kvm->lock);
-
-	new = old = *memslot;
-
-	new.base_gfn = base_gfn;
-	new.npages = npages;
-	new.flags = mem->flags;
-
-	/* Disallow changing a memory slot's size. */
-	r = -EINVAL;
-	if (npages && old.npages && npages != old.npages)
-		goto out_unlock;
-
-	/* Check for overlaps */
-	r = -EEXIST;
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *s = &kvm->memslots[i];
-
-		if (s == memslot)
-			continue;
-		if (!((base_gfn + npages <= s->base_gfn) ||
-		      (base_gfn >= s->base_gfn + s->npages)))
-			goto out_unlock;
-	}
-
-	/* Deallocate if slot is being removed */
-	if (!npages)
-		new.phys_mem = NULL;
-
-	/* Free page dirty bitmap if unneeded */
-	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
-		new.dirty_bitmap = NULL;
-
-	r = -ENOMEM;
-
-	/* Allocate if a slot is being created */
-	if (npages && !new.phys_mem) {
-		new.phys_mem = vmalloc(npages * sizeof(struct page *));
-
-		if (!new.phys_mem)
-			goto out_unlock;
-
-		memset(new.phys_mem, 0, npages * sizeof(struct page *));
-		for (i = 0; i < npages; ++i) {
-			new.phys_mem[i] = alloc_page(GFP_HIGHUSER
-						     | __GFP_ZERO);
-			if (!new.phys_mem[i])
-				goto out_unlock;
-			set_page_private(new.phys_mem[i],0);
-		}
-	}
-
-	/* Allocate page dirty bitmap if needed */
-	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
-		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
-
-		new.dirty_bitmap = vmalloc(dirty_bytes);
-		if (!new.dirty_bitmap)
-			goto out_unlock;
-		memset(new.dirty_bitmap, 0, dirty_bytes);
-	}
-
-	if (mem->slot >= kvm->nmemslots)
-		kvm->nmemslots = mem->slot + 1;
-
-	*memslot = new;
-
-	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-	kvm_flush_remote_tlbs(kvm);
-
-	mutex_unlock(&kvm->lock);
-
-	kvm_free_physmem_slot(&old, &new);
-	return 0;
-
-out_unlock:
-	mutex_unlock(&kvm->lock);
-	kvm_free_physmem_slot(&new, &old);
-out:
-	return r;
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-				      struct kvm_dirty_log *log)
-{
-	struct kvm_memory_slot *memslot;
-	int r, i;
-	int n;
-	unsigned long any = 0;
-
-	mutex_lock(&kvm->lock);
-
-	r = -EINVAL;
-	if (log->slot >= KVM_MEMORY_SLOTS)
-		goto out;
-
-	memslot = &kvm->memslots[log->slot];
-	r = -ENOENT;
-	if (!memslot->dirty_bitmap)
-		goto out;
-
-	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
-
-	for (i = 0; !any && i < n/sizeof(long); ++i)
-		any = memslot->dirty_bitmap[i];
-
-	r = -EFAULT;
-	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
-		goto out;
-
-	/* If nothing is dirty, don't bother messing with page tables. */
-	if (any) {
-		kvm_mmu_slot_remove_write_access(kvm, log->slot);
-		kvm_flush_remote_tlbs(kvm);
-		memset(memslot->dirty_bitmap, 0, n);
-	}
-
-	r = 0;
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-/*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-					 struct kvm_memory_alias *alias)
-{
-	int r, n;
-	struct kvm_mem_alias *p;
-
-	r = -EINVAL;
-	/* General sanity checks */
-	if (alias->memory_size & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->slot >= KVM_ALIAS_SLOTS)
-		goto out;
-	if (alias->guest_phys_addr + alias->memory_size
-	    < alias->guest_phys_addr)
-		goto out;
-	if (alias->target_phys_addr + alias->memory_size
-	    < alias->target_phys_addr)
-		goto out;
-
-	mutex_lock(&kvm->lock);
-
-	p = &kvm->aliases[alias->slot];
-	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-	p->npages = alias->memory_size >> PAGE_SHIFT;
-	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
-
-	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-		if (kvm->aliases[n - 1].npages)
-			break;
-	kvm->naliases = n;
-
-	kvm_mmu_zap_all(kvm);
-
-	mutex_unlock(&kvm->lock);
-
-	return 0;
-
-out:
-	return r;
-}
-
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
-	int r;
-
-	r = 0;
-	switch (chip->chip_id) {
-	case KVM_IRQCHIP_PIC_MASTER:
-		memcpy (&chip->chip.pic,
-			&pic_irqchip(kvm)->pics[0],
-			sizeof(struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_PIC_SLAVE:
-		memcpy (&chip->chip.pic,
-			&pic_irqchip(kvm)->pics[1],
-			sizeof(struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_IOAPIC:
-		memcpy (&chip->chip.ioapic,
-			ioapic_irqchip(kvm),
-			sizeof(struct kvm_ioapic_state));
-		break;
-	default:
-		r = -EINVAL;
-		break;
-	}
-	return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
-	int r;
-
-	r = 0;
-	switch (chip->chip_id) {
-	case KVM_IRQCHIP_PIC_MASTER:
-		memcpy (&pic_irqchip(kvm)->pics[0],
-			&chip->chip.pic,
-			sizeof(struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_PIC_SLAVE:
-		memcpy (&pic_irqchip(kvm)->pics[1],
-			&chip->chip.pic,
-			sizeof(struct kvm_pic_state));
-		break;
-	case KVM_IRQCHIP_IOAPIC:
-		memcpy (ioapic_irqchip(kvm),
-			&chip->chip.ioapic,
-			sizeof(struct kvm_ioapic_state));
-		break;
-	default:
-		r = -EINVAL;
-		break;
-	}
-	kvm_pic_update_irq(pic_irqchip(kvm));
-	return r;
-}
-
-static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-	struct kvm_mem_alias *alias;
-
-	for (i = 0; i < kvm->naliases; ++i) {
-		alias = &kvm->aliases[i];
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
-	}
-	return gfn;
-}
-
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-
-	for (i = 0; i < kvm->nmemslots; ++i) {
-		struct kvm_memory_slot *memslot = &kvm->memslots[i];
-
-		if (gfn >= memslot->base_gfn
-		    && gfn < memslot->base_gfn + memslot->npages)
-			return memslot;
-	}
-	return NULL;
-}
-
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
-	gfn = unalias_gfn(kvm, gfn);
-	return __gfn_to_memslot(kvm, gfn);
-}
-
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
-	struct kvm_memory_slot *slot;
-
-	gfn = unalias_gfn(kvm, gfn);
-	slot = __gfn_to_memslot(kvm, gfn);
-	if (!slot)
-		return NULL;
-	return slot->phys_mem[gfn - slot->base_gfn];
-}
-EXPORT_SYMBOL_GPL(gfn_to_page);
-
-/* WARNING: Does not work on aliased pages. */
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
-{
-	struct kvm_memory_slot *memslot;
-
-	memslot = __gfn_to_memslot(kvm, gfn);
-	if (memslot && memslot->dirty_bitmap) {
-		unsigned long rel_gfn = gfn - memslot->base_gfn;
-
-		/* avoid RMW */
-		if (!test_bit(rel_gfn, memslot->dirty_bitmap))
-			set_bit(rel_gfn, memslot->dirty_bitmap);
-	}
-}
-
-int emulator_read_std(unsigned long addr,
-			     void *val,
-			     unsigned int bytes,
-			     struct kvm_vcpu *vcpu)
-{
-	void *data = val;
-
-	while (bytes) {
-		gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
-		unsigned offset = addr & (PAGE_SIZE-1);
-		unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
-		unsigned long pfn;
-		struct page *page;
-		void *page_virt;
-
-		if (gpa == UNMAPPED_GVA)
-			return X86EMUL_PROPAGATE_FAULT;
-		pfn = gpa >> PAGE_SHIFT;
-		page = gfn_to_page(vcpu->kvm, pfn);
-		if (!page)
-			return X86EMUL_UNHANDLEABLE;
-		page_virt = kmap_atomic(page, KM_USER0);
-
-		memcpy(data, page_virt + offset, tocopy);
-
-		kunmap_atomic(page_virt, KM_USER0);
-
-		bytes -= tocopy;
-		data += tocopy;
-		addr += tocopy;
-	}
-
-	return X86EMUL_CONTINUE;
-}
-EXPORT_SYMBOL_GPL(emulator_read_std);
-
-static int emulator_write_std(unsigned long addr,
-			      const void *val,
-			      unsigned int bytes,
-			      struct kvm_vcpu *vcpu)
-{
-	pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
-	return X86EMUL_UNHANDLEABLE;
-}
-
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
-						gpa_t addr)
-{
-	struct kvm_io_device *dev;
-
-	if (vcpu->apic) {
-		dev = &vcpu->apic->dev;
-		if (dev->in_range(dev, addr))
-			return dev;
-	}
-	return NULL;
-}
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-						gpa_t addr)
-{
-	struct kvm_io_device *dev;
-
-	dev = vcpu_find_pervcpu_dev(vcpu, addr);
-	if (dev == NULL)
-		dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
-	return dev;
-}
-
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
-					       gpa_t addr)
-{
-	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
-}
-
-static int emulator_read_emulated(unsigned long addr,
-				  void *val,
-				  unsigned int bytes,
-				  struct kvm_vcpu *vcpu)
-{
-	struct kvm_io_device *mmio_dev;
-	gpa_t                 gpa;
-
-	if (vcpu->mmio_read_completed) {
-		memcpy(val, vcpu->mmio_data, bytes);
-		vcpu->mmio_read_completed = 0;
-		return X86EMUL_CONTINUE;
-	} else if (emulator_read_std(addr, val, bytes, vcpu)
-		   == X86EMUL_CONTINUE)
-		return X86EMUL_CONTINUE;
-
-	gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
-	if (gpa == UNMAPPED_GVA)
-		return X86EMUL_PROPAGATE_FAULT;
-
-	/*
-	 * Is this MMIO handled locally?
-	 */
-	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
-	if (mmio_dev) {
-		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
-		return X86EMUL_CONTINUE;
-	}
-
-	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 0;
-
-	return X86EMUL_UNHANDLEABLE;
-}
-
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-			       const void *val, int bytes)
-{
-	struct page *page;
-	void *virt;
-
-	if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
-		return 0;
-	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-	if (!page)
-		return 0;
-	mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
-	virt = kmap_atomic(page, KM_USER0);
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
-	memcpy(virt + offset_in_page(gpa), val, bytes);
-	kunmap_atomic(virt, KM_USER0);
-	return 1;
-}
-
-static int emulator_write_emulated_onepage(unsigned long addr,
-					   const void *val,
-					   unsigned int bytes,
-					   struct kvm_vcpu *vcpu)
-{
-	struct kvm_io_device *mmio_dev;
-	gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
-
-	if (gpa == UNMAPPED_GVA) {
-		kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-
-	if (emulator_write_phys(vcpu, gpa, val, bytes))
-		return X86EMUL_CONTINUE;
-
-	/*
-	 * Is this MMIO handled locally?
-	 */
-	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
-	if (mmio_dev) {
-		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
-		return X86EMUL_CONTINUE;
-	}
-
-	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 1;
-	memcpy(vcpu->mmio_data, val, bytes);
-
-	return X86EMUL_CONTINUE;
-}
-
-int emulator_write_emulated(unsigned long addr,
-				   const void *val,
-				   unsigned int bytes,
-				   struct kvm_vcpu *vcpu)
-{
-	/* Crossing a page boundary? */
-	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
-		int rc, now;
-
-		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		addr += now;
-		val += now;
-		bytes -= now;
-	}
-	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
-}
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
-
-static int emulator_cmpxchg_emulated(unsigned long addr,
-				     const void *old,
-				     const void *new,
-				     unsigned int bytes,
-				     struct kvm_vcpu *vcpu)
-{
-	static int reported;
-
-	if (!reported) {
-		reported = 1;
-		printk(KERN_WARNING "kvm: emulating exchange as write\n");
-	}
-	return emulator_write_emulated(addr, new, bytes, vcpu);
-}
-
-static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
-	return kvm_x86_ops->get_segment_base(vcpu, seg);
-}
-
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
-{
-	return X86EMUL_CONTINUE;
-}
-
-int emulate_clts(struct kvm_vcpu *vcpu)
-{
-	kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
-	return X86EMUL_CONTINUE;
-}
-
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
-{
-	struct kvm_vcpu *vcpu = ctxt->vcpu;
-
-	switch (dr) {
-	case 0 ... 3:
-		*dest = kvm_x86_ops->get_dr(vcpu, dr);
-		return X86EMUL_CONTINUE;
-	default:
-		pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
-		return X86EMUL_UNHANDLEABLE;
-	}
-}
-
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
-{
-	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-	int exception;
-
-	kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
-	if (exception) {
-		/* FIXME: better handling */
-		return X86EMUL_UNHANDLEABLE;
-	}
-	return X86EMUL_CONTINUE;
-}
-
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-{
-	static int reported;
-	u8 opcodes[4];
-	unsigned long rip = vcpu->rip;
-	unsigned long rip_linear;
-
-	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
-	if (reported)
-		return;
-
-	emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
-
-	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
-	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
-	reported = 1;
-}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-
-struct x86_emulate_ops emulate_ops = {
-	.read_std            = emulator_read_std,
-	.write_std           = emulator_write_std,
-	.read_emulated       = emulator_read_emulated,
-	.write_emulated      = emulator_write_emulated,
-	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
-};
-
-int emulate_instruction(struct kvm_vcpu *vcpu,
-			struct kvm_run *run,
-			unsigned long cr2,
-			u16 error_code)
-{
-	struct x86_emulate_ctxt emulate_ctxt;
-	int r;
-	int cs_db, cs_l;
-
-	vcpu->mmio_fault_cr2 = cr2;
-	kvm_x86_ops->cache_regs(vcpu);
-
-	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
-	emulate_ctxt.vcpu = vcpu;
-	emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-	emulate_ctxt.cr2 = cr2;
-	emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
-		? X86EMUL_MODE_REAL : cs_l
-		? X86EMUL_MODE_PROT64 :	cs_db
-		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
-	if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
-		emulate_ctxt.cs_base = 0;
-		emulate_ctxt.ds_base = 0;
-		emulate_ctxt.es_base = 0;
-		emulate_ctxt.ss_base = 0;
-	} else {
-		emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
-		emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
-		emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
-		emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
-	}
-
-	emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
-	emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
-
-	vcpu->mmio_is_write = 0;
-	vcpu->pio.string = 0;
-	r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
-	if (vcpu->pio.string)
-		return EMULATE_DO_MMIO;
-
-	if ((r || vcpu->mmio_is_write) && run) {
-		run->exit_reason = KVM_EXIT_MMIO;
-		run->mmio.phys_addr = vcpu->mmio_phys_addr;
-		memcpy(run->mmio.data, vcpu->mmio_data, 8);
-		run->mmio.len = vcpu->mmio_size;
-		run->mmio.is_write = vcpu->mmio_is_write;
-	}
-
-	if (r) {
-		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-			return EMULATE_DONE;
-		if (!vcpu->mmio_needed) {
-			kvm_report_emulation_failure(vcpu, "mmio");
-			return EMULATE_FAIL;
-		}
-		return EMULATE_DO_MMIO;
-	}
-
-	kvm_x86_ops->decache_regs(vcpu);
-	kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
-
-	if (vcpu->mmio_is_write) {
-		vcpu->mmio_needed = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-
-/*
- * The vCPU has executed a HLT instruction with in-kernel mode enabled.
- */
-static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue(&vcpu->wq, &wait);
-
-	/*
-	 * We will block until either an interrupt or a signal wakes us up
-	 */
-	while (!kvm_cpu_has_interrupt(vcpu)
-	       && !signal_pending(current)
-	       && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
-	       && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		vcpu_put(vcpu);
-		schedule();
-		vcpu_load(vcpu);
-	}
-
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&vcpu->wq, &wait);
-}
-
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
-	++vcpu->stat.halt_exits;
-	if (irqchip_in_kernel(vcpu->kvm)) {
-		vcpu->mp_state = VCPU_MP_STATE_HALTED;
-		kvm_vcpu_block(vcpu);
-		if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
-			return -EINTR;
-		return 1;
-	} else {
-		vcpu->run->exit_reason = KVM_EXIT_HLT;
-		return 0;
-	}
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
-
-	kvm_x86_ops->cache_regs(vcpu);
-	ret = -KVM_EINVAL;
-#ifdef CONFIG_X86_64
-	if (is_long_mode(vcpu)) {
-		nr = vcpu->regs[VCPU_REGS_RAX];
-		a0 = vcpu->regs[VCPU_REGS_RDI];
-		a1 = vcpu->regs[VCPU_REGS_RSI];
-		a2 = vcpu->regs[VCPU_REGS_RDX];
-		a3 = vcpu->regs[VCPU_REGS_RCX];
-		a4 = vcpu->regs[VCPU_REGS_R8];
-		a5 = vcpu->regs[VCPU_REGS_R9];
-	} else
-#endif
-	{
-		nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
-		a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
-		a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
-		a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
-		a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
-		a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
-		a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
-	}
-	switch (nr) {
-	default:
-		run->hypercall.nr = nr;
-		run->hypercall.args[0] = a0;
-		run->hypercall.args[1] = a1;
-		run->hypercall.args[2] = a2;
-		run->hypercall.args[3] = a3;
-		run->hypercall.args[4] = a4;
-		run->hypercall.args[5] = a5;
-		run->hypercall.ret = ret;
-		run->hypercall.longmode = is_long_mode(vcpu);
-		kvm_x86_ops->decache_regs(vcpu);
-		return 0;
-	}
-	vcpu->regs[VCPU_REGS_RAX] = ret;
-	kvm_x86_ops->decache_regs(vcpu);
-	return 1;
-}
-EXPORT_SYMBOL_GPL(kvm_hypercall);
-
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
-	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
-	struct descriptor_table dt = { limit, base };
-
-	kvm_x86_ops->set_gdt(vcpu, &dt);
-}
-
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
-	struct descriptor_table dt = { limit, base };
-
-	kvm_x86_ops->set_idt(vcpu, &dt);
-}
-
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags)
-{
-	lmsw(vcpu, msw);
-	*rflags = kvm_x86_ops->get_rflags(vcpu);
-}
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
-	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-	switch (cr) {
-	case 0:
-		return vcpu->cr0;
-	case 2:
-		return vcpu->cr2;
-	case 3:
-		return vcpu->cr3;
-	case 4:
-		return vcpu->cr4;
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
-		return 0;
-	}
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
-		     unsigned long *rflags)
-{
-	switch (cr) {
-	case 0:
-		set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
-		*rflags = kvm_x86_ops->get_rflags(vcpu);
-		break;
-	case 2:
-		vcpu->cr2 = val;
-		break;
-	case 3:
-		set_cr3(vcpu, val);
-		break;
-	case 4:
-		set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
-		break;
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
-	}
-}
-
-/*
- * Register the para guest with the host:
- */
-static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
-{
-	struct kvm_vcpu_para_state *para_state;
-	hpa_t para_state_hpa, hypercall_hpa;
-	struct page *para_state_page;
-	unsigned char *hypercall;
-	gpa_t hypercall_gpa;
-
-	printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
-	printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
-
-	/*
-	 * Needs to be page aligned:
-	 */
-	if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
-		goto err_gp;
-
-	para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
-	printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
-	if (is_error_hpa(para_state_hpa))
-		goto err_gp;
-
-	mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
-	para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
-	para_state = kmap(para_state_page);
-
-	printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
-	printk(KERN_DEBUG "....           size: %d\n", para_state->size);
-
-	para_state->host_version = KVM_PARA_API_VERSION;
-	/*
-	 * We cannot support guests that try to register themselves
-	 * with a newer API version than the host supports:
-	 */
-	if (para_state->guest_version > KVM_PARA_API_VERSION) {
-		para_state->ret = -KVM_EINVAL;
-		goto err_kunmap_skip;
-	}
-
-	hypercall_gpa = para_state->hypercall_gpa;
-	hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
-	printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
-	if (is_error_hpa(hypercall_hpa)) {
-		para_state->ret = -KVM_EINVAL;
-		goto err_kunmap_skip;
-	}
-
-	printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
-	vcpu->para_state_page = para_state_page;
-	vcpu->para_state_gpa = para_state_gpa;
-	vcpu->hypercall_gpa = hypercall_gpa;
-
-	mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
-	hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
-				KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
-	kvm_x86_ops->patch_hypercall(vcpu, hypercall);
-	kunmap_atomic(hypercall, KM_USER1);
-
-	para_state->ret = 0;
-err_kunmap_skip:
-	kunmap(para_state_page);
-	return 0;
-err_gp:
-	return 1;
-}
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
-	u64 data;
-
-	switch (msr) {
-	case 0xc0010010: /* SYSCFG */
-	case 0xc0010015: /* HWCR */
-	case MSR_IA32_PLATFORM_ID:
-	case MSR_IA32_P5_MC_ADDR:
-	case MSR_IA32_P5_MC_TYPE:
-	case MSR_IA32_MC0_CTL:
-	case MSR_IA32_MCG_STATUS:
-	case MSR_IA32_MCG_CAP:
-	case MSR_IA32_MC0_MISC:
-	case MSR_IA32_MC0_MISC+4:
-	case MSR_IA32_MC0_MISC+8:
-	case MSR_IA32_MC0_MISC+12:
-	case MSR_IA32_MC0_MISC+16:
-	case MSR_IA32_UCODE_REV:
-	case MSR_IA32_PERF_STATUS:
-	case MSR_IA32_EBL_CR_POWERON:
-		/* MTRR registers */
-	case 0xfe:
-	case 0x200 ... 0x2ff:
-		data = 0;
-		break;
-	case 0xcd: /* fsb frequency */
-		data = 3;
-		break;
-	case MSR_IA32_APICBASE:
-		data = kvm_get_apic_base(vcpu);
-		break;
-	case MSR_IA32_MISC_ENABLE:
-		data = vcpu->ia32_misc_enable_msr;
-		break;
-#ifdef CONFIG_X86_64
-	case MSR_EFER:
-		data = vcpu->shadow_efer;
-		break;
-#endif
-	default:
-		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
-		return 1;
-	}
-	*pdata = data;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
-	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
-}
-
-#ifdef CONFIG_X86_64
-
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-	if (efer & EFER_RESERVED_BITS) {
-		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
-		       efer);
-		inject_gp(vcpu);
-		return;
-	}
-
-	if (is_paging(vcpu)
-	    && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
-		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
-		inject_gp(vcpu);
-		return;
-	}
-
-	kvm_x86_ops->set_efer(vcpu, efer);
-
-	efer &= ~EFER_LMA;
-	efer |= vcpu->shadow_efer & EFER_LMA;
-
-	vcpu->shadow_efer = efer;
-}
-
-#endif
-
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-	switch (msr) {
-#ifdef CONFIG_X86_64
-	case MSR_EFER:
-		set_efer(vcpu, data);
-		break;
-#endif
-	case MSR_IA32_MC0_STATUS:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-		       __FUNCTION__, data);
-		break;
-	case MSR_IA32_MCG_STATUS:
-		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-			__FUNCTION__, data);
-		break;
-	case MSR_IA32_UCODE_REV:
-	case MSR_IA32_UCODE_WRITE:
-	case 0x200 ... 0x2ff: /* MTRRs */
-		break;
-	case MSR_IA32_APICBASE:
-		kvm_set_apic_base(vcpu, data);
-		break;
-	case MSR_IA32_MISC_ENABLE:
-		vcpu->ia32_misc_enable_msr = data;
-		break;
-	/*
-	 * This is the 'probe whether the host is KVM' logic:
-	 */
-	case MSR_KVM_API_MAGIC:
-		return vcpu_register_para(vcpu, data);
-
-	default:
-		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
-	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
-}
-
-void kvm_resched(struct kvm_vcpu *vcpu)
-{
-	if (!need_resched())
-		return;
-	cond_resched();
-}
-EXPORT_SYMBOL_GPL(kvm_resched);
-
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
-	int i;
-	u32 function;
-	struct kvm_cpuid_entry *e, *best;
-
-	kvm_x86_ops->cache_regs(vcpu);
-	function = vcpu->regs[VCPU_REGS_RAX];
-	vcpu->regs[VCPU_REGS_RAX] = 0;
-	vcpu->regs[VCPU_REGS_RBX] = 0;
-	vcpu->regs[VCPU_REGS_RCX] = 0;
-	vcpu->regs[VCPU_REGS_RDX] = 0;
-	best = NULL;
-	for (i = 0; i < vcpu->cpuid_nent; ++i) {
-		e = &vcpu->cpuid_entries[i];
-		if (e->function == function) {
-			best = e;
-			break;
-		}
-		/*
-		 * Both basic or both extended?
-		 */
-		if (((e->function ^ function) & 0x80000000) == 0)
-			if (!best || e->function > best->function)
-				best = e;
-	}
-	if (best) {
-		vcpu->regs[VCPU_REGS_RAX] = best->eax;
-		vcpu->regs[VCPU_REGS_RBX] = best->ebx;
-		vcpu->regs[VCPU_REGS_RCX] = best->ecx;
-		vcpu->regs[VCPU_REGS_RDX] = best->edx;
-	}
-	kvm_x86_ops->decache_regs(vcpu);
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
-
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
-	void *p = vcpu->pio_data;
-	void *q;
-	unsigned bytes;
-	int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
-
-	q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
-		 PAGE_KERNEL);
-	if (!q) {
-		free_pio_guest_pages(vcpu);
-		return -ENOMEM;
-	}
-	q += vcpu->pio.guest_page_offset;
-	bytes = vcpu->pio.size * vcpu->pio.cur_count;
-	if (vcpu->pio.in)
-		memcpy(q, p, bytes);
-	else
-		memcpy(p, q, bytes);
-	q -= vcpu->pio.guest_page_offset;
-	vunmap(q);
-	free_pio_guest_pages(vcpu);
-	return 0;
-}
-
-static int complete_pio(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pio_request *io = &vcpu->pio;
-	long delta;
-	int r;
-
-	kvm_x86_ops->cache_regs(vcpu);
-
-	if (!io->string) {
-		if (io->in)
-			memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
-			       io->size);
-	} else {
-		if (io->in) {
-			r = pio_copy_data(vcpu);
-			if (r) {
-				kvm_x86_ops->cache_regs(vcpu);
-				return r;
-			}
-		}
-
-		delta = 1;
-		if (io->rep) {
-			delta *= io->cur_count;
-			/*
-			 * The size of the register should really depend on
-			 * current address size.
-			 */
-			vcpu->regs[VCPU_REGS_RCX] -= delta;
-		}
-		if (io->down)
-			delta = -delta;
-		delta *= io->size;
-		if (io->in)
-			vcpu->regs[VCPU_REGS_RDI] += delta;
-		else
-			vcpu->regs[VCPU_REGS_RSI] += delta;
-	}
-
-	kvm_x86_ops->decache_regs(vcpu);
-
-	io->count -= io->cur_count;
-	io->cur_count = 0;
-
-	return 0;
-}
-
-static void kernel_pio(struct kvm_io_device *pio_dev,
-		       struct kvm_vcpu *vcpu,
-		       void *pd)
-{
-	/* TODO: String I/O for in kernel device */
-
-	mutex_lock(&vcpu->kvm->lock);
-	if (vcpu->pio.in)
-		kvm_iodevice_read(pio_dev, vcpu->pio.port,
-				  vcpu->pio.size,
-				  pd);
-	else
-		kvm_iodevice_write(pio_dev, vcpu->pio.port,
-				   vcpu->pio.size,
-				   pd);
-	mutex_unlock(&vcpu->kvm->lock);
-}
-
-static void pio_string_write(struct kvm_io_device *pio_dev,
-			     struct kvm_vcpu *vcpu)
-{
-	struct kvm_pio_request *io = &vcpu->pio;
-	void *pd = vcpu->pio_data;
-	int i;
-
-	mutex_lock(&vcpu->kvm->lock);
-	for (i = 0; i < io->cur_count; i++) {
-		kvm_iodevice_write(pio_dev, io->port,
-				   io->size,
-				   pd);
-		pd += io->size;
-	}
-	mutex_unlock(&vcpu->kvm->lock);
-}
-
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-		  int size, unsigned port)
-{
-	struct kvm_io_device *pio_dev;
-
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->pio.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
-	vcpu->run->io.port = vcpu->pio.port = port;
-	vcpu->pio.in = in;
-	vcpu->pio.string = 0;
-	vcpu->pio.down = 0;
-	vcpu->pio.guest_page_offset = 0;
-	vcpu->pio.rep = 0;
-
-	kvm_x86_ops->cache_regs(vcpu);
-	memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
-	kvm_x86_ops->decache_regs(vcpu);
-
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-
-	pio_dev = vcpu_find_pio_dev(vcpu, port);
-	if (pio_dev) {
-		kernel_pio(pio_dev, vcpu, vcpu->pio_data);
-		complete_pio(vcpu);
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
-		  int size, unsigned long count, int down,
-		  gva_t address, int rep, unsigned port)
-{
-	unsigned now, in_page;
-	int i, ret = 0;
-	int nr_pages = 1;
-	struct page *page;
-	struct kvm_io_device *pio_dev;
-
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->pio.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
-	vcpu->run->io.port = vcpu->pio.port = port;
-	vcpu->pio.in = in;
-	vcpu->pio.string = 1;
-	vcpu->pio.down = down;
-	vcpu->pio.guest_page_offset = offset_in_page(address);
-	vcpu->pio.rep = rep;
-
-	if (!count) {
-		kvm_x86_ops->skip_emulated_instruction(vcpu);
-		return 1;
-	}
-
-	if (!down)
-		in_page = PAGE_SIZE - offset_in_page(address);
-	else
-		in_page = offset_in_page(address) + size;
-	now = min(count, (unsigned long)in_page / size);
-	if (!now) {
-		/*
-		 * String I/O straddles page boundary.  Pin two guest pages
-		 * so that we satisfy atomicity constraints.  Do just one
-		 * transaction to avoid complexity.
-		 */
-		nr_pages = 2;
-		now = 1;
-	}
-	if (down) {
-		/*
-		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
-		 */
-		pr_unimpl(vcpu, "guest string pio down\n");
-		inject_gp(vcpu);
-		return 1;
-	}
-	vcpu->run->io.count = now;
-	vcpu->pio.cur_count = now;
-
-	if (vcpu->pio.cur_count == vcpu->pio.count)
-		kvm_x86_ops->skip_emulated_instruction(vcpu);
-
-	for (i = 0; i < nr_pages; ++i) {
-		mutex_lock(&vcpu->kvm->lock);
-		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
-		if (page)
-			get_page(page);
-		vcpu->pio.guest_pages[i] = page;
-		mutex_unlock(&vcpu->kvm->lock);
-		if (!page) {
-			inject_gp(vcpu);
-			free_pio_guest_pages(vcpu);
-			return 1;
-		}
-	}
-
-	pio_dev = vcpu_find_pio_dev(vcpu, port);
-	if (!vcpu->pio.in) {
-		/* string PIO write */
-		ret = pio_copy_data(vcpu);
-		if (ret >= 0 && pio_dev) {
-			pio_string_write(pio_dev, vcpu);
-			complete_pio(vcpu);
-			if (vcpu->pio.count == 0)
-				ret = 1;
-		}
-	} else if (pio_dev)
-		pr_unimpl(vcpu, "no string pio read support yet, "
-		       "port %x size %d count %ld\n",
-			port, size, count);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
-					  struct kvm_run *kvm_run)
-{
-	return (!vcpu->irq_summary &&
-		kvm_run->request_interrupt_window &&
-		vcpu->interrupt_window_open &&
-		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
-}
-
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
-			      struct kvm_run *kvm_run)
-{
-	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-	kvm_run->cr8 = get_cr8(vcpu);
-	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_run->ready_for_interrupt_injection = 1;
-	else
-		kvm_run->ready_for_interrupt_injection =
-					(vcpu->interrupt_window_open &&
-					 vcpu->irq_summary == 0);
-}
-
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	int r;
-
-	if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
-		printk("vcpu %d received sipi with vector # %x\n",
-		       vcpu->vcpu_id, vcpu->sipi_vector);
-		kvm_lapic_reset(vcpu);
-		kvm_x86_ops->vcpu_reset(vcpu);
-		vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
-	}
-
-preempted:
-	if (vcpu->guest_debug.enabled)
-		kvm_x86_ops->guest_debug_pre(vcpu);
-
-again:
-	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
-		goto out;
-
-	preempt_disable();
-
-	kvm_x86_ops->prepare_guest_switch(vcpu);
-	kvm_load_guest_fpu(vcpu);
-
-	local_irq_disable();
-
-	if (signal_pending(current)) {
-		local_irq_enable();
-		preempt_enable();
-		r = -EINTR;
-		kvm_run->exit_reason = KVM_EXIT_INTR;
-		++vcpu->stat.signal_exits;
-		goto out;
-	}
-
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_x86_ops->inject_pending_irq(vcpu);
-	else if (!vcpu->mmio_read_completed)
-		kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
-
-	vcpu->guest_mode = 1;
-	kvm_guest_enter();
-
-	if (vcpu->requests)
-		if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
-			kvm_x86_ops->tlb_flush(vcpu);
-
-	kvm_x86_ops->run(vcpu, kvm_run);
-
-	vcpu->guest_mode = 0;
-	local_irq_enable();
-
-	++vcpu->stat.exits;
-
-	/*
-	 * We must have an instruction between local_irq_enable() and
-	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
-	 * the interrupt shadow.  The stat.exits increment will do nicely.
-	 * But we need to prevent reordering, hence this barrier():
-	 */
-	barrier();
-
-	kvm_guest_exit();
-
-	preempt_enable();
-
-	/*
-	 * Profile KVM exit RIPs:
-	 */
-	if (unlikely(prof_on == KVM_PROFILING)) {
-		kvm_x86_ops->cache_regs(vcpu);
-		profile_hit(KVM_PROFILING, (void *)vcpu->rip);
-	}
-
-	r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
-
-	if (r > 0) {
-		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-			r = -EINTR;
-			kvm_run->exit_reason = KVM_EXIT_INTR;
-			++vcpu->stat.request_irq_exits;
-			goto out;
-		}
-		if (!need_resched()) {
-			++vcpu->stat.light_exits;
-			goto again;
-		}
-	}
-
-out:
-	if (r > 0) {
-		kvm_resched(vcpu);
-		goto preempted;
-	}
-
-	post_kvm_run_save(vcpu, kvm_run);
-
-	return r;
-}
-
-
-static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	int r;
-	sigset_t sigsaved;
-
-	vcpu_load(vcpu);
-
-	if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
-		kvm_vcpu_block(vcpu);
-		vcpu_put(vcpu);
-		return -EAGAIN;
-	}
-
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
-	/* re-sync apic's tpr */
-	if (!irqchip_in_kernel(vcpu->kvm))
-		set_cr8(vcpu, kvm_run->cr8);
-
-	if (vcpu->pio.cur_count) {
-		r = complete_pio(vcpu);
-		if (r)
-			goto out;
-	}
-
-	if (vcpu->mmio_needed) {
-		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-		vcpu->mmio_read_completed = 1;
-		vcpu->mmio_needed = 0;
-		r = emulate_instruction(vcpu, kvm_run,
-					vcpu->mmio_fault_cr2, 0);
-		if (r == EMULATE_DO_MMIO) {
-			/*
-			 * Read-modify-write.  Back to userspace.
-			 */
-			r = 0;
-			goto out;
-		}
-	}
-
-	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
-		kvm_x86_ops->cache_regs(vcpu);
-		vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
-		kvm_x86_ops->decache_regs(vcpu);
-	}
-
-	r = __vcpu_run(vcpu, kvm_run);
-
-out:
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	vcpu_put(vcpu);
-	return r;
-}
-
-static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
-				   struct kvm_regs *regs)
-{
-	vcpu_load(vcpu);
-
-	kvm_x86_ops->cache_regs(vcpu);
-
-	regs->rax = vcpu->regs[VCPU_REGS_RAX];
-	regs->rbx = vcpu->regs[VCPU_REGS_RBX];
-	regs->rcx = vcpu->regs[VCPU_REGS_RCX];
-	regs->rdx = vcpu->regs[VCPU_REGS_RDX];
-	regs->rsi = vcpu->regs[VCPU_REGS_RSI];
-	regs->rdi = vcpu->regs[VCPU_REGS_RDI];
-	regs->rsp = vcpu->regs[VCPU_REGS_RSP];
-	regs->rbp = vcpu->regs[VCPU_REGS_RBP];
-#ifdef CONFIG_X86_64
-	regs->r8 = vcpu->regs[VCPU_REGS_R8];
-	regs->r9 = vcpu->regs[VCPU_REGS_R9];
-	regs->r10 = vcpu->regs[VCPU_REGS_R10];
-	regs->r11 = vcpu->regs[VCPU_REGS_R11];
-	regs->r12 = vcpu->regs[VCPU_REGS_R12];
-	regs->r13 = vcpu->regs[VCPU_REGS_R13];
-	regs->r14 = vcpu->regs[VCPU_REGS_R14];
-	regs->r15 = vcpu->regs[VCPU_REGS_R15];
-#endif
-
-	regs->rip = vcpu->rip;
-	regs->rflags = kvm_x86_ops->get_rflags(vcpu);
-
-	/*
-	 * Don't leak debug flags in case they were set for guest debugging
-	 */
-	if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
-		regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
-				   struct kvm_regs *regs)
-{
-	vcpu_load(vcpu);
-
-	vcpu->regs[VCPU_REGS_RAX] = regs->rax;
-	vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
-	vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
-	vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
-	vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
-	vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
-	vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
-	vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
-#ifdef CONFIG_X86_64
-	vcpu->regs[VCPU_REGS_R8] = regs->r8;
-	vcpu->regs[VCPU_REGS_R9] = regs->r9;
-	vcpu->regs[VCPU_REGS_R10] = regs->r10;
-	vcpu->regs[VCPU_REGS_R11] = regs->r11;
-	vcpu->regs[VCPU_REGS_R12] = regs->r12;
-	vcpu->regs[VCPU_REGS_R13] = regs->r13;
-	vcpu->regs[VCPU_REGS_R14] = regs->r14;
-	vcpu->regs[VCPU_REGS_R15] = regs->r15;
-#endif
-
-	vcpu->rip = regs->rip;
-	kvm_x86_ops->set_rflags(vcpu, regs->rflags);
-
-	kvm_x86_ops->decache_regs(vcpu);
-
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-static void get_segment(struct kvm_vcpu *vcpu,
-			struct kvm_segment *var, int seg)
-{
-	return kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
-static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-				    struct kvm_sregs *sregs)
-{
-	struct descriptor_table dt;
-	int pending_vec;
-
-	vcpu_load(vcpu);
-
-	get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
-	get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
-	get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
-	get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
-	get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
-	get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
-	get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
-	get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
-	kvm_x86_ops->get_idt(vcpu, &dt);
-	sregs->idt.limit = dt.limit;
-	sregs->idt.base = dt.base;
-	kvm_x86_ops->get_gdt(vcpu, &dt);
-	sregs->gdt.limit = dt.limit;
-	sregs->gdt.base = dt.base;
-
-	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-	sregs->cr0 = vcpu->cr0;
-	sregs->cr2 = vcpu->cr2;
-	sregs->cr3 = vcpu->cr3;
-	sregs->cr4 = vcpu->cr4;
-	sregs->cr8 = get_cr8(vcpu);
-	sregs->efer = vcpu->shadow_efer;
-	sregs->apic_base = kvm_get_apic_base(vcpu);
-
-	if (irqchip_in_kernel(vcpu->kvm)) {
-		memset(sregs->interrupt_bitmap, 0,
-		       sizeof sregs->interrupt_bitmap);
-		pending_vec = kvm_x86_ops->get_irq(vcpu);
-		if (pending_vec >= 0)
-			set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
-	} else
-		memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
-		       sizeof sregs->interrupt_bitmap);
-
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-static void set_segment(struct kvm_vcpu *vcpu,
-			struct kvm_segment *var, int seg)
-{
-	return kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-				    struct kvm_sregs *sregs)
-{
-	int mmu_reset_needed = 0;
-	int i, pending_vec, max_bits;
-	struct descriptor_table dt;
-
-	vcpu_load(vcpu);
-
-	dt.limit = sregs->idt.limit;
-	dt.base = sregs->idt.base;
-	kvm_x86_ops->set_idt(vcpu, &dt);
-	dt.limit = sregs->gdt.limit;
-	dt.base = sregs->gdt.base;
-	kvm_x86_ops->set_gdt(vcpu, &dt);
-
-	vcpu->cr2 = sregs->cr2;
-	mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
-	vcpu->cr3 = sregs->cr3;
-
-	set_cr8(vcpu, sregs->cr8);
-
-	mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
-#ifdef CONFIG_X86_64
-	kvm_x86_ops->set_efer(vcpu, sregs->efer);
-#endif
-	kvm_set_apic_base(vcpu, sregs->apic_base);
-
-	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-
-	mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
-	vcpu->cr0 = sregs->cr0;
-	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
-
-	mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
-	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-	if (!is_long_mode(vcpu) && is_pae(vcpu))
-		load_pdptrs(vcpu, vcpu->cr3);
-
-	if (mmu_reset_needed)
-		kvm_mmu_reset_context(vcpu);
-
-	if (!irqchip_in_kernel(vcpu->kvm)) {
-		memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
-		       sizeof vcpu->irq_pending);
-		vcpu->irq_summary = 0;
-		for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
-			if (vcpu->irq_pending[i])
-				__set_bit(i, &vcpu->irq_summary);
-	} else {
-		max_bits = (sizeof sregs->interrupt_bitmap) << 3;
-		pending_vec = find_first_bit(
-			(const unsigned long *)sregs->interrupt_bitmap,
-			max_bits);
-		/* Only pending external irq is handled here */
-		if (pending_vec < max_bits) {
-			kvm_x86_ops->set_irq(vcpu, pending_vec);
-			printk("Set back pending irq %d\n", pending_vec);
-		}
-	}
-
-	set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
-	set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
-	set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
-	set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
-	set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
-	set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
-	set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
-	set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
-	struct kvm_segment cs;
-
-	get_segment(vcpu, &cs, VCPU_SREG_CS);
-	*db = cs.db;
-	*l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
-/*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * This list is modified at module load time to reflect the
- * capabilities of the host cpu.
- */
-static u32 msrs_to_save[] = {
-	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-	MSR_K6_STAR,
-#ifdef CONFIG_X86_64
-	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
-	MSR_IA32_TIME_STAMP_COUNTER,
-};
-
-static unsigned num_msrs_to_save;
-
-static u32 emulated_msrs[] = {
-	MSR_IA32_MISC_ENABLE,
-};
-
-static __init void kvm_init_msr_list(void)
-{
-	u32 dummy[2];
-	unsigned i, j;
-
-	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
-		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
-			continue;
-		if (j < i)
-			msrs_to_save[j] = msrs_to_save[i];
-		j++;
-	}
-	num_msrs_to_save = j;
-}
-
-/*
- * Adapt set_msr() to msr_io()'s calling convention
- */
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
-	return kvm_set_msr(vcpu, index, *data);
-}
-
-/*
- * Read or write a bunch of msrs. All parameters are kernel addresses.
- *
- * @return number of msrs set successfully.
- */
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
-		    struct kvm_msr_entry *entries,
-		    int (*do_msr)(struct kvm_vcpu *vcpu,
-				  unsigned index, u64 *data))
-{
-	int i;
-
-	vcpu_load(vcpu);
-
-	for (i = 0; i < msrs->nmsrs; ++i)
-		if (do_msr(vcpu, entries[i].index, &entries[i].data))
-			break;
-
-	vcpu_put(vcpu);
-
-	return i;
-}
-
-/*
- * Read or write a bunch of msrs. Parameters are user addresses.
- *
- * @return number of msrs set successfully.
- */
-static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
-		  int (*do_msr)(struct kvm_vcpu *vcpu,
-				unsigned index, u64 *data),
-		  int writeback)
-{
-	struct kvm_msrs msrs;
-	struct kvm_msr_entry *entries;
-	int r, n;
-	unsigned size;
-
-	r = -EFAULT;
-	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
-		goto out;
-
-	r = -E2BIG;
-	if (msrs.nmsrs >= MAX_IO_MSRS)
-		goto out;
-
-	r = -ENOMEM;
-	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
-	entries = vmalloc(size);
-	if (!entries)
-		goto out;
-
-	r = -EFAULT;
-	if (copy_from_user(entries, user_msrs->entries, size))
-		goto out_free;
-
-	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
-	if (r < 0)
-		goto out_free;
-
-	r = -EFAULT;
-	if (writeback && copy_to_user(user_msrs->entries, entries, size))
-		goto out_free;
-
-	r = n;
-
-out_free:
-	vfree(entries);
-out:
-	return r;
-}
-
-/*
- * Translate a guest virtual address to a guest physical address.
- */
-static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
-				    struct kvm_translation *tr)
-{
-	unsigned long vaddr = tr->linear_address;
-	gpa_t gpa;
-
-	vcpu_load(vcpu);
-	mutex_lock(&vcpu->kvm->lock);
-	gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
-	tr->physical_address = gpa;
-	tr->valid = gpa != UNMAPPED_GVA;
-	tr->writeable = 1;
-	tr->usermode = 0;
-	mutex_unlock(&vcpu->kvm->lock);
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
-				    struct kvm_interrupt *irq)
-{
-	if (irq->irq < 0 || irq->irq >= 256)
-		return -EINVAL;
-	if (irqchip_in_kernel(vcpu->kvm))
-		return -ENXIO;
-	vcpu_load(vcpu);
-
-	set_bit(irq->irq, vcpu->irq_pending);
-	set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
-
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				      struct kvm_debug_guest *dbg)
-{
-	int r;
-
-	vcpu_load(vcpu);
-
-	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
-
-	vcpu_put(vcpu);
-
-	return r;
-}
-
-static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
-				    unsigned long address,
-				    int *type)
-{
-	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
-	unsigned long pgoff;
-	struct page *page;
-
-	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	if (pgoff == 0)
-		page = virt_to_page(vcpu->run);
-	else if (pgoff == KVM_PIO_PAGE_OFFSET)
-		page = virt_to_page(vcpu->pio_data);
-	else
-		return NOPAGE_SIGBUS;
-	get_page(page);
-	if (type != NULL)
-		*type = VM_FAULT_MINOR;
-
-	return page;
-}
-
-static struct vm_operations_struct kvm_vcpu_vm_ops = {
-	.nopage = kvm_vcpu_nopage,
-};
-
-static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	vma->vm_ops = &kvm_vcpu_vm_ops;
-	return 0;
-}
-
-static int kvm_vcpu_release(struct inode *inode, struct file *filp)
-{
-	struct kvm_vcpu *vcpu = filp->private_data;
-
-	fput(vcpu->kvm->filp);
-	return 0;
-}
-
-static struct file_operations kvm_vcpu_fops = {
-	.release        = kvm_vcpu_release,
-	.unlocked_ioctl = kvm_vcpu_ioctl,
-	.compat_ioctl   = kvm_vcpu_ioctl,
-	.mmap           = kvm_vcpu_mmap,
-};
-
-/*
- * Allocates an inode for the vcpu.
- */
-static int create_vcpu_fd(struct kvm_vcpu *vcpu)
-{
-	int fd, r;
-	struct inode *inode;
-	struct file *file;
-
-	r = anon_inode_getfd(&fd, &inode, &file,
-			     "kvm-vcpu", &kvm_vcpu_fops, vcpu);
-	if (r)
-		return r;
-	atomic_inc(&vcpu->kvm->filp->f_count);
-	return fd;
-}
-
-/*
- * Creates some virtual cpus.  Good luck creating more than one.
- */
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
-{
-	int r;
-	struct kvm_vcpu *vcpu;
-
-	if (!valid_vcpu(n))
-		return -EINVAL;
-
-	vcpu = kvm_x86_ops->vcpu_create(kvm, n);
-	if (IS_ERR(vcpu))
-		return PTR_ERR(vcpu);
-
-	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
-
-	/* We do fxsave: this must be aligned. */
-	BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
-
-	vcpu_load(vcpu);
-	r = kvm_mmu_setup(vcpu);
-	vcpu_put(vcpu);
-	if (r < 0)
-		goto free_vcpu;
-
-	mutex_lock(&kvm->lock);
-	if (kvm->vcpus[n]) {
-		r = -EEXIST;
-		mutex_unlock(&kvm->lock);
-		goto mmu_unload;
-	}
-	kvm->vcpus[n] = vcpu;
-	mutex_unlock(&kvm->lock);
-
-	/* Now it's all set up, let userspace reach it */
-	r = create_vcpu_fd(vcpu);
-	if (r < 0)
-		goto unlink;
-	return r;
-
-unlink:
-	mutex_lock(&kvm->lock);
-	kvm->vcpus[n] = NULL;
-	mutex_unlock(&kvm->lock);
-
-mmu_unload:
-	vcpu_load(vcpu);
-	kvm_mmu_unload(vcpu);
-	vcpu_put(vcpu);
-
-free_vcpu:
-	kvm_x86_ops->vcpu_free(vcpu);
-	return r;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
-	u64 efer;
-	int i;
-	struct kvm_cpuid_entry *e, *entry;
-
-	rdmsrl(MSR_EFER, efer);
-	entry = NULL;
-	for (i = 0; i < vcpu->cpuid_nent; ++i) {
-		e = &vcpu->cpuid_entries[i];
-		if (e->function == 0x80000001) {
-			entry = e;
-			break;
-		}
-	}
-	if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
-		entry->edx &= ~(1 << 20);
-		printk(KERN_INFO "kvm: guest NX capability removed\n");
-	}
-}
-
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid *cpuid,
-				    struct kvm_cpuid_entry __user *entries)
-{
-	int r;
-
-	r = -E2BIG;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		goto out;
-	r = -EFAULT;
-	if (copy_from_user(&vcpu->cpuid_entries, entries,
-			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
-		goto out;
-	vcpu->cpuid_nent = cpuid->nent;
-	cpuid_fix_nx_cap(vcpu);
-	return 0;
-
-out:
-	return r;
-}
-
-static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
-{
-	if (sigset) {
-		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		vcpu->sigset_active = 1;
-		vcpu->sigset = *sigset;
-	} else
-		vcpu->sigset_active = 0;
-	return 0;
-}
-
-/*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
-	u16	cwd;
-	u16	swd;
-	u16	twd;
-	u16	fop;
-	u64	rip;
-	u64	rdp;
-	u32	mxcsr;
-	u32	mxcsr_mask;
-	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
-static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
-
-	vcpu_load(vcpu);
-
-	memcpy(fpu->fpr, fxsave->st_space, 128);
-	fpu->fcw = fxsave->cwd;
-	fpu->fsw = fxsave->swd;
-	fpu->ftwx = fxsave->twd;
-	fpu->last_opcode = fxsave->fop;
-	fpu->last_ip = fxsave->rip;
-	fpu->last_dp = fxsave->rdp;
-	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
-
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
-
-	vcpu_load(vcpu);
-
-	memcpy(fxsave->st_space, fpu->fpr, 128);
-	fxsave->cwd = fpu->fcw;
-	fxsave->swd = fpu->fsw;
-	fxsave->twd = fpu->ftwx;
-	fxsave->fop = fpu->last_opcode;
-	fxsave->rip = fpu->last_ip;
-	fxsave->rdp = fpu->last_dp;
-	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
-
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
-				    struct kvm_lapic_state *s)
-{
-	vcpu_load(vcpu);
-	memcpy(s->regs, vcpu->apic->regs, sizeof *s);
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
-				    struct kvm_lapic_state *s)
-{
-	vcpu_load(vcpu);
-	memcpy(vcpu->apic->regs, s->regs, sizeof *s);
-	kvm_apic_post_state_restore(vcpu);
-	vcpu_put(vcpu);
-
-	return 0;
-}
-
-static long kvm_vcpu_ioctl(struct file *filp,
-			   unsigned int ioctl, unsigned long arg)
-{
-	struct kvm_vcpu *vcpu = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	int r = -EINVAL;
-
-	switch (ioctl) {
-	case KVM_RUN:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
-		break;
-	case KVM_GET_REGS: {
-		struct kvm_regs kvm_regs;
-
-		memset(&kvm_regs, 0, sizeof kvm_regs);
-		r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_REGS: {
-		struct kvm_regs kvm_regs;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
-			goto out;
-		r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_GET_SREGS: {
-		struct kvm_sregs kvm_sregs;
-
-		memset(&kvm_sregs, 0, sizeof kvm_sregs);
-		r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_SREGS: {
-		struct kvm_sregs kvm_sregs;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
-			goto out;
-		r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_TRANSLATE: {
-		struct kvm_translation tr;
-
-		r = -EFAULT;
-		if (copy_from_user(&tr, argp, sizeof tr))
-			goto out;
-		r = kvm_vcpu_ioctl_translate(vcpu, &tr);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &tr, sizeof tr))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_INTERRUPT: {
-		struct kvm_interrupt irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&irq, argp, sizeof irq))
-			goto out;
-		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_DEBUG_GUEST: {
-		struct kvm_debug_guest dbg;
-
-		r = -EFAULT;
-		if (copy_from_user(&dbg, argp, sizeof dbg))
-			goto out;
-		r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_GET_MSRS:
-		r = msr_io(vcpu, argp, kvm_get_msr, 1);
-		break;
-	case KVM_SET_MSRS:
-		r = msr_io(vcpu, argp, do_set_msr, 0);
-		break;
-	case KVM_SET_CPUID: {
-		struct kvm_cpuid __user *cpuid_arg = argp;
-		struct kvm_cpuid cpuid;
-
-		r = -EFAULT;
-		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-			goto out;
-		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_SET_SIGNAL_MASK: {
-		struct kvm_signal_mask __user *sigmask_arg = argp;
-		struct kvm_signal_mask kvm_sigmask;
-		sigset_t sigset, *p;
-
-		p = NULL;
-		if (argp) {
-			r = -EFAULT;
-			if (copy_from_user(&kvm_sigmask, argp,
-					   sizeof kvm_sigmask))
-				goto out;
-			r = -EINVAL;
-			if (kvm_sigmask.len != sizeof sigset)
-				goto out;
-			r = -EFAULT;
-			if (copy_from_user(&sigset, sigmask_arg->sigset,
-					   sizeof sigset))
-				goto out;
-			p = &sigset;
-		}
-		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
-		break;
-	}
-	case KVM_GET_FPU: {
-		struct kvm_fpu fpu;
-
-		memset(&fpu, 0, sizeof fpu);
-		r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &fpu, sizeof fpu))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_FPU: {
-		struct kvm_fpu fpu;
-
-		r = -EFAULT;
-		if (copy_from_user(&fpu, argp, sizeof fpu))
-			goto out;
-		r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_GET_LAPIC: {
-		struct kvm_lapic_state lapic;
-
-		memset(&lapic, 0, sizeof lapic);
-		r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &lapic, sizeof lapic))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_LAPIC: {
-		struct kvm_lapic_state lapic;
-
-		r = -EFAULT;
-		if (copy_from_user(&lapic, argp, sizeof lapic))
-			goto out;
-		r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	default:
-		;
-	}
-out:
-	return r;
-}
-
-static long kvm_vm_ioctl(struct file *filp,
-			   unsigned int ioctl, unsigned long arg)
-{
-	struct kvm *kvm = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	int r = -EINVAL;
-
-	switch (ioctl) {
-	case KVM_CREATE_VCPU:
-		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
-		if (r < 0)
-			goto out;
-		break;
-	case KVM_SET_MEMORY_REGION: {
-		struct kvm_memory_region kvm_mem;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
-			goto out;
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_GET_DIRTY_LOG: {
-		struct kvm_dirty_log log;
-
-		r = -EFAULT;
-		if (copy_from_user(&log, argp, sizeof log))
-			goto out;
-		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_SET_MEMORY_ALIAS: {
-		struct kvm_memory_alias alias;
-
-		r = -EFAULT;
-		if (copy_from_user(&alias, argp, sizeof alias))
-			goto out;
-		r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_CREATE_IRQCHIP:
-		r = -ENOMEM;
-		kvm->vpic = kvm_create_pic(kvm);
-		if (kvm->vpic) {
-			r = kvm_ioapic_init(kvm);
-			if (r) {
-				kfree(kvm->vpic);
-				kvm->vpic = NULL;
-				goto out;
-			}
-		}
-		else
-			goto out;
-		break;
-	case KVM_IRQ_LINE: {
-		struct kvm_irq_level irq_event;
-
-		r = -EFAULT;
-		if (copy_from_user(&irq_event, argp, sizeof irq_event))
-			goto out;
-		if (irqchip_in_kernel(kvm)) {
-			mutex_lock(&kvm->lock);
-			if (irq_event.irq < 16)
-				kvm_pic_set_irq(pic_irqchip(kvm),
-					irq_event.irq,
-					irq_event.level);
-			kvm_ioapic_set_irq(kvm->vioapic,
-					irq_event.irq,
-					irq_event.level);
-			mutex_unlock(&kvm->lock);
-			r = 0;
-		}
-		break;
-	}
-	case KVM_GET_IRQCHIP: {
-		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip chip;
-
-		r = -EFAULT;
-		if (copy_from_user(&chip, argp, sizeof chip))
-			goto out;
-		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
-			goto out;
-		r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, &chip, sizeof chip))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_SET_IRQCHIP: {
-		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip chip;
-
-		r = -EFAULT;
-		if (copy_from_user(&chip, argp, sizeof chip))
-			goto out;
-		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
-			goto out;
-		r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
-		if (r)
-			goto out;
-		r = 0;
-		break;
-	}
-	default:
-		;
-	}
-out:
-	return r;
-}
-
-static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
-				  unsigned long address,
-				  int *type)
-{
-	struct kvm *kvm = vma->vm_file->private_data;
-	unsigned long pgoff;
-	struct page *page;
-
-	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	page = gfn_to_page(kvm, pgoff);
-	if (!page)
-		return NOPAGE_SIGBUS;
-	get_page(page);
-	if (type != NULL)
-		*type = VM_FAULT_MINOR;
-
-	return page;
-}
-
-static struct vm_operations_struct kvm_vm_vm_ops = {
-	.nopage = kvm_vm_nopage,
-};
-
-static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	vma->vm_ops = &kvm_vm_vm_ops;
-	return 0;
-}
-
-static struct file_operations kvm_vm_fops = {
-	.release        = kvm_vm_release,
-	.unlocked_ioctl = kvm_vm_ioctl,
-	.compat_ioctl   = kvm_vm_ioctl,
-	.mmap           = kvm_vm_mmap,
-};
-
-static int kvm_dev_ioctl_create_vm(void)
-{
-	int fd, r;
-	struct inode *inode;
-	struct file *file;
-	struct kvm *kvm;
-
-	kvm = kvm_create_vm();
-	if (IS_ERR(kvm))
-		return PTR_ERR(kvm);
-	r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
-	if (r) {
-		kvm_destroy_vm(kvm);
-		return r;
-	}
-
-	kvm->filp = file;
-
-	return fd;
-}
-
-static long kvm_dev_ioctl(struct file *filp,
-			  unsigned int ioctl, unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	long r = -EINVAL;
-
-	switch (ioctl) {
-	case KVM_GET_API_VERSION:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = KVM_API_VERSION;
-		break;
-	case KVM_CREATE_VM:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = kvm_dev_ioctl_create_vm();
-		break;
-	case KVM_GET_MSR_INDEX_LIST: {
-		struct kvm_msr_list __user *user_msr_list = argp;
-		struct kvm_msr_list msr_list;
-		unsigned n;
-
-		r = -EFAULT;
-		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
-			goto out;
-		n = msr_list.nmsrs;
-		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
-		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
-			goto out;
-		r = -E2BIG;
-		if (n < num_msrs_to_save)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
-				 num_msrs_to_save * sizeof(u32)))
-			goto out;
-		if (copy_to_user(user_msr_list->indices
-				 + num_msrs_to_save * sizeof(u32),
-				 &emulated_msrs,
-				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
-			goto out;
-		r = 0;
-		break;
-	}
-	case KVM_CHECK_EXTENSION: {
-		int ext = (long)argp;
-
-		switch (ext) {
-		case KVM_CAP_IRQCHIP:
-		case KVM_CAP_HLT:
-			r = 1;
-			break;
-		default:
-			r = 0;
-			break;
-		}
-		break;
-	}
-	case KVM_GET_VCPU_MMAP_SIZE:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = 2 * PAGE_SIZE;
-		break;
-	default:
-		;
-	}
-out:
-	return r;
-}
-
-static struct file_operations kvm_chardev_ops = {
-	.unlocked_ioctl = kvm_dev_ioctl,
-	.compat_ioctl   = kvm_dev_ioctl,
-};
-
-static struct miscdevice kvm_dev = {
-	KVM_MINOR,
-	"kvm",
-	&kvm_chardev_ops,
-};
-
-/*
- * Make sure that a cpu that is being hot-unplugged does not have any vcpus
- * cached on it.
- */
-static void decache_vcpus_on_cpu(int cpu)
-{
-	struct kvm *vm;
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	spin_lock(&kvm_lock);
-	list_for_each_entry(vm, &vm_list, vm_list)
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = vm->vcpus[i];
-			if (!vcpu)
-				continue;
-			/*
-			 * If the vcpu is locked, then it is running on some
-			 * other cpu and therefore it is not cached on the
-			 * cpu in question.
-			 *
-			 * If it's not locked, check the last cpu it executed
-			 * on.
-			 */
-			if (mutex_trylock(&vcpu->mutex)) {
-				if (vcpu->cpu == cpu) {
-					kvm_x86_ops->vcpu_decache(vcpu);
-					vcpu->cpu = -1;
-				}
-				mutex_unlock(&vcpu->mutex);
-			}
-		}
-	spin_unlock(&kvm_lock);
-}
-
-static void hardware_enable(void *junk)
-{
-	int cpu = raw_smp_processor_id();
-
-	if (cpu_isset(cpu, cpus_hardware_enabled))
-		return;
-	cpu_set(cpu, cpus_hardware_enabled);
-	kvm_x86_ops->hardware_enable(NULL);
-}
-
-static void hardware_disable(void *junk)
-{
-	int cpu = raw_smp_processor_id();
-
-	if (!cpu_isset(cpu, cpus_hardware_enabled))
-		return;
-	cpu_clear(cpu, cpus_hardware_enabled);
-	decache_vcpus_on_cpu(cpu);
-	kvm_x86_ops->hardware_disable(NULL);
-}
-
-static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
-			   void *v)
-{
-	int cpu = (long)v;
-
-	switch (val) {
-	case CPU_DYING:
-	case CPU_DYING_FROZEN:
-		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
-		       cpu);
-		hardware_disable(NULL);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
-		       cpu);
-		smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
-		break;
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
-		       cpu);
-		smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
-                       void *v)
-{
-	if (val == SYS_RESTART) {
-		/*
-		 * Some (well, at least mine) BIOSes hang on reboot if
-		 * in vmx root mode.
-		 */
-		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-		on_each_cpu(hardware_disable, NULL, 0, 1);
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block kvm_reboot_notifier = {
-	.notifier_call = kvm_reboot,
-	.priority = 0,
-};
-
-void kvm_io_bus_init(struct kvm_io_bus *bus)
-{
-	memset(bus, 0, sizeof(*bus));
-}
-
-void kvm_io_bus_destroy(struct kvm_io_bus *bus)
-{
-	int i;
-
-	for (i = 0; i < bus->dev_count; i++) {
-		struct kvm_io_device *pos = bus->devs[i];
-
-		kvm_iodevice_destructor(pos);
-	}
-}
-
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
-{
-	int i;
-
-	for (i = 0; i < bus->dev_count; i++) {
-		struct kvm_io_device *pos = bus->devs[i];
-
-		if (pos->in_range(pos, addr))
-			return pos;
-	}
-
-	return NULL;
-}
-
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
-{
-	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
-
-	bus->devs[bus->dev_count++] = dev;
-}
-
-static struct notifier_block kvm_cpu_notifier = {
-	.notifier_call = kvm_cpu_hotplug,
-	.priority = 20, /* must be > scheduler priority */
-};
-
-static u64 stat_get(void *_offset)
-{
-	unsigned offset = (long)_offset;
-	u64 total = 0;
-	struct kvm *kvm;
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (vcpu)
-				total += *(u32 *)((void *)vcpu + offset);
-		}
-	spin_unlock(&kvm_lock);
-	return total;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
-
-static __init void kvm_init_debug(void)
-{
-	struct kvm_stats_debugfs_item *p;
-
-	debugfs_dir = debugfs_create_dir("kvm", NULL);
-	for (p = debugfs_entries; p->name; ++p)
-		p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
-						(void *)(long)p->offset,
-						&stat_fops);
-}
-
-static void kvm_exit_debug(void)
-{
-	struct kvm_stats_debugfs_item *p;
-
-	for (p = debugfs_entries; p->name; ++p)
-		debugfs_remove(p->dentry);
-	debugfs_remove(debugfs_dir);
-}
-
-static int kvm_suspend(struct sys_device *dev, pm_message_t state)
-{
-	hardware_disable(NULL);
-	return 0;
-}
-
-static int kvm_resume(struct sys_device *dev)
-{
-	hardware_enable(NULL);
-	return 0;
-}
-
-static struct sysdev_class kvm_sysdev_class = {
-	.name = "kvm",
-	.suspend = kvm_suspend,
-	.resume = kvm_resume,
-};
-
-static struct sys_device kvm_sysdev = {
-	.id = 0,
-	.cls = &kvm_sysdev_class,
-};
-
-hpa_t bad_page_address;
-
-static inline
-struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
-{
-	return container_of(pn, struct kvm_vcpu, preempt_notifier);
-}
-
-static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
-{
-	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
-
-	kvm_x86_ops->vcpu_load(vcpu, cpu);
-}
-
-static void kvm_sched_out(struct preempt_notifier *pn,
-			  struct task_struct *next)
-{
-	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
-
-	kvm_x86_ops->vcpu_put(vcpu);
-}
-
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
-		  struct module *module)
-{
-	int r;
-	int cpu;
-
-	if (kvm_x86_ops) {
-		printk(KERN_ERR "kvm: already loaded the other module\n");
-		return -EEXIST;
-	}
-
-	if (!ops->cpu_has_kvm_support()) {
-		printk(KERN_ERR "kvm: no hardware support\n");
-		return -EOPNOTSUPP;
-	}
-	if (ops->disabled_by_bios()) {
-		printk(KERN_ERR "kvm: disabled by bios\n");
-		return -EOPNOTSUPP;
-	}
-
-	kvm_x86_ops = ops;
-
-	r = kvm_x86_ops->hardware_setup();
-	if (r < 0)
-		goto out;
-
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu,
-				kvm_x86_ops->check_processor_compatibility,
-				&r, 0, 1);
-		if (r < 0)
-			goto out_free_0;
-	}
-
-	on_each_cpu(hardware_enable, NULL, 0, 1);
-	r = register_cpu_notifier(&kvm_cpu_notifier);
-	if (r)
-		goto out_free_1;
-	register_reboot_notifier(&kvm_reboot_notifier);
-
-	r = sysdev_class_register(&kvm_sysdev_class);
-	if (r)
-		goto out_free_2;
-
-	r = sysdev_register(&kvm_sysdev);
-	if (r)
-		goto out_free_3;
-
-	/* A kmem cache lets us meet the alignment requirements of fx_save. */
-	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
-					   __alignof__(struct kvm_vcpu), 0, 0);
-	if (!kvm_vcpu_cache) {
-		r = -ENOMEM;
-		goto out_free_4;
-	}
-
-	kvm_chardev_ops.owner = module;
-
-	r = misc_register(&kvm_dev);
-	if (r) {
-		printk (KERN_ERR "kvm: misc device register failed\n");
-		goto out_free;
-	}
-
-	kvm_preempt_ops.sched_in = kvm_sched_in;
-	kvm_preempt_ops.sched_out = kvm_sched_out;
-
-	return r;
-
-out_free:
-	kmem_cache_destroy(kvm_vcpu_cache);
-out_free_4:
-	sysdev_unregister(&kvm_sysdev);
-out_free_3:
-	sysdev_class_unregister(&kvm_sysdev_class);
-out_free_2:
-	unregister_reboot_notifier(&kvm_reboot_notifier);
-	unregister_cpu_notifier(&kvm_cpu_notifier);
-out_free_1:
-	on_each_cpu(hardware_disable, NULL, 0, 1);
-out_free_0:
-	kvm_x86_ops->hardware_unsetup();
-out:
-	kvm_x86_ops = NULL;
-	return r;
-}
-
-void kvm_exit_x86(void)
-{
-	misc_deregister(&kvm_dev);
-	kmem_cache_destroy(kvm_vcpu_cache);
-	sysdev_unregister(&kvm_sysdev);
-	sysdev_class_unregister(&kvm_sysdev_class);
-	unregister_reboot_notifier(&kvm_reboot_notifier);
-	unregister_cpu_notifier(&kvm_cpu_notifier);
-	on_each_cpu(hardware_disable, NULL, 0, 1);
-	kvm_x86_ops->hardware_unsetup();
-	kvm_x86_ops = NULL;
-}
-
-static __init int kvm_init(void)
-{
-	static struct page *bad_page;
-	int r;
-
-	r = kvm_mmu_module_init();
-	if (r)
-		goto out4;
-
-	kvm_init_debug();
-
-	kvm_init_msr_list();
-
-	if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
-		r = -ENOMEM;
-		goto out;
-	}
-
-	bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
-	memset(__va(bad_page_address), 0, PAGE_SIZE);
-
-	return 0;
-
-out:
-	kvm_exit_debug();
-	kvm_mmu_module_exit();
-out4:
-	return r;
-}
-
-static __exit void kvm_exit(void)
-{
-	kvm_exit_debug();
-	__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
-	kvm_mmu_module_exit();
-}
-
-module_init(kvm_init)
-module_exit(kvm_exit)
-
-EXPORT_SYMBOL_GPL(kvm_init_x86);
-EXPORT_SYMBOL_GPL(kvm_exit_x86);
diff -puN drivers/kvm/kvm_svm.h~git-kvm /dev/null
--- a/drivers/kvm/kvm_svm.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef __KVM_SVM_H
-#define __KVM_SVM_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <asm/msr.h>
-
-#include "svm.h"
-#include "kvm.h"
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
-	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
-	MSR_FS_BASE,
-#endif
-	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define NUM_DB_REGS 4
-
-struct kvm_vcpu;
-
-struct vcpu_svm {
-	struct kvm_vcpu vcpu;
-	struct vmcb *vmcb;
-	unsigned long vmcb_pa;
-	struct svm_cpu_data *svm_data;
-	uint64_t asid_generation;
-
-	unsigned long db_regs[NUM_DB_REGS];
-
-	u64 next_rip;
-
-	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-	u64 host_gs_base;
-	unsigned long host_cr2;
-	unsigned long host_db_regs[NUM_DB_REGS];
-	unsigned long host_dr6;
-	unsigned long host_dr7;
-};
-
-#endif
-
diff -puN drivers/kvm/lapic.c~git-kvm /dev/null
--- a/drivers/kvm/lapic.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-
-/*
- * Local APIC virtualization
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright (C) 2007 Novell
- * Copyright (C) 2007 Intel
- *
- * Authors:
- *   Dor Laor <dor.laor@qumranet.com>
- *   Gregory Haskins <ghaskins@novell.com>
- *   Yaozu (Eddie) Dong <eddie.dong@intel.com>
- *
- * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- */
-
-#include "kvm.h"
-#include <linux/kvm.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-#include <linux/hrtimer.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include <asm/apicdef.h>
-#include <asm/atomic.h>
-#include <asm/div64.h>
-#include "irq.h"
-
-#define PRId64 "d"
-#define PRIx64 "llx"
-#define PRIu64 "u"
-#define PRIo64 "o"
-
-#define APIC_BUS_CYCLE_NS 1
-
-/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define apic_debug(fmt, arg...)
-
-#define APIC_LVT_NUM			6
-/* 14 is the version for Xeon and Pentium 8.4.8*/
-#define APIC_VERSION			(0x14UL | ((APIC_LVT_NUM - 1) << 16))
-#define LAPIC_MMIO_LENGTH		(1 << 12)
-/* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK			0xc0000
-#define APIC_DEST_NOSHORT		0x0
-#define APIC_DEST_MASK			0x800
-#define MAX_APIC_VECTOR			256
-
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
-{
-	return *((u32 *) (apic->regs + reg_off));
-}
-
-static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
-{
-	*((u32 *) (apic->regs + reg_off)) = val;
-}
-
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
-	return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
-	return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void apic_set_vector(int vec, void *bitmap)
-{
-	set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void apic_clear_vector(int vec, void *bitmap)
-{
-	clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
-{
-	return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
-}
-
-static inline int  apic_sw_enabled(struct kvm_lapic *apic)
-{
-	return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
-}
-
-static inline int apic_enabled(struct kvm_lapic *apic)
-{
-	return apic_sw_enabled(apic) &&	apic_hw_enabled(apic);
-}
-
-#define LVT_MASK	\
-	(APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
-
-#define LINT_MASK	\
-	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
-	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-
-static inline int kvm_apic_id(struct kvm_lapic *apic)
-{
-	return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
-}
-
-static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
-{
-	return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
-}
-
-static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
-{
-	return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
-}
-
-static inline int apic_lvtt_period(struct kvm_lapic *apic)
-{
-	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
-}
-
-static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
-	LVT_MASK | APIC_LVT_TIMER_PERIODIC,	/* LVTT */
-	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
-	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
-	LINT_MASK, LINT_MASK,	/* LVT0-1 */
-	LVT_MASK		/* LVTERR */
-};
-
-static int find_highest_vector(void *bitmap)
-{
-	u32 *word = bitmap;
-	int word_offset = MAX_APIC_VECTOR >> 5;
-
-	while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
-		continue;
-
-	if (likely(!word_offset && !word[0]))
-		return -1;
-	else
-		return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
-}
-
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
-{
-	return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
-}
-
-static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
-{
-	apic_clear_vector(vec, apic->regs + APIC_IRR);
-}
-
-static inline int apic_find_highest_irr(struct kvm_lapic *apic)
-{
-	int result;
-
-	result = find_highest_vector(apic->regs + APIC_IRR);
-	ASSERT(result == -1 || result >= 16);
-
-	return result;
-}
-
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
-	int highest_irr;
-
-	if (!apic)
-		return 0;
-	highest_irr = apic_find_highest_irr(apic);
-
-	return highest_irr;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
-
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
-{
-	if (!apic_test_and_set_irr(vec, apic)) {
-		/* a new pending irq is set in IRR */
-		if (trig)
-			apic_set_vector(vec, apic->regs + APIC_TMR);
-		else
-			apic_clear_vector(vec, apic->regs + APIC_TMR);
-		kvm_vcpu_kick(apic->vcpu);
-		return 1;
-	}
-	return 0;
-}
-
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
-{
-	int result;
-
-	result = find_highest_vector(apic->regs + APIC_ISR);
-	ASSERT(result == -1 || result >= 16);
-
-	return result;
-}
-
-static void apic_update_ppr(struct kvm_lapic *apic)
-{
-	u32 tpr, isrv, ppr;
-	int isr;
-
-	tpr = apic_get_reg(apic, APIC_TASKPRI);
-	isr = apic_find_highest_isr(apic);
-	isrv = (isr != -1) ? isr : 0;
-
-	if ((tpr & 0xf0) >= (isrv & 0xf0))
-		ppr = tpr & 0xff;
-	else
-		ppr = isrv & 0xf0;
-
-	apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
-		   apic, ppr, isr, isrv);
-
-	apic_set_reg(apic, APIC_PROCPRI, ppr);
-}
-
-static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
-{
-	apic_set_reg(apic, APIC_TASKPRI, tpr);
-	apic_update_ppr(apic);
-}
-
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
-{
-	return kvm_apic_id(apic) == dest;
-}
-
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
-{
-	int result = 0;
-	u8 logical_id;
-
-	logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
-
-	switch (apic_get_reg(apic, APIC_DFR)) {
-	case APIC_DFR_FLAT:
-		if (logical_id & mda)
-			result = 1;
-		break;
-	case APIC_DFR_CLUSTER:
-		if (((logical_id >> 4) == (mda >> 0x4))
-		    && (logical_id & mda & 0xf))
-			result = 1;
-		break;
-	default:
-		printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
-		       apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
-		break;
-	}
-
-	return result;
-}
-
-static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
-			   int short_hand, int dest, int dest_mode)
-{
-	int result = 0;
-	struct kvm_lapic *target = vcpu->apic;
-
-	apic_debug("target %p, source %p, dest 0x%x, "
-		   "dest_mode 0x%x, short_hand 0x%x",
-		   target, source, dest, dest_mode, short_hand);
-
-	ASSERT(!target);
-	switch (short_hand) {
-	case APIC_DEST_NOSHORT:
-		if (dest_mode == 0) {
-			/* Physical mode. */
-			if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
-				result = 1;
-		} else
-			/* Logical mode. */
-			result = kvm_apic_match_logical_addr(target, dest);
-		break;
-	case APIC_DEST_SELF:
-		if (target == source)
-			result = 1;
-		break;
-	case APIC_DEST_ALLINC:
-		result = 1;
-		break;
-	case APIC_DEST_ALLBUT:
-		if (target != source)
-			result = 1;
-		break;
-	default:
-		printk(KERN_WARNING "Bad dest shorthand value %x\n",
-		       short_hand);
-		break;
-	}
-
-	return result;
-}
-
-/*
- * Add a pending IRQ into lapic.
- * Return 1 if successfully added and 0 if discarded.
- */
-static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
-			     int vector, int level, int trig_mode)
-{
-	int orig_irr, result = 0;
-	struct kvm_vcpu *vcpu = apic->vcpu;
-
-	switch (delivery_mode) {
-	case APIC_DM_FIXED:
-	case APIC_DM_LOWEST:
-		/* FIXME add logic for vcpu on reset */
-		if (unlikely(!apic_enabled(apic)))
-			break;
-
-		orig_irr = apic_test_and_set_irr(vector, apic);
-		if (orig_irr && trig_mode) {
-			apic_debug("level trig mode repeatedly for vector %d",
-				   vector);
-			break;
-		}
-
-		if (trig_mode) {
-			apic_debug("level trig mode for vector %d", vector);
-			apic_set_vector(vector, apic->regs + APIC_TMR);
-		} else
-			apic_clear_vector(vector, apic->regs + APIC_TMR);
-
-		if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
-			kvm_vcpu_kick(vcpu);
-		else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
-			vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
-			if (waitqueue_active(&vcpu->wq))
-				wake_up_interruptible(&vcpu->wq);
-		}
-
-		result = (orig_irr == 0);
-		break;
-
-	case APIC_DM_REMRD:
-		printk(KERN_DEBUG "Ignoring delivery mode 3\n");
-		break;
-
-	case APIC_DM_SMI:
-		printk(KERN_DEBUG "Ignoring guest SMI\n");
-		break;
-	case APIC_DM_NMI:
-		printk(KERN_DEBUG "Ignoring guest NMI\n");
-		break;
-
-	case APIC_DM_INIT:
-		if (level) {
-			if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
-				printk(KERN_DEBUG
-				       "INIT on a runnable vcpu %d\n",
-				       vcpu->vcpu_id);
-			vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
-			kvm_vcpu_kick(vcpu);
-		} else {
-			printk(KERN_DEBUG
-			       "Ignoring de-assert INIT to vcpu %d\n",
-			       vcpu->vcpu_id);
-		}
-
-		break;
-
-	case APIC_DM_STARTUP:
-		printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
-		       vcpu->vcpu_id, vector);
-		if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
-			vcpu->sipi_vector = vector;
-			vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
-			if (waitqueue_active(&vcpu->wq))
-				wake_up_interruptible(&vcpu->wq);
-		}
-		break;
-
-	default:
-		printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
-		       delivery_mode);
-		break;
-	}
-	return result;
-}
-
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
-				       unsigned long bitmap)
-{
-	int vcpu_id;
-	int last;
-	int next;
-	struct kvm_lapic *apic;
-
-	last = kvm->round_robin_prev_vcpu;
-	next = last;
-
-	do {
-		if (++next == KVM_MAX_VCPUS)
-			next = 0;
-		if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
-			continue;
-		apic = kvm->vcpus[next]->apic;
-		if (apic && apic_enabled(apic))
-			break;
-		apic = NULL;
-	} while (next != last);
-	kvm->round_robin_prev_vcpu = next;
-
-	if (!apic) {
-		vcpu_id = ffs(bitmap) - 1;
-		if (vcpu_id < 0) {
-			vcpu_id = 0;
-			printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
-		}
-		apic = kvm->vcpus[vcpu_id]->apic;
-	}
-
-	return apic;
-}
-
-static void apic_set_eoi(struct kvm_lapic *apic)
-{
-	int vector = apic_find_highest_isr(apic);
-
-	/*
-	 * Not every write EOI will has corresponding ISR,
-	 * one example is when Kernel check timer on setup_IO_APIC
-	 */
-	if (vector == -1)
-		return;
-
-	apic_clear_vector(vector, apic->regs + APIC_ISR);
-	apic_update_ppr(apic);
-
-	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
-		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
-}
-
-static void apic_send_ipi(struct kvm_lapic *apic)
-{
-	u32 icr_low = apic_get_reg(apic, APIC_ICR);
-	u32 icr_high = apic_get_reg(apic, APIC_ICR2);
-
-	unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
-	unsigned int short_hand = icr_low & APIC_SHORT_MASK;
-	unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
-	unsigned int level = icr_low & APIC_INT_ASSERT;
-	unsigned int dest_mode = icr_low & APIC_DEST_MASK;
-	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
-	unsigned int vector = icr_low & APIC_VECTOR_MASK;
-
-	struct kvm_lapic *target;
-	struct kvm_vcpu *vcpu;
-	unsigned long lpr_map = 0;
-	int i;
-
-	apic_debug("icr_high 0x%x, icr_low 0x%x, "
-		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
-		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
-		   icr_high, icr_low, short_hand, dest,
-		   trig_mode, level, dest_mode, delivery_mode, vector);
-
-	for (i = 0; i < KVM_MAX_VCPUS; i++) {
-		vcpu = apic->vcpu->kvm->vcpus[i];
-		if (!vcpu)
-			continue;
-
-		if (vcpu->apic &&
-		    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
-			if (delivery_mode == APIC_DM_LOWEST)
-				set_bit(vcpu->vcpu_id, &lpr_map);
-			else
-				__apic_accept_irq(vcpu->apic, delivery_mode,
-						  vector, level, trig_mode);
-		}
-	}
-
-	if (delivery_mode == APIC_DM_LOWEST) {
-		target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
-		if (target != NULL)
-			__apic_accept_irq(target, delivery_mode,
-					  vector, level, trig_mode);
-	}
-}
-
-static u32 apic_get_tmcct(struct kvm_lapic *apic)
-{
-	u64 counter_passed;
-	ktime_t passed, now;
-	u32 tmcct;
-
-	ASSERT(apic != NULL);
-
-	now = apic->timer.dev.base->get_time();
-	tmcct = apic_get_reg(apic, APIC_TMICT);
-
-	/* if initial count is 0, current count should also be 0 */
-	if (tmcct == 0)
-		return 0;
-
-	if (unlikely(ktime_to_ns(now) <=
-		ktime_to_ns(apic->timer.last_update))) {
-		/* Wrap around */
-		passed = ktime_add(( {
-				    (ktime_t) {
-				    .tv64 = KTIME_MAX -
-				    (apic->timer.last_update).tv64}; }
-				   ), now);
-		apic_debug("time elapsed\n");
-	} else
-		passed = ktime_sub(now, apic->timer.last_update);
-
-	counter_passed = div64_64(ktime_to_ns(passed),
-				  (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
-
-	if (counter_passed > tmcct) {
-		if (unlikely(!apic_lvtt_period(apic))) {
-			/* one-shot timers stick at 0 until reset */
-			tmcct = 0;
-		} else {
-			/*
-			 * periodic timers reset to APIC_TMICT when they
-			 * hit 0. The while loop simulates this happening N
-			 * times. (counter_passed %= tmcct) would also work,
-			 * but might be slower or not work on 32-bit??
-			 */
-			while (counter_passed > tmcct)
-				counter_passed -= tmcct;
-			tmcct -= counter_passed;
-		}
-	} else {
-		tmcct -= counter_passed;
-	}
-
-	return tmcct;
-}
-
-static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
-{
-	u32 val = 0;
-
-	if (offset >= LAPIC_MMIO_LENGTH)
-		return 0;
-
-	switch (offset) {
-	case APIC_ARBPRI:
-		printk(KERN_WARNING "Access APIC ARBPRI register "
-		       "which is for P6\n");
-		break;
-
-	case APIC_TMCCT:	/* Timer CCR */
-		val = apic_get_tmcct(apic);
-		break;
-
-	default:
-		apic_update_ppr(apic);
-		val = apic_get_reg(apic, offset);
-		break;
-	}
-
-	return val;
-}
-
-static void apic_mmio_read(struct kvm_io_device *this,
-			   gpa_t address, int len, void *data)
-{
-	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
-	unsigned int offset = address - apic->base_address;
-	unsigned char alignment = offset & 0xf;
-	u32 result;
-
-	if ((alignment + len) > 4) {
-		printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
-		       (unsigned long)address, len);
-		return;
-	}
-	result = __apic_read(apic, offset & ~0xf);
-
-	switch (len) {
-	case 1:
-	case 2:
-	case 4:
-		memcpy(data, (char *)&result + alignment, len);
-		break;
-	default:
-		printk(KERN_ERR "Local APIC read with len = %x, "
-		       "should be 1,2, or 4 instead\n", len);
-		break;
-	}
-}
-
-static void update_divide_count(struct kvm_lapic *apic)
-{
-	u32 tmp1, tmp2, tdcr;
-
-	tdcr = apic_get_reg(apic, APIC_TDCR);
-	tmp1 = tdcr & 0xf;
-	tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
-	apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
-
-	apic_debug("timer divide count is 0x%x\n",
-				   apic->timer.divide_count);
-}
-
-static void start_apic_timer(struct kvm_lapic *apic)
-{
-	ktime_t now = apic->timer.dev.base->get_time();
-
-	apic->timer.last_update = now;
-
-	apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
-		    APIC_BUS_CYCLE_NS * apic->timer.divide_count;
-	atomic_set(&apic->timer.pending, 0);
-	hrtimer_start(&apic->timer.dev,
-		      ktime_add_ns(now, apic->timer.period),
-		      HRTIMER_MODE_ABS);
-
-	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
-			   PRIx64 ", "
-			   "timer initial count 0x%x, period %lldns, "
-			   "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
-			   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
-			   apic_get_reg(apic, APIC_TMICT),
-			   apic->timer.period,
-			   ktime_to_ns(ktime_add_ns(now,
-					apic->timer.period)));
-}
-
-static void apic_mmio_write(struct kvm_io_device *this,
-			    gpa_t address, int len, const void *data)
-{
-	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
-	unsigned int offset = address - apic->base_address;
-	unsigned char alignment = offset & 0xf;
-	u32 val;
-
-	/*
-	 * APIC register must be aligned on 128-bits boundary.
-	 * 32/64/128 bits registers must be accessed thru 32 bits.
-	 * Refer SDM 8.4.1
-	 */
-	if (len != 4 || alignment) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "apic write: bad size=%d %lx\n",
-			       len, (long)address);
-		return;
-	}
-
-	val = *(u32 *) data;
-
-	/* too common printing */
-	if (offset != APIC_EOI)
-		apic_debug("%s: offset 0x%x with length 0x%x, and value is "
-			   "0x%x\n", __FUNCTION__, offset, len, val);
-
-	offset &= 0xff0;
-
-	switch (offset) {
-	case APIC_ID:		/* Local APIC ID */
-		apic_set_reg(apic, APIC_ID, val);
-		break;
-
-	case APIC_TASKPRI:
-		apic_set_tpr(apic, val & 0xff);
-		break;
-
-	case APIC_EOI:
-		apic_set_eoi(apic);
-		break;
-
-	case APIC_LDR:
-		apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
-		break;
-
-	case APIC_DFR:
-		apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
-		break;
-
-	case APIC_SPIV:
-		apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
-		if (!(val & APIC_SPIV_APIC_ENABLED)) {
-			int i;
-			u32 lvt_val;
-
-			for (i = 0; i < APIC_LVT_NUM; i++) {
-				lvt_val = apic_get_reg(apic,
-						       APIC_LVTT + 0x10 * i);
-				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
-					     lvt_val | APIC_LVT_MASKED);
-			}
-			atomic_set(&apic->timer.pending, 0);
-
-		}
-		break;
-
-	case APIC_ICR:
-		/* No delay here, so we always clear the pending bit */
-		apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
-		apic_send_ipi(apic);
-		break;
-
-	case APIC_ICR2:
-		apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
-		break;
-
-	case APIC_LVTT:
-	case APIC_LVTTHMR:
-	case APIC_LVTPC:
-	case APIC_LVT0:
-	case APIC_LVT1:
-	case APIC_LVTERR:
-		/* TODO: Check vector */
-		if (!apic_sw_enabled(apic))
-			val |= APIC_LVT_MASKED;
-
-		val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
-		apic_set_reg(apic, offset, val);
-
-		break;
-
-	case APIC_TMICT:
-		hrtimer_cancel(&apic->timer.dev);
-		apic_set_reg(apic, APIC_TMICT, val);
-		start_apic_timer(apic);
-		return;
-
-	case APIC_TDCR:
-		if (val & 4)
-			printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
-		apic_set_reg(apic, APIC_TDCR, val);
-		update_divide_count(apic);
-		break;
-
-	default:
-		apic_debug("Local APIC Write to read-only register %x\n",
-			   offset);
-		break;
-	}
-
-}
-
-static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
-{
-	struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
-	int ret = 0;
-
-
-	if (apic_hw_enabled(apic) &&
-	    (addr >= apic->base_address) &&
-	    (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
-		ret = 1;
-
-	return ret;
-}
-
-void kvm_free_apic(struct kvm_lapic *apic)
-{
-	if (!apic)
-		return;
-
-	hrtimer_cancel(&apic->timer.dev);
-
-	if (apic->regs_page) {
-		__free_page(apic->regs_page);
-		apic->regs_page = 0;
-	}
-
-	kfree(apic);
-}
-
-/*
- *----------------------------------------------------------------------
- * LAPIC interface
- *----------------------------------------------------------------------
- */
-
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
-	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
-
-	if (!apic)
-		return;
-	apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
-}
-
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
-	u64 tpr;
-
-	if (!apic)
-		return 0;
-	tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
-
-	return (tpr & 0xf0) >> 4;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
-
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
-{
-	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
-
-	if (!apic) {
-		value |= MSR_IA32_APICBASE_BSP;
-		vcpu->apic_base = value;
-		return;
-	}
-	if (apic->vcpu->vcpu_id)
-		value &= ~MSR_IA32_APICBASE_BSP;
-
-	vcpu->apic_base = value;
-	apic->base_address = apic->vcpu->apic_base &
-			     MSR_IA32_APICBASE_BASE;
-
-	/* with FSB delivery interrupt, we can restart APIC functionality */
-	apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
-		   "0x%lx.\n", apic->apic_base, apic->base_address);
-
-}
-
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
-{
-	return vcpu->apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
-
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic;
-	int i;
-
-	apic_debug("%s\n", __FUNCTION__);
-
-	ASSERT(vcpu);
-	apic = vcpu->apic;
-	ASSERT(apic != NULL);
-
-	/* Stop the timer in case it's a reset to an active apic */
-	hrtimer_cancel(&apic->timer.dev);
-
-	apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
-	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
-
-	for (i = 0; i < APIC_LVT_NUM; i++)
-		apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
-	apic_set_reg(apic, APIC_LVT0,
-		     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
-
-	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
-	apic_set_reg(apic, APIC_SPIV, 0xff);
-	apic_set_reg(apic, APIC_TASKPRI, 0);
-	apic_set_reg(apic, APIC_LDR, 0);
-	apic_set_reg(apic, APIC_ESR, 0);
-	apic_set_reg(apic, APIC_ICR, 0);
-	apic_set_reg(apic, APIC_ICR2, 0);
-	apic_set_reg(apic, APIC_TDCR, 0);
-	apic_set_reg(apic, APIC_TMICT, 0);
-	for (i = 0; i < 8; i++) {
-		apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
-		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
-		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
-	}
-	update_divide_count(apic);
-	atomic_set(&apic->timer.pending, 0);
-	if (vcpu->vcpu_id == 0)
-		vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
-	apic_update_ppr(apic);
-
-	apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
-		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
-		   vcpu, kvm_apic_id(apic),
-		   vcpu->apic_base, apic->base_address);
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_reset);
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
-	int ret = 0;
-
-	if (!apic)
-		return 0;
-	ret = apic_enabled(apic);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
-
-/*
- *----------------------------------------------------------------------
- * timer interface
- *----------------------------------------------------------------------
- */
-
-/* TODO: make sure __apic_timer_fn runs in current pCPU */
-static int __apic_timer_fn(struct kvm_lapic *apic)
-{
-	int result = 0;
-	wait_queue_head_t *q = &apic->vcpu->wq;
-
-	atomic_inc(&apic->timer.pending);
-	if (waitqueue_active(q))
-	{
-		apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
-		wake_up_interruptible(q);
-	}
-	if (apic_lvtt_period(apic)) {
-		result = 1;
-		apic->timer.dev.expires = ktime_add_ns(
-					apic->timer.dev.expires,
-					apic->timer.period);
-	}
-	return result;
-}
-
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
-{
-	int vector;
-
-	vector = apic_lvt_vector(apic, APIC_LVTT);
-	return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
-}
-
-static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
-{
-	struct kvm_lapic *apic;
-	int restart_timer = 0;
-
-	apic = container_of(data, struct kvm_lapic, timer.dev);
-
-	restart_timer = __apic_timer_fn(apic);
-
-	if (restart_timer)
-		return HRTIMER_RESTART;
-	else
-		return HRTIMER_NORESTART;
-}
-
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic;
-
-	ASSERT(vcpu != NULL);
-	apic_debug("apic_init %d\n", vcpu->vcpu_id);
-
-	apic = kzalloc(sizeof(*apic), GFP_KERNEL);
-	if (!apic)
-		goto nomem;
-
-	vcpu->apic = apic;
-
-	apic->regs_page = alloc_page(GFP_KERNEL);
-	if (apic->regs_page == NULL) {
-		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
-		       vcpu->vcpu_id);
-		goto nomem;
-	}
-	apic->regs = page_address(apic->regs_page);
-	memset(apic->regs, 0, PAGE_SIZE);
-	apic->vcpu = vcpu;
-
-	hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-	apic->timer.dev.function = apic_timer_fn;
-	apic->base_address = APIC_DEFAULT_PHYS_BASE;
-	vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
-
-	kvm_lapic_reset(vcpu);
-	apic->dev.read = apic_mmio_read;
-	apic->dev.write = apic_mmio_write;
-	apic->dev.in_range = apic_mmio_range;
-	apic->dev.private = apic;
-
-	return 0;
-nomem:
-	kvm_free_apic(apic);
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(kvm_create_lapic);
-
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->apic;
-	int highest_irr;
-
-	if (!apic || !apic_enabled(apic))
-		return -1;
-
-	apic_update_ppr(apic);
-	highest_irr = apic_find_highest_irr(apic);
-	if ((highest_irr == -1) ||
-	    ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
-		return -1;
-	return highest_irr;
-}
-
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
-{
-	u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
-	int r = 0;
-
-	if (vcpu->vcpu_id == 0) {
-		if (!apic_hw_enabled(vcpu->apic))
-			r = 1;
-		if ((lvt0 & APIC_LVT_MASKED) == 0 &&
-		    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
-			r = 1;
-	}
-	return r;
-}
-
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->apic;
-
-	if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
-		atomic_read(&apic->timer.pending) > 0) {
-		if (__inject_apic_timer_irq(apic))
-			atomic_dec(&apic->timer.pending);
-	}
-}
-
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
-	struct kvm_lapic *apic = vcpu->apic;
-
-	if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
-		apic->timer.last_update = ktime_add_ns(
-				apic->timer.last_update,
-				apic->timer.period);
-}
-
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
-{
-	int vector = kvm_apic_has_interrupt(vcpu);
-	struct kvm_lapic *apic = vcpu->apic;
-
-	if (vector == -1)
-		return -1;
-
-	apic_set_vector(vector, apic->regs + APIC_ISR);
-	apic_update_ppr(apic);
-	apic_clear_irr(vector, apic);
-	return vector;
-}
-
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->apic;
-
-	apic->base_address = vcpu->apic_base &
-			     MSR_IA32_APICBASE_BASE;
-	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
-	apic_update_ppr(apic);
-	hrtimer_cancel(&apic->timer.dev);
-	update_divide_count(apic);
-	start_apic_timer(apic);
-}
-
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->apic;
-	struct hrtimer *timer;
-
-	if (!apic)
-		return;
-
-	timer = &apic->timer.dev;
-	if (hrtimer_cancel(timer))
-		hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
-}
-EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff -puN drivers/kvm/mmu.c~git-kvm /dev/null
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1498 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "vmx.h"
-#include "kvm.h"
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-
-#include <asm/page.h>
-#include <asm/cmpxchg.h>
-
-#undef MMU_DEBUG
-
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
-#ifdef MMU_DEBUG
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
-#else
-
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#if defined(MMU_DEBUG) || defined(AUDIT)
-static int dbg = 1;
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x)							\
-	if (!(x)) {							\
-		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
-		       __FILE__, __LINE__, #x);				\
-	}
-#endif
-
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_MASK (1ULL << 5)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_MASK (1ULL << 63)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
-		( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
-
-#define PT64_LEVEL_MASK(level) \
-		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
-
-#define PT64_INDEX(address, level)\
-	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
-		( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
-
-#define PT32_LEVEL_MASK(level) \
-		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
-
-#define PT32_INDEX(address, level)\
-	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_DIR_BASE_ADDR_MASK \
-	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
-	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-
-
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
-#define RMAP_EXT 4
-
-struct kvm_rmap_desc {
-	u64 *shadow_ptes[RMAP_EXT];
-	struct kvm_rmap_desc *more;
-};
-
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
-
-static int is_write_protection(struct kvm_vcpu *vcpu)
-{
-	return vcpu->cr0 & X86_CR0_WP;
-}
-
-static int is_cpuid_PSE36(void)
-{
-	return 1;
-}
-
-static int is_nx(struct kvm_vcpu *vcpu)
-{
-	return vcpu->shadow_efer & EFER_NX;
-}
-
-static int is_present_pte(unsigned long pte)
-{
-	return pte & PT_PRESENT_MASK;
-}
-
-static int is_writeble_pte(unsigned long pte)
-{
-	return pte & PT_WRITABLE_MASK;
-}
-
-static int is_io_pte(unsigned long pte)
-{
-	return pte & PT_SHADOW_IO_MARK;
-}
-
-static int is_rmap_pte(u64 pte)
-{
-	return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
-		== (PT_WRITABLE_MASK | PT_PRESENT_MASK);
-}
-
-static void set_shadow_pte(u64 *sptep, u64 spte)
-{
-#ifdef CONFIG_X86_64
-	set_64bit((unsigned long *)sptep, spte);
-#else
-	set_64bit((unsigned long long *)sptep, spte);
-#endif
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-				  struct kmem_cache *base_cache, int min)
-{
-	void *obj;
-
-	if (cache->nobjs >= min)
-		return 0;
-	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
-		if (!obj)
-			return -ENOMEM;
-		cache->objects[cache->nobjs++] = obj;
-	}
-	return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-	while (mc->nobjs)
-		kfree(mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
-				       int min)
-{
-	struct page *page;
-
-	if (cache->nobjs >= min)
-		return 0;
-	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-		page = alloc_page(GFP_KERNEL);
-		if (!page)
-			return -ENOMEM;
-		set_page_private(page, 0);
-		cache->objects[cache->nobjs++] = page_address(page);
-	}
-	return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
-	while (mc->nobjs)
-		free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
-	int r;
-
-	kvm_mmu_free_some_pages(vcpu);
-	r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
-				   pte_chain_cache, 4);
-	if (r)
-		goto out;
-	r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
-				   rmap_desc_cache, 1);
-	if (r)
-		goto out;
-	r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
-	if (r)
-		goto out;
-	r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
-				   mmu_page_header_cache, 4);
-out:
-	return r;
-}
-
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-	mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
-	mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
-	mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
-	mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
-				    size_t size)
-{
-	void *p;
-
-	BUG_ON(!mc->nobjs);
-	p = mc->objects[--mc->nobjs];
-	memset(p, 0, size);
-	return p;
-}
-
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
-	return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
-				      sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
-{
-	kfree(pc);
-}
-
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
-{
-	return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
-				      sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
-	kfree(rd);
-}
-
-/*
- * Reverse mapping data structures:
- *
- * If page->private bit zero is zero, then page->private points to the
- * shadow page table entry that points to page_address(page).
- *
- * If page->private bit zero is one, (then page->private & ~1) points
- * to a struct kvm_rmap_desc containing more mappings.
- */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
-{
-	struct page *page;
-	struct kvm_rmap_desc *desc;
-	int i;
-
-	if (!is_rmap_pte(*spte))
-		return;
-	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
-	if (!page_private(page)) {
-		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
-		set_page_private(page,(unsigned long)spte);
-	} else if (!(page_private(page) & 1)) {
-		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
-		desc = mmu_alloc_rmap_desc(vcpu);
-		desc->shadow_ptes[0] = (u64 *)page_private(page);
-		desc->shadow_ptes[1] = spte;
-		set_page_private(page,(unsigned long)desc | 1);
-	} else {
-		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
-		desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
-		while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
-			desc = desc->more;
-		if (desc->shadow_ptes[RMAP_EXT-1]) {
-			desc->more = mmu_alloc_rmap_desc(vcpu);
-			desc = desc->more;
-		}
-		for (i = 0; desc->shadow_ptes[i]; ++i)
-			;
-		desc->shadow_ptes[i] = spte;
-	}
-}
-
-static void rmap_desc_remove_entry(struct page *page,
-				   struct kvm_rmap_desc *desc,
-				   int i,
-				   struct kvm_rmap_desc *prev_desc)
-{
-	int j;
-
-	for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
-		;
-	desc->shadow_ptes[i] = desc->shadow_ptes[j];
-	desc->shadow_ptes[j] = NULL;
-	if (j != 0)
-		return;
-	if (!prev_desc && !desc->more)
-		set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
-	else
-		if (prev_desc)
-			prev_desc->more = desc->more;
-		else
-			set_page_private(page,(unsigned long)desc->more | 1);
-	mmu_free_rmap_desc(desc);
-}
-
-static void rmap_remove(u64 *spte)
-{
-	struct page *page;
-	struct kvm_rmap_desc *desc;
-	struct kvm_rmap_desc *prev_desc;
-	int i;
-
-	if (!is_rmap_pte(*spte))
-		return;
-	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
-	if (!page_private(page)) {
-		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
-		BUG();
-	} else if (!(page_private(page) & 1)) {
-		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
-		if ((u64 *)page_private(page) != spte) {
-			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
-			       spte, *spte);
-			BUG();
-		}
-		set_page_private(page,0);
-	} else {
-		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
-		desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
-		prev_desc = NULL;
-		while (desc) {
-			for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
-				if (desc->shadow_ptes[i] == spte) {
-					rmap_desc_remove_entry(page,
-							       desc, i,
-							       prev_desc);
-					return;
-				}
-			prev_desc = desc;
-			desc = desc->more;
-		}
-		BUG();
-	}
-}
-
-static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct page *page;
-	struct kvm_rmap_desc *desc;
-	u64 *spte;
-
-	page = gfn_to_page(kvm, gfn);
-	BUG_ON(!page);
-
-	while (page_private(page)) {
-		if (!(page_private(page) & 1))
-			spte = (u64 *)page_private(page);
-		else {
-			desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
-			spte = desc->shadow_ptes[0];
-		}
-		BUG_ON(!spte);
-		BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
-		       != page_to_pfn(page));
-		BUG_ON(!(*spte & PT_PRESENT_MASK));
-		BUG_ON(!(*spte & PT_WRITABLE_MASK));
-		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
-		rmap_remove(spte);
-		set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	}
-}
-
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
-{
-	u64 *pos;
-	u64 *end;
-
-	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
-		if (*pos != 0) {
-			printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
-			       pos, *pos);
-			return 0;
-		}
-	return 1;
-}
-#endif
-
-static void kvm_mmu_free_page(struct kvm *kvm,
-			      struct kvm_mmu_page *page_head)
-{
-	ASSERT(is_empty_shadow_page(page_head->spt));
-	list_del(&page_head->link);
-	__free_page(virt_to_page(page_head->spt));
-	kfree(page_head);
-	++kvm->n_free_mmu_pages;
-}
-
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
-{
-	return gfn;
-}
-
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-					       u64 *parent_pte)
-{
-	struct kvm_mmu_page *page;
-
-	if (!vcpu->kvm->n_free_mmu_pages)
-		return NULL;
-
-	page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
-				      sizeof *page);
-	page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
-	set_page_private(virt_to_page(page->spt), (unsigned long)page);
-	list_add(&page->link, &vcpu->kvm->active_mmu_pages);
-	ASSERT(is_empty_shadow_page(page->spt));
-	page->slot_bitmap = 0;
-	page->multimapped = 0;
-	page->parent_pte = parent_pte;
-	--vcpu->kvm->n_free_mmu_pages;
-	return page;
-}
-
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
-				    struct kvm_mmu_page *page, u64 *parent_pte)
-{
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
-
-	if (!parent_pte)
-		return;
-	if (!page->multimapped) {
-		u64 *old = page->parent_pte;
-
-		if (!old) {
-			page->parent_pte = parent_pte;
-			return;
-		}
-		page->multimapped = 1;
-		pte_chain = mmu_alloc_pte_chain(vcpu);
-		INIT_HLIST_HEAD(&page->parent_ptes);
-		hlist_add_head(&pte_chain->link, &page->parent_ptes);
-		pte_chain->parent_ptes[0] = old;
-	}
-	hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
-		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
-			continue;
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
-			if (!pte_chain->parent_ptes[i]) {
-				pte_chain->parent_ptes[i] = parent_pte;
-				return;
-			}
-	}
-	pte_chain = mmu_alloc_pte_chain(vcpu);
-	BUG_ON(!pte_chain);
-	hlist_add_head(&pte_chain->link, &page->parent_ptes);
-	pte_chain->parent_ptes[0] = parent_pte;
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
-				       u64 *parent_pte)
-{
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
-
-	if (!page->multimapped) {
-		BUG_ON(page->parent_pte != parent_pte);
-		page->parent_pte = NULL;
-		return;
-	}
-	hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
-				break;
-			if (pte_chain->parent_ptes[i] != parent_pte)
-				continue;
-			while (i + 1 < NR_PTE_CHAIN_ENTRIES
-				&& pte_chain->parent_ptes[i + 1]) {
-				pte_chain->parent_ptes[i]
-					= pte_chain->parent_ptes[i + 1];
-				++i;
-			}
-			pte_chain->parent_ptes[i] = NULL;
-			if (i == 0) {
-				hlist_del(&pte_chain->link);
-				mmu_free_pte_chain(pte_chain);
-				if (hlist_empty(&page->parent_ptes)) {
-					page->multimapped = 0;
-					page->parent_pte = NULL;
-				}
-			}
-			return;
-		}
-	BUG();
-}
-
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
-						gfn_t gfn)
-{
-	unsigned index;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *page;
-	struct hlist_node *node;
-
-	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-	bucket = &vcpu->kvm->mmu_page_hash[index];
-	hlist_for_each_entry(page, node, bucket, hash_link)
-		if (page->gfn == gfn && !page->role.metaphysical) {
-			pgprintk("%s: found role %x\n",
-				 __FUNCTION__, page->role.word);
-			return page;
-		}
-	return NULL;
-}
-
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
-					     gfn_t gfn,
-					     gva_t gaddr,
-					     unsigned level,
-					     int metaphysical,
-					     unsigned hugepage_access,
-					     u64 *parent_pte)
-{
-	union kvm_mmu_page_role role;
-	unsigned index;
-	unsigned quadrant;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *page;
-	struct hlist_node *node;
-
-	role.word = 0;
-	role.glevels = vcpu->mmu.root_level;
-	role.level = level;
-	role.metaphysical = metaphysical;
-	role.hugepage_access = hugepage_access;
-	if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
-		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
-		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
-		role.quadrant = quadrant;
-	}
-	pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
-		 gfn, role.word);
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-	bucket = &vcpu->kvm->mmu_page_hash[index];
-	hlist_for_each_entry(page, node, bucket, hash_link)
-		if (page->gfn == gfn && page->role.word == role.word) {
-			mmu_page_add_parent_pte(vcpu, page, parent_pte);
-			pgprintk("%s: found\n", __FUNCTION__);
-			return page;
-		}
-	page = kvm_mmu_alloc_page(vcpu, parent_pte);
-	if (!page)
-		return page;
-	pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
-	page->gfn = gfn;
-	page->role = role;
-	hlist_add_head(&page->hash_link, bucket);
-	if (!metaphysical)
-		rmap_write_protect(vcpu, gfn);
-	return page;
-}
-
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
-					 struct kvm_mmu_page *page)
-{
-	unsigned i;
-	u64 *pt;
-	u64 ent;
-
-	pt = page->spt;
-
-	if (page->role.level == PT_PAGE_TABLE_LEVEL) {
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (pt[i] & PT_PRESENT_MASK)
-				rmap_remove(&pt[i]);
-			pt[i] = 0;
-		}
-		kvm_flush_remote_tlbs(kvm);
-		return;
-	}
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		ent = pt[i];
-
-		pt[i] = 0;
-		if (!(ent & PT_PRESENT_MASK))
-			continue;
-		ent &= PT64_BASE_ADDR_MASK;
-		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
-	}
-	kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_mmu_put_page(struct kvm_mmu_page *page,
-			     u64 *parent_pte)
-{
-	mmu_page_remove_parent_pte(page, parent_pte);
-}
-
-static void kvm_mmu_zap_page(struct kvm *kvm,
-			     struct kvm_mmu_page *page)
-{
-	u64 *parent_pte;
-
-	while (page->multimapped || page->parent_pte) {
-		if (!page->multimapped)
-			parent_pte = page->parent_pte;
-		else {
-			struct kvm_pte_chain *chain;
-
-			chain = container_of(page->parent_ptes.first,
-					     struct kvm_pte_chain, link);
-			parent_pte = chain->parent_ptes[0];
-		}
-		BUG_ON(!parent_pte);
-		kvm_mmu_put_page(page, parent_pte);
-		set_shadow_pte(parent_pte, 0);
-	}
-	kvm_mmu_page_unlink_children(kvm, page);
-	if (!page->root_count) {
-		hlist_del(&page->hash_link);
-		kvm_mmu_free_page(kvm, page);
-	} else
-		list_move(&page->link, &kvm->active_mmu_pages);
-}
-
-static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	unsigned index;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *page;
-	struct hlist_node *node, *n;
-	int r;
-
-	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
-	r = 0;
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-	bucket = &vcpu->kvm->mmu_page_hash[index];
-	hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
-		if (page->gfn == gfn && !page->role.metaphysical) {
-			pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
-				 page->role.word);
-			kvm_mmu_zap_page(vcpu->kvm, page);
-			r = 1;
-		}
-	return r;
-}
-
-static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	struct kvm_mmu_page *page;
-
-	while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
-		pgprintk("%s: zap %lx %x\n",
-			 __FUNCTION__, gfn, page->role.word);
-		kvm_mmu_zap_page(vcpu->kvm, page);
-	}
-}
-
-static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
-{
-	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
-	struct kvm_mmu_page *page_head = page_header(__pa(pte));
-
-	__set_bit(slot, &page_head->slot_bitmap);
-}
-
-hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
-	hpa_t hpa = gpa_to_hpa(vcpu, gpa);
-
-	return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
-}
-
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
-	struct page *page;
-
-	ASSERT((gpa & HPA_ERR_MASK) == 0);
-	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-	if (!page)
-		return gpa | HPA_ERR_MASK;
-	return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
-		| (gpa & (PAGE_SIZE-1));
-}
-
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
-	if (gpa == UNMAPPED_GVA)
-		return UNMAPPED_GVA;
-	return gpa_to_hpa(vcpu, gpa);
-}
-
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
-	if (gpa == UNMAPPED_GVA)
-		return NULL;
-	return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
-}
-
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
-{
-	int level = PT32E_ROOT_LEVEL;
-	hpa_t table_addr = vcpu->mmu.root_hpa;
-
-	for (; ; level--) {
-		u32 index = PT64_INDEX(v, level);
-		u64 *table;
-		u64 pte;
-
-		ASSERT(VALID_PAGE(table_addr));
-		table = __va(table_addr);
-
-		if (level == 1) {
-			pte = table[index];
-			if (is_present_pte(pte) && is_writeble_pte(pte))
-				return 0;
-			mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
-			page_header_update_slot(vcpu->kvm, table, v);
-			table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-								PT_USER_MASK;
-			rmap_add(vcpu, &table[index]);
-			return 0;
-		}
-
-		if (table[index] == 0) {
-			struct kvm_mmu_page *new_table;
-			gfn_t pseudo_gfn;
-
-			pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
-				>> PAGE_SHIFT;
-			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
-						     v, level - 1,
-						     1, 0, &table[index]);
-			if (!new_table) {
-				pgprintk("nonpaging_map: ENOMEM\n");
-				return -ENOMEM;
-			}
-
-			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-				| PT_WRITABLE_MASK | PT_USER_MASK;
-		}
-		table_addr = table[index] & PT64_BASE_ADDR_MASK;
-	}
-}
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
-{
-	int i;
-	struct kvm_mmu_page *page;
-
-	if (!VALID_PAGE(vcpu->mmu.root_hpa))
-		return;
-#ifdef CONFIG_X86_64
-	if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->mmu.root_hpa;
-
-		page = page_header(root);
-		--page->root_count;
-		vcpu->mmu.root_hpa = INVALID_PAGE;
-		return;
-	}
-#endif
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->mmu.pae_root[i];
-
-		if (root) {
-			root &= PT64_BASE_ADDR_MASK;
-			page = page_header(root);
-			--page->root_count;
-		}
-		vcpu->mmu.pae_root[i] = INVALID_PAGE;
-	}
-	vcpu->mmu.root_hpa = INVALID_PAGE;
-}
-
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
-	int i;
-	gfn_t root_gfn;
-	struct kvm_mmu_page *page;
-
-	root_gfn = vcpu->cr3 >> PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
-	if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->mmu.root_hpa;
-
-		ASSERT(!VALID_PAGE(root));
-		page = kvm_mmu_get_page(vcpu, root_gfn, 0,
-					PT64_ROOT_LEVEL, 0, 0, NULL);
-		root = __pa(page->spt);
-		++page->root_count;
-		vcpu->mmu.root_hpa = root;
-		return;
-	}
-#endif
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->mmu.pae_root[i];
-
-		ASSERT(!VALID_PAGE(root));
-		if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
-			if (!is_present_pte(vcpu->pdptrs[i])) {
-				vcpu->mmu.pae_root[i] = 0;
-				continue;
-			}
-			root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
-		} else if (vcpu->mmu.root_level == 0)
-			root_gfn = 0;
-		page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-					PT32_ROOT_LEVEL, !is_paging(vcpu),
-					0, NULL);
-		root = __pa(page->spt);
-		++page->root_count;
-		vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
-	}
-	vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
-}
-
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
-	return vaddr;
-}
-
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-			       u32 error_code)
-{
-	gpa_t addr = gva;
-	hpa_t paddr;
-	int r;
-
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		return r;
-
-	ASSERT(vcpu);
-	ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
-
-
-	paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
-
-	if (is_error_hpa(paddr))
-		return 1;
-
-	return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
-}
-
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
-	mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu *context = &vcpu->mmu;
-
-	context->new_cr3 = nonpaging_new_cr3;
-	context->page_fault = nonpaging_page_fault;
-	context->gva_to_gpa = nonpaging_gva_to_gpa;
-	context->free = nonpaging_free;
-	context->root_level = 0;
-	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
-	return 0;
-}
-
-static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
-	++vcpu->stat.tlb_flush;
-	kvm_x86_ops->tlb_flush(vcpu);
-}
-
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
-{
-	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
-	mmu_free_roots(vcpu);
-}
-
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-			      u64 addr,
-			      u32 err_code)
-{
-	kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
-}
-
-static void paging_free(struct kvm_vcpu *vcpu)
-{
-	nonpaging_free(vcpu);
-}
-
-#define PTTYPE 64
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-#define PTTYPE 32
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
-{
-	struct kvm_mmu *context = &vcpu->mmu;
-
-	ASSERT(is_pae(vcpu));
-	context->new_cr3 = paging_new_cr3;
-	context->page_fault = paging64_page_fault;
-	context->gva_to_gpa = paging64_gva_to_gpa;
-	context->free = paging_free;
-	context->root_level = level;
-	context->shadow_root_level = level;
-	context->root_hpa = INVALID_PAGE;
-	return 0;
-}
-
-static int paging64_init_context(struct kvm_vcpu *vcpu)
-{
-	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
-}
-
-static int paging32_init_context(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu *context = &vcpu->mmu;
-
-	context->new_cr3 = paging_new_cr3;
-	context->page_fault = paging32_page_fault;
-	context->gva_to_gpa = paging32_gva_to_gpa;
-	context->free = paging_free;
-	context->root_level = PT32_ROOT_LEVEL;
-	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
-	return 0;
-}
-
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
-{
-	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
-}
-
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
-	if (!is_paging(vcpu))
-		return nonpaging_init_context(vcpu);
-	else if (is_long_mode(vcpu))
-		return paging64_init_context(vcpu);
-	else if (is_pae(vcpu))
-		return paging32E_init_context(vcpu);
-	else
-		return paging32_init_context(vcpu);
-}
-
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-	if (VALID_PAGE(vcpu->mmu.root_hpa)) {
-		vcpu->mmu.free(vcpu);
-		vcpu->mmu.root_hpa = INVALID_PAGE;
-	}
-}
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
-	destroy_kvm_mmu(vcpu);
-	return init_kvm_mmu(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
-
-int kvm_mmu_load(struct kvm_vcpu *vcpu)
-{
-	int r;
-
-	mutex_lock(&vcpu->kvm->lock);
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		goto out;
-	mmu_alloc_roots(vcpu);
-	kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
-	kvm_mmu_flush_tlb(vcpu);
-out:
-	mutex_unlock(&vcpu->kvm->lock);
-	return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
-
-void kvm_mmu_unload(struct kvm_vcpu *vcpu)
-{
-	mmu_free_roots(vcpu);
-}
-
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu_page *page,
-				  u64 *spte)
-{
-	u64 pte;
-	struct kvm_mmu_page *child;
-
-	pte = *spte;
-	if (is_present_pte(pte)) {
-		if (page->role.level == PT_PAGE_TABLE_LEVEL)
-			rmap_remove(spte);
-		else {
-			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, spte);
-		}
-	}
-	set_shadow_pte(spte, 0);
-	kvm_flush_remote_tlbs(vcpu->kvm);
-}
-
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu_page *page,
-				  u64 *spte,
-				  const void *new, int bytes)
-{
-	if (page->role.level != PT_PAGE_TABLE_LEVEL)
-		return;
-
-	if (page->role.glevels == PT32_ROOT_LEVEL)
-		paging32_update_pte(vcpu, page, spte, new, bytes);
-	else
-		paging64_update_pte(vcpu, page, spte, new, bytes);
-}
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes)
-{
-	gfn_t gfn = gpa >> PAGE_SHIFT;
-	struct kvm_mmu_page *page;
-	struct hlist_node *node, *n;
-	struct hlist_head *bucket;
-	unsigned index;
-	u64 *spte;
-	unsigned offset = offset_in_page(gpa);
-	unsigned pte_size;
-	unsigned page_offset;
-	unsigned misaligned;
-	unsigned quadrant;
-	int level;
-	int flooded = 0;
-	int npte;
-
-	pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
-	if (gfn == vcpu->last_pt_write_gfn) {
-		++vcpu->last_pt_write_count;
-		if (vcpu->last_pt_write_count >= 3)
-			flooded = 1;
-	} else {
-		vcpu->last_pt_write_gfn = gfn;
-		vcpu->last_pt_write_count = 1;
-	}
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-	bucket = &vcpu->kvm->mmu_page_hash[index];
-	hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
-		if (page->gfn != gfn || page->role.metaphysical)
-			continue;
-		pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
-		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
-		misaligned |= bytes < 4;
-		if (misaligned || flooded) {
-			/*
-			 * Misaligned accesses are too much trouble to fix
-			 * up; also, they usually indicate a page is not used
-			 * as a page table.
-			 *
-			 * If we're seeing too many writes to a page,
-			 * it may no longer be a page table, or we may be
-			 * forking, in which case it is better to unmap the
-			 * page.
-			 */
-			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
-				 gpa, bytes, page->role.word);
-			kvm_mmu_zap_page(vcpu->kvm, page);
-			continue;
-		}
-		page_offset = offset;
-		level = page->role.level;
-		npte = 1;
-		if (page->role.glevels == PT32_ROOT_LEVEL) {
-			page_offset <<= 1;	/* 32->64 */
-			/*
-			 * A 32-bit pde maps 4MB while the shadow pdes map
-			 * only 2MB.  So we need to double the offset again
-			 * and zap two pdes instead of one.
-			 */
-			if (level == PT32_ROOT_LEVEL) {
-				page_offset &= ~7; /* kill rounding error */
-				page_offset <<= 1;
-				npte = 2;
-			}
-			quadrant = page_offset >> PAGE_SHIFT;
-			page_offset &= ~PAGE_MASK;
-			if (quadrant != page->role.quadrant)
-				continue;
-		}
-		spte = &page->spt[page_offset / sizeof(*spte)];
-		while (npte--) {
-			mmu_pte_write_zap_pte(vcpu, page, spte);
-			mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
-			++spte;
-		}
-	}
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
-	return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
-}
-
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-	while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
-		struct kvm_mmu_page *page;
-
-		page = container_of(vcpu->kvm->active_mmu_pages.prev,
-				    struct kvm_mmu_page, link);
-		kvm_mmu_zap_page(vcpu->kvm, page);
-	}
-}
-
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *page;
-
-	while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
-		page = container_of(vcpu->kvm->active_mmu_pages.next,
-				    struct kvm_mmu_page, link);
-		kvm_mmu_zap_page(vcpu->kvm, page);
-	}
-	free_page((unsigned long)vcpu->mmu.pae_root);
-}
-
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-{
-	struct page *page;
-	int i;
-
-	ASSERT(vcpu);
-
-	vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
-
-	/*
-	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
-	 * Therefore we need to allocate shadow page tables in the first
-	 * 4GB of memory, which happens to fit the DMA32 zone.
-	 */
-	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
-	if (!page)
-		goto error_1;
-	vcpu->mmu.pae_root = page_address(page);
-	for (i = 0; i < 4; ++i)
-		vcpu->mmu.pae_root[i] = INVALID_PAGE;
-
-	return 0;
-
-error_1:
-	free_mmu_pages(vcpu);
-	return -ENOMEM;
-}
-
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
-	return alloc_mmu_pages(vcpu);
-}
-
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
-	return init_kvm_mmu(vcpu);
-}
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-
-	destroy_kvm_mmu(vcpu);
-	free_mmu_pages(vcpu);
-	mmu_free_memory_caches(vcpu);
-}
-
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
-{
-	struct kvm_mmu_page *page;
-
-	list_for_each_entry(page, &kvm->active_mmu_pages, link) {
-		int i;
-		u64 *pt;
-
-		if (!test_bit(slot, &page->slot_bitmap))
-			continue;
-
-		pt = page->spt;
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-			/* avoid RMW */
-			if (pt[i] & PT_WRITABLE_MASK) {
-				rmap_remove(&pt[i]);
-				pt[i] &= ~PT_WRITABLE_MASK;
-			}
-	}
-}
-
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
-	struct kvm_mmu_page *page, *node;
-
-	list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
-		kvm_mmu_zap_page(kvm, page);
-
-	kvm_flush_remote_tlbs(kvm);
-}
-
-void kvm_mmu_module_exit(void)
-{
-	if (pte_chain_cache)
-		kmem_cache_destroy(pte_chain_cache);
-	if (rmap_desc_cache)
-		kmem_cache_destroy(rmap_desc_cache);
-	if (mmu_page_header_cache)
-		kmem_cache_destroy(mmu_page_header_cache);
-}
-
-int kvm_mmu_module_init(void)
-{
-	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
-					    sizeof(struct kvm_pte_chain),
-					    0, 0, NULL);
-	if (!pte_chain_cache)
-		goto nomem;
-	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
-					    sizeof(struct kvm_rmap_desc),
-					    0, 0, NULL);
-	if (!rmap_desc_cache)
-		goto nomem;
-
-	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
-						  sizeof(struct kvm_mmu_page),
-						  0, 0, NULL);
-	if (!mmu_page_header_cache)
-		goto nomem;
-
-	return 0;
-
-nomem:
-	kvm_mmu_module_exit();
-	return -ENOMEM;
-}
-
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
-	gva = (long long)(gva << 16) >> 16;
-#endif
-	return gva;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
-{
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 ent = pt[i];
-
-		if (!(ent & PT_PRESENT_MASK))
-			continue;
-
-		va = canonicalize(va);
-		if (level > 1)
-			audit_mappings_page(vcpu, ent, va, level - 1);
-		else {
-			gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
-			hpa_t hpa = gpa_to_hpa(vcpu, gpa);
-
-			if ((ent & PT_PRESENT_MASK)
-			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
-				printk(KERN_ERR "audit error: (%s) levels %d"
-				       " gva %lx gpa %llx hpa %llx ent %llx\n",
-				       audit_msg, vcpu->mmu.root_level,
-				       va, gpa, hpa, ent);
-		}
-	}
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->mmu.pae_root[i],
-						    i << 30,
-						    2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
-	int nmaps = 0;
-	int i, j, k;
-
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
-		struct kvm_rmap_desc *d;
-
-		for (j = 0; j < m->npages; ++j) {
-			struct page *page = m->phys_mem[j];
-
-			if (!page->private)
-				continue;
-			if (!(page->private & 1)) {
-				++nmaps;
-				continue;
-			}
-			d = (struct kvm_rmap_desc *)(page->private & ~1ul);
-			while (d) {
-				for (k = 0; k < RMAP_EXT; ++k)
-					if (d->shadow_ptes[k])
-						++nmaps;
-					else
-						break;
-				d = d->more;
-			}
-		}
-	}
-	return nmaps;
-}
-
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
-{
-	int nmaps = 0;
-	struct kvm_mmu_page *page;
-	int i;
-
-	list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
-		u64 *pt = page->spt;
-
-		if (page->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
-
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			u64 ent = pt[i];
-
-			if (!(ent & PT_PRESENT_MASK))
-				continue;
-			if (!(ent & PT_WRITABLE_MASK))
-				continue;
-			++nmaps;
-		}
-	}
-	return nmaps;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-	int n_rmap = count_rmaps(vcpu);
-	int n_actual = count_writable_mappings(vcpu);
-
-	if (n_rmap != n_actual)
-		printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
-		       __FUNCTION__, audit_msg, n_rmap, n_actual);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *page;
-
-	list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
-		hfn_t hfn;
-		struct page *pg;
-
-		if (page->role.metaphysical)
-			continue;
-
-		hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
-			>> PAGE_SHIFT;
-		pg = pfn_to_page(hfn);
-		if (pg->private)
-			printk(KERN_ERR "%s: (%s) shadow page has writable"
-			       " mappings: gfn %lx role %x\n",
-			       __FUNCTION__, audit_msg, page->gfn,
-			       page->role.word);
-	}
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
-	int olddbg = dbg;
-
-	dbg = 0;
-	audit_msg = msg;
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
-	audit_mappings(vcpu);
-	dbg = olddbg;
-}
-
-#endif
diff -puN drivers/kvm/paging_tmpl.h~git-kvm /dev/null
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
- */
-
-#if PTTYPE == 64
-	#define pt_element_t u64
-	#define guest_walker guest_walker64
-	#define FNAME(name) paging##64_##name
-	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
-	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
-	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-	#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
-	#ifdef CONFIG_X86_64
-	#define PT_MAX_FULL_LEVELS 4
-	#else
-	#define PT_MAX_FULL_LEVELS 2
-	#endif
-#elif PTTYPE == 32
-	#define pt_element_t u32
-	#define guest_walker guest_walker32
-	#define FNAME(name) paging##32_##name
-	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
-	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
-	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
-	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-	#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
-	#define PT_MAX_FULL_LEVELS 2
-#else
-	#error Invalid PTTYPE value
-#endif
-
-/*
- * The guest_walker structure emulates the behavior of the hardware page
- * table walker.
- */
-struct guest_walker {
-	int level;
-	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
-	pt_element_t *table;
-	pt_element_t pte;
-	pt_element_t *ptep;
-	struct page *page;
-	int index;
-	pt_element_t inherited_ar;
-	gfn_t gfn;
-	u32 error_code;
-};
-
-/*
- * Fetch a guest pte for a guest virtual address
- */
-static int FNAME(walk_addr)(struct guest_walker *walker,
-			    struct kvm_vcpu *vcpu, gva_t addr,
-			    int write_fault, int user_fault, int fetch_fault)
-{
-	hpa_t hpa;
-	struct kvm_memory_slot *slot;
-	pt_element_t *ptep;
-	pt_element_t root;
-	gfn_t table_gfn;
-
-	pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
-	walker->level = vcpu->mmu.root_level;
-	walker->table = NULL;
-	walker->page = NULL;
-	walker->ptep = NULL;
-	root = vcpu->cr3;
-#if PTTYPE == 64
-	if (!is_long_mode(vcpu)) {
-		walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
-		root = *walker->ptep;
-		walker->pte = root;
-		if (!(root & PT_PRESENT_MASK))
-			goto not_present;
-		--walker->level;
-	}
-#endif
-	table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-	walker->table_gfn[walker->level - 1] = table_gfn;
-	pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
-		 walker->level - 1, table_gfn);
-	slot = gfn_to_memslot(vcpu->kvm, table_gfn);
-	hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
-	walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
-	walker->table = kmap_atomic(walker->page, KM_USER0);
-
-	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
-
-	walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
-
-	for (;;) {
-		int index = PT_INDEX(addr, walker->level);
-		hpa_t paddr;
-
-		ptep = &walker->table[index];
-		walker->index = index;
-		ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
-		       ((unsigned long)ptep & PAGE_MASK));
-
-		if (!is_present_pte(*ptep))
-			goto not_present;
-
-		if (write_fault && !is_writeble_pte(*ptep))
-			if (user_fault || is_write_protection(vcpu))
-				goto access_error;
-
-		if (user_fault && !(*ptep & PT_USER_MASK))
-			goto access_error;
-
-#if PTTYPE == 64
-		if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
-			goto access_error;
-#endif
-
-		if (!(*ptep & PT_ACCESSED_MASK)) {
-			mark_page_dirty(vcpu->kvm, table_gfn);
-			*ptep |= PT_ACCESSED_MASK;
-		}
-
-		if (walker->level == PT_PAGE_TABLE_LEVEL) {
-			walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
-				>> PAGE_SHIFT;
-			break;
-		}
-
-		if (walker->level == PT_DIRECTORY_LEVEL
-		    && (*ptep & PT_PAGE_SIZE_MASK)
-		    && (PTTYPE == 64 || is_pse(vcpu))) {
-			walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
-				>> PAGE_SHIFT;
-			walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
-			break;
-		}
-
-		walker->inherited_ar &= walker->table[index];
-		table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-		kunmap_atomic(walker->table, KM_USER0);
-		paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
-		walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
-		walker->table = kmap_atomic(walker->page, KM_USER0);
-		--walker->level;
-		walker->table_gfn[walker->level - 1 ] = table_gfn;
-		pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
-			 walker->level - 1, table_gfn);
-	}
-	walker->pte = *ptep;
-	if (walker->page)
-		walker->ptep = NULL;
-	if (walker->table)
-		kunmap_atomic(walker->table, KM_USER0);
-	pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
-	return 1;
-
-not_present:
-	walker->error_code = 0;
-	goto err;
-
-access_error:
-	walker->error_code = PFERR_PRESENT_MASK;
-
-err:
-	if (write_fault)
-		walker->error_code |= PFERR_WRITE_MASK;
-	if (user_fault)
-		walker->error_code |= PFERR_USER_MASK;
-	if (fetch_fault)
-		walker->error_code |= PFERR_FETCH_MASK;
-	if (walker->table)
-		kunmap_atomic(walker->table, KM_USER0);
-	return 0;
-}
-
-static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
-					struct guest_walker *walker)
-{
-	mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
-}
-
-static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
-				  u64 *shadow_pte,
-				  gpa_t gaddr,
-				  pt_element_t gpte,
-				  u64 access_bits,
-				  int user_fault,
-				  int write_fault,
-				  int *ptwrite,
-				  struct guest_walker *walker,
-				  gfn_t gfn)
-{
-	hpa_t paddr;
-	int dirty = gpte & PT_DIRTY_MASK;
-	u64 spte = *shadow_pte;
-	int was_rmapped = is_rmap_pte(spte);
-
-	pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
-		 " user_fault %d gfn %lx\n",
-		 __FUNCTION__, spte, (u64)gpte, access_bits,
-		 write_fault, user_fault, gfn);
-
-	if (write_fault && !dirty) {
-		pt_element_t *guest_ent, *tmp = NULL;
-
-		if (walker->ptep)
-			guest_ent = walker->ptep;
-		else {
-			tmp = kmap_atomic(walker->page, KM_USER0);
-			guest_ent = &tmp[walker->index];
-		}
-
-		*guest_ent |= PT_DIRTY_MASK;
-		if (!walker->ptep)
-			kunmap_atomic(tmp, KM_USER0);
-		dirty = 1;
-		FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
-	}
-
-	spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
-	spte |= gpte & PT64_NX_MASK;
-	if (!dirty)
-		access_bits &= ~PT_WRITABLE_MASK;
-
-	paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
-
-	spte |= PT_PRESENT_MASK;
-	if (access_bits & PT_USER_MASK)
-		spte |= PT_USER_MASK;
-
-	if (is_error_hpa(paddr)) {
-		spte |= gaddr;
-		spte |= PT_SHADOW_IO_MARK;
-		spte &= ~PT_PRESENT_MASK;
-		set_shadow_pte(shadow_pte, spte);
-		return;
-	}
-
-	spte |= paddr;
-
-	if ((access_bits & PT_WRITABLE_MASK)
-	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-		struct kvm_mmu_page *shadow;
-
-		spte |= PT_WRITABLE_MASK;
-		if (user_fault) {
-			mmu_unshadow(vcpu, gfn);
-			goto unshadowed;
-		}
-
-		shadow = kvm_mmu_lookup_page(vcpu, gfn);
-		if (shadow) {
-			pgprintk("%s: found shadow page for %lx, marking ro\n",
-				 __FUNCTION__, gfn);
-			access_bits &= ~PT_WRITABLE_MASK;
-			if (is_writeble_pte(spte)) {
-				spte &= ~PT_WRITABLE_MASK;
-				kvm_x86_ops->tlb_flush(vcpu);
-			}
-			if (write_fault)
-				*ptwrite = 1;
-		}
-	}
-
-unshadowed:
-
-	if (access_bits & PT_WRITABLE_MASK)
-		mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
-
-	set_shadow_pte(shadow_pte, spte);
-	page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
-	if (!was_rmapped)
-		rmap_add(vcpu, shadow_pte);
-}
-
-static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
-			   u64 *shadow_pte, u64 access_bits,
-			   int user_fault, int write_fault, int *ptwrite,
-			   struct guest_walker *walker, gfn_t gfn)
-{
-	access_bits &= gpte;
-	FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
-			      gpte, access_bits, user_fault, write_fault,
-			      ptwrite, walker, gfn);
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
-			      u64 *spte, const void *pte, int bytes)
-{
-	pt_element_t gpte;
-
-	if (bytes < sizeof(pt_element_t))
-		return;
-	gpte = *(const pt_element_t *)pte;
-	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
-		return;
-	pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
-	FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
-		       0, NULL, NULL,
-		       (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
-}
-
-static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
-			   u64 *shadow_pte, u64 access_bits,
-			   int user_fault, int write_fault, int *ptwrite,
-			   struct guest_walker *walker, gfn_t gfn)
-{
-	gpa_t gaddr;
-
-	access_bits &= gpde;
-	gaddr = (gpa_t)gfn << PAGE_SHIFT;
-	if (PTTYPE == 32 && is_cpuid_PSE36())
-		gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
-			(32 - PT32_DIR_PSE36_SHIFT);
-	FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
-			      gpde, access_bits, user_fault, write_fault,
-			      ptwrite, walker, gfn);
-}
-
-/*
- * Fetch a shadow pte for a specific level in the paging hierarchy.
- */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-			 struct guest_walker *walker,
-			 int user_fault, int write_fault, int *ptwrite)
-{
-	hpa_t shadow_addr;
-	int level;
-	u64 *shadow_ent;
-	u64 *prev_shadow_ent = NULL;
-
-	if (!is_present_pte(walker->pte))
-		return NULL;
-
-	shadow_addr = vcpu->mmu.root_hpa;
-	level = vcpu->mmu.shadow_root_level;
-	if (level == PT32E_ROOT_LEVEL) {
-		shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
-		shadow_addr &= PT64_BASE_ADDR_MASK;
-		--level;
-	}
-
-	for (; ; level--) {
-		u32 index = SHADOW_PT_INDEX(addr, level);
-		struct kvm_mmu_page *shadow_page;
-		u64 shadow_pte;
-		int metaphysical;
-		gfn_t table_gfn;
-		unsigned hugepage_access = 0;
-
-		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
-		if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
-			if (level == PT_PAGE_TABLE_LEVEL)
-				break;
-			shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
-			prev_shadow_ent = shadow_ent;
-			continue;
-		}
-
-		if (level == PT_PAGE_TABLE_LEVEL)
-			break;
-
-		if (level - 1 == PT_PAGE_TABLE_LEVEL
-		    && walker->level == PT_DIRECTORY_LEVEL) {
-			metaphysical = 1;
-			hugepage_access = walker->pte;
-			hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
-			if (walker->pte & PT64_NX_MASK)
-				hugepage_access |= (1 << 2);
-			hugepage_access >>= PT_WRITABLE_SHIFT;
-			table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
-				>> PAGE_SHIFT;
-		} else {
-			metaphysical = 0;
-			table_gfn = walker->table_gfn[level - 2];
-		}
-		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					       metaphysical, hugepage_access,
-					       shadow_ent);
-		shadow_addr = __pa(shadow_page->spt);
-		shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
-			| PT_WRITABLE_MASK | PT_USER_MASK;
-		*shadow_ent = shadow_pte;
-		prev_shadow_ent = shadow_ent;
-	}
-
-	if (walker->level == PT_DIRECTORY_LEVEL) {
-		FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
-			       walker->inherited_ar, user_fault, write_fault,
-			       ptwrite, walker, walker->gfn);
-	} else {
-		ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
-		FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
-			       walker->inherited_ar, user_fault, write_fault,
-			       ptwrite, walker, walker->gfn);
-	}
-	return shadow_ent;
-}
-
-/*
- * Page fault handler.  There are several causes for a page fault:
- *   - there is no shadow pte for the guest pte
- *   - write access through a shadow pte marked read only so that we can set
- *     the dirty bit
- *   - write access to a shadow pte marked read only so we can update the page
- *     dirty bitmap, when userspace requests it
- *   - mmio access; in this case we will never install a present shadow pte
- *   - normal guest page fault due to the guest pte marked not present, not
- *     writable, or not executable
- *
- *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
- *           a negative value on error.
- */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
-			       u32 error_code)
-{
-	int write_fault = error_code & PFERR_WRITE_MASK;
-	int user_fault = error_code & PFERR_USER_MASK;
-	int fetch_fault = error_code & PFERR_FETCH_MASK;
-	struct guest_walker walker;
-	u64 *shadow_pte;
-	int write_pt = 0;
-	int r;
-
-	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
-	kvm_mmu_audit(vcpu, "pre page fault");
-
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		return r;
-
-	/*
-	 * Look up the shadow pte for the faulting address.
-	 */
-	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
-			     fetch_fault);
-
-	/*
-	 * The page is not mapped by the guest.  Let the guest handle it.
-	 */
-	if (!r) {
-		pgprintk("%s: guest page fault\n", __FUNCTION__);
-		inject_page_fault(vcpu, addr, walker.error_code);
-		vcpu->last_pt_write_count = 0; /* reset fork detector */
-		return 0;
-	}
-
-	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-				  &write_pt);
-	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
-		 shadow_pte, *shadow_pte, write_pt);
-
-	if (!write_pt)
-		vcpu->last_pt_write_count = 0; /* reset fork detector */
-
-	/*
-	 * mmio: emulate if accessible, otherwise its a guest fault.
-	 */
-	if (is_io_pte(*shadow_pte))
-		return 1;
-
-	++vcpu->stat.pf_fixed;
-	kvm_mmu_audit(vcpu, "post page fault (fixed)");
-
-	return write_pt;
-}
-
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
-	struct guest_walker walker;
-	gpa_t gpa = UNMAPPED_GVA;
-	int r;
-
-	r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
-
-	if (r) {
-		gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
-		gpa |= vaddr & ~PAGE_MASK;
-	}
-
-	return gpa;
-}
-
-#undef pt_element_t
-#undef guest_walker
-#undef FNAME
-#undef PT_BASE_ADDR_MASK
-#undef PT_INDEX
-#undef SHADOW_PT_INDEX
-#undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
-#undef PT_MAX_FULL_LEVELS
diff -puN drivers/kvm/segment_descriptor.h~git-kvm /dev/null
--- a/drivers/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,17 +0,0 @@
-struct segment_descriptor {
-	u16 limit_low;
-	u16 base_low;
-	u8  base_mid;
-	u8  type : 4;
-	u8  system : 1;
-	u8  dpl : 2;
-	u8  present : 1;
-	u8  limit_high : 4;
-	u8  avl : 1;
-	u8  long_mode : 1;
-	u8  default_op : 1;
-	u8  granularity : 1;
-	u8  base_high;
-} __attribute__((packed));
-
-
diff -puN drivers/kvm/svm.c~git-kvm /dev/null
--- a/drivers/kvm/svm.c
+++ /dev/null
@@ -1,1754 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * AMD SVM support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "kvm_svm.h"
-#include "x86_emulate.h"
-#include "irq.h"
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
-#define DB_VECTOR 1
-#define UD_VECTOR 6
-#define GP_VECTOR 13
-
-#define DR7_GD_MASK (1 << 13)
-#define DR6_BD_MASK (1 << 13)
-
-#define SEG_TYPE_LDT 2
-#define SEG_TYPE_BUSY_TSS16 3
-
-#define KVM_EFER_LMA (1 << 10)
-#define KVM_EFER_LME (1 << 8)
-
-#define SVM_FEATURE_NPT  (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_DEATURE_SVML (1 << 2)
-
-static void kvm_reput_irq(struct vcpu_svm *svm);
-
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
-{
-	return container_of(vcpu, struct vcpu_svm, vcpu);
-}
-
-unsigned long iopm_base;
-unsigned long msrpm_base;
-
-struct kvm_ldttss_desc {
-	u16 limit0;
-	u16 base0;
-	unsigned base1 : 8, type : 5, dpl : 2, p : 1;
-	unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
-	u32 base3;
-	u32 zero1;
-} __attribute__((packed));
-
-struct svm_cpu_data {
-	int cpu;
-
-	u64 asid_generation;
-	u32 max_asid;
-	u32 next_asid;
-	struct kvm_ldttss_desc *tss_desc;
-
-	struct page *save_area;
-};
-
-static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-static uint32_t svm_features;
-
-struct svm_init_data {
-	int cpu;
-	int r;
-};
-
-static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-#define MAX_INST_SIZE 15
-
-static inline u32 svm_has(u32 feat)
-{
-	return svm_features & feat;
-}
-
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->irq_summary);
-	int bit_index = __ffs(vcpu->irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-	if (!vcpu->irq_pending[word_index])
-		clear_bit(word_index, &vcpu->irq_summary);
-	return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
-	set_bit(irq, vcpu->irq_pending);
-	set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
-}
-
-static inline void clgi(void)
-{
-	asm volatile (SVM_CLGI);
-}
-
-static inline void stgi(void)
-{
-	asm volatile (SVM_STGI);
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
-	asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
-}
-
-static inline unsigned long kvm_read_cr2(void)
-{
-	unsigned long cr2;
-
-	asm volatile ("mov %%cr2, %0" : "=r" (cr2));
-	return cr2;
-}
-
-static inline void kvm_write_cr2(unsigned long val)
-{
-	asm volatile ("mov %0, %%cr2" :: "r" (val));
-}
-
-static inline unsigned long read_dr6(void)
-{
-	unsigned long dr6;
-
-	asm volatile ("mov %%dr6, %0" : "=r" (dr6));
-	return dr6;
-}
-
-static inline void write_dr6(unsigned long val)
-{
-	asm volatile ("mov %0, %%dr6" :: "r" (val));
-}
-
-static inline unsigned long read_dr7(void)
-{
-	unsigned long dr7;
-
-	asm volatile ("mov %%dr7, %0" : "=r" (dr7));
-	return dr7;
-}
-
-static inline void write_dr7(unsigned long val)
-{
-	asm volatile ("mov %0, %%dr7" :: "r" (val));
-}
-
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
-	to_svm(vcpu)->asid_generation--;
-}
-
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
-	force_new_asid(vcpu);
-}
-
-static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-	if (!(efer & KVM_EFER_LMA))
-		efer &= ~KVM_EFER_LME;
-
-	to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
-	vcpu->shadow_efer = efer;
-}
-
-static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	svm->vmcb->control.event_inj =		SVM_EVTINJ_VALID |
-						SVM_EVTINJ_VALID_ERR |
-						SVM_EVTINJ_TYPE_EXEPT |
-						GP_VECTOR;
-	svm->vmcb->control.event_inj_err = error_code;
-}
-
-static void inject_ud(struct kvm_vcpu *vcpu)
-{
-	to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
-						SVM_EVTINJ_TYPE_EXEPT |
-						UD_VECTOR;
-}
-
-static int is_page_fault(uint32_t info)
-{
-	info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
-	return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
-}
-
-static int is_external_interrupt(u32 info)
-{
-	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
-	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	if (!svm->next_rip) {
-		printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
-		return;
-	}
-	if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
-		printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
-		       __FUNCTION__,
-		       svm->vmcb->save.rip,
-		       svm->next_rip);
-	}
-
-	vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
-	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
-
-	vcpu->interrupt_window_open = 1;
-}
-
-static int has_svm(void)
-{
-	uint32_t eax, ebx, ecx, edx;
-
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-		printk(KERN_INFO "has_svm: not amd\n");
-		return 0;
-	}
-
-	cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
-	if (eax < SVM_CPUID_FUNC) {
-		printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
-		return 0;
-	}
-
-	cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-	if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
-		printk(KERN_DEBUG "has_svm: svm not available\n");
-		return 0;
-	}
-	return 1;
-}
-
-static void svm_hardware_disable(void *garbage)
-{
-	struct svm_cpu_data *svm_data
-		= per_cpu(svm_data, raw_smp_processor_id());
-
-	if (svm_data) {
-		uint64_t efer;
-
-		wrmsrl(MSR_VM_HSAVE_PA, 0);
-		rdmsrl(MSR_EFER, efer);
-		wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
-		per_cpu(svm_data, raw_smp_processor_id()) = NULL;
-		__free_page(svm_data->save_area);
-		kfree(svm_data);
-	}
-}
-
-static void svm_hardware_enable(void *garbage)
-{
-
-	struct svm_cpu_data *svm_data;
-	uint64_t efer;
-#ifdef CONFIG_X86_64
-	struct desc_ptr gdt_descr;
-#else
-	struct Xgt_desc_struct gdt_descr;
-#endif
-	struct desc_struct *gdt;
-	int me = raw_smp_processor_id();
-
-	if (!has_svm()) {
-		printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
-		return;
-	}
-	svm_data = per_cpu(svm_data, me);
-
-	if (!svm_data) {
-		printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
-		       me);
-		return;
-	}
-
-	svm_data->asid_generation = 1;
-	svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
-	svm_data->next_asid = svm_data->max_asid + 1;
-	svm_features = cpuid_edx(SVM_CPUID_FUNC);
-
-	asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
-	gdt = (struct desc_struct *)gdt_descr.address;
-	svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
-
-	rdmsrl(MSR_EFER, efer);
-	wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
-
-	wrmsrl(MSR_VM_HSAVE_PA,
-	       page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
-}
-
-static int svm_cpu_init(int cpu)
-{
-	struct svm_cpu_data *svm_data;
-	int r;
-
-	svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
-	if (!svm_data)
-		return -ENOMEM;
-	svm_data->cpu = cpu;
-	svm_data->save_area = alloc_page(GFP_KERNEL);
-	r = -ENOMEM;
-	if (!svm_data->save_area)
-		goto err_1;
-
-	per_cpu(svm_data, cpu) = svm_data;
-
-	return 0;
-
-err_1:
-	kfree(svm_data);
-	return r;
-
-}
-
-static void set_msr_interception(u32 *msrpm, unsigned msr,
-				 int read, int write)
-{
-	int i;
-
-	for (i = 0; i < NUM_MSR_MAPS; i++) {
-		if (msr >= msrpm_ranges[i] &&
-		    msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
-			u32 msr_offset = (i * MSRS_IN_RANGE + msr -
-					  msrpm_ranges[i]) * 2;
-
-			u32 *base = msrpm + (msr_offset / 32);
-			u32 msr_shift = msr_offset % 32;
-			u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
-			*base = (*base & ~(0x3 << msr_shift)) |
-				(mask << msr_shift);
-			return;
-		}
-	}
-	BUG();
-}
-
-static __init int svm_hardware_setup(void)
-{
-	int cpu;
-	struct page *iopm_pages;
-	struct page *msrpm_pages;
-	void *iopm_va, *msrpm_va;
-	int r;
-
-	iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
-
-	if (!iopm_pages)
-		return -ENOMEM;
-
-	iopm_va = page_address(iopm_pages);
-	memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
-	clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
-	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
-
-	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
-
-	r = -ENOMEM;
-	if (!msrpm_pages)
-		goto err_1;
-
-	msrpm_va = page_address(msrpm_pages);
-	memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
-	msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
-	set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
-	set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
-	set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
-	set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
-	set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
-	set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
-#endif
-	set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
-	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
-	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
-	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
-
-	for_each_online_cpu(cpu) {
-		r = svm_cpu_init(cpu);
-		if (r)
-			goto err_2;
-	}
-	return 0;
-
-err_2:
-	__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
-	msrpm_base = 0;
-err_1:
-	__free_pages(iopm_pages, IOPM_ALLOC_ORDER);
-	iopm_base = 0;
-	return r;
-}
-
-static __exit void svm_hardware_unsetup(void)
-{
-	__free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
-	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
-	iopm_base = msrpm_base = 0;
-}
-
-static void init_seg(struct vmcb_seg *seg)
-{
-	seg->selector = 0;
-	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
-		SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
-	seg->limit = 0xffff;
-	seg->base = 0;
-}
-
-static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
-{
-	seg->selector = 0;
-	seg->attrib = SVM_SELECTOR_P_MASK | type;
-	seg->limit = 0xffff;
-	seg->base = 0;
-}
-
-static void init_vmcb(struct vmcb *vmcb)
-{
-	struct vmcb_control_area *control = &vmcb->control;
-	struct vmcb_save_area *save = &vmcb->save;
-
-	control->intercept_cr_read = 	INTERCEPT_CR0_MASK |
-					INTERCEPT_CR3_MASK |
-					INTERCEPT_CR4_MASK;
-
-	control->intercept_cr_write = 	INTERCEPT_CR0_MASK |
-					INTERCEPT_CR3_MASK |
-					INTERCEPT_CR4_MASK;
-
-	control->intercept_dr_read = 	INTERCEPT_DR0_MASK |
-					INTERCEPT_DR1_MASK |
-					INTERCEPT_DR2_MASK |
-					INTERCEPT_DR3_MASK;
-
-	control->intercept_dr_write = 	INTERCEPT_DR0_MASK |
-					INTERCEPT_DR1_MASK |
-					INTERCEPT_DR2_MASK |
-					INTERCEPT_DR3_MASK |
-					INTERCEPT_DR5_MASK |
-					INTERCEPT_DR7_MASK;
-
-	control->intercept_exceptions = 1 << PF_VECTOR;
-
-
-	control->intercept = 	(1ULL << INTERCEPT_INTR) |
-				(1ULL << INTERCEPT_NMI) |
-				(1ULL << INTERCEPT_SMI) |
-		/*
-		 * selective cr0 intercept bug?
-		 *    	0:   0f 22 d8                mov    %eax,%cr3
-		 *	3:   0f 20 c0                mov    %cr0,%eax
-		 *	6:   0d 00 00 00 80          or     $0x80000000,%eax
-		 *	b:   0f 22 c0                mov    %eax,%cr0
-		 * set cr3 ->interception
-		 * get cr0 ->interception
-		 * set cr0 -> no interception
-		 */
-		/*              (1ULL << INTERCEPT_SELECTIVE_CR0) | */
-				(1ULL << INTERCEPT_CPUID) |
-				(1ULL << INTERCEPT_INVD) |
-				(1ULL << INTERCEPT_HLT) |
-				(1ULL << INTERCEPT_INVLPGA) |
-				(1ULL << INTERCEPT_IOIO_PROT) |
-				(1ULL << INTERCEPT_MSR_PROT) |
-				(1ULL << INTERCEPT_TASK_SWITCH) |
-				(1ULL << INTERCEPT_SHUTDOWN) |
-				(1ULL << INTERCEPT_VMRUN) |
-				(1ULL << INTERCEPT_VMMCALL) |
-				(1ULL << INTERCEPT_VMLOAD) |
-				(1ULL << INTERCEPT_VMSAVE) |
-				(1ULL << INTERCEPT_STGI) |
-				(1ULL << INTERCEPT_CLGI) |
-				(1ULL << INTERCEPT_SKINIT) |
-				(1ULL << INTERCEPT_WBINVD) |
-				(1ULL << INTERCEPT_MONITOR) |
-				(1ULL << INTERCEPT_MWAIT);
-
-	control->iopm_base_pa = iopm_base;
-	control->msrpm_base_pa = msrpm_base;
-	control->tsc_offset = 0;
-	control->int_ctl = V_INTR_MASKING_MASK;
-
-	init_seg(&save->es);
-	init_seg(&save->ss);
-	init_seg(&save->ds);
-	init_seg(&save->fs);
-	init_seg(&save->gs);
-
-	save->cs.selector = 0xf000;
-	/* Executable/Readable Code Segment */
-	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
-		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
-	save->cs.limit = 0xffff;
-	/*
-	 * cs.base should really be 0xffff0000, but vmx can't handle that, so
-	 * be consistent with it.
-	 *
-	 * Replace when we have real mode working for vmx.
-	 */
-	save->cs.base = 0xf0000;
-
-	save->gdtr.limit = 0xffff;
-	save->idtr.limit = 0xffff;
-
-	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
-	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
-
-	save->efer = MSR_EFER_SVME_MASK;
-
-        save->dr6 = 0xffff0ff0;
-	save->dr7 = 0x400;
-	save->rflags = 2;
-	save->rip = 0x0000fff0;
-
-	/*
-	 * cr0 val on cpu init should be 0x60000010, we enable cpu
-	 * cache by default. the orderly way is to enable cache in bios.
-	 */
-	save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
-	save->cr4 = X86_CR4_PAE;
-	/* rdx = ?? */
-}
-
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	init_vmcb(svm->vmcb);
-
-	if (vcpu->vcpu_id != 0) {
-		svm->vmcb->save.rip = 0;
-		svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
-		svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
-	}
-}
-
-static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
-{
-	struct vcpu_svm *svm;
-	struct page *page;
-	int err;
-
-	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-	if (!svm) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	err = kvm_vcpu_init(&svm->vcpu, kvm, id);
-	if (err)
-		goto free_svm;
-
-	if (irqchip_in_kernel(kvm)) {
-		err = kvm_create_lapic(&svm->vcpu);
-		if (err < 0)
-			goto free_svm;
-	}
-
-	page = alloc_page(GFP_KERNEL);
-	if (!page) {
-		err = -ENOMEM;
-		goto uninit;
-	}
-
-	svm->vmcb = page_address(page);
-	clear_page(svm->vmcb);
-	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
-	svm->asid_generation = 0;
-	memset(svm->db_regs, 0, sizeof(svm->db_regs));
-	init_vmcb(svm->vmcb);
-
-	fx_init(&svm->vcpu);
-	svm->vcpu.fpu_active = 1;
-	svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-	if (svm->vcpu.vcpu_id == 0)
-		svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
-
-	return &svm->vcpu;
-
-uninit:
-	kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
-	kmem_cache_free(kvm_vcpu_cache, svm);
-out:
-	return ERR_PTR(err);
-}
-
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
-	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(kvm_vcpu_cache, svm);
-}
-
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	int i;
-
-	if (unlikely(cpu != vcpu->cpu)) {
-		u64 tsc_this, delta;
-
-		/*
-		 * Make sure that the guest sees a monotonically
-		 * increasing TSC.
-		 */
-		rdtscll(tsc_this);
-		delta = vcpu->host_tsc - tsc_this;
-		svm->vmcb->control.tsc_offset += delta;
-		vcpu->cpu = cpu;
-		kvm_migrate_apic_timer(vcpu);
-	}
-
-	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-}
-
-static void svm_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	int i;
-
-	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
-	rdtscll(vcpu->host_tsc);
-	kvm_put_guest_fpu(vcpu);
-}
-
-static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_cache_regs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-	vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-	vcpu->rip = svm->vmcb->save.rip;
-}
-
-static void svm_decache_regs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
-	svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
-	svm->vmcb->save.rip = vcpu->rip;
-}
-
-static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
-{
-	return to_svm(vcpu)->vmcb->save.rflags;
-}
-
-static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-	to_svm(vcpu)->vmcb->save.rflags = rflags;
-}
-
-static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
-{
-	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
-
-	switch (seg) {
-	case VCPU_SREG_CS: return &save->cs;
-	case VCPU_SREG_DS: return &save->ds;
-	case VCPU_SREG_ES: return &save->es;
-	case VCPU_SREG_FS: return &save->fs;
-	case VCPU_SREG_GS: return &save->gs;
-	case VCPU_SREG_SS: return &save->ss;
-	case VCPU_SREG_TR: return &save->tr;
-	case VCPU_SREG_LDTR: return &save->ldtr;
-	}
-	BUG();
-	return NULL;
-}
-
-static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
-	struct vmcb_seg *s = svm_seg(vcpu, seg);
-
-	return s->base;
-}
-
-static void svm_get_segment(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg)
-{
-	struct vmcb_seg *s = svm_seg(vcpu, seg);
-
-	var->base = s->base;
-	var->limit = s->limit;
-	var->selector = s->selector;
-	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
-	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
-	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
-	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
-	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
-	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
-	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
-	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
-	var->unusable = !var->present;
-}
-
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	dt->limit = svm->vmcb->save.idtr.limit;
-	dt->base = svm->vmcb->save.idtr.base;
-}
-
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	svm->vmcb->save.idtr.limit = dt->limit;
-	svm->vmcb->save.idtr.base = dt->base ;
-}
-
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	dt->limit = svm->vmcb->save.gdtr.limit;
-	dt->base = svm->vmcb->save.gdtr.base;
-}
-
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	svm->vmcb->save.gdtr.limit = dt->limit;
-	svm->vmcb->save.gdtr.base = dt->base ;
-}
-
-static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-#ifdef CONFIG_X86_64
-	if (vcpu->shadow_efer & KVM_EFER_LME) {
-		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-			vcpu->shadow_efer |= KVM_EFER_LMA;
-			svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
-		}
-
-		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
-			vcpu->shadow_efer &= ~KVM_EFER_LMA;
-			svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
-		}
-	}
-#endif
-	if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
-		svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
-		vcpu->fpu_active = 1;
-	}
-
-	vcpu->cr0 = cr0;
-	cr0 |= X86_CR0_PG | X86_CR0_WP;
-	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
-	svm->vmcb->save.cr0 = cr0;
-}
-
-static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-       vcpu->cr4 = cr4;
-       to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
-}
-
-static void svm_set_segment(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb_seg *s = svm_seg(vcpu, seg);
-
-	s->base = var->base;
-	s->limit = var->limit;
-	s->selector = var->selector;
-	if (var->unusable)
-		s->attrib = 0;
-	else {
-		s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
-		s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
-		s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
-		s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
-		s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
-		s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
-		s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
-		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
-	}
-	if (seg == VCPU_SREG_CS)
-		svm->vmcb->save.cpl
-			= (svm->vmcb->save.cs.attrib
-			   >> SVM_SELECTOR_DPL_SHIFT) & 3;
-
-}
-
-/* FIXME:
-
-	svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
-	svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
-
-*/
-
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
-{
-	return -EOPNOTSUPP;
-}
-
-static int svm_get_irq(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	u32 exit_int_info = svm->vmcb->control.exit_int_info;
-
-	if (is_external_interrupt(exit_int_info))
-		return exit_int_info & SVM_EVTINJ_VEC_MASK;
-	return -1;
-}
-
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-	wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
-{
-	if (svm_data->next_asid > svm_data->max_asid) {
-		++svm_data->asid_generation;
-		svm_data->next_asid = 1;
-		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
-	}
-
-	svm->vcpu.cpu = svm_data->cpu;
-	svm->asid_generation = svm_data->asid_generation;
-	svm->vmcb->control.asid = svm_data->next_asid++;
-}
-
-static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
-{
-	return to_svm(vcpu)->db_regs[dr];
-}
-
-static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
-		       int *exception)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	*exception = 0;
-
-	if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
-		svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
-		svm->vmcb->save.dr6 |= DR6_BD_MASK;
-		*exception = DB_VECTOR;
-		return;
-	}
-
-	switch (dr) {
-	case 0 ... 3:
-		svm->db_regs[dr] = value;
-		return;
-	case 4 ... 5:
-		if (vcpu->cr4 & X86_CR4_DE) {
-			*exception = UD_VECTOR;
-			return;
-		}
-	case 7: {
-		if (value & ~((1ULL << 32) - 1)) {
-			*exception = GP_VECTOR;
-			return;
-		}
-		svm->vmcb->save.dr7 = value;
-		return;
-	}
-	default:
-		printk(KERN_DEBUG "%s: unexpected dr %u\n",
-		       __FUNCTION__, dr);
-		*exception = UD_VECTOR;
-		return;
-	}
-}
-
-static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	u32 exit_int_info = svm->vmcb->control.exit_int_info;
-	struct kvm *kvm = svm->vcpu.kvm;
-	u64 fault_address;
-	u32 error_code;
-	enum emulation_result er;
-	int r;
-
-	if (!irqchip_in_kernel(kvm) &&
-		is_external_interrupt(exit_int_info))
-		push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
-
-	mutex_lock(&kvm->lock);
-
-	fault_address  = svm->vmcb->control.exit_info_2;
-	error_code = svm->vmcb->control.exit_info_1;
-	r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
-	if (r < 0) {
-		mutex_unlock(&kvm->lock);
-		return r;
-	}
-	if (!r) {
-		mutex_unlock(&kvm->lock);
-		return 1;
-	}
-	er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
-				 error_code);
-	mutex_unlock(&kvm->lock);
-
-	switch (er) {
-	case EMULATE_DONE:
-		return 1;
-	case EMULATE_DO_MMIO:
-		++svm->vcpu.stat.mmio_exits;
-		return 0;
-	case EMULATE_FAIL:
-		kvm_report_emulation_failure(&svm->vcpu, "pagetable");
-		break;
-	default:
-		BUG();
-	}
-
-	kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-	return 0;
-}
-
-static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
-	if (!(svm->vcpu.cr0 & X86_CR0_TS))
-		svm->vmcb->save.cr0 &= ~X86_CR0_TS;
-	svm->vcpu.fpu_active = 1;
-
-	return 1;
-}
-
-static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	/*
-	 * VMCB is undefined after a SHUTDOWN intercept
-	 * so reinitialize it.
-	 */
-	clear_page(svm->vmcb);
-	init_vmcb(svm->vmcb);
-
-	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
-	return 0;
-}
-
-static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
-	int size, down, in, string, rep;
-	unsigned port;
-
-	++svm->vcpu.stat.io_exits;
-
-	svm->next_rip = svm->vmcb->control.exit_info_2;
-
-	string = (io_info & SVM_IOIO_STR_MASK) != 0;
-
-	if (string) {
-		if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
-			return 0;
-		return 1;
-	}
-
-	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
-	port = io_info >> 16;
-	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-	rep = (io_info & SVM_IOIO_REP_MASK) != 0;
-	down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
-
-	return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
-}
-
-static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	return 1;
-}
-
-static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	svm->next_rip = svm->vmcb->save.rip + 1;
-	skip_emulated_instruction(&svm->vcpu);
-	return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	svm->next_rip = svm->vmcb->save.rip + 3;
-	skip_emulated_instruction(&svm->vcpu);
-	return kvm_hypercall(&svm->vcpu, kvm_run);
-}
-
-static int invalid_op_interception(struct vcpu_svm *svm,
-				   struct kvm_run *kvm_run)
-{
-	inject_ud(&svm->vcpu);
-	return 1;
-}
-
-static int task_switch_interception(struct vcpu_svm *svm,
-				    struct kvm_run *kvm_run)
-{
-	pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
-	kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-	return 0;
-}
-
-static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	svm->next_rip = svm->vmcb->save.rip + 2;
-	kvm_emulate_cpuid(&svm->vcpu);
-	return 1;
-}
-
-static int emulate_on_interception(struct vcpu_svm *svm,
-				   struct kvm_run *kvm_run)
-{
-	if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
-		pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
-	return 1;
-}
-
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	switch (ecx) {
-	case MSR_IA32_TIME_STAMP_COUNTER: {
-		u64 tsc;
-
-		rdtscll(tsc);
-		*data = svm->vmcb->control.tsc_offset + tsc;
-		break;
-	}
-	case MSR_K6_STAR:
-		*data = svm->vmcb->save.star;
-		break;
-#ifdef CONFIG_X86_64
-	case MSR_LSTAR:
-		*data = svm->vmcb->save.lstar;
-		break;
-	case MSR_CSTAR:
-		*data = svm->vmcb->save.cstar;
-		break;
-	case MSR_KERNEL_GS_BASE:
-		*data = svm->vmcb->save.kernel_gs_base;
-		break;
-	case MSR_SYSCALL_MASK:
-		*data = svm->vmcb->save.sfmask;
-		break;
-#endif
-	case MSR_IA32_SYSENTER_CS:
-		*data = svm->vmcb->save.sysenter_cs;
-		break;
-	case MSR_IA32_SYSENTER_EIP:
-		*data = svm->vmcb->save.sysenter_eip;
-		break;
-	case MSR_IA32_SYSENTER_ESP:
-		*data = svm->vmcb->save.sysenter_esp;
-		break;
-	default:
-		return kvm_get_msr_common(vcpu, ecx, data);
-	}
-	return 0;
-}
-
-static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
-	u64 data;
-
-	if (svm_get_msr(&svm->vcpu, ecx, &data))
-		svm_inject_gp(&svm->vcpu, 0);
-	else {
-		svm->vmcb->save.rax = data & 0xffffffff;
-		svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
-		svm->next_rip = svm->vmcb->save.rip + 2;
-		skip_emulated_instruction(&svm->vcpu);
-	}
-	return 1;
-}
-
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	switch (ecx) {
-	case MSR_IA32_TIME_STAMP_COUNTER: {
-		u64 tsc;
-
-		rdtscll(tsc);
-		svm->vmcb->control.tsc_offset = data - tsc;
-		break;
-	}
-	case MSR_K6_STAR:
-		svm->vmcb->save.star = data;
-		break;
-#ifdef CONFIG_X86_64
-	case MSR_LSTAR:
-		svm->vmcb->save.lstar = data;
-		break;
-	case MSR_CSTAR:
-		svm->vmcb->save.cstar = data;
-		break;
-	case MSR_KERNEL_GS_BASE:
-		svm->vmcb->save.kernel_gs_base = data;
-		break;
-	case MSR_SYSCALL_MASK:
-		svm->vmcb->save.sfmask = data;
-		break;
-#endif
-	case MSR_IA32_SYSENTER_CS:
-		svm->vmcb->save.sysenter_cs = data;
-		break;
-	case MSR_IA32_SYSENTER_EIP:
-		svm->vmcb->save.sysenter_eip = data;
-		break;
-	case MSR_IA32_SYSENTER_ESP:
-		svm->vmcb->save.sysenter_esp = data;
-		break;
-	default:
-		return kvm_set_msr_common(vcpu, ecx, data);
-	}
-	return 0;
-}
-
-static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
-	u64 data = (svm->vmcb->save.rax & -1u)
-		| ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
-	svm->next_rip = svm->vmcb->save.rip + 2;
-	if (svm_set_msr(&svm->vcpu, ecx, data))
-		svm_inject_gp(&svm->vcpu, 0);
-	else
-		skip_emulated_instruction(&svm->vcpu);
-	return 1;
-}
-
-static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
-	if (svm->vmcb->control.exit_info_1)
-		return wrmsr_interception(svm, kvm_run);
-	else
-		return rdmsr_interception(svm, kvm_run);
-}
-
-static int interrupt_window_interception(struct vcpu_svm *svm,
-				   struct kvm_run *kvm_run)
-{
-	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
-	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-	/*
-	 * If the user space waits to inject interrupts, exit as soon as
-	 * possible
-	 */
-	if (kvm_run->request_interrupt_window &&
-	    !svm->vcpu.irq_summary) {
-		++svm->vcpu.stat.irq_window_exits;
-		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		return 0;
-	}
-
-	return 1;
-}
-
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
-				      struct kvm_run *kvm_run) = {
-	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR3]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR4]           		= emulate_on_interception,
-	/* for now: */
-	[SVM_EXIT_WRITE_CR0]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR3]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR4]          		= emulate_on_interception,
-	[SVM_EXIT_READ_DR0] 			= emulate_on_interception,
-	[SVM_EXIT_READ_DR1]			= emulate_on_interception,
-	[SVM_EXIT_READ_DR2]			= emulate_on_interception,
-	[SVM_EXIT_READ_DR3]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR0]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR1]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR2]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception,
-	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
-	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
-	[SVM_EXIT_INTR] 			= nop_on_interception,
-	[SVM_EXIT_NMI]				= nop_on_interception,
-	[SVM_EXIT_SMI]				= nop_on_interception,
-	[SVM_EXIT_INIT]				= nop_on_interception,
-	[SVM_EXIT_VINTR]			= interrupt_window_interception,
-	/* [SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception, */
-	[SVM_EXIT_CPUID]			= cpuid_interception,
-	[SVM_EXIT_INVD]                         = emulate_on_interception,
-	[SVM_EXIT_HLT]				= halt_interception,
-	[SVM_EXIT_INVLPG]			= emulate_on_interception,
-	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
-	[SVM_EXIT_IOIO] 		  	= io_interception,
-	[SVM_EXIT_MSR]				= msr_interception,
-	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
-	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
-	[SVM_EXIT_VMRUN]			= invalid_op_interception,
-	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
-	[SVM_EXIT_VMLOAD]			= invalid_op_interception,
-	[SVM_EXIT_VMSAVE]			= invalid_op_interception,
-	[SVM_EXIT_STGI]				= invalid_op_interception,
-	[SVM_EXIT_CLGI]				= invalid_op_interception,
-	[SVM_EXIT_SKINIT]			= invalid_op_interception,
-	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
-	[SVM_EXIT_MONITOR]			= invalid_op_interception,
-	[SVM_EXIT_MWAIT]			= invalid_op_interception,
-};
-
-
-static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	u32 exit_code = svm->vmcb->control.exit_code;
-
-	kvm_reput_irq(svm);
-
-	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
-		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-		kvm_run->fail_entry.hardware_entry_failure_reason
-			= svm->vmcb->control.exit_code;
-		return 0;
-	}
-
-	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
-	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
-		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
-		       "exit_code 0x%x\n",
-		       __FUNCTION__, svm->vmcb->control.exit_int_info,
-		       exit_code);
-
-	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
-	    || svm_exit_handlers[exit_code] == 0) {
-		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-		kvm_run->hw.hardware_exit_reason = exit_code;
-		return 0;
-	}
-
-	return svm_exit_handlers[exit_code](svm, kvm_run);
-}
-
-static void reload_tss(struct kvm_vcpu *vcpu)
-{
-	int cpu = raw_smp_processor_id();
-
-	struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
-	svm_data->tss_desc->type = 9; //available 32/64-bit TSS
-	load_TR_desc();
-}
-
-static void pre_svm_run(struct vcpu_svm *svm)
-{
-	int cpu = raw_smp_processor_id();
-
-	struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
-
-	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
-	if (svm->vcpu.cpu != cpu ||
-	    svm->asid_generation != svm_data->asid_generation)
-		new_asid(svm, svm_data);
-}
-
-
-static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
-{
-	struct vmcb_control_area *control;
-
-	control = &svm->vmcb->control;
-	control->int_vector = irq;
-	control->int_ctl &= ~V_INTR_PRIO_MASK;
-	control->int_ctl |= V_IRQ_MASK |
-		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
-}
-
-static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	svm_inject_irq(svm, irq);
-}
-
-static void svm_intr_assist(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb *vmcb = svm->vmcb;
-	int intr_vector = -1;
-
-	kvm_inject_pending_timer_irqs(vcpu);
-	if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
-	    ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
-		intr_vector = vmcb->control.exit_int_info &
-			      SVM_EVTINJ_VEC_MASK;
-		vmcb->control.exit_int_info = 0;
-		svm_inject_irq(svm, intr_vector);
-		return;
-	}
-
-	if (vmcb->control.int_ctl & V_IRQ_MASK)
-		return;
-
-	if (!kvm_cpu_has_interrupt(vcpu))
-		return;
-
-	if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
-	    (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-	    (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
-		/* unable to deliver irq, set pending irq */
-		vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
-		svm_inject_irq(svm, 0x0);
-		return;
-	}
-	/* Okay, we can deliver the interrupt: grab it and update PIC state. */
-	intr_vector = kvm_cpu_get_interrupt(vcpu);
-	svm_inject_irq(svm, intr_vector);
-	kvm_timer_intr_post(vcpu, intr_vector);
-}
-
-static void kvm_reput_irq(struct vcpu_svm *svm)
-{
-	struct vmcb_control_area *control = &svm->vmcb->control;
-
-	if ((control->int_ctl & V_IRQ_MASK)
-	    && !irqchip_in_kernel(svm->vcpu.kvm)) {
-		control->int_ctl &= ~V_IRQ_MASK;
-		push_irq(&svm->vcpu, control->int_vector);
-	}
-
-	svm->vcpu.interrupt_window_open =
-		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
-}
-
-static void svm_do_inject_vector(struct vcpu_svm *svm)
-{
-	struct kvm_vcpu *vcpu = &svm->vcpu;
-	int word_index = __ffs(vcpu->irq_summary);
-	int bit_index = __ffs(vcpu->irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-	if (!vcpu->irq_pending[word_index])
-		clear_bit(word_index, &vcpu->irq_summary);
-	svm_inject_irq(svm, irq);
-}
-
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb_control_area *control = &svm->vmcb->control;
-
-	svm->vcpu.interrupt_window_open =
-		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
-
-	if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
-		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
-		 */
-		svm_do_inject_vector(svm);
-
-	/*
-	 * Interrupts blocked.  Wait for unblock.
-	 */
-	if (!svm->vcpu.interrupt_window_open &&
-	    (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
-		control->intercept |= 1ULL << INTERCEPT_VINTR;
-	} else
-		control->intercept &= ~(1ULL << INTERCEPT_VINTR);
-}
-
-static void save_db_regs(unsigned long *db_regs)
-{
-	asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
-	asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
-	asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
-	asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
-}
-
-static void load_db_regs(unsigned long *db_regs)
-{
-	asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
-	asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
-	asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
-	asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
-}
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
-{
-	force_new_asid(vcpu);
-}
-
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	u16 fs_selector;
-	u16 gs_selector;
-	u16 ldt_selector;
-
-	pre_svm_run(svm);
-
-	save_host_msrs(vcpu);
-	fs_selector = read_fs();
-	gs_selector = read_gs();
-	ldt_selector = read_ldt();
-	svm->host_cr2 = kvm_read_cr2();
-	svm->host_dr6 = read_dr6();
-	svm->host_dr7 = read_dr7();
-	svm->vmcb->save.cr2 = vcpu->cr2;
-
-	if (svm->vmcb->save.dr7 & 0xff) {
-		write_dr7(0);
-		save_db_regs(svm->host_db_regs);
-		load_db_regs(svm->db_regs);
-	}
-
-	clgi();
-
-	local_irq_enable();
-
-	asm volatile (
-#ifdef CONFIG_X86_64
-		"push %%rbx; push %%rcx; push %%rdx;"
-		"push %%rsi; push %%rdi; push %%rbp;"
-		"push %%r8;  push %%r9;  push %%r10; push %%r11;"
-		"push %%r12; push %%r13; push %%r14; push %%r15;"
-#else
-		"push %%ebx; push %%ecx; push %%edx;"
-		"push %%esi; push %%edi; push %%ebp;"
-#endif
-
-#ifdef CONFIG_X86_64
-		"mov %c[rbx](%[svm]), %%rbx \n\t"
-		"mov %c[rcx](%[svm]), %%rcx \n\t"
-		"mov %c[rdx](%[svm]), %%rdx \n\t"
-		"mov %c[rsi](%[svm]), %%rsi \n\t"
-		"mov %c[rdi](%[svm]), %%rdi \n\t"
-		"mov %c[rbp](%[svm]), %%rbp \n\t"
-		"mov %c[r8](%[svm]),  %%r8  \n\t"
-		"mov %c[r9](%[svm]),  %%r9  \n\t"
-		"mov %c[r10](%[svm]), %%r10 \n\t"
-		"mov %c[r11](%[svm]), %%r11 \n\t"
-		"mov %c[r12](%[svm]), %%r12 \n\t"
-		"mov %c[r13](%[svm]), %%r13 \n\t"
-		"mov %c[r14](%[svm]), %%r14 \n\t"
-		"mov %c[r15](%[svm]), %%r15 \n\t"
-#else
-		"mov %c[rbx](%[svm]), %%ebx \n\t"
-		"mov %c[rcx](%[svm]), %%ecx \n\t"
-		"mov %c[rdx](%[svm]), %%edx \n\t"
-		"mov %c[rsi](%[svm]), %%esi \n\t"
-		"mov %c[rdi](%[svm]), %%edi \n\t"
-		"mov %c[rbp](%[svm]), %%ebp \n\t"
-#endif
-
-#ifdef CONFIG_X86_64
-		/* Enter guest mode */
-		"push %%rax \n\t"
-		"mov %c[vmcb](%[svm]), %%rax \n\t"
-		SVM_VMLOAD "\n\t"
-		SVM_VMRUN "\n\t"
-		SVM_VMSAVE "\n\t"
-		"pop %%rax \n\t"
-#else
-		/* Enter guest mode */
-		"push %%eax \n\t"
-		"mov %c[vmcb](%[svm]), %%eax \n\t"
-		SVM_VMLOAD "\n\t"
-		SVM_VMRUN "\n\t"
-		SVM_VMSAVE "\n\t"
-		"pop %%eax \n\t"
-#endif
-
-		/* Save guest registers, load host registers */
-#ifdef CONFIG_X86_64
-		"mov %%rbx, %c[rbx](%[svm]) \n\t"
-		"mov %%rcx, %c[rcx](%[svm]) \n\t"
-		"mov %%rdx, %c[rdx](%[svm]) \n\t"
-		"mov %%rsi, %c[rsi](%[svm]) \n\t"
-		"mov %%rdi, %c[rdi](%[svm]) \n\t"
-		"mov %%rbp, %c[rbp](%[svm]) \n\t"
-		"mov %%r8,  %c[r8](%[svm]) \n\t"
-		"mov %%r9,  %c[r9](%[svm]) \n\t"
-		"mov %%r10, %c[r10](%[svm]) \n\t"
-		"mov %%r11, %c[r11](%[svm]) \n\t"
-		"mov %%r12, %c[r12](%[svm]) \n\t"
-		"mov %%r13, %c[r13](%[svm]) \n\t"
-		"mov %%r14, %c[r14](%[svm]) \n\t"
-		"mov %%r15, %c[r15](%[svm]) \n\t"
-
-		"pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
-		"pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
-		"pop  %%rbp; pop  %%rdi; pop  %%rsi;"
-		"pop  %%rdx; pop  %%rcx; pop  %%rbx; \n\t"
-#else
-		"mov %%ebx, %c[rbx](%[svm]) \n\t"
-		"mov %%ecx, %c[rcx](%[svm]) \n\t"
-		"mov %%edx, %c[rdx](%[svm]) \n\t"
-		"mov %%esi, %c[rsi](%[svm]) \n\t"
-		"mov %%edi, %c[rdi](%[svm]) \n\t"
-		"mov %%ebp, %c[rbp](%[svm]) \n\t"
-
-		"pop  %%ebp; pop  %%edi; pop  %%esi;"
-		"pop  %%edx; pop  %%ecx; pop  %%ebx; \n\t"
-#endif
-		:
-		: [svm]"a"(svm),
-		  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
-		  [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
-		  [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
-		  [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
-		  [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
-		  [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
-		  [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
-#ifdef CONFIG_X86_64
-		  ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
-		  [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
-		  [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
-		  [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
-		  [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
-		  [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
-		  [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
-		  [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
-#endif
-		: "cc", "memory" );
-
-	if ((svm->vmcb->save.dr7 & 0xff))
-		load_db_regs(svm->host_db_regs);
-
-	vcpu->cr2 = svm->vmcb->save.cr2;
-
-	write_dr6(svm->host_dr6);
-	write_dr7(svm->host_dr7);
-	kvm_write_cr2(svm->host_cr2);
-
-	load_fs(fs_selector);
-	load_gs(gs_selector);
-	load_ldt(ldt_selector);
-	load_host_msrs(vcpu);
-
-	reload_tss(vcpu);
-
-	local_irq_disable();
-
-	stgi();
-
-	svm->next_rip = 0;
-}
-
-static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	svm->vmcb->save.cr3 = root;
-	force_new_asid(vcpu);
-
-	if (vcpu->fpu_active) {
-		svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
-		svm->vmcb->save.cr0 |= X86_CR0_TS;
-		vcpu->fpu_active = 0;
-	}
-}
-
-static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
-				  unsigned long  addr,
-				  uint32_t err_code)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
-
-	++vcpu->stat.pf_guest;
-
-	if (is_page_fault(exit_int_info)) {
-
-		svm->vmcb->control.event_inj_err = 0;
-		svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
-						SVM_EVTINJ_VALID_ERR |
-						SVM_EVTINJ_TYPE_EXEPT |
-						DF_VECTOR;
-		return;
-	}
-	vcpu->cr2 = addr;
-	svm->vmcb->save.cr2 = addr;
-	svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
-					SVM_EVTINJ_VALID_ERR |
-					SVM_EVTINJ_TYPE_EXEPT |
-					PF_VECTOR;
-	svm->vmcb->control.event_inj_err = err_code;
-}
-
-
-static int is_disabled(void)
-{
-	u64 vm_cr;
-
-	rdmsrl(MSR_VM_CR, vm_cr);
-	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
-		return 1;
-
-	return 0;
-}
-
-static void
-svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
-	/*
-	 * Patch in the VMMCALL instruction:
-	 */
-	hypercall[0] = 0x0f;
-	hypercall[1] = 0x01;
-	hypercall[2] = 0xd9;
-	hypercall[3] = 0xc3;
-}
-
-static void svm_check_processor_compat(void *rtn)
-{
-	*(int *)rtn = 0;
-}
-
-static struct kvm_x86_ops svm_x86_ops = {
-	.cpu_has_kvm_support = has_svm,
-	.disabled_by_bios = is_disabled,
-	.hardware_setup = svm_hardware_setup,
-	.hardware_unsetup = svm_hardware_unsetup,
-	.check_processor_compatibility = svm_check_processor_compat,
-	.hardware_enable = svm_hardware_enable,
-	.hardware_disable = svm_hardware_disable,
-
-	.vcpu_create = svm_create_vcpu,
-	.vcpu_free = svm_free_vcpu,
-	.vcpu_reset = svm_vcpu_reset,
-
-	.prepare_guest_switch = svm_prepare_guest_switch,
-	.vcpu_load = svm_vcpu_load,
-	.vcpu_put = svm_vcpu_put,
-	.vcpu_decache = svm_vcpu_decache,
-
-	.set_guest_debug = svm_guest_debug,
-	.get_msr = svm_get_msr,
-	.set_msr = svm_set_msr,
-	.get_segment_base = svm_get_segment_base,
-	.get_segment = svm_get_segment,
-	.set_segment = svm_set_segment,
-	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
-	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
-	.set_cr0 = svm_set_cr0,
-	.set_cr3 = svm_set_cr3,
-	.set_cr4 = svm_set_cr4,
-	.set_efer = svm_set_efer,
-	.get_idt = svm_get_idt,
-	.set_idt = svm_set_idt,
-	.get_gdt = svm_get_gdt,
-	.set_gdt = svm_set_gdt,
-	.get_dr = svm_get_dr,
-	.set_dr = svm_set_dr,
-	.cache_regs = svm_cache_regs,
-	.decache_regs = svm_decache_regs,
-	.get_rflags = svm_get_rflags,
-	.set_rflags = svm_set_rflags,
-
-	.tlb_flush = svm_flush_tlb,
-	.inject_page_fault = svm_inject_page_fault,
-
-	.inject_gp = svm_inject_gp,
-
-	.run = svm_vcpu_run,
-	.handle_exit = handle_exit,
-	.skip_emulated_instruction = skip_emulated_instruction,
-	.patch_hypercall = svm_patch_hypercall,
-	.get_irq = svm_get_irq,
-	.set_irq = svm_set_irq,
-	.inject_pending_irq = svm_intr_assist,
-	.inject_pending_vectors = do_interrupt_requests,
-};
-
-static int __init svm_init(void)
-{
-	return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
-			      THIS_MODULE);
-}
-
-static void __exit svm_exit(void)
-{
-	kvm_exit_x86();
-}
-
-module_init(svm_init)
-module_exit(svm_exit)
diff -puN drivers/kvm/svm.h~git-kvm /dev/null
--- a/drivers/kvm/svm.h
+++ /dev/null
@@ -1,324 +0,0 @@
-#ifndef __SVM_H
-#define __SVM_H
-
-enum {
-	INTERCEPT_INTR,
-	INTERCEPT_NMI,
-	INTERCEPT_SMI,
-	INTERCEPT_INIT,
-	INTERCEPT_VINTR,
-	INTERCEPT_SELECTIVE_CR0,
-	INTERCEPT_STORE_IDTR,
-	INTERCEPT_STORE_GDTR,
-	INTERCEPT_STORE_LDTR,
-	INTERCEPT_STORE_TR,
-	INTERCEPT_LOAD_IDTR,
-	INTERCEPT_LOAD_GDTR,
-	INTERCEPT_LOAD_LDTR,
-	INTERCEPT_LOAD_TR,
-	INTERCEPT_RDTSC,
-	INTERCEPT_RDPMC,
-	INTERCEPT_PUSHF,
-	INTERCEPT_POPF,
-	INTERCEPT_CPUID,
-	INTERCEPT_RSM,
-	INTERCEPT_IRET,
-	INTERCEPT_INTn,
-	INTERCEPT_INVD,
-	INTERCEPT_PAUSE,
-	INTERCEPT_HLT,
-	INTERCEPT_INVLPG,
-	INTERCEPT_INVLPGA,
-	INTERCEPT_IOIO_PROT,
-	INTERCEPT_MSR_PROT,
-	INTERCEPT_TASK_SWITCH,
-	INTERCEPT_FERR_FREEZE,
-	INTERCEPT_SHUTDOWN,
-	INTERCEPT_VMRUN,
-	INTERCEPT_VMMCALL,
-	INTERCEPT_VMLOAD,
-	INTERCEPT_VMSAVE,
-	INTERCEPT_STGI,
-	INTERCEPT_CLGI,
-	INTERCEPT_SKINIT,
-	INTERCEPT_RDTSCP,
-	INTERCEPT_ICEBP,
-	INTERCEPT_WBINVD,
-	INTERCEPT_MONITOR,
-	INTERCEPT_MWAIT,
-	INTERCEPT_MWAIT_COND,
-};
-
-
-struct __attribute__ ((__packed__)) vmcb_control_area {
-	u16 intercept_cr_read;
-	u16 intercept_cr_write;
-	u16 intercept_dr_read;
-	u16 intercept_dr_write;
-	u32 intercept_exceptions;
-	u64 intercept;
-	u8 reserved_1[44];
-	u64 iopm_base_pa;
-	u64 msrpm_base_pa;
-	u64 tsc_offset;
-	u32 asid;
-	u8 tlb_ctl;
-	u8 reserved_2[3];
-	u32 int_ctl;
-	u32 int_vector;
-	u32 int_state;
-	u8 reserved_3[4];
-	u32 exit_code;
-	u32 exit_code_hi;
-	u64 exit_info_1;
-	u64 exit_info_2;
-	u32 exit_int_info;
-	u32 exit_int_info_err;
-	u64 nested_ctl;
-	u8 reserved_4[16];
-	u32 event_inj;
-	u32 event_inj_err;
-	u64 nested_cr3;
-	u64 lbr_ctl;
-	u8 reserved_5[832];
-};
-
-
-#define TLB_CONTROL_DO_NOTHING 0
-#define TLB_CONTROL_FLUSH_ALL_ASID 1
-
-#define V_TPR_MASK 0x0f
-
-#define V_IRQ_SHIFT 8
-#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
-
-#define V_INTR_PRIO_SHIFT 16
-#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
-
-#define V_IGN_TPR_SHIFT 20
-#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
-
-#define V_INTR_MASKING_SHIFT 24
-#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
-
-#define SVM_INTERRUPT_SHADOW_MASK 1
-
-#define SVM_IOIO_STR_SHIFT 2
-#define SVM_IOIO_REP_SHIFT 3
-#define SVM_IOIO_SIZE_SHIFT 4
-#define SVM_IOIO_ASIZE_SHIFT 7
-
-#define SVM_IOIO_TYPE_MASK 1
-#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
-#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
-#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
-#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-
-struct __attribute__ ((__packed__)) vmcb_seg {
-	u16 selector;
-	u16 attrib;
-	u32 limit;
-	u64 base;
-};
-
-struct __attribute__ ((__packed__)) vmcb_save_area {
-	struct vmcb_seg es;
-	struct vmcb_seg cs;
-	struct vmcb_seg ss;
-	struct vmcb_seg ds;
-	struct vmcb_seg fs;
-	struct vmcb_seg gs;
-	struct vmcb_seg gdtr;
-	struct vmcb_seg ldtr;
-	struct vmcb_seg idtr;
-	struct vmcb_seg tr;
-	u8 reserved_1[43];
-	u8 cpl;
-	u8 reserved_2[4];
-	u64 efer;
-	u8 reserved_3[112];
-	u64 cr4;
-	u64 cr3;
-	u64 cr0;
-	u64 dr7;
-	u64 dr6;
-	u64 rflags;
-	u64 rip;
-	u8 reserved_4[88];
-	u64 rsp;
-	u8 reserved_5[24];
-	u64 rax;
-	u64 star;
-	u64 lstar;
-	u64 cstar;
-	u64 sfmask;
-	u64 kernel_gs_base;
-	u64 sysenter_cs;
-	u64 sysenter_esp;
-	u64 sysenter_eip;
-	u64 cr2;
-	u8 reserved_6[32];
-	u64 g_pat;
-	u64 dbgctl;
-	u64 br_from;
-	u64 br_to;
-	u64 last_excp_from;
-	u64 last_excp_to;
-};
-
-struct __attribute__ ((__packed__)) vmcb {
-	struct vmcb_control_area control;
-	struct vmcb_save_area save;
-};
-
-#define SVM_CPUID_FEATURE_SHIFT 2
-#define SVM_CPUID_FUNC 0x8000000a
-
-#define MSR_EFER_SVME_MASK (1ULL << 12)
-#define MSR_VM_CR       0xc0010114
-#define MSR_VM_HSAVE_PA 0xc0010117ULL
-
-#define SVM_VM_CR_SVM_DISABLE 4
-
-#define SVM_SELECTOR_S_SHIFT 4
-#define SVM_SELECTOR_DPL_SHIFT 5
-#define SVM_SELECTOR_P_SHIFT 7
-#define SVM_SELECTOR_AVL_SHIFT 8
-#define SVM_SELECTOR_L_SHIFT 9
-#define SVM_SELECTOR_DB_SHIFT 10
-#define SVM_SELECTOR_G_SHIFT 11
-
-#define SVM_SELECTOR_TYPE_MASK (0xf)
-#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
-#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
-#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
-#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
-#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
-#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
-#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
-
-#define SVM_SELECTOR_WRITE_MASK (1 << 1)
-#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
-#define SVM_SELECTOR_CODE_MASK (1 << 3)
-
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
-
-#define SVM_EVTINJ_VEC_MASK 0xff
-
-#define SVM_EVTINJ_TYPE_SHIFT 8
-#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_VALID (1 << 31)
-#define SVM_EVTINJ_VALID_ERR (1 << 11)
-
-#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
-
-#define	SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
-#define	SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
-#define	SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
-#define	SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
-
-#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
-#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
-
-#define	SVM_EXIT_READ_CR0 	0x000
-#define	SVM_EXIT_READ_CR3 	0x003
-#define	SVM_EXIT_READ_CR4 	0x004
-#define	SVM_EXIT_READ_CR8 	0x008
-#define	SVM_EXIT_WRITE_CR0 	0x010
-#define	SVM_EXIT_WRITE_CR3 	0x013
-#define	SVM_EXIT_WRITE_CR4 	0x014
-#define	SVM_EXIT_WRITE_CR8 	0x018
-#define	SVM_EXIT_READ_DR0 	0x020
-#define	SVM_EXIT_READ_DR1 	0x021
-#define	SVM_EXIT_READ_DR2 	0x022
-#define	SVM_EXIT_READ_DR3 	0x023
-#define	SVM_EXIT_READ_DR4 	0x024
-#define	SVM_EXIT_READ_DR5 	0x025
-#define	SVM_EXIT_READ_DR6 	0x026
-#define	SVM_EXIT_READ_DR7 	0x027
-#define	SVM_EXIT_WRITE_DR0 	0x030
-#define	SVM_EXIT_WRITE_DR1 	0x031
-#define	SVM_EXIT_WRITE_DR2 	0x032
-#define	SVM_EXIT_WRITE_DR3 	0x033
-#define	SVM_EXIT_WRITE_DR4 	0x034
-#define	SVM_EXIT_WRITE_DR5 	0x035
-#define	SVM_EXIT_WRITE_DR6 	0x036
-#define	SVM_EXIT_WRITE_DR7 	0x037
-#define SVM_EXIT_EXCP_BASE      0x040
-#define SVM_EXIT_INTR		0x060
-#define SVM_EXIT_NMI		0x061
-#define SVM_EXIT_SMI		0x062
-#define SVM_EXIT_INIT		0x063
-#define SVM_EXIT_VINTR		0x064
-#define SVM_EXIT_CR0_SEL_WRITE	0x065
-#define SVM_EXIT_IDTR_READ	0x066
-#define SVM_EXIT_GDTR_READ	0x067
-#define SVM_EXIT_LDTR_READ	0x068
-#define SVM_EXIT_TR_READ	0x069
-#define SVM_EXIT_IDTR_WRITE	0x06a
-#define SVM_EXIT_GDTR_WRITE	0x06b
-#define SVM_EXIT_LDTR_WRITE	0x06c
-#define SVM_EXIT_TR_WRITE	0x06d
-#define SVM_EXIT_RDTSC		0x06e
-#define SVM_EXIT_RDPMC		0x06f
-#define SVM_EXIT_PUSHF		0x070
-#define SVM_EXIT_POPF		0x071
-#define SVM_EXIT_CPUID		0x072
-#define SVM_EXIT_RSM		0x073
-#define SVM_EXIT_IRET		0x074
-#define SVM_EXIT_SWINT		0x075
-#define SVM_EXIT_INVD		0x076
-#define SVM_EXIT_PAUSE		0x077
-#define SVM_EXIT_HLT		0x078
-#define SVM_EXIT_INVLPG		0x079
-#define SVM_EXIT_INVLPGA	0x07a
-#define SVM_EXIT_IOIO		0x07b
-#define SVM_EXIT_MSR		0x07c
-#define SVM_EXIT_TASK_SWITCH	0x07d
-#define SVM_EXIT_FERR_FREEZE	0x07e
-#define SVM_EXIT_SHUTDOWN	0x07f
-#define SVM_EXIT_VMRUN		0x080
-#define SVM_EXIT_VMMCALL	0x081
-#define SVM_EXIT_VMLOAD		0x082
-#define SVM_EXIT_VMSAVE		0x083
-#define SVM_EXIT_STGI		0x084
-#define SVM_EXIT_CLGI		0x085
-#define SVM_EXIT_SKINIT		0x086
-#define SVM_EXIT_RDTSCP		0x087
-#define SVM_EXIT_ICEBP		0x088
-#define SVM_EXIT_WBINVD		0x089
-#define SVM_EXIT_MONITOR	0x08a
-#define SVM_EXIT_MWAIT		0x08b
-#define SVM_EXIT_MWAIT_COND	0x08c
-#define SVM_EXIT_NPF  		0x400
-
-#define SVM_EXIT_ERR		-1
-
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
-
-#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
-#define SVM_VMRUN  ".byte 0x0f, 0x01, 0xd8"
-#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
-#define SVM_CLGI   ".byte 0x0f, 0x01, 0xdd"
-#define SVM_STGI   ".byte 0x0f, 0x01, 0xdc"
-#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
-
-#endif
-
diff -puN drivers/kvm/vmx.c~git-kvm /dev/null
--- a/drivers/kvm/vmx.c
+++ /dev/null
@@ -1,2566 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Avi Kivity   <avi@qumranet.com>
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "kvm.h"
-#include "x86_emulate.h"
-#include "irq.h"
-#include "vmx.h"
-#include "segment_descriptor.h"
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-
-#include <asm/io.h>
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-struct vmcs {
-	u32 revision_id;
-	u32 abort;
-	char data[0];
-};
-
-struct vcpu_vmx {
-	struct kvm_vcpu       vcpu;
-	int                   launched;
-	u8                    fail;
-	struct kvm_msr_entry *guest_msrs;
-	struct kvm_msr_entry *host_msrs;
-	int                   nmsrs;
-	int                   save_nmsrs;
-	int                   msr_offset_efer;
-#ifdef CONFIG_X86_64
-	int                   msr_offset_kernel_gs_base;
-#endif
-	struct vmcs          *vmcs;
-	struct {
-		int           loaded;
-		u16           fs_sel, gs_sel, ldt_sel;
-		int           gs_ldt_reload_needed;
-		int           fs_reload_needed;
-	}host_state;
-
-};
-
-static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
-{
-	return container_of(vcpu, struct vcpu_vmx, vcpu);
-}
-
-static int init_rmode_tss(struct kvm *kvm);
-
-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
-static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-
-static struct page *vmx_io_bitmap_a;
-static struct page *vmx_io_bitmap_b;
-
-#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
-
-static struct vmcs_config {
-	int size;
-	int order;
-	u32 revision_id;
-	u32 pin_based_exec_ctrl;
-	u32 cpu_based_exec_ctrl;
-	u32 vmexit_ctrl;
-	u32 vmentry_ctrl;
-} vmcs_config;
-
-#define VMX_SEGMENT_FIELD(seg)					\
-	[VCPU_SREG_##seg] = {                                   \
-		.selector = GUEST_##seg##_SELECTOR,		\
-		.base = GUEST_##seg##_BASE,		   	\
-		.limit = GUEST_##seg##_LIMIT,		   	\
-		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
-	}
-
-static struct kvm_vmx_segment_field {
-	unsigned selector;
-	unsigned base;
-	unsigned limit;
-	unsigned ar_bytes;
-} kvm_vmx_segment_fields[] = {
-	VMX_SEGMENT_FIELD(CS),
-	VMX_SEGMENT_FIELD(DS),
-	VMX_SEGMENT_FIELD(ES),
-	VMX_SEGMENT_FIELD(FS),
-	VMX_SEGMENT_FIELD(GS),
-	VMX_SEGMENT_FIELD(SS),
-	VMX_SEGMENT_FIELD(TR),
-	VMX_SEGMENT_FIELD(LDTR),
-};
-
-/*
- * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
- * away by decrementing the array size.
- */
-static const u32 vmx_msr_index[] = {
-#ifdef CONFIG_X86_64
-	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
-#endif
-	MSR_EFER, MSR_K6_STAR,
-};
-#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-
-static void load_msrs(struct kvm_msr_entry *e, int n)
-{
-	int i;
-
-	for (i = 0; i < n; ++i)
-		wrmsrl(e[i].index, e[i].data);
-}
-
-static void save_msrs(struct kvm_msr_entry *e, int n)
-{
-	int i;
-
-	for (i = 0; i < n; ++i)
-		rdmsrl(e[i].index, e[i].data);
-}
-
-static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
-{
-	return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
-}
-
-static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
-{
-	int efer_offset = vmx->msr_offset_efer;
-	return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
-		msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
-}
-
-static inline int is_page_fault(u32 intr_info)
-{
-	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_no_device(u32 intr_info)
-{
-	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_external_interrupt(u32 intr_info)
-{
-	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-static inline int cpu_has_vmx_tpr_shadow(void)
-{
-	return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
-}
-
-static inline int vm_need_tpr_shadow(struct kvm *kvm)
-{
-	return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
-}
-
-static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
-{
-	int i;
-
-	for (i = 0; i < vmx->nmsrs; ++i)
-		if (vmx->guest_msrs[i].index == msr)
-			return i;
-	return -1;
-}
-
-static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
-{
-	int i;
-
-	i = __find_msr_index(vmx, msr);
-	if (i >= 0)
-		return &vmx->guest_msrs[i];
-	return NULL;
-}
-
-static void vmcs_clear(struct vmcs *vmcs)
-{
-	u64 phys_addr = __pa(vmcs);
-	u8 error;
-
-	asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
-		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-		      : "cc", "memory");
-	if (error)
-		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
-		       vmcs, phys_addr);
-}
-
-static void __vcpu_clear(void *arg)
-{
-	struct vcpu_vmx *vmx = arg;
-	int cpu = raw_smp_processor_id();
-
-	if (vmx->vcpu.cpu == cpu)
-		vmcs_clear(vmx->vmcs);
-	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
-		per_cpu(current_vmcs, cpu) = NULL;
-	rdtscll(vmx->vcpu.host_tsc);
-}
-
-static void vcpu_clear(struct vcpu_vmx *vmx)
-{
-	if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
-		smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
-					 vmx, 0, 1);
-	else
-		__vcpu_clear(vmx);
-	vmx->launched = 0;
-}
-
-static unsigned long vmcs_readl(unsigned long field)
-{
-	unsigned long value;
-
-	asm volatile (ASM_VMX_VMREAD_RDX_RAX
-		      : "=a"(value) : "d"(field) : "cc");
-	return value;
-}
-
-static u16 vmcs_read16(unsigned long field)
-{
-	return vmcs_readl(field);
-}
-
-static u32 vmcs_read32(unsigned long field)
-{
-	return vmcs_readl(field);
-}
-
-static u64 vmcs_read64(unsigned long field)
-{
-#ifdef CONFIG_X86_64
-	return vmcs_readl(field);
-#else
-	return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
-#endif
-}
-
-static noinline void vmwrite_error(unsigned long field, unsigned long value)
-{
-	printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
-	       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
-	dump_stack();
-}
-
-static void vmcs_writel(unsigned long field, unsigned long value)
-{
-	u8 error;
-
-	asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
-		       : "=q"(error) : "a"(value), "d"(field) : "cc" );
-	if (unlikely(error))
-		vmwrite_error(field, value);
-}
-
-static void vmcs_write16(unsigned long field, u16 value)
-{
-	vmcs_writel(field, value);
-}
-
-static void vmcs_write32(unsigned long field, u32 value)
-{
-	vmcs_writel(field, value);
-}
-
-static void vmcs_write64(unsigned long field, u64 value)
-{
-#ifdef CONFIG_X86_64
-	vmcs_writel(field, value);
-#else
-	vmcs_writel(field, value);
-	asm volatile ("");
-	vmcs_writel(field+1, value >> 32);
-#endif
-}
-
-static void vmcs_clear_bits(unsigned long field, u32 mask)
-{
-	vmcs_writel(field, vmcs_readl(field) & ~mask);
-}
-
-static void vmcs_set_bits(unsigned long field, u32 mask)
-{
-	vmcs_writel(field, vmcs_readl(field) | mask);
-}
-
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
-{
-	u32 eb;
-
-	eb = 1u << PF_VECTOR;
-	if (!vcpu->fpu_active)
-		eb |= 1u << NM_VECTOR;
-	if (vcpu->guest_debug.enabled)
-		eb |= 1u << 1;
-	if (vcpu->rmode.active)
-		eb = ~0;
-	vmcs_write32(EXCEPTION_BITMAP, eb);
-}
-
-static void reload_tss(void)
-{
-#ifndef CONFIG_X86_64
-
-	/*
-	 * VT restores TR but not its size.  Useless.
-	 */
-	struct descriptor_table gdt;
-	struct segment_descriptor *descs;
-
-	get_gdt(&gdt);
-	descs = (void *)gdt.base;
-	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
-	load_TR_desc();
-#endif
-}
-
-static void load_transition_efer(struct vcpu_vmx *vmx)
-{
-	u64 trans_efer;
-	int efer_offset = vmx->msr_offset_efer;
-
-	trans_efer = vmx->host_msrs[efer_offset].data;
-	trans_efer &= ~EFER_SAVE_RESTORE_BITS;
-	trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
-	wrmsrl(MSR_EFER, trans_efer);
-	vmx->vcpu.stat.efer_reload++;
-}
-
-static void vmx_save_host_state(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (vmx->host_state.loaded)
-		return;
-
-	vmx->host_state.loaded = 1;
-	/*
-	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
-	 * allow segment selectors with cpl > 0 or ti == 1.
-	 */
-	vmx->host_state.ldt_sel = read_ldt();
-	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
-	vmx->host_state.fs_sel = read_fs();
-	if (!(vmx->host_state.fs_sel & 7)) {
-		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
-		vmx->host_state.fs_reload_needed = 0;
-	} else {
-		vmcs_write16(HOST_FS_SELECTOR, 0);
-		vmx->host_state.fs_reload_needed = 1;
-	}
-	vmx->host_state.gs_sel = read_gs();
-	if (!(vmx->host_state.gs_sel & 7))
-		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
-	else {
-		vmcs_write16(HOST_GS_SELECTOR, 0);
-		vmx->host_state.gs_ldt_reload_needed = 1;
-	}
-
-#ifdef CONFIG_X86_64
-	vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
-	vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
-#else
-	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
-	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
-#endif
-
-#ifdef CONFIG_X86_64
-	if (is_long_mode(&vmx->vcpu)) {
-		save_msrs(vmx->host_msrs +
-			  vmx->msr_offset_kernel_gs_base, 1);
-	}
-#endif
-	load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-	if (msr_efer_need_save_restore(vmx))
-		load_transition_efer(vmx);
-}
-
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
-{
-	unsigned long flags;
-
-	if (!vmx->host_state.loaded)
-		return;
-
-	vmx->host_state.loaded = 0;
-	if (vmx->host_state.fs_reload_needed)
-		load_fs(vmx->host_state.fs_sel);
-	if (vmx->host_state.gs_ldt_reload_needed) {
-		load_ldt(vmx->host_state.ldt_sel);
-		/*
-		 * If we have to reload gs, we must take care to
-		 * preserve our gs base.
-		 */
-		local_irq_save(flags);
-		load_gs(vmx->host_state.gs_sel);
-#ifdef CONFIG_X86_64
-		wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
-#endif
-		local_irq_restore(flags);
-	}
-	reload_tss();
-	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-	load_msrs(vmx->host_msrs, vmx->save_nmsrs);
-	if (msr_efer_need_save_restore(vmx))
-		load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
-}
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put(), but assumes
- * vcpu mutex is already taken.
- */
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 phys_addr = __pa(vmx->vmcs);
-	u64 tsc_this, delta;
-
-	if (vcpu->cpu != cpu) {
-		vcpu_clear(vmx);
-		kvm_migrate_apic_timer(vcpu);
-	}
-
-	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		u8 error;
-
-		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
-			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-			      : "cc");
-		if (error)
-			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-			       vmx->vmcs, phys_addr);
-	}
-
-	if (vcpu->cpu != cpu) {
-		struct descriptor_table dt;
-		unsigned long sysenter_esp;
-
-		vcpu->cpu = cpu;
-		/*
-		 * Linux uses per-cpu TSS and GDT, so set these when switching
-		 * processors.
-		 */
-		vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
-		get_gdt(&dt);
-		vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
-
-		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
-		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
-		/*
-		 * Make sure the time stamp counter is monotonous.
-		 */
-		rdtscll(tsc_this);
-		delta = vcpu->host_tsc - tsc_this;
-		vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
-	}
-}
-
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	vmx_load_host_state(to_vmx(vcpu));
-	kvm_put_guest_fpu(vcpu);
-}
-
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->fpu_active)
-		return;
-	vcpu->fpu_active = 1;
-	vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
-	if (vcpu->cr0 & X86_CR0_TS)
-		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
-	update_exception_bitmap(vcpu);
-}
-
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->fpu_active)
-		return;
-	vcpu->fpu_active = 0;
-	vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
-	update_exception_bitmap(vcpu);
-}
-
-static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
-{
-	vcpu_clear(to_vmx(vcpu));
-}
-
-static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
-{
-	return vmcs_readl(GUEST_RFLAGS);
-}
-
-static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-	if (vcpu->rmode.active)
-		rflags |= IOPL_MASK | X86_EFLAGS_VM;
-	vmcs_writel(GUEST_RFLAGS, rflags);
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
-	unsigned long rip;
-	u32 interruptibility;
-
-	rip = vmcs_readl(GUEST_RIP);
-	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-	vmcs_writel(GUEST_RIP, rip);
-
-	/*
-	 * We emulated an instruction, so temporary interrupt blocking
-	 * should be removed, if set.
-	 */
-	interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	if (interruptibility & 3)
-		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-			     interruptibility & ~3);
-	vcpu->interrupt_window_open = 1;
-}
-
-static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
-{
-	printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
-	       vmcs_readl(GUEST_RIP));
-	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-		     GP_VECTOR |
-		     INTR_TYPE_EXCEPTION |
-		     INTR_INFO_DELIEVER_CODE_MASK |
-		     INTR_INFO_VALID_MASK);
-}
-
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-#ifdef CONFIG_X86_64
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
-{
-	struct kvm_msr_entry tmp;
-
-	tmp = vmx->guest_msrs[to];
-	vmx->guest_msrs[to] = vmx->guest_msrs[from];
-	vmx->guest_msrs[from] = tmp;
-	tmp = vmx->host_msrs[to];
-	vmx->host_msrs[to] = vmx->host_msrs[from];
-	vmx->host_msrs[from] = tmp;
-}
-#endif
-
-/*
- * Set up the vmcs to automatically save and restore system
- * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
- */
-static void setup_msrs(struct vcpu_vmx *vmx)
-{
-	int save_nmsrs;
-
-	save_nmsrs = 0;
-#ifdef CONFIG_X86_64
-	if (is_long_mode(&vmx->vcpu)) {
-		int index;
-
-		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
-		if (index >= 0)
-			move_msr_up(vmx, index, save_nmsrs++);
-		index = __find_msr_index(vmx, MSR_LSTAR);
-		if (index >= 0)
-			move_msr_up(vmx, index, save_nmsrs++);
-		index = __find_msr_index(vmx, MSR_CSTAR);
-		if (index >= 0)
-			move_msr_up(vmx, index, save_nmsrs++);
-		index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-		if (index >= 0)
-			move_msr_up(vmx, index, save_nmsrs++);
-		/*
-		 * MSR_K6_STAR is only needed on long mode guests, and only
-		 * if efer.sce is enabled.
-		 */
-		index = __find_msr_index(vmx, MSR_K6_STAR);
-		if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
-			move_msr_up(vmx, index, save_nmsrs++);
-	}
-#endif
-	vmx->save_nmsrs = save_nmsrs;
-
-#ifdef CONFIG_X86_64
-	vmx->msr_offset_kernel_gs_base =
-		__find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
-	vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
-}
-
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset    -- 21.3
- */
-static u64 guest_read_tsc(void)
-{
-	u64 host_tsc, tsc_offset;
-
-	rdtscll(host_tsc);
-	tsc_offset = vmcs_read64(TSC_OFFSET);
-	return host_tsc + tsc_offset;
-}
-
-/*
- * writes 'guest_tsc' into guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
- */
-static void guest_write_tsc(u64 guest_tsc)
-{
-	u64 host_tsc;
-
-	rdtscll(host_tsc);
-	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
-}
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
-	u64 data;
-	struct kvm_msr_entry *msr;
-
-	if (!pdata) {
-		printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
-		return -EINVAL;
-	}
-
-	switch (msr_index) {
-#ifdef CONFIG_X86_64
-	case MSR_FS_BASE:
-		data = vmcs_readl(GUEST_FS_BASE);
-		break;
-	case MSR_GS_BASE:
-		data = vmcs_readl(GUEST_GS_BASE);
-		break;
-	case MSR_EFER:
-		return kvm_get_msr_common(vcpu, msr_index, pdata);
-#endif
-	case MSR_IA32_TIME_STAMP_COUNTER:
-		data = guest_read_tsc();
-		break;
-	case MSR_IA32_SYSENTER_CS:
-		data = vmcs_read32(GUEST_SYSENTER_CS);
-		break;
-	case MSR_IA32_SYSENTER_EIP:
-		data = vmcs_readl(GUEST_SYSENTER_EIP);
-		break;
-	case MSR_IA32_SYSENTER_ESP:
-		data = vmcs_readl(GUEST_SYSENTER_ESP);
-		break;
-	default:
-		msr = find_msr_entry(to_vmx(vcpu), msr_index);
-		if (msr) {
-			data = msr->data;
-			break;
-		}
-		return kvm_get_msr_common(vcpu, msr_index, pdata);
-	}
-
-	*pdata = data;
-	return 0;
-}
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_msr_entry *msr;
-	int ret = 0;
-
-	switch (msr_index) {
-#ifdef CONFIG_X86_64
-	case MSR_EFER:
-		ret = kvm_set_msr_common(vcpu, msr_index, data);
-		if (vmx->host_state.loaded)
-			load_transition_efer(vmx);
-		break;
-	case MSR_FS_BASE:
-		vmcs_writel(GUEST_FS_BASE, data);
-		break;
-	case MSR_GS_BASE:
-		vmcs_writel(GUEST_GS_BASE, data);
-		break;
-#endif
-	case MSR_IA32_SYSENTER_CS:
-		vmcs_write32(GUEST_SYSENTER_CS, data);
-		break;
-	case MSR_IA32_SYSENTER_EIP:
-		vmcs_writel(GUEST_SYSENTER_EIP, data);
-		break;
-	case MSR_IA32_SYSENTER_ESP:
-		vmcs_writel(GUEST_SYSENTER_ESP, data);
-		break;
-	case MSR_IA32_TIME_STAMP_COUNTER:
-		guest_write_tsc(data);
-		break;
-	default:
-		msr = find_msr_entry(vmx, msr_index);
-		if (msr) {
-			msr->data = data;
-			if (vmx->host_state.loaded)
-				load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-			break;
-		}
-		ret = kvm_set_msr_common(vcpu, msr_index, data);
-	}
-
-	return ret;
-}
-
-/*
- * Sync the rsp and rip registers into the vcpu structure.  This allows
- * registers to be accessed by indexing vcpu->regs.
- */
-static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
-{
-	vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-	vcpu->rip = vmcs_readl(GUEST_RIP);
-}
-
-/*
- * Syncs rsp and rip back into the vmcs.  Should be called after possible
- * modification.
- */
-static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
-{
-	vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
-	vmcs_writel(GUEST_RIP, vcpu->rip);
-}
-
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
-{
-	unsigned long dr7 = 0x400;
-	int old_singlestep;
-
-	old_singlestep = vcpu->guest_debug.singlestep;
-
-	vcpu->guest_debug.enabled = dbg->enabled;
-	if (vcpu->guest_debug.enabled) {
-		int i;
-
-		dr7 |= 0x200;  /* exact */
-		for (i = 0; i < 4; ++i) {
-			if (!dbg->breakpoints[i].enabled)
-				continue;
-			vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
-			dr7 |= 2 << (i*2);    /* global enable */
-			dr7 |= 0 << (i*4+16); /* execution breakpoint */
-		}
-
-		vcpu->guest_debug.singlestep = dbg->singlestep;
-	} else
-		vcpu->guest_debug.singlestep = 0;
-
-	if (old_singlestep && !vcpu->guest_debug.singlestep) {
-		unsigned long flags;
-
-		flags = vmcs_readl(GUEST_RFLAGS);
-		flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
-
-	update_exception_bitmap(vcpu);
-	vmcs_writel(GUEST_DR7, dr7);
-
-	return 0;
-}
-
-static int vmx_get_irq(struct kvm_vcpu *vcpu)
-{
-	u32 idtv_info_field;
-
-	idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-	if (idtv_info_field & INTR_INFO_VALID_MASK) {
-		if (is_external_interrupt(idtv_info_field))
-			return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-		else
-			printk("pending exception: not handled yet\n");
-	}
-	return -1;
-}
-
-static __init int cpu_has_kvm_support(void)
-{
-	unsigned long ecx = cpuid_ecx(1);
-	return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
-	u64 msr;
-
-	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-	    == MSR_IA32_FEATURE_CONTROL_LOCKED;
-	/* locked but not enabled */
-}
-
-static void hardware_enable(void *garbage)
-{
-	int cpu = raw_smp_processor_id();
-	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-	u64 old;
-
-	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-		    MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-	    != (MSR_IA32_FEATURE_CONTROL_LOCKED |
-		MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-		/* enable and lock */
-		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-		       MSR_IA32_FEATURE_CONTROL_LOCKED |
-		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
-	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
-	asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
-		      : "memory", "cc");
-}
-
-static void hardware_disable(void *garbage)
-{
-	asm volatile (ASM_VMX_VMXOFF : : : "cc");
-}
-
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
-				      u32 msr, u32* result)
-{
-	u32 vmx_msr_low, vmx_msr_high;
-	u32 ctl = ctl_min | ctl_opt;
-
-	rdmsr(msr, vmx_msr_low, vmx_msr_high);
-
-	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
-	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
-
-	/* Ensure minimum (required) set of control bits are supported. */
-	if (ctl_min & ~ctl)
-		return -EIO;
-
-	*result = ctl;
-	return 0;
-}
-
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
-{
-	u32 vmx_msr_low, vmx_msr_high;
-	u32 min, opt;
-	u32 _pin_based_exec_control = 0;
-	u32 _cpu_based_exec_control = 0;
-	u32 _vmexit_control = 0;
-	u32 _vmentry_control = 0;
-
-	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = 0;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-				&_pin_based_exec_control) < 0)
-		return -EIO;
-
-	min = CPU_BASED_HLT_EXITING |
-#ifdef CONFIG_X86_64
-	      CPU_BASED_CR8_LOAD_EXITING |
-	      CPU_BASED_CR8_STORE_EXITING |
-#endif
-	      CPU_BASED_USE_IO_BITMAPS |
-	      CPU_BASED_MOV_DR_EXITING |
-	      CPU_BASED_USE_TSC_OFFSETING;
-#ifdef CONFIG_X86_64
-	opt = CPU_BASED_TPR_SHADOW;
-#else
-	opt = 0;
-#endif
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
-				&_cpu_based_exec_control) < 0)
-		return -EIO;
-#ifdef CONFIG_X86_64
-	if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
-		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
-					   ~CPU_BASED_CR8_STORE_EXITING;
-#endif
-
-	min = 0;
-#ifdef CONFIG_X86_64
-	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#endif
-	opt = 0;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
-				&_vmexit_control) < 0)
-		return -EIO;
-
-	min = opt = 0;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
-				&_vmentry_control) < 0)
-		return -EIO;
-
-	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
-
-	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
-	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
-		return -EIO;
-
-#ifdef CONFIG_X86_64
-	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
-	if (vmx_msr_high & (1u<<16))
-		return -EIO;
-#endif
-
-	/* Require Write-Back (WB) memory type for VMCS accesses. */
-	if (((vmx_msr_high >> 18) & 15) != 6)
-		return -EIO;
-
-	vmcs_conf->size = vmx_msr_high & 0x1fff;
-	vmcs_conf->order = get_order(vmcs_config.size);
-	vmcs_conf->revision_id = vmx_msr_low;
-
-	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
-	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
-	vmcs_conf->vmexit_ctrl         = _vmexit_control;
-	vmcs_conf->vmentry_ctrl        = _vmentry_control;
-
-	return 0;
-}
-
-static struct vmcs *alloc_vmcs_cpu(int cpu)
-{
-	int node = cpu_to_node(cpu);
-	struct page *pages;
-	struct vmcs *vmcs;
-
-	pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
-	if (!pages)
-		return NULL;
-	vmcs = page_address(pages);
-	memset(vmcs, 0, vmcs_config.size);
-	vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
-	return vmcs;
-}
-
-static struct vmcs *alloc_vmcs(void)
-{
-	return alloc_vmcs_cpu(raw_smp_processor_id());
-}
-
-static void free_vmcs(struct vmcs *vmcs)
-{
-	free_pages((unsigned long)vmcs, vmcs_config.order);
-}
-
-static void free_kvm_area(void)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		free_vmcs(per_cpu(vmxarea, cpu));
-}
-
-static __init int alloc_kvm_area(void)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		struct vmcs *vmcs;
-
-		vmcs = alloc_vmcs_cpu(cpu);
-		if (!vmcs) {
-			free_kvm_area();
-			return -ENOMEM;
-		}
-
-		per_cpu(vmxarea, cpu) = vmcs;
-	}
-	return 0;
-}
-
-static __init int hardware_setup(void)
-{
-	if (setup_vmcs_config(&vmcs_config) < 0)
-		return -EIO;
-	return alloc_kvm_area();
-}
-
-static __exit void hardware_unsetup(void)
-{
-	free_kvm_area();
-}
-
-static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
-{
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-	if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
-		vmcs_write16(sf->selector, save->selector);
-		vmcs_writel(sf->base, save->base);
-		vmcs_write32(sf->limit, save->limit);
-		vmcs_write32(sf->ar_bytes, save->ar);
-	} else {
-		u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
-			<< AR_DPL_SHIFT;
-		vmcs_write32(sf->ar_bytes, 0x93 | dpl);
-	}
-}
-
-static void enter_pmode(struct kvm_vcpu *vcpu)
-{
-	unsigned long flags;
-
-	vcpu->rmode.active = 0;
-
-	vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
-	vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
-	vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
-
-	flags = vmcs_readl(GUEST_RFLAGS);
-	flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
-	flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
-	vmcs_writel(GUEST_RFLAGS, flags);
-
-	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
-			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
-
-	update_exception_bitmap(vcpu);
-
-	fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
-	fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
-	fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
-	fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
-
-	vmcs_write16(GUEST_SS_SELECTOR, 0);
-	vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
-
-	vmcs_write16(GUEST_CS_SELECTOR,
-		     vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
-	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-}
-
-static gva_t rmode_tss_base(struct kvm* kvm)
-{
-	gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
-	return base_gfn << PAGE_SHIFT;
-}
-
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
-{
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-	save->selector = vmcs_read16(sf->selector);
-	save->base = vmcs_readl(sf->base);
-	save->limit = vmcs_read32(sf->limit);
-	save->ar = vmcs_read32(sf->ar_bytes);
-	vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
-	vmcs_write32(sf->limit, 0xffff);
-	vmcs_write32(sf->ar_bytes, 0xf3);
-}
-
-static void enter_rmode(struct kvm_vcpu *vcpu)
-{
-	unsigned long flags;
-
-	vcpu->rmode.active = 1;
-
-	vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
-	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
-	vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
-	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
-	vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
-	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
-	flags = vmcs_readl(GUEST_RFLAGS);
-	vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
-
-	flags |= IOPL_MASK | X86_EFLAGS_VM;
-
-	vmcs_writel(GUEST_RFLAGS, flags);
-	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
-	update_exception_bitmap(vcpu);
-
-	vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
-	vmcs_write32(GUEST_SS_LIMIT, 0xffff);
-	vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
-
-	vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
-	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-	if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
-		vmcs_writel(GUEST_CS_BASE, 0xf0000);
-	vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
-
-	fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
-	fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
-	fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
-	fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
-
-	kvm_mmu_reset_context(vcpu);
-	init_rmode_tss(vcpu->kvm);
-}
-
-#ifdef CONFIG_X86_64
-
-static void enter_lmode(struct kvm_vcpu *vcpu)
-{
-	u32 guest_tr_ar;
-
-	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
-	if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
-		printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
-		       __FUNCTION__);
-		vmcs_write32(GUEST_TR_AR_BYTES,
-			     (guest_tr_ar & ~AR_TYPE_MASK)
-			     | AR_TYPE_BUSY_64_TSS);
-	}
-
-	vcpu->shadow_efer |= EFER_LMA;
-
-	find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
-	vmcs_write32(VM_ENTRY_CONTROLS,
-		     vmcs_read32(VM_ENTRY_CONTROLS)
-		     | VM_ENTRY_IA32E_MODE);
-}
-
-static void exit_lmode(struct kvm_vcpu *vcpu)
-{
-	vcpu->shadow_efer &= ~EFER_LMA;
-
-	vmcs_write32(VM_ENTRY_CONTROLS,
-		     vmcs_read32(VM_ENTRY_CONTROLS)
-		     & ~VM_ENTRY_IA32E_MODE);
-}
-
-#endif
-
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-	vcpu->cr4 &= KVM_GUEST_CR4_MASK;
-	vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
-}
-
-static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-	vmx_fpu_deactivate(vcpu);
-
-	if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
-		enter_pmode(vcpu);
-
-	if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
-		enter_rmode(vcpu);
-
-#ifdef CONFIG_X86_64
-	if (vcpu->shadow_efer & EFER_LME) {
-		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
-			enter_lmode(vcpu);
-		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
-			exit_lmode(vcpu);
-	}
-#endif
-
-	vmcs_writel(CR0_READ_SHADOW, cr0);
-	vmcs_writel(GUEST_CR0,
-		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
-	vcpu->cr0 = cr0;
-
-	if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
-		vmx_fpu_activate(vcpu);
-}
-
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-	vmcs_writel(GUEST_CR3, cr3);
-	if (vcpu->cr0 & X86_CR0_PE)
-		vmx_fpu_deactivate(vcpu);
-}
-
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-	vmcs_writel(CR4_READ_SHADOW, cr4);
-	vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
-		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
-	vcpu->cr4 = cr4;
-}
-
-#ifdef CONFIG_X86_64
-
-static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
-
-	vcpu->shadow_efer = efer;
-	if (efer & EFER_LMA) {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-				     vmcs_read32(VM_ENTRY_CONTROLS) |
-				     VM_ENTRY_IA32E_MODE);
-		msr->data = efer;
-
-	} else {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-				     vmcs_read32(VM_ENTRY_CONTROLS) &
-				     ~VM_ENTRY_IA32E_MODE);
-
-		msr->data = efer & ~EFER_LME;
-	}
-	setup_msrs(vmx);
-}
-
-#endif
-
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-	return vmcs_readl(sf->base);
-}
-
-static void vmx_get_segment(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg)
-{
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-	u32 ar;
-
-	var->base = vmcs_readl(sf->base);
-	var->limit = vmcs_read32(sf->limit);
-	var->selector = vmcs_read16(sf->selector);
-	ar = vmcs_read32(sf->ar_bytes);
-	if (ar & AR_UNUSABLE_MASK)
-		ar = 0;
-	var->type = ar & 15;
-	var->s = (ar >> 4) & 1;
-	var->dpl = (ar >> 5) & 3;
-	var->present = (ar >> 7) & 1;
-	var->avl = (ar >> 12) & 1;
-	var->l = (ar >> 13) & 1;
-	var->db = (ar >> 14) & 1;
-	var->g = (ar >> 15) & 1;
-	var->unusable = (ar >> 16) & 1;
-}
-
-static u32 vmx_segment_access_rights(struct kvm_segment *var)
-{
-	u32 ar;
-
-	if (var->unusable)
-		ar = 1 << 16;
-	else {
-		ar = var->type & 15;
-		ar |= (var->s & 1) << 4;
-		ar |= (var->dpl & 3) << 5;
-		ar |= (var->present & 1) << 7;
-		ar |= (var->avl & 1) << 12;
-		ar |= (var->l & 1) << 13;
-		ar |= (var->db & 1) << 14;
-		ar |= (var->g & 1) << 15;
-	}
-	if (ar == 0) /* a 0 value means unusable */
-		ar = AR_UNUSABLE_MASK;
-
-	return ar;
-}
-
-static void vmx_set_segment(struct kvm_vcpu *vcpu,
-			    struct kvm_segment *var, int seg)
-{
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-	u32 ar;
-
-	if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
-		vcpu->rmode.tr.selector = var->selector;
-		vcpu->rmode.tr.base = var->base;
-		vcpu->rmode.tr.limit = var->limit;
-		vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
-		return;
-	}
-	vmcs_writel(sf->base, var->base);
-	vmcs_write32(sf->limit, var->limit);
-	vmcs_write16(sf->selector, var->selector);
-	if (vcpu->rmode.active && var->s) {
-		/*
-		 * Hack real-mode segments into vm86 compatibility.
-		 */
-		if (var->base == 0xffff0000 && var->selector == 0xf000)
-			vmcs_writel(sf->base, 0xf0000);
-		ar = 0xf3;
-	} else
-		ar = vmx_segment_access_rights(var);
-	vmcs_write32(sf->ar_bytes, ar);
-}
-
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
-	u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
-
-	*db = (ar >> 14) & 1;
-	*l = (ar >> 13) & 1;
-}
-
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-	dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
-	dt->base = vmcs_readl(GUEST_IDTR_BASE);
-}
-
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-	vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
-	vmcs_writel(GUEST_IDTR_BASE, dt->base);
-}
-
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-	dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
-	dt->base = vmcs_readl(GUEST_GDTR_BASE);
-}
-
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
-	vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
-	vmcs_writel(GUEST_GDTR_BASE, dt->base);
-}
-
-static int init_rmode_tss(struct kvm* kvm)
-{
-	struct page *p1, *p2, *p3;
-	gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
-	char *page;
-
-	p1 = gfn_to_page(kvm, fn++);
-	p2 = gfn_to_page(kvm, fn++);
-	p3 = gfn_to_page(kvm, fn);
-
-	if (!p1 || !p2 || !p3) {
-		kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
-		return 0;
-	}
-
-	page = kmap_atomic(p1, KM_USER0);
-	clear_page(page);
-	*(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-	kunmap_atomic(page, KM_USER0);
-
-	page = kmap_atomic(p2, KM_USER0);
-	clear_page(page);
-	kunmap_atomic(page, KM_USER0);
-
-	page = kmap_atomic(p3, KM_USER0);
-	clear_page(page);
-	*(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
-	kunmap_atomic(page, KM_USER0);
-
-	return 1;
-}
-
-static void seg_setup(int seg)
-{
-	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-	vmcs_write16(sf->selector, 0);
-	vmcs_writel(sf->base, 0);
-	vmcs_write32(sf->limit, 0xffff);
-	vmcs_write32(sf->ar_bytes, 0x93);
-}
-
-/*
- * Sets up the vmcs for emulated real mode.
- */
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
-{
-	u32 host_sysenter_cs;
-	u32 junk;
-	unsigned long a;
-	struct descriptor_table dt;
-	int i;
-	int ret = 0;
-	unsigned long kvm_vmx_return;
-	u64 msr;
-	u32 exec_control;
-
-	if (!init_rmode_tss(vmx->vcpu.kvm)) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	vmx->vcpu.rmode.active = 0;
-
-	vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	set_cr8(&vmx->vcpu, 0);
-	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-	if (vmx->vcpu.vcpu_id == 0)
-		msr |= MSR_IA32_APICBASE_BSP;
-	kvm_set_apic_base(&vmx->vcpu, msr);
-
-	fx_init(&vmx->vcpu);
-
-	/*
-	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
-	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
-	 */
-	if (vmx->vcpu.vcpu_id == 0) {
-		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-		vmcs_writel(GUEST_CS_BASE, 0x000f0000);
-	} else {
-		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
-		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
-	}
-	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-
-	seg_setup(VCPU_SREG_DS);
-	seg_setup(VCPU_SREG_ES);
-	seg_setup(VCPU_SREG_FS);
-	seg_setup(VCPU_SREG_GS);
-	seg_setup(VCPU_SREG_SS);
-
-	vmcs_write16(GUEST_TR_SELECTOR, 0);
-	vmcs_writel(GUEST_TR_BASE, 0);
-	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
-	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
-	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
-	vmcs_writel(GUEST_LDTR_BASE, 0);
-	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
-	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
-
-	vmcs_write32(GUEST_SYSENTER_CS, 0);
-	vmcs_writel(GUEST_SYSENTER_ESP, 0);
-	vmcs_writel(GUEST_SYSENTER_EIP, 0);
-
-	vmcs_writel(GUEST_RFLAGS, 0x02);
-	if (vmx->vcpu.vcpu_id == 0)
-		vmcs_writel(GUEST_RIP, 0xfff0);
-	else
-		vmcs_writel(GUEST_RIP, 0);
-	vmcs_writel(GUEST_RSP, 0);
-
-	//todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
-	vmcs_writel(GUEST_DR7, 0x400);
-
-	vmcs_writel(GUEST_GDTR_BASE, 0);
-	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
-
-	vmcs_writel(GUEST_IDTR_BASE, 0);
-	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
-
-	vmcs_write32(GUEST_ACTIVITY_STATE, 0);
-	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
-	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-
-	/* I/O */
-	vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
-	vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
-
-	guest_write_tsc(0);
-
-	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
-
-	/* Special registers */
-	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
-	/* Control */
-	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-		vmcs_config.pin_based_exec_ctrl);
-
-	exec_control = vmcs_config.cpu_based_exec_ctrl;
-	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
-		exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
-		exec_control |= CPU_BASED_CR8_STORE_EXITING |
-				CPU_BASED_CR8_LOAD_EXITING;
-#endif
-	}
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
-	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
-
-	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
-	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
-	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
-
-	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
-	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-	vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
-	vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
-	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-#ifdef CONFIG_X86_64
-	rdmsrl(MSR_FS_BASE, a);
-	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
-	rdmsrl(MSR_GS_BASE, a);
-	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
-	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
-	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
-
-	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
-
-	get_idt(&dt);
-	vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
-
-	asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
-	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
-	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-
-	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
-	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
-	rdmsrl(MSR_IA32_SYSENTER_ESP, a);
-	vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
-	rdmsrl(MSR_IA32_SYSENTER_EIP, a);
-	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
-
-	for (i = 0; i < NR_VMX_MSR; ++i) {
-		u32 index = vmx_msr_index[i];
-		u32 data_low, data_high;
-		u64 data;
-		int j = vmx->nmsrs;
-
-		if (rdmsr_safe(index, &data_low, &data_high) < 0)
-			continue;
-		if (wrmsr_safe(index, data_low, data_high) < 0)
-			continue;
-		data = data_low | ((u64)data_high << 32);
-		vmx->host_msrs[j].index = index;
-		vmx->host_msrs[j].reserved = 0;
-		vmx->host_msrs[j].data = data;
-		vmx->guest_msrs[j] = vmx->host_msrs[j];
-		++vmx->nmsrs;
-	}
-
-	setup_msrs(vmx);
-
-	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
-	/* 22.2.1, 20.8.1 */
-	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
-
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
-
-#ifdef CONFIG_X86_64
-	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-	if (vm_need_tpr_shadow(vmx->vcpu.kvm))
-		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-			     page_to_phys(vmx->vcpu.apic->regs_page));
-	vmcs_write32(TPR_THRESHOLD, 0);
-#endif
-
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
-	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
-
-	vmx->vcpu.cr0 = 0x60000010;
-	vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
-	vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
-	vmx_set_efer(&vmx->vcpu, 0);
-#endif
-	vmx_fpu_activate(&vmx->vcpu);
-	update_exception_bitmap(&vmx->vcpu);
-
-	return 0;
-
-out:
-	return ret;
-}
-
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	vmx_vcpu_setup(vmx);
-}
-
-static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
-{
-	u16 ent[2];
-	u16 cs;
-	u16 ip;
-	unsigned long flags;
-	unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
-	u16 sp =  vmcs_readl(GUEST_RSP);
-	u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
-
-	if (sp > ss_limit || sp < 6 ) {
-		vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
-			    __FUNCTION__,
-			    vmcs_readl(GUEST_RSP),
-			    vmcs_readl(GUEST_SS_BASE),
-			    vmcs_read32(GUEST_SS_LIMIT));
-		return;
-	}
-
-	if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
-							X86EMUL_CONTINUE) {
-		vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
-		return;
-	}
-
-	flags =  vmcs_readl(GUEST_RFLAGS);
-	cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
-	ip =  vmcs_readl(GUEST_RIP);
-
-
-	if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
-	    emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
-	    emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
-		vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
-		return;
-	}
-
-	vmcs_writel(GUEST_RFLAGS, flags &
-		    ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
-	vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
-	vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
-	vmcs_writel(GUEST_RIP, ent[0]);
-	vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
-}
-
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
-{
-	if (vcpu->rmode.active) {
-		inject_rmode_irq(vcpu, irq);
-		return;
-	}
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->irq_summary);
-	int bit_index = __ffs(vcpu->irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-	if (!vcpu->irq_pending[word_index])
-		clear_bit(word_index, &vcpu->irq_summary);
-	vmx_inject_irq(vcpu, irq);
-}
-
-
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
-{
-	u32 cpu_based_vm_exec_control;
-
-	vcpu->interrupt_window_open =
-		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
-
-	if (vcpu->interrupt_window_open &&
-	    vcpu->irq_summary &&
-	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
-		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
-		 */
-		kvm_do_inject_irq(vcpu);
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	if (!vcpu->interrupt_window_open &&
-	    (vcpu->irq_summary || kvm_run->request_interrupt_window))
-		/*
-		 * Interrupts blocked.  Wait for unblock.
-		 */
-		cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	else
-		cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
-{
-	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-
-	set_debugreg(dbg->bp[0], 0);
-	set_debugreg(dbg->bp[1], 1);
-	set_debugreg(dbg->bp[2], 2);
-	set_debugreg(dbg->bp[3], 3);
-
-	if (dbg->singlestep) {
-		unsigned long flags;
-
-		flags = vmcs_readl(GUEST_RFLAGS);
-		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
-}
-
-static int handle_rmode_exception(struct kvm_vcpu *vcpu,
-				  int vec, u32 err_code)
-{
-	if (!vcpu->rmode.active)
-		return 0;
-
-	/*
-	 * Instruction with address size override prefix opcode 0x67
-	 * Cause the #SS fault with 0 error code in VM86 mode.
-	 */
-	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
-		if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
-			return 1;
-	return 0;
-}
-
-static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	u32 intr_info, error_code;
-	unsigned long cr2, rip;
-	u32 vect_info;
-	enum emulation_result er;
-	int r;
-
-	vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-						!is_page_fault(intr_info)) {
-		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
-		       "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
-	}
-
-	if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
-		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-		set_bit(irq, vcpu->irq_pending);
-		set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
-	}
-
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
-		return 1;  /* already handled by vmx_vcpu_run() */
-
-	if (is_no_device(intr_info)) {
-		vmx_fpu_activate(vcpu);
-		return 1;
-	}
-
-	error_code = 0;
-	rip = vmcs_readl(GUEST_RIP);
-	if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
-		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-	if (is_page_fault(intr_info)) {
-		cr2 = vmcs_readl(EXIT_QUALIFICATION);
-
-		mutex_lock(&vcpu->kvm->lock);
-		r = kvm_mmu_page_fault(vcpu, cr2, error_code);
-		if (r < 0) {
-			mutex_unlock(&vcpu->kvm->lock);
-			return r;
-		}
-		if (!r) {
-			mutex_unlock(&vcpu->kvm->lock);
-			return 1;
-		}
-
-		er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
-		mutex_unlock(&vcpu->kvm->lock);
-
-		switch (er) {
-		case EMULATE_DONE:
-			return 1;
-		case EMULATE_DO_MMIO:
-			++vcpu->stat.mmio_exits;
-			return 0;
-		 case EMULATE_FAIL:
-			kvm_report_emulation_failure(vcpu, "pagetable");
-			break;
-		default:
-			BUG();
-		}
-	}
-
-	if (vcpu->rmode.active &&
-	    handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
-								error_code)) {
-		if (vcpu->halt_request) {
-			vcpu->halt_request = 0;
-			return kvm_emulate_halt(vcpu);
-		}
-		return 1;
-	}
-
-	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
-		kvm_run->exit_reason = KVM_EXIT_DEBUG;
-		return 0;
-	}
-	kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
-	kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
-	kvm_run->ex.error_code = error_code;
-	return 0;
-}
-
-static int handle_external_interrupt(struct kvm_vcpu *vcpu,
-				     struct kvm_run *kvm_run)
-{
-	++vcpu->stat.irq_exits;
-	return 1;
-}
-
-static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
-	return 0;
-}
-
-static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	unsigned long exit_qualification;
-	int size, down, in, string, rep;
-	unsigned port;
-
-	++vcpu->stat.io_exits;
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	string = (exit_qualification & 16) != 0;
-
-	if (string) {
-		if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
-			return 0;
-		return 1;
-	}
-
-	size = (exit_qualification & 7) + 1;
-	in = (exit_qualification & 8) != 0;
-	down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
-	rep = (exit_qualification & 32) != 0;
-	port = exit_qualification >> 16;
-
-	return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
-}
-
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
-	/*
-	 * Patch in the VMCALL instruction:
-	 */
-	hypercall[0] = 0x0f;
-	hypercall[1] = 0x01;
-	hypercall[2] = 0xc1;
-	hypercall[3] = 0xc3;
-}
-
-static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	unsigned long exit_qualification;
-	int cr;
-	int reg;
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	cr = exit_qualification & 15;
-	reg = (exit_qualification >> 8) & 15;
-	switch ((exit_qualification >> 4) & 3) {
-	case 0: /* mov to cr */
-		switch (cr) {
-		case 0:
-			vcpu_load_rsp_rip(vcpu);
-			set_cr0(vcpu, vcpu->regs[reg]);
-			skip_emulated_instruction(vcpu);
-			return 1;
-		case 3:
-			vcpu_load_rsp_rip(vcpu);
-			set_cr3(vcpu, vcpu->regs[reg]);
-			skip_emulated_instruction(vcpu);
-			return 1;
-		case 4:
-			vcpu_load_rsp_rip(vcpu);
-			set_cr4(vcpu, vcpu->regs[reg]);
-			skip_emulated_instruction(vcpu);
-			return 1;
-		case 8:
-			vcpu_load_rsp_rip(vcpu);
-			set_cr8(vcpu, vcpu->regs[reg]);
-			skip_emulated_instruction(vcpu);
-			kvm_run->exit_reason = KVM_EXIT_SET_TPR;
-			return 0;
-		};
-		break;
-	case 2: /* clts */
-		vcpu_load_rsp_rip(vcpu);
-		vmx_fpu_deactivate(vcpu);
-		vcpu->cr0 &= ~X86_CR0_TS;
-		vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
-		vmx_fpu_activate(vcpu);
-		skip_emulated_instruction(vcpu);
-		return 1;
-	case 1: /*mov from cr*/
-		switch (cr) {
-		case 3:
-			vcpu_load_rsp_rip(vcpu);
-			vcpu->regs[reg] = vcpu->cr3;
-			vcpu_put_rsp_rip(vcpu);
-			skip_emulated_instruction(vcpu);
-			return 1;
-		case 8:
-			vcpu_load_rsp_rip(vcpu);
-			vcpu->regs[reg] = get_cr8(vcpu);
-			vcpu_put_rsp_rip(vcpu);
-			skip_emulated_instruction(vcpu);
-			return 1;
-		}
-		break;
-	case 3: /* lmsw */
-		lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
-
-		skip_emulated_instruction(vcpu);
-		return 1;
-	default:
-		break;
-	}
-	kvm_run->exit_reason = 0;
-	pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
-	       (int)(exit_qualification >> 4) & 3, cr);
-	return 0;
-}
-
-static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	unsigned long exit_qualification;
-	unsigned long val;
-	int dr, reg;
-
-	/*
-	 * FIXME: this code assumes the host is debugging the guest.
-	 *        need to deal with guest debugging itself too.
-	 */
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	dr = exit_qualification & 7;
-	reg = (exit_qualification >> 8) & 15;
-	vcpu_load_rsp_rip(vcpu);
-	if (exit_qualification & 16) {
-		/* mov from dr */
-		switch (dr) {
-		case 6:
-			val = 0xffff0ff0;
-			break;
-		case 7:
-			val = 0x400;
-			break;
-		default:
-			val = 0;
-		}
-		vcpu->regs[reg] = val;
-	} else {
-		/* mov to dr */
-	}
-	vcpu_put_rsp_rip(vcpu);
-	skip_emulated_instruction(vcpu);
-	return 1;
-}
-
-static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	kvm_emulate_cpuid(vcpu);
-	return 1;
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
-	u64 data;
-
-	if (vmx_get_msr(vcpu, ecx, &data)) {
-		vmx_inject_gp(vcpu, 0);
-		return 1;
-	}
-
-	/* FIXME: handling of bits 32:63 of rax, rdx */
-	vcpu->regs[VCPU_REGS_RAX] = data & -1u;
-	vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
-	skip_emulated_instruction(vcpu);
-	return 1;
-}
-
-static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
-	u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
-		| ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
-
-	if (vmx_set_msr(vcpu, ecx, data) != 0) {
-		vmx_inject_gp(vcpu, 0);
-		return 1;
-	}
-
-	skip_emulated_instruction(vcpu);
-	return 1;
-}
-
-static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
-				      struct kvm_run *kvm_run)
-{
-	return 1;
-}
-
-static int handle_interrupt_window(struct kvm_vcpu *vcpu,
-				   struct kvm_run *kvm_run)
-{
-	u32 cpu_based_vm_exec_control;
-
-	/* clear pending irq */
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-	/*
-	 * If the user space waits to inject interrupts, exit as soon as
-	 * possible
-	 */
-	if (kvm_run->request_interrupt_window &&
-	    !vcpu->irq_summary) {
-		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		++vcpu->stat.irq_window_exits;
-		return 0;
-	}
-	return 1;
-}
-
-static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	skip_emulated_instruction(vcpu);
-	return kvm_emulate_halt(vcpu);
-}
-
-static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	skip_emulated_instruction(vcpu);
-	return kvm_hypercall(vcpu, kvm_run);
-}
-
-/*
- * The exit handlers return 1 if the exit was handled fully and guest execution
- * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
- * to be done to userspace and return 0.
- */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
-				      struct kvm_run *kvm_run) = {
-	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
-	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
-	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
-	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
-	[EXIT_REASON_CR_ACCESS]               = handle_cr,
-	[EXIT_REASON_DR_ACCESS]               = handle_dr,
-	[EXIT_REASON_CPUID]                   = handle_cpuid,
-	[EXIT_REASON_MSR_READ]                = handle_rdmsr,
-	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
-	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
-	[EXIT_REASON_HLT]                     = handle_halt,
-	[EXIT_REASON_VMCALL]                  = handle_vmcall,
-	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold
-};
-
-static const int kvm_vmx_max_exit_handlers =
-	ARRAY_SIZE(kvm_vmx_exit_handlers);
-
-/*
- * The guest has exited.  See if we can fix it or if we need userspace
- * assistance.
- */
-static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
-	u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (unlikely(vmx->fail)) {
-		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-		kvm_run->fail_entry.hardware_entry_failure_reason
-			= vmcs_read32(VM_INSTRUCTION_ERROR);
-		return 0;
-	}
-
-	if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
-				exit_reason != EXIT_REASON_EXCEPTION_NMI )
-		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
-		       "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
-	if (exit_reason < kvm_vmx_max_exit_handlers
-	    && kvm_vmx_exit_handlers[exit_reason])
-		return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
-	else {
-		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-		kvm_run->hw.hardware_exit_reason = exit_reason;
-	}
-	return 0;
-}
-
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
-}
-
-static void update_tpr_threshold(struct kvm_vcpu *vcpu)
-{
-	int max_irr, tpr;
-
-	if (!vm_need_tpr_shadow(vcpu->kvm))
-		return;
-
-	if (!kvm_lapic_enabled(vcpu) ||
-	    ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
-		vmcs_write32(TPR_THRESHOLD, 0);
-		return;
-	}
-
-	tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
-	vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-	u32 cpu_based_vm_exec_control;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
-{
-	u32 idtv_info_field, intr_info_field;
-	int has_ext_irq, interrupt_window_open;
-	int vector;
-
-	kvm_inject_pending_timer_irqs(vcpu);
-	update_tpr_threshold(vcpu);
-
-	has_ext_irq = kvm_cpu_has_interrupt(vcpu);
-	intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-	idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-	if (intr_info_field & INTR_INFO_VALID_MASK) {
-		if (idtv_info_field & INTR_INFO_VALID_MASK) {
-			/* TODO: fault when IDT_Vectoring */
-			printk(KERN_ERR "Fault when IDT_Vectoring\n");
-		}
-		if (has_ext_irq)
-			enable_irq_window(vcpu);
-		return;
-	}
-	if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-				vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-
-		if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
-			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-				vmcs_read32(IDT_VECTORING_ERROR_CODE));
-		if (unlikely(has_ext_irq))
-			enable_irq_window(vcpu);
-		return;
-	}
-	if (!has_ext_irq)
-		return;
-	interrupt_window_open =
-		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
-	if (interrupt_window_open) {
-		vector = kvm_cpu_get_interrupt(vcpu);
-		vmx_inject_irq(vcpu, vector);
-		kvm_timer_intr_post(vcpu, vector);
-	} else
-		enable_irq_window(vcpu);
-}
-
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 intr_info;
-
-	/*
-	 * Loading guest fpu may have cleared host cr0.ts
-	 */
-	vmcs_writel(HOST_CR0, read_cr0());
-
-	asm (
-		/* Store host registers */
-#ifdef CONFIG_X86_64
-		"push %%rax; push %%rbx; push %%rdx;"
-		"push %%rsi; push %%rdi; push %%rbp;"
-		"push %%r8;  push %%r9;  push %%r10; push %%r11;"
-		"push %%r12; push %%r13; push %%r14; push %%r15;"
-		"push %%rcx \n\t"
-		ASM_VMX_VMWRITE_RSP_RDX "\n\t"
-#else
-		"pusha; push %%ecx \n\t"
-		ASM_VMX_VMWRITE_RSP_RDX "\n\t"
-#endif
-		/* Check if vmlaunch of vmresume is needed */
-		"cmp $0, %1 \n\t"
-		/* Load guest registers.  Don't clobber flags. */
-#ifdef CONFIG_X86_64
-		"mov %c[cr2](%3), %%rax \n\t"
-		"mov %%rax, %%cr2 \n\t"
-		"mov %c[rax](%3), %%rax \n\t"
-		"mov %c[rbx](%3), %%rbx \n\t"
-		"mov %c[rdx](%3), %%rdx \n\t"
-		"mov %c[rsi](%3), %%rsi \n\t"
-		"mov %c[rdi](%3), %%rdi \n\t"
-		"mov %c[rbp](%3), %%rbp \n\t"
-		"mov %c[r8](%3),  %%r8  \n\t"
-		"mov %c[r9](%3),  %%r9  \n\t"
-		"mov %c[r10](%3), %%r10 \n\t"
-		"mov %c[r11](%3), %%r11 \n\t"
-		"mov %c[r12](%3), %%r12 \n\t"
-		"mov %c[r13](%3), %%r13 \n\t"
-		"mov %c[r14](%3), %%r14 \n\t"
-		"mov %c[r15](%3), %%r15 \n\t"
-		"mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
-#else
-		"mov %c[cr2](%3), %%eax \n\t"
-		"mov %%eax,   %%cr2 \n\t"
-		"mov %c[rax](%3), %%eax \n\t"
-		"mov %c[rbx](%3), %%ebx \n\t"
-		"mov %c[rdx](%3), %%edx \n\t"
-		"mov %c[rsi](%3), %%esi \n\t"
-		"mov %c[rdi](%3), %%edi \n\t"
-		"mov %c[rbp](%3), %%ebp \n\t"
-		"mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
-#endif
-		/* Enter guest mode */
-		"jne .Llaunched \n\t"
-		ASM_VMX_VMLAUNCH "\n\t"
-		"jmp .Lkvm_vmx_return \n\t"
-		".Llaunched: " ASM_VMX_VMRESUME "\n\t"
-		".Lkvm_vmx_return: "
-		/* Save guest registers, load host registers, keep flags */
-#ifdef CONFIG_X86_64
-		"xchg %3,     (%%rsp) \n\t"
-		"mov %%rax, %c[rax](%3) \n\t"
-		"mov %%rbx, %c[rbx](%3) \n\t"
-		"pushq (%%rsp); popq %c[rcx](%3) \n\t"
-		"mov %%rdx, %c[rdx](%3) \n\t"
-		"mov %%rsi, %c[rsi](%3) \n\t"
-		"mov %%rdi, %c[rdi](%3) \n\t"
-		"mov %%rbp, %c[rbp](%3) \n\t"
-		"mov %%r8,  %c[r8](%3) \n\t"
-		"mov %%r9,  %c[r9](%3) \n\t"
-		"mov %%r10, %c[r10](%3) \n\t"
-		"mov %%r11, %c[r11](%3) \n\t"
-		"mov %%r12, %c[r12](%3) \n\t"
-		"mov %%r13, %c[r13](%3) \n\t"
-		"mov %%r14, %c[r14](%3) \n\t"
-		"mov %%r15, %c[r15](%3) \n\t"
-		"mov %%cr2, %%rax   \n\t"
-		"mov %%rax, %c[cr2](%3) \n\t"
-		"mov (%%rsp), %3 \n\t"
-
-		"pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
-		"pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
-		"pop  %%rbp; pop  %%rdi; pop  %%rsi;"
-		"pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
-#else
-		"xchg %3, (%%esp) \n\t"
-		"mov %%eax, %c[rax](%3) \n\t"
-		"mov %%ebx, %c[rbx](%3) \n\t"
-		"pushl (%%esp); popl %c[rcx](%3) \n\t"
-		"mov %%edx, %c[rdx](%3) \n\t"
-		"mov %%esi, %c[rsi](%3) \n\t"
-		"mov %%edi, %c[rdi](%3) \n\t"
-		"mov %%ebp, %c[rbp](%3) \n\t"
-		"mov %%cr2, %%eax  \n\t"
-		"mov %%eax, %c[cr2](%3) \n\t"
-		"mov (%%esp), %3 \n\t"
-
-		"pop %%ecx; popa \n\t"
-#endif
-		"setbe %0 \n\t"
-	      : "=q" (vmx->fail)
-	      : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
-		"c"(vcpu),
-		[rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
-		[rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
-		[rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
-		[rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
-		[rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
-		[rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
-		[rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
-#ifdef CONFIG_X86_64
-		[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
-		[r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
-		[r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
-		[r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
-		[r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
-		[r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
-		[r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
-		[r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
-#endif
-		[cr2]"i"(offsetof(struct kvm_vcpu, cr2))
-	      : "cc", "memory" );
-
-	vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
-
-	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
-	vmx->launched = 1;
-
-	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-	/* We need to handle NMIs before interrupts are enabled */
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
-		asm("int $2");
-}
-
-static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
-				  unsigned long addr,
-				  u32 err_code)
-{
-	u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
-	++vcpu->stat.pf_guest;
-
-	if (is_page_fault(vect_info)) {
-		printk(KERN_DEBUG "inject_page_fault: "
-		       "double fault 0x%lx @ 0x%lx\n",
-		       addr, vmcs_readl(GUEST_RIP));
-		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     DF_VECTOR |
-			     INTR_TYPE_EXCEPTION |
-			     INTR_INFO_DELIEVER_CODE_MASK |
-			     INTR_INFO_VALID_MASK);
-		return;
-	}
-	vcpu->cr2 = addr;
-	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-		     PF_VECTOR |
-		     INTR_TYPE_EXCEPTION |
-		     INTR_INFO_DELIEVER_CODE_MASK |
-		     INTR_INFO_VALID_MASK);
-
-}
-
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (vmx->vmcs) {
-		on_each_cpu(__vcpu_clear, vmx, 0, 1);
-		free_vmcs(vmx->vmcs);
-		vmx->vmcs = NULL;
-	}
-}
-
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	vmx_free_vmcs(vcpu);
-	kfree(vmx->host_msrs);
-	kfree(vmx->guest_msrs);
-	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(kvm_vcpu_cache, vmx);
-}
-
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
-{
-	int err;
-	struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-	int cpu;
-
-	if (!vmx)
-		return ERR_PTR(-ENOMEM);
-
-	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
-	if (err)
-		goto free_vcpu;
-
-	if (irqchip_in_kernel(kvm)) {
-		err = kvm_create_lapic(&vmx->vcpu);
-		if (err < 0)
-			goto free_vcpu;
-	}
-
-	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!vmx->guest_msrs) {
-		err = -ENOMEM;
-		goto uninit_vcpu;
-	}
-
-	vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!vmx->host_msrs)
-		goto free_guest_msrs;
-
-	vmx->vmcs = alloc_vmcs();
-	if (!vmx->vmcs)
-		goto free_msrs;
-
-	vmcs_clear(vmx->vmcs);
-
-	cpu = get_cpu();
-	vmx_vcpu_load(&vmx->vcpu, cpu);
-	err = vmx_vcpu_setup(vmx);
-	vmx_vcpu_put(&vmx->vcpu);
-	put_cpu();
-	if (err)
-		goto free_vmcs;
-
-	return &vmx->vcpu;
-
-free_vmcs:
-	free_vmcs(vmx->vmcs);
-free_msrs:
-	kfree(vmx->host_msrs);
-free_guest_msrs:
-	kfree(vmx->guest_msrs);
-uninit_vcpu:
-	kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
-	kmem_cache_free(kvm_vcpu_cache, vmx);
-	return ERR_PTR(err);
-}
-
-static void __init vmx_check_processor_compat(void *rtn)
-{
-	struct vmcs_config vmcs_conf;
-
-	*(int *)rtn = 0;
-	if (setup_vmcs_config(&vmcs_conf) < 0)
-		*(int *)rtn = -EIO;
-	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
-		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
-				smp_processor_id());
-		*(int *)rtn = -EIO;
-	}
-}
-
-static struct kvm_x86_ops vmx_x86_ops = {
-	.cpu_has_kvm_support = cpu_has_kvm_support,
-	.disabled_by_bios = vmx_disabled_by_bios,
-	.hardware_setup = hardware_setup,
-	.hardware_unsetup = hardware_unsetup,
-	.check_processor_compatibility = vmx_check_processor_compat,
-	.hardware_enable = hardware_enable,
-	.hardware_disable = hardware_disable,
-
-	.vcpu_create = vmx_create_vcpu,
-	.vcpu_free = vmx_free_vcpu,
-	.vcpu_reset = vmx_vcpu_reset,
-
-	.prepare_guest_switch = vmx_save_host_state,
-	.vcpu_load = vmx_vcpu_load,
-	.vcpu_put = vmx_vcpu_put,
-	.vcpu_decache = vmx_vcpu_decache,
-
-	.set_guest_debug = set_guest_debug,
-	.guest_debug_pre = kvm_guest_debug_pre,
-	.get_msr = vmx_get_msr,
-	.set_msr = vmx_set_msr,
-	.get_segment_base = vmx_get_segment_base,
-	.get_segment = vmx_get_segment,
-	.set_segment = vmx_set_segment,
-	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
-	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
-	.set_cr0 = vmx_set_cr0,
-	.set_cr3 = vmx_set_cr3,
-	.set_cr4 = vmx_set_cr4,
-#ifdef CONFIG_X86_64
-	.set_efer = vmx_set_efer,
-#endif
-	.get_idt = vmx_get_idt,
-	.set_idt = vmx_set_idt,
-	.get_gdt = vmx_get_gdt,
-	.set_gdt = vmx_set_gdt,
-	.cache_regs = vcpu_load_rsp_rip,
-	.decache_regs = vcpu_put_rsp_rip,
-	.get_rflags = vmx_get_rflags,
-	.set_rflags = vmx_set_rflags,
-
-	.tlb_flush = vmx_flush_tlb,
-	.inject_page_fault = vmx_inject_page_fault,
-
-	.inject_gp = vmx_inject_gp,
-
-	.run = vmx_vcpu_run,
-	.handle_exit = kvm_handle_exit,
-	.skip_emulated_instruction = skip_emulated_instruction,
-	.patch_hypercall = vmx_patch_hypercall,
-	.get_irq = vmx_get_irq,
-	.set_irq = vmx_inject_irq,
-	.inject_pending_irq = vmx_intr_assist,
-	.inject_pending_vectors = do_interrupt_requests,
-};
-
-static int __init vmx_init(void)
-{
-	void *iova;
-	int r;
-
-	vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
-	if (!vmx_io_bitmap_a)
-		return -ENOMEM;
-
-	vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
-	if (!vmx_io_bitmap_b) {
-		r = -ENOMEM;
-		goto out;
-	}
-
-	/*
-	 * Allow direct access to the PC debug port (it is often used for I/O
-	 * delays, but the vmexits simply slow things down).
-	 */
-	iova = kmap(vmx_io_bitmap_a);
-	memset(iova, 0xff, PAGE_SIZE);
-	clear_bit(0x80, iova);
-	kunmap(vmx_io_bitmap_a);
-
-	iova = kmap(vmx_io_bitmap_b);
-	memset(iova, 0xff, PAGE_SIZE);
-	kunmap(vmx_io_bitmap_b);
-
-	r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
-	if (r)
-		goto out1;
-
-	return 0;
-
-out1:
-	__free_page(vmx_io_bitmap_b);
-out:
-	__free_page(vmx_io_bitmap_a);
-	return r;
-}
-
-static void __exit vmx_exit(void)
-{
-	__free_page(vmx_io_bitmap_b);
-	__free_page(vmx_io_bitmap_a);
-
-	kvm_exit_x86();
-}
-
-module_init(vmx_init)
-module_exit(vmx_exit)
diff -puN drivers/kvm/vmx.h~git-kvm /dev/null
--- a/drivers/kvm/vmx.h
+++ /dev/null
@@ -1,310 +0,0 @@
-#ifndef VMX_H
-#define VMX_H
-
-/*
- * vmx.h: VMX Architecture related definitions
- * Copyright (c) 2004, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * A few random additions are:
- * Copyright (C) 2006 Qumranet
- *    Avi Kivity <avi@qumranet.com>
- *    Yaniv Kamay <yaniv@qumranet.com>
- *
- */
-
-#define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
-#define CPU_BASED_USE_TSC_OFFSETING             0x00000008
-#define CPU_BASED_HLT_EXITING                   0x00000080
-#define CPU_BASED_INVLPG_EXITING                0x00000200
-#define CPU_BASED_MWAIT_EXITING                 0x00000400
-#define CPU_BASED_RDPMC_EXITING                 0x00000800
-#define CPU_BASED_RDTSC_EXITING                 0x00001000
-#define CPU_BASED_CR8_LOAD_EXITING              0x00080000
-#define CPU_BASED_CR8_STORE_EXITING             0x00100000
-#define CPU_BASED_TPR_SHADOW                    0x00200000
-#define CPU_BASED_MOV_DR_EXITING                0x00800000
-#define CPU_BASED_UNCOND_IO_EXITING             0x01000000
-#define CPU_BASED_USE_IO_BITMAPS                0x02000000
-#define CPU_BASED_USE_MSR_BITMAPS               0x10000000
-#define CPU_BASED_MONITOR_EXITING               0x20000000
-#define CPU_BASED_PAUSE_EXITING                 0x40000000
-#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
-
-#define PIN_BASED_EXT_INTR_MASK                 0x00000001
-#define PIN_BASED_NMI_EXITING                   0x00000008
-#define PIN_BASED_VIRTUAL_NMIS                  0x00000020
-
-#define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
-#define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
-
-#define VM_ENTRY_IA32E_MODE                     0x00000200
-#define VM_ENTRY_SMM                            0x00000400
-#define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
-
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-
-/* VMCS Encodings */
-enum vmcs_field {
-	GUEST_ES_SELECTOR               = 0x00000800,
-	GUEST_CS_SELECTOR               = 0x00000802,
-	GUEST_SS_SELECTOR               = 0x00000804,
-	GUEST_DS_SELECTOR               = 0x00000806,
-	GUEST_FS_SELECTOR               = 0x00000808,
-	GUEST_GS_SELECTOR               = 0x0000080a,
-	GUEST_LDTR_SELECTOR             = 0x0000080c,
-	GUEST_TR_SELECTOR               = 0x0000080e,
-	HOST_ES_SELECTOR                = 0x00000c00,
-	HOST_CS_SELECTOR                = 0x00000c02,
-	HOST_SS_SELECTOR                = 0x00000c04,
-	HOST_DS_SELECTOR                = 0x00000c06,
-	HOST_FS_SELECTOR                = 0x00000c08,
-	HOST_GS_SELECTOR                = 0x00000c0a,
-	HOST_TR_SELECTOR                = 0x00000c0c,
-	IO_BITMAP_A                     = 0x00002000,
-	IO_BITMAP_A_HIGH                = 0x00002001,
-	IO_BITMAP_B                     = 0x00002002,
-	IO_BITMAP_B_HIGH                = 0x00002003,
-	MSR_BITMAP                      = 0x00002004,
-	MSR_BITMAP_HIGH                 = 0x00002005,
-	VM_EXIT_MSR_STORE_ADDR          = 0x00002006,
-	VM_EXIT_MSR_STORE_ADDR_HIGH     = 0x00002007,
-	VM_EXIT_MSR_LOAD_ADDR           = 0x00002008,
-	VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
-	VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
-	VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
-	TSC_OFFSET                      = 0x00002010,
-	TSC_OFFSET_HIGH                 = 0x00002011,
-	VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
-	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
-	VMCS_LINK_POINTER               = 0x00002800,
-	VMCS_LINK_POINTER_HIGH          = 0x00002801,
-	GUEST_IA32_DEBUGCTL             = 0x00002802,
-	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
-	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
-	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
-	EXCEPTION_BITMAP                = 0x00004004,
-	PAGE_FAULT_ERROR_CODE_MASK      = 0x00004006,
-	PAGE_FAULT_ERROR_CODE_MATCH     = 0x00004008,
-	CR3_TARGET_COUNT                = 0x0000400a,
-	VM_EXIT_CONTROLS                = 0x0000400c,
-	VM_EXIT_MSR_STORE_COUNT         = 0x0000400e,
-	VM_EXIT_MSR_LOAD_COUNT          = 0x00004010,
-	VM_ENTRY_CONTROLS               = 0x00004012,
-	VM_ENTRY_MSR_LOAD_COUNT         = 0x00004014,
-	VM_ENTRY_INTR_INFO_FIELD        = 0x00004016,
-	VM_ENTRY_EXCEPTION_ERROR_CODE   = 0x00004018,
-	VM_ENTRY_INSTRUCTION_LEN        = 0x0000401a,
-	TPR_THRESHOLD                   = 0x0000401c,
-	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
-	VM_INSTRUCTION_ERROR            = 0x00004400,
-	VM_EXIT_REASON                  = 0x00004402,
-	VM_EXIT_INTR_INFO               = 0x00004404,
-	VM_EXIT_INTR_ERROR_CODE         = 0x00004406,
-	IDT_VECTORING_INFO_FIELD        = 0x00004408,
-	IDT_VECTORING_ERROR_CODE        = 0x0000440a,
-	VM_EXIT_INSTRUCTION_LEN         = 0x0000440c,
-	VMX_INSTRUCTION_INFO            = 0x0000440e,
-	GUEST_ES_LIMIT                  = 0x00004800,
-	GUEST_CS_LIMIT                  = 0x00004802,
-	GUEST_SS_LIMIT                  = 0x00004804,
-	GUEST_DS_LIMIT                  = 0x00004806,
-	GUEST_FS_LIMIT                  = 0x00004808,
-	GUEST_GS_LIMIT                  = 0x0000480a,
-	GUEST_LDTR_LIMIT                = 0x0000480c,
-	GUEST_TR_LIMIT                  = 0x0000480e,
-	GUEST_GDTR_LIMIT                = 0x00004810,
-	GUEST_IDTR_LIMIT                = 0x00004812,
-	GUEST_ES_AR_BYTES               = 0x00004814,
-	GUEST_CS_AR_BYTES               = 0x00004816,
-	GUEST_SS_AR_BYTES               = 0x00004818,
-	GUEST_DS_AR_BYTES               = 0x0000481a,
-	GUEST_FS_AR_BYTES               = 0x0000481c,
-	GUEST_GS_AR_BYTES               = 0x0000481e,
-	GUEST_LDTR_AR_BYTES             = 0x00004820,
-	GUEST_TR_AR_BYTES               = 0x00004822,
-	GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
-	GUEST_ACTIVITY_STATE            = 0X00004826,
-	GUEST_SYSENTER_CS               = 0x0000482A,
-	HOST_IA32_SYSENTER_CS           = 0x00004c00,
-	CR0_GUEST_HOST_MASK             = 0x00006000,
-	CR4_GUEST_HOST_MASK             = 0x00006002,
-	CR0_READ_SHADOW                 = 0x00006004,
-	CR4_READ_SHADOW                 = 0x00006006,
-	CR3_TARGET_VALUE0               = 0x00006008,
-	CR3_TARGET_VALUE1               = 0x0000600a,
-	CR3_TARGET_VALUE2               = 0x0000600c,
-	CR3_TARGET_VALUE3               = 0x0000600e,
-	EXIT_QUALIFICATION              = 0x00006400,
-	GUEST_LINEAR_ADDRESS            = 0x0000640a,
-	GUEST_CR0                       = 0x00006800,
-	GUEST_CR3                       = 0x00006802,
-	GUEST_CR4                       = 0x00006804,
-	GUEST_ES_BASE                   = 0x00006806,
-	GUEST_CS_BASE                   = 0x00006808,
-	GUEST_SS_BASE                   = 0x0000680a,
-	GUEST_DS_BASE                   = 0x0000680c,
-	GUEST_FS_BASE                   = 0x0000680e,
-	GUEST_GS_BASE                   = 0x00006810,
-	GUEST_LDTR_BASE                 = 0x00006812,
-	GUEST_TR_BASE                   = 0x00006814,
-	GUEST_GDTR_BASE                 = 0x00006816,
-	GUEST_IDTR_BASE                 = 0x00006818,
-	GUEST_DR7                       = 0x0000681a,
-	GUEST_RSP                       = 0x0000681c,
-	GUEST_RIP                       = 0x0000681e,
-	GUEST_RFLAGS                    = 0x00006820,
-	GUEST_PENDING_DBG_EXCEPTIONS    = 0x00006822,
-	GUEST_SYSENTER_ESP              = 0x00006824,
-	GUEST_SYSENTER_EIP              = 0x00006826,
-	HOST_CR0                        = 0x00006c00,
-	HOST_CR3                        = 0x00006c02,
-	HOST_CR4                        = 0x00006c04,
-	HOST_FS_BASE                    = 0x00006c06,
-	HOST_GS_BASE                    = 0x00006c08,
-	HOST_TR_BASE                    = 0x00006c0a,
-	HOST_GDTR_BASE                  = 0x00006c0c,
-	HOST_IDTR_BASE                  = 0x00006c0e,
-	HOST_IA32_SYSENTER_ESP          = 0x00006c10,
-	HOST_IA32_SYSENTER_EIP          = 0x00006c12,
-	HOST_RSP                        = 0x00006c14,
-	HOST_RIP                        = 0x00006c16,
-};
-
-#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
-
-#define EXIT_REASON_EXCEPTION_NMI       0
-#define EXIT_REASON_EXTERNAL_INTERRUPT  1
-#define EXIT_REASON_TRIPLE_FAULT        2
-
-#define EXIT_REASON_PENDING_INTERRUPT   7
-
-#define EXIT_REASON_TASK_SWITCH         9
-#define EXIT_REASON_CPUID               10
-#define EXIT_REASON_HLT                 12
-#define EXIT_REASON_INVLPG              14
-#define EXIT_REASON_RDPMC               15
-#define EXIT_REASON_RDTSC               16
-#define EXIT_REASON_VMCALL              18
-#define EXIT_REASON_VMCLEAR             19
-#define EXIT_REASON_VMLAUNCH            20
-#define EXIT_REASON_VMPTRLD             21
-#define EXIT_REASON_VMPTRST             22
-#define EXIT_REASON_VMREAD              23
-#define EXIT_REASON_VMRESUME            24
-#define EXIT_REASON_VMWRITE             25
-#define EXIT_REASON_VMOFF               26
-#define EXIT_REASON_VMON                27
-#define EXIT_REASON_CR_ACCESS           28
-#define EXIT_REASON_DR_ACCESS           29
-#define EXIT_REASON_IO_INSTRUCTION      30
-#define EXIT_REASON_MSR_READ            31
-#define EXIT_REASON_MSR_WRITE           32
-#define EXIT_REASON_MWAIT_INSTRUCTION   36
-#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
-
-/*
- * Interruption-information format
- */
-#define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
-#define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
-#define INTR_INFO_DELIEVER_CODE_MASK    0x800           /* 11 */
-#define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
-
-#define VECTORING_INFO_VECTOR_MASK           	INTR_INFO_VECTOR_MASK
-#define VECTORING_INFO_TYPE_MASK        	INTR_INFO_INTR_TYPE_MASK
-#define VECTORING_INFO_DELIEVER_CODE_MASK    	INTR_INFO_DELIEVER_CODE_MASK
-#define VECTORING_INFO_VALID_MASK       	INTR_INFO_VALID_MASK
-
-#define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
-#define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
-
-/*
- * Exit Qualifications for MOV for Control Register Access
- */
-#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control register */
-#define CONTROL_REG_ACCESS_TYPE         0x30    /* 5:4, access type */
-#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose register */
-#define LMSW_SOURCE_DATA_SHIFT 16
-#define LMSW_SOURCE_DATA  (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
-#define REG_EAX                         (0 << 8)
-#define REG_ECX                         (1 << 8)
-#define REG_EDX                         (2 << 8)
-#define REG_EBX                         (3 << 8)
-#define REG_ESP                         (4 << 8)
-#define REG_EBP                         (5 << 8)
-#define REG_ESI                         (6 << 8)
-#define REG_EDI                         (7 << 8)
-#define REG_R8                         (8 << 8)
-#define REG_R9                         (9 << 8)
-#define REG_R10                        (10 << 8)
-#define REG_R11                        (11 << 8)
-#define REG_R12                        (12 << 8)
-#define REG_R13                        (13 << 8)
-#define REG_R14                        (14 << 8)
-#define REG_R15                        (15 << 8)
-
-/*
- * Exit Qualifications for MOV for Debug Register Access
- */
-#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug register */
-#define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
-#define TYPE_MOV_TO_DR                  (0 << 4)
-#define TYPE_MOV_FROM_DR                (1 << 4)
-#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose register */
-
-
-/* segment AR */
-#define SEGMENT_AR_L_MASK (1 << 13)
-
-#define AR_TYPE_ACCESSES_MASK 1
-#define AR_TYPE_READABLE_MASK (1 << 1)
-#define AR_TYPE_WRITEABLE_MASK (1 << 2)
-#define AR_TYPE_CODE_MASK (1 << 3)
-#define AR_TYPE_MASK 0x0f
-#define AR_TYPE_BUSY_64_TSS 11
-#define AR_TYPE_BUSY_32_TSS 11
-#define AR_TYPE_BUSY_16_TSS 3
-#define AR_TYPE_LDT 2
-
-#define AR_UNUSABLE_MASK (1 << 16)
-#define AR_S_MASK (1 << 4)
-#define AR_P_MASK (1 << 7)
-#define AR_L_MASK (1 << 13)
-#define AR_DB_MASK (1 << 14)
-#define AR_G_MASK (1 << 15)
-#define AR_DPL_SHIFT 5
-#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
-
-#define AR_RESERVD_MASK 0xfffe0f00
-
-#define MSR_IA32_VMX_BASIC                      0x480
-#define MSR_IA32_VMX_PINBASED_CTLS              0x481
-#define MSR_IA32_VMX_PROCBASED_CTLS             0x482
-#define MSR_IA32_VMX_EXIT_CTLS                  0x483
-#define MSR_IA32_VMX_ENTRY_CTLS                 0x484
-#define MSR_IA32_VMX_MISC                       0x485
-#define MSR_IA32_VMX_CR0_FIXED0                 0x486
-#define MSR_IA32_VMX_CR0_FIXED1                 0x487
-#define MSR_IA32_VMX_CR4_FIXED0                 0x488
-#define MSR_IA32_VMX_CR4_FIXED1                 0x489
-#define MSR_IA32_VMX_VMCS_ENUM                  0x48a
-#define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
-
-#define MSR_IA32_FEATURE_CONTROL                0x3a
-#define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
-#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
-
-#endif
diff -puN drivers/kvm/x86_emulate.c~git-kvm /dev/null
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1662 +0,0 @@
-/******************************************************************************
- * x86_emulate.c
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privileged instructions:
- *
- * Copyright (C) 2006 Qumranet
- *
- *   Avi Kivity <avi@qumranet.com>
- *   Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf( _f , ## _a )
-#else
-#include "kvm.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
-#include "x86_emulate.h"
-#include <linux/module.h>
-
-/*
- * Opcode effective-address decode tables.
- * Note that we only emulate instructions that have at least one memory
- * operand (excluding implicit stack references). We assume that stack
- * references and instruction fetches will never occur in special memory
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
- * not be handled.
- */
-
-/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp      (1<<0)	/* 8-bit operands. */
-/* Destination operand type. */
-#define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
-#define DstReg      (2<<1)	/* Register operand. */
-#define DstMem      (3<<1)	/* Memory operand. */
-#define DstMask     (3<<1)
-/* Source operand type. */
-#define SrcNone     (0<<3)	/* No source operand. */
-#define SrcImplicit (0<<3)	/* Source operand is implicit in the opcode. */
-#define SrcReg      (1<<3)	/* Register operand. */
-#define SrcMem      (2<<3)	/* Memory operand. */
-#define SrcMem16    (3<<3)	/* Memory operand (16-bit). */
-#define SrcMem32    (4<<3)	/* Memory operand (32-bit). */
-#define SrcImm      (5<<3)	/* Immediate operand. */
-#define SrcImmByte  (6<<3)	/* 8-bit sign-extended immediate operand. */
-#define SrcMask     (7<<3)
-/* Generic ModRM decode. */
-#define ModRM       (1<<6)
-/* Destination is only written; never read. */
-#define Mov         (1<<7)
-#define BitOp       (1<<8)
-
-static u8 opcode_table[256] = {
-	/* 0x00 - 0x07 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x08 - 0x0F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x10 - 0x17 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x18 - 0x1F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x20 - 0x27 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	SrcImmByte, SrcImm, 0, 0,
-	/* 0x28 - 0x2F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x30 - 0x37 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x38 - 0x3F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
-	/* 0x40 - 0x4F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x50 - 0x57 */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	/* 0x58 - 0x5F */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	/* 0x60 - 0x67 */
-	0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-	0, 0, 0, 0,
-	/* 0x68 - 0x6F */
-	0, 0, ImplicitOps|Mov, 0,
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
-	/* 0x70 - 0x77 */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	/* 0x78 - 0x7F */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	/* 0x80 - 0x87 */
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	/* 0x88 - 0x8F */
-	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
-	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
-	/* 0x90 - 0x9F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
-	/* 0xA0 - 0xA7 */
-	ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
-	ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
-	ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
-	ByteOp | ImplicitOps, ImplicitOps,
-	/* 0xA8 - 0xAF */
-	0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
-	ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
-	ByteOp | ImplicitOps, ImplicitOps,
-	/* 0xB0 - 0xBF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xC0 - 0xC7 */
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-	0, ImplicitOps, 0, 0,
-	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
-	/* 0xC8 - 0xCF */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xD0 - 0xD7 */
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	0, 0, 0, 0,
-	/* 0xD8 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xE0 - 0xE7 */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xE8 - 0xEF */
-	ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
-	/* 0xF0 - 0xF7 */
-	0, 0, 0, 0,
-	ImplicitOps, 0,
-	ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
-	/* 0xF8 - 0xFF */
-	0, 0, 0, 0,
-	0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
-};
-
-static u16 twobyte_table[256] = {
-	/* 0x00 - 0x0F */
-	0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
-	ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
-	/* 0x10 - 0x1F */
-	0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x20 - 0x2F */
-	ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x30 - 0x3F */
-	ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x40 - 0x47 */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	/* 0x48 - 0x4F */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	/* 0x50 - 0x5F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x60 - 0x6F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x70 - 0x7F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x80 - 0x8F */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	/* 0x90 - 0x9F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xA0 - 0xA7 */
-	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
-	/* 0xA8 - 0xAF */
-	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
-	/* 0xB0 - 0xB7 */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
-	    DstMem | SrcReg | ModRM | BitOp,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
-	/* 0xB8 - 0xBF */
-	0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
-	/* 0xC0 - 0xCF */
-	0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xD0 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xE0 - 0xEF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xF0 - 0xFF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
-	enum { OP_REG, OP_MEM, OP_IMM } type;
-	unsigned int bytes;
-	unsigned long val, orig_val, *ptr;
-};
-
-/* EFLAGS bit definitions. */
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k"		/* force 32-bit operand */
-#define _STK  "%%rsp"		/* stack pointer */
-#elif defined(__i386__)
-#define _LO32 ""		/* force 32-bit operand */
-#define _STK  "%%esp"		/* stack pointer */
-#endif
-
-/*
- * These EFLAGS bits are restored from saved value during emulation, and
- * any changes are written back to the saved value after emulation.
- */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp) \
-	/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */	\
-	"push %"_sav"; "					\
-	"movl %"_msk",%"_LO32 _tmp"; "				\
-	"andl %"_LO32 _tmp",("_STK"); "				\
-	"pushf; "						\
-	"notl %"_LO32 _tmp"; "					\
-	"andl %"_LO32 _tmp",("_STK"); "				\
-	"pop  %"_tmp"; "					\
-	"orl  %"_LO32 _tmp",("_STK"); "				\
-	"popf; "						\
-	/* _sav &= ~msk; */					\
-	"movl %"_msk",%"_LO32 _tmp"; "				\
-	"notl %"_LO32 _tmp"; "					\
-	"andl %"_LO32 _tmp",%"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
-	/* _sav |= EFLAGS & _msk; */		\
-	"pushf; "				\
-	"pop  %"_tmp"; "			\
-	"andl %"_msk",%"_LO32 _tmp"; "		\
-	"orl  %"_LO32 _tmp",%"_sav"; "
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
-	do { 								    \
-		unsigned long _tmp;					    \
-									    \
-		switch ((_dst).bytes) {					    \
-		case 2:							    \
-			__asm__ __volatile__ (				    \
-				_PRE_EFLAGS("0","4","2")		    \
-				_op"w %"_wx"3,%1; "			    \
-				_POST_EFLAGS("0","4","2")		    \
-				: "=m" (_eflags), "=m" ((_dst).val),        \
-				  "=&r" (_tmp)				    \
-				: _wy ((_src).val), "i" (EFLAGS_MASK) );    \
-			break;						    \
-		case 4:							    \
-			__asm__ __volatile__ (				    \
-				_PRE_EFLAGS("0","4","2")		    \
-				_op"l %"_lx"3,%1; "			    \
-				_POST_EFLAGS("0","4","2")		    \
-				: "=m" (_eflags), "=m" ((_dst).val),	    \
-				  "=&r" (_tmp)				    \
-				: _ly ((_src).val), "i" (EFLAGS_MASK) );    \
-			break;						    \
-		case 8:							    \
-			__emulate_2op_8byte(_op, _src, _dst,		    \
-					    _eflags, _qx, _qy);		    \
-			break;						    \
-		}							    \
-	} while (0)
-
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
-	do {								     \
-		unsigned long _tmp;					     \
-		switch ( (_dst).bytes )					     \
-		{							     \
-		case 1:							     \
-			__asm__ __volatile__ (				     \
-				_PRE_EFLAGS("0","4","2")		     \
-				_op"b %"_bx"3,%1; "			     \
-				_POST_EFLAGS("0","4","2")		     \
-				: "=m" (_eflags), "=m" ((_dst).val),	     \
-				  "=&r" (_tmp)				     \
-				: _by ((_src).val), "i" (EFLAGS_MASK) );     \
-			break;						     \
-		default:						     \
-			__emulate_2op_nobyte(_op, _src, _dst, _eflags,	     \
-					     _wx, _wy, _lx, _ly, _qx, _qy);  \
-			break;						     \
-		}							     \
-	} while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                      \
-	__emulate_2op(_op, _src, _dst, _eflags,				\
-		      "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                      \
-	__emulate_2op(_op, _src, _dst, _eflags,				\
-		      "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)               \
-	__emulate_2op_nobyte(_op, _src, _dst, _eflags,			\
-			     "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags)                                    \
-	do {								\
-		unsigned long _tmp;					\
-									\
-		switch ( (_dst).bytes )					\
-		{							\
-		case 1:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0","3","2")		\
-				_op"b %1; "				\
-				_POST_EFLAGS("0","3","2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK) );			\
-			break;						\
-		case 2:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0","3","2")		\
-				_op"w %1; "				\
-				_POST_EFLAGS("0","3","2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK) );			\
-			break;						\
-		case 4:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0","3","2")		\
-				_op"l %1; "				\
-				_POST_EFLAGS("0","3","2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK) );			\
-			break;						\
-		case 8:							\
-			__emulate_1op_8byte(_op, _dst, _eflags);	\
-			break;						\
-		}							\
-	} while (0)
-
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
-	do {								  \
-		__asm__ __volatile__ (					  \
-			_PRE_EFLAGS("0","4","2")			  \
-			_op"q %"_qx"3,%1; "				  \
-			_POST_EFLAGS("0","4","2")			  \
-			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-			: _qy ((_src).val), "i" (EFLAGS_MASK) );	  \
-	} while (0)
-
-#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
-	do {								  \
-		__asm__ __volatile__ (					  \
-			_PRE_EFLAGS("0","3","2")			  \
-			_op"q %1; "					  \
-			_POST_EFLAGS("0","3","2")			  \
-			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-			: "i" (EFLAGS_MASK) );				  \
-	} while (0)
-
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif				/* __i386__ */
-
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip)                                  \
-({	unsigned long _x;						\
-	rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x,	\
-                                                  (_size), ctxt->vcpu); \
-	if ( rc != 0 )							\
-		goto done;						\
-	(_eip) += (_size);						\
-	(_type)_x;							\
-})
-
-/* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg)						\
-	((ad_bytes == sizeof(unsigned long)) ? 				\
-		(reg) :	((reg) & ((1UL << (ad_bytes << 3)) - 1)))
-#define register_address(base, reg)                                     \
-	((base) + address_mask(reg))
-#define register_address_increment(reg, inc)                            \
-	do {								\
-		/* signed type ensures sign extension to long */        \
-		int _inc = (inc);					\
-		if ( ad_bytes == sizeof(unsigned long) )		\
-			(reg) += _inc;					\
-		else							\
-			(reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
-			   (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
-	} while (0)
-
-#define JMP_REL(rel) 							\
-	do {								\
-		register_address_increment(_eip, rel);			\
-	} while (0)
-
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
-			     int highbyte_regs)
-{
-	void *p;
-
-	p = &regs[modrm_reg];
-	if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
-		p = (unsigned char *)&regs[modrm_reg & 3] + 1;
-	return p;
-}
-
-static int read_descriptor(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops,
-			   void *ptr,
-			   u16 *size, unsigned long *address, int op_bytes)
-{
-	int rc;
-
-	if (op_bytes == 2)
-		op_bytes = 3;
-	*address = 0;
-	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
-			   ctxt->vcpu);
-	if (rc)
-		return rc;
-	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
-			   ctxt->vcpu);
-	return rc;
-}
-
-static int test_cc(unsigned int condition, unsigned int flags)
-{
-	int rc = 0;
-
-	switch ((condition & 15) >> 1) {
-	case 0: /* o */
-		rc |= (flags & EFLG_OF);
-		break;
-	case 1: /* b/c/nae */
-		rc |= (flags & EFLG_CF);
-		break;
-	case 2: /* z/e */
-		rc |= (flags & EFLG_ZF);
-		break;
-	case 3: /* be/na */
-		rc |= (flags & (EFLG_CF|EFLG_ZF));
-		break;
-	case 4: /* s */
-		rc |= (flags & EFLG_SF);
-		break;
-	case 5: /* p/pe */
-		rc |= (flags & EFLG_PF);
-		break;
-	case 7: /* le/ng */
-		rc |= (flags & EFLG_ZF);
-		/* fall through */
-	case 6: /* l/nge */
-		rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
-		break;
-	}
-
-	/* Odd condition identifiers (lsb == 1) have inverted sense. */
-	return (!!rc ^ (condition & 1));
-}
-
-int
-x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
-	unsigned d;
-	u8 b, sib, twobyte = 0, rex_prefix = 0;
-	u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
-	unsigned long *override_base = NULL;
-	unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
-	int rc = 0;
-	struct operand src, dst;
-	unsigned long cr2 = ctxt->cr2;
-	int mode = ctxt->mode;
-	unsigned long modrm_ea;
-	int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
-	int no_wb = 0;
-	u64 msr_data;
-
-	/* Shadow copy of register state. Committed on successful emulation. */
-	unsigned long _regs[NR_VCPU_REGS];
-	unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
-	unsigned long modrm_val = 0;
-
-	memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
-
-	switch (mode) {
-	case X86EMUL_MODE_REAL:
-	case X86EMUL_MODE_PROT16:
-		op_bytes = ad_bytes = 2;
-		break;
-	case X86EMUL_MODE_PROT32:
-		op_bytes = ad_bytes = 4;
-		break;
-#ifdef CONFIG_X86_64
-	case X86EMUL_MODE_PROT64:
-		op_bytes = 4;
-		ad_bytes = 8;
-		break;
-#endif
-	default:
-		return -1;
-	}
-
-	/* Legacy prefixes. */
-	for (i = 0; i < 8; i++) {
-		switch (b = insn_fetch(u8, 1, _eip)) {
-		case 0x66:	/* operand-size override */
-			op_bytes ^= 6;	/* switch between 2/4 bytes */
-			break;
-		case 0x67:	/* address-size override */
-			if (mode == X86EMUL_MODE_PROT64)
-				ad_bytes ^= 12;	/* switch between 4/8 bytes */
-			else
-				ad_bytes ^= 6;	/* switch between 2/4 bytes */
-			break;
-		case 0x2e:	/* CS override */
-			override_base = &ctxt->cs_base;
-			break;
-		case 0x3e:	/* DS override */
-			override_base = &ctxt->ds_base;
-			break;
-		case 0x26:	/* ES override */
-			override_base = &ctxt->es_base;
-			break;
-		case 0x64:	/* FS override */
-			override_base = &ctxt->fs_base;
-			break;
-		case 0x65:	/* GS override */
-			override_base = &ctxt->gs_base;
-			break;
-		case 0x36:	/* SS override */
-			override_base = &ctxt->ss_base;
-			break;
-		case 0xf0:	/* LOCK */
-			lock_prefix = 1;
-			break;
-		case 0xf2:	/* REPNE/REPNZ */
-		case 0xf3:	/* REP/REPE/REPZ */
-			rep_prefix = 1;
-			break;
-		default:
-			goto done_prefixes;
-		}
-	}
-
-done_prefixes:
-
-	/* REX prefix. */
-	if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
-		rex_prefix = b;
-		if (b & 8)
-			op_bytes = 8;	/* REX.W */
-		modrm_reg = (b & 4) << 1;	/* REX.R */
-		index_reg = (b & 2) << 2; /* REX.X */
-		modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
-		b = insn_fetch(u8, 1, _eip);
-	}
-
-	/* Opcode byte(s). */
-	d = opcode_table[b];
-	if (d == 0) {
-		/* Two-byte opcode? */
-		if (b == 0x0f) {
-			twobyte = 1;
-			b = insn_fetch(u8, 1, _eip);
-			d = twobyte_table[b];
-		}
-
-		/* Unrecognised? */
-		if (d == 0)
-			goto cannot_emulate;
-	}
-
-	/* ModRM and SIB bytes. */
-	if (d & ModRM) {
-		modrm = insn_fetch(u8, 1, _eip);
-		modrm_mod |= (modrm & 0xc0) >> 6;
-		modrm_reg |= (modrm & 0x38) >> 3;
-		modrm_rm |= (modrm & 0x07);
-		modrm_ea = 0;
-		use_modrm_ea = 1;
-
-		if (modrm_mod == 3) {
-			modrm_val = *(unsigned long *)
-				decode_register(modrm_rm, _regs, d & ByteOp);
-			goto modrm_done;
-		}
-
-		if (ad_bytes == 2) {
-			unsigned bx = _regs[VCPU_REGS_RBX];
-			unsigned bp = _regs[VCPU_REGS_RBP];
-			unsigned si = _regs[VCPU_REGS_RSI];
-			unsigned di = _regs[VCPU_REGS_RDI];
-
-			/* 16-bit ModR/M decode. */
-			switch (modrm_mod) {
-			case 0:
-				if (modrm_rm == 6)
-					modrm_ea += insn_fetch(u16, 2, _eip);
-				break;
-			case 1:
-				modrm_ea += insn_fetch(s8, 1, _eip);
-				break;
-			case 2:
-				modrm_ea += insn_fetch(u16, 2, _eip);
-				break;
-			}
-			switch (modrm_rm) {
-			case 0:
-				modrm_ea += bx + si;
-				break;
-			case 1:
-				modrm_ea += bx + di;
-				break;
-			case 2:
-				modrm_ea += bp + si;
-				break;
-			case 3:
-				modrm_ea += bp + di;
-				break;
-			case 4:
-				modrm_ea += si;
-				break;
-			case 5:
-				modrm_ea += di;
-				break;
-			case 6:
-				if (modrm_mod != 0)
-					modrm_ea += bp;
-				break;
-			case 7:
-				modrm_ea += bx;
-				break;
-			}
-			if (modrm_rm == 2 || modrm_rm == 3 ||
-			    (modrm_rm == 6 && modrm_mod != 0))
-				if (!override_base)
-					override_base = &ctxt->ss_base;
-			modrm_ea = (u16)modrm_ea;
-		} else {
-			/* 32/64-bit ModR/M decode. */
-			switch (modrm_rm) {
-			case 4:
-			case 12:
-				sib = insn_fetch(u8, 1, _eip);
-				index_reg |= (sib >> 3) & 7;
-				base_reg |= sib & 7;
-				scale = sib >> 6;
-
-				switch (base_reg) {
-				case 5:
-					if (modrm_mod != 0)
-						modrm_ea += _regs[base_reg];
-					else
-						modrm_ea += insn_fetch(s32, 4, _eip);
-					break;
-				default:
-					modrm_ea += _regs[base_reg];
-				}
-				switch (index_reg) {
-				case 4:
-					break;
-				default:
-					modrm_ea += _regs[index_reg] << scale;
-
-				}
-				break;
-			case 5:
-				if (modrm_mod != 0)
-					modrm_ea += _regs[modrm_rm];
-				else if (mode == X86EMUL_MODE_PROT64)
-					rip_relative = 1;
-				break;
-			default:
-				modrm_ea += _regs[modrm_rm];
-				break;
-			}
-			switch (modrm_mod) {
-			case 0:
-				if (modrm_rm == 5)
-					modrm_ea += insn_fetch(s32, 4, _eip);
-				break;
-			case 1:
-				modrm_ea += insn_fetch(s8, 1, _eip);
-				break;
-			case 2:
-				modrm_ea += insn_fetch(s32, 4, _eip);
-				break;
-			}
-		}
-		if (!override_base)
-			override_base = &ctxt->ds_base;
-		if (mode == X86EMUL_MODE_PROT64 &&
-		    override_base != &ctxt->fs_base &&
-		    override_base != &ctxt->gs_base)
-			override_base = NULL;
-
-		if (override_base)
-			modrm_ea += *override_base;
-
-		if (rip_relative) {
-			modrm_ea += _eip;
-			switch (d & SrcMask) {
-			case SrcImmByte:
-				modrm_ea += 1;
-				break;
-			case SrcImm:
-				if (d & ByteOp)
-					modrm_ea += 1;
-				else
-					if (op_bytes == 8)
-						modrm_ea += 4;
-					else
-						modrm_ea += op_bytes;
-			}
-		}
-		if (ad_bytes != 8)
-			modrm_ea = (u32)modrm_ea;
-		cr2 = modrm_ea;
-	modrm_done:
-		;
-	}
-
-	/*
-	 * Decode and fetch the source operand: register, memory
-	 * or immediate.
-	 */
-	switch (d & SrcMask) {
-	case SrcNone:
-		break;
-	case SrcReg:
-		src.type = OP_REG;
-		if (d & ByteOp) {
-			src.ptr = decode_register(modrm_reg, _regs,
-						  (rex_prefix == 0));
-			src.val = src.orig_val = *(u8 *) src.ptr;
-			src.bytes = 1;
-		} else {
-			src.ptr = decode_register(modrm_reg, _regs, 0);
-			switch ((src.bytes = op_bytes)) {
-			case 2:
-				src.val = src.orig_val = *(u16 *) src.ptr;
-				break;
-			case 4:
-				src.val = src.orig_val = *(u32 *) src.ptr;
-				break;
-			case 8:
-				src.val = src.orig_val = *(u64 *) src.ptr;
-				break;
-			}
-		}
-		break;
-	case SrcMem16:
-		src.bytes = 2;
-		goto srcmem_common;
-	case SrcMem32:
-		src.bytes = 4;
-		goto srcmem_common;
-	case SrcMem:
-		src.bytes = (d & ByteOp) ? 1 : op_bytes;
-		/* Don't fetch the address for invlpg: it could be unmapped. */
-		if (twobyte && b == 0x01 && modrm_reg == 7)
-			break;
-	      srcmem_common:
-		/*
-		 * For instructions with a ModR/M byte, switch to register
-		 * access if Mod = 3.
-		 */
-		if ((d & ModRM) && modrm_mod == 3) {
-			src.type = OP_REG;
-			break;
-		}
-		src.type = OP_MEM;
-		src.ptr = (unsigned long *)cr2;
-		src.val = 0;
-		if ((rc = ops->read_emulated((unsigned long)src.ptr,
-					     &src.val, src.bytes, ctxt->vcpu)) != 0)
-			goto done;
-		src.orig_val = src.val;
-		break;
-	case SrcImm:
-		src.type = OP_IMM;
-		src.ptr = (unsigned long *)_eip;
-		src.bytes = (d & ByteOp) ? 1 : op_bytes;
-		if (src.bytes == 8)
-			src.bytes = 4;
-		/* NB. Immediates are sign-extended as necessary. */
-		switch (src.bytes) {
-		case 1:
-			src.val = insn_fetch(s8, 1, _eip);
-			break;
-		case 2:
-			src.val = insn_fetch(s16, 2, _eip);
-			break;
-		case 4:
-			src.val = insn_fetch(s32, 4, _eip);
-			break;
-		}
-		break;
-	case SrcImmByte:
-		src.type = OP_IMM;
-		src.ptr = (unsigned long *)_eip;
-		src.bytes = 1;
-		src.val = insn_fetch(s8, 1, _eip);
-		break;
-	}
-
-	/* Decode and fetch the destination operand: register or memory. */
-	switch (d & DstMask) {
-	case ImplicitOps:
-		/* Special instructions do their own operand decoding. */
-		goto special_insn;
-	case DstReg:
-		dst.type = OP_REG;
-		if ((d & ByteOp)
-		    && !(twobyte && (b == 0xb6 || b == 0xb7))) {
-			dst.ptr = decode_register(modrm_reg, _regs,
-						  (rex_prefix == 0));
-			dst.val = *(u8 *) dst.ptr;
-			dst.bytes = 1;
-		} else {
-			dst.ptr = decode_register(modrm_reg, _regs, 0);
-			switch ((dst.bytes = op_bytes)) {
-			case 2:
-				dst.val = *(u16 *)dst.ptr;
-				break;
-			case 4:
-				dst.val = *(u32 *)dst.ptr;
-				break;
-			case 8:
-				dst.val = *(u64 *)dst.ptr;
-				break;
-			}
-		}
-		break;
-	case DstMem:
-		dst.type = OP_MEM;
-		dst.ptr = (unsigned long *)cr2;
-		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-		dst.val = 0;
-		/*
-		 * For instructions with a ModR/M byte, switch to register
-		 * access if Mod = 3.
-		 */
-		if ((d & ModRM) && modrm_mod == 3) {
-			dst.type = OP_REG;
-			break;
-		}
-		if (d & BitOp) {
-			unsigned long mask = ~(dst.bytes * 8 - 1);
-
-			dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
-		}
-		if (!(d & Mov) && /* optimisation - avoid slow emulated read */
-		    ((rc = ops->read_emulated((unsigned long)dst.ptr,
-					      &dst.val, dst.bytes, ctxt->vcpu)) != 0))
-			goto done;
-		break;
-	}
-	dst.orig_val = dst.val;
-
-	if (twobyte)
-		goto twobyte_insn;
-
-	switch (b) {
-	case 0x00 ... 0x05:
-	      add:		/* add */
-		emulate_2op_SrcV("add", src, dst, _eflags);
-		break;
-	case 0x08 ... 0x0d:
-	      or:		/* or */
-		emulate_2op_SrcV("or", src, dst, _eflags);
-		break;
-	case 0x10 ... 0x15:
-	      adc:		/* adc */
-		emulate_2op_SrcV("adc", src, dst, _eflags);
-		break;
-	case 0x18 ... 0x1d:
-	      sbb:		/* sbb */
-		emulate_2op_SrcV("sbb", src, dst, _eflags);
-		break;
-	case 0x20 ... 0x23:
-	      and:		/* and */
-		emulate_2op_SrcV("and", src, dst, _eflags);
-		break;
-	case 0x24:              /* and al imm8 */
-		dst.type = OP_REG;
-		dst.ptr = &_regs[VCPU_REGS_RAX];
-		dst.val = *(u8 *)dst.ptr;
-		dst.bytes = 1;
-		dst.orig_val = dst.val;
-		goto and;
-	case 0x25:              /* and ax imm16, or eax imm32 */
-		dst.type = OP_REG;
-		dst.bytes = op_bytes;
-		dst.ptr = &_regs[VCPU_REGS_RAX];
-		if (op_bytes == 2)
-			dst.val = *(u16 *)dst.ptr;
-		else
-			dst.val = *(u32 *)dst.ptr;
-		dst.orig_val = dst.val;
-		goto and;
-	case 0x28 ... 0x2d:
-	      sub:		/* sub */
-		emulate_2op_SrcV("sub", src, dst, _eflags);
-		break;
-	case 0x30 ... 0x35:
-	      xor:		/* xor */
-		emulate_2op_SrcV("xor", src, dst, _eflags);
-		break;
-	case 0x38 ... 0x3d:
-	      cmp:		/* cmp */
-		emulate_2op_SrcV("cmp", src, dst, _eflags);
-		break;
-	case 0x63:		/* movsxd */
-		if (mode != X86EMUL_MODE_PROT64)
-			goto cannot_emulate;
-		dst.val = (s32) src.val;
-		break;
-	case 0x80 ... 0x83:	/* Grp1 */
-		switch (modrm_reg) {
-		case 0:
-			goto add;
-		case 1:
-			goto or;
-		case 2:
-			goto adc;
-		case 3:
-			goto sbb;
-		case 4:
-			goto and;
-		case 5:
-			goto sub;
-		case 6:
-			goto xor;
-		case 7:
-			goto cmp;
-		}
-		break;
-	case 0x84 ... 0x85:
-	      test:		/* test */
-		emulate_2op_SrcV("test", src, dst, _eflags);
-		break;
-	case 0x86 ... 0x87:	/* xchg */
-		/* Write back the register source. */
-		switch (dst.bytes) {
-		case 1:
-			*(u8 *) src.ptr = (u8) dst.val;
-			break;
-		case 2:
-			*(u16 *) src.ptr = (u16) dst.val;
-			break;
-		case 4:
-			*src.ptr = (u32) dst.val;
-			break;	/* 64b reg: zero-extend */
-		case 8:
-			*src.ptr = dst.val;
-			break;
-		}
-		/*
-		 * Write back the memory destination with implicit LOCK
-		 * prefix.
-		 */
-		dst.val = src.val;
-		lock_prefix = 1;
-		break;
-	case 0x88 ... 0x8b:	/* mov */
-		goto mov;
-	case 0x8d: /* lea r16/r32, m */
-		dst.val = modrm_val;
-		break;
-	case 0x8f:		/* pop (sole member of Grp1a) */
-		/* 64-bit mode: POP always pops a 64-bit operand. */
-		if (mode == X86EMUL_MODE_PROT64)
-			dst.bytes = 8;
-		if ((rc = ops->read_std(register_address(ctxt->ss_base,
-							 _regs[VCPU_REGS_RSP]),
-					&dst.val, dst.bytes, ctxt->vcpu)) != 0)
-			goto done;
-		register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
-		break;
-	case 0xa0 ... 0xa1:	/* mov */
-		dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-		dst.val = src.val;
-		_eip += ad_bytes;	/* skip src displacement */
-		break;
-	case 0xa2 ... 0xa3:	/* mov */
-		dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
-		_eip += ad_bytes;	/* skip dst displacement */
-		break;
-	case 0xc0 ... 0xc1:
-	      grp2:		/* Grp2 */
-		switch (modrm_reg) {
-		case 0:	/* rol */
-			emulate_2op_SrcB("rol", src, dst, _eflags);
-			break;
-		case 1:	/* ror */
-			emulate_2op_SrcB("ror", src, dst, _eflags);
-			break;
-		case 2:	/* rcl */
-			emulate_2op_SrcB("rcl", src, dst, _eflags);
-			break;
-		case 3:	/* rcr */
-			emulate_2op_SrcB("rcr", src, dst, _eflags);
-			break;
-		case 4:	/* sal/shl */
-		case 6:	/* sal/shl */
-			emulate_2op_SrcB("sal", src, dst, _eflags);
-			break;
-		case 5:	/* shr */
-			emulate_2op_SrcB("shr", src, dst, _eflags);
-			break;
-		case 7:	/* sar */
-			emulate_2op_SrcB("sar", src, dst, _eflags);
-			break;
-		}
-		break;
-	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
-	mov:
-		dst.val = src.val;
-		break;
-	case 0xd0 ... 0xd1:	/* Grp2 */
-		src.val = 1;
-		goto grp2;
-	case 0xd2 ... 0xd3:	/* Grp2 */
-		src.val = _regs[VCPU_REGS_RCX];
-		goto grp2;
-	case 0xf6 ... 0xf7:	/* Grp3 */
-		switch (modrm_reg) {
-		case 0 ... 1:	/* test */
-			/*
-			 * Special case in Grp3: test has an immediate
-			 * source operand.
-			 */
-			src.type = OP_IMM;
-			src.ptr = (unsigned long *)_eip;
-			src.bytes = (d & ByteOp) ? 1 : op_bytes;
-			if (src.bytes == 8)
-				src.bytes = 4;
-			switch (src.bytes) {
-			case 1:
-				src.val = insn_fetch(s8, 1, _eip);
-				break;
-			case 2:
-				src.val = insn_fetch(s16, 2, _eip);
-				break;
-			case 4:
-				src.val = insn_fetch(s32, 4, _eip);
-				break;
-			}
-			goto test;
-		case 2:	/* not */
-			dst.val = ~dst.val;
-			break;
-		case 3:	/* neg */
-			emulate_1op("neg", dst, _eflags);
-			break;
-		default:
-			goto cannot_emulate;
-		}
-		break;
-	case 0xfe ... 0xff:	/* Grp4/Grp5 */
-		switch (modrm_reg) {
-		case 0:	/* inc */
-			emulate_1op("inc", dst, _eflags);
-			break;
-		case 1:	/* dec */
-			emulate_1op("dec", dst, _eflags);
-			break;
-		case 4: /* jmp abs */
-			if (b == 0xff)
-				_eip = dst.val;
-			else
-				goto cannot_emulate;
-			break;
-		case 6:	/* push */
-			/* 64-bit mode: PUSH always pushes a 64-bit operand. */
-			if (mode == X86EMUL_MODE_PROT64) {
-				dst.bytes = 8;
-				if ((rc = ops->read_std((unsigned long)dst.ptr,
-							&dst.val, 8,
-							ctxt->vcpu)) != 0)
-					goto done;
-			}
-			register_address_increment(_regs[VCPU_REGS_RSP],
-						   -dst.bytes);
-			if ((rc = ops->write_emulated(
-				     register_address(ctxt->ss_base,
-						      _regs[VCPU_REGS_RSP]),
-				     &dst.val, dst.bytes, ctxt->vcpu)) != 0)
-				goto done;
-			no_wb = 1;
-			break;
-		default:
-			goto cannot_emulate;
-		}
-		break;
-	}
-
-writeback:
-	if (!no_wb) {
-		switch (dst.type) {
-		case OP_REG:
-			/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
-			switch (dst.bytes) {
-			case 1:
-				*(u8 *)dst.ptr = (u8)dst.val;
-				break;
-			case 2:
-				*(u16 *)dst.ptr = (u16)dst.val;
-				break;
-			case 4:
-				*dst.ptr = (u32)dst.val;
-				break;	/* 64b: zero-ext */
-			case 8:
-				*dst.ptr = dst.val;
-				break;
-			}
-			break;
-		case OP_MEM:
-			if (lock_prefix)
-				rc = ops->cmpxchg_emulated((unsigned long)dst.
-							   ptr, &dst.orig_val,
-							   &dst.val, dst.bytes,
-							   ctxt->vcpu);
-			else
-				rc = ops->write_emulated((unsigned long)dst.ptr,
-							 &dst.val, dst.bytes,
-							 ctxt->vcpu);
-			if (rc != 0)
-				goto done;
-		default:
-			break;
-		}
-	}
-
-	/* Commit shadow register state. */
-	memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
-	ctxt->eflags = _eflags;
-	ctxt->vcpu->rip = _eip;
-
-done:
-	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-
-special_insn:
-	if (twobyte)
-		goto twobyte_special_insn;
-	switch(b) {
-	case 0x50 ... 0x57:  /* push reg */
-		if (op_bytes == 2)
-			src.val = (u16) _regs[b & 0x7];
-		else
-			src.val = (u32) _regs[b & 0x7];
-		dst.type  = OP_MEM;
-		dst.bytes = op_bytes;
-		dst.val = src.val;
-		register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
-		dst.ptr = (void *) register_address(
-			ctxt->ss_base, _regs[VCPU_REGS_RSP]);
-		break;
-	case 0x58 ... 0x5f: /* pop reg */
-		dst.ptr = (unsigned long *)&_regs[b & 0x7];
-	pop_instruction:
-		if ((rc = ops->read_std(register_address(ctxt->ss_base,
-			_regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
-			!= 0)
-			goto done;
-
-		register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
-		no_wb = 1; /* Disable writeback. */
-		break;
-	case 0x6a: /* push imm8 */
-		src.val = 0L;
-		src.val = insn_fetch(s8, 1, _eip);
-	push:
-		dst.type  = OP_MEM;
-		dst.bytes = op_bytes;
-		dst.val = src.val;
-		register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
-		dst.ptr = (void *) register_address(ctxt->ss_base,
-							_regs[VCPU_REGS_RSP]);
-		break;
-	case 0x6c:		/* insb */
-	case 0x6d:		/* insw/insd */
-		 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-				1, 					/* in */
-				(d & ByteOp) ? 1 : op_bytes, 		/* size */
-				rep_prefix ?
-				address_mask(_regs[VCPU_REGS_RCX]) : 1,	/* count */
-				(_eflags & EFLG_DF),			/* down */
-				register_address(ctxt->es_base,
-						 _regs[VCPU_REGS_RDI]),	/* address */
-				rep_prefix,
-				_regs[VCPU_REGS_RDX]			/* port */
-				) == 0)
-			return -1;
-		return 0;
-	case 0x6e:		/* outsb */
-	case 0x6f:		/* outsw/outsd */
-		if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-				0, 					/* in */
-				(d & ByteOp) ? 1 : op_bytes, 		/* size */
-				rep_prefix ?
-				address_mask(_regs[VCPU_REGS_RCX]) : 1,	/* count */
-				(_eflags & EFLG_DF),			/* down */
-				register_address(override_base ?
-						 *override_base : ctxt->ds_base,
-						 _regs[VCPU_REGS_RSI]),	/* address */
-				rep_prefix,
-				_regs[VCPU_REGS_RDX]			/* port */
-				) == 0)
-			return -1;
-		return 0;
-	case 0x70 ... 0x7f: /* jcc (short) */ {
-		int rel = insn_fetch(s8, 1, _eip);
-
-		if (test_cc(b, _eflags))
-		JMP_REL(rel);
-		break;
-	}
-	case 0x9c: /* pushf */
-		src.val =  (unsigned long) _eflags;
-		goto push;
-	case 0x9d: /* popf */
-		dst.ptr = (unsigned long *) &_eflags;
-		goto pop_instruction;
-	case 0xc3: /* ret */
-		dst.ptr = &_eip;
-		goto pop_instruction;
-	case 0xf4:              /* hlt */
-		ctxt->vcpu->halt_request = 1;
-		goto done;
-	}
-	if (rep_prefix) {
-		if (_regs[VCPU_REGS_RCX] == 0) {
-			ctxt->vcpu->rip = _eip;
-			goto done;
-		}
-		_regs[VCPU_REGS_RCX]--;
-		_eip = ctxt->vcpu->rip;
-	}
-	switch (b) {
-	case 0xa4 ... 0xa5:	/* movs */
-		dst.type = OP_MEM;
-		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-		dst.ptr = (unsigned long *)register_address(ctxt->es_base,
-							_regs[VCPU_REGS_RDI]);
-		if ((rc = ops->read_emulated(register_address(
-		      override_base ? *override_base : ctxt->ds_base,
-		      _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
-			goto done;
-		register_address_increment(_regs[VCPU_REGS_RSI],
-			     (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-		register_address_increment(_regs[VCPU_REGS_RDI],
-			     (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-		break;
-	case 0xa6 ... 0xa7:	/* cmps */
-		DPRINTF("Urk! I don't handle CMPS.\n");
-		goto cannot_emulate;
-	case 0xaa ... 0xab:	/* stos */
-		dst.type = OP_MEM;
-		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-		dst.ptr = (unsigned long *)cr2;
-		dst.val = _regs[VCPU_REGS_RAX];
-		register_address_increment(_regs[VCPU_REGS_RDI],
-			     (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-		break;
-	case 0xac ... 0xad:	/* lods */
-		dst.type = OP_REG;
-		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-		dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-		if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
-					     ctxt->vcpu)) != 0)
-			goto done;
-		register_address_increment(_regs[VCPU_REGS_RSI],
-			   (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-		break;
-	case 0xae ... 0xaf:	/* scas */
-		DPRINTF("Urk! I don't handle SCAS.\n");
-		goto cannot_emulate;
-	case 0xe8: /* call (near) */ {
-		long int rel;
-		switch (op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, _eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, _eip);
-			break;
-		case 8:
-			rel = insn_fetch(s64, 8, _eip);
-			break;
-		default:
-			DPRINTF("Call: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
-		src.val = (unsigned long) _eip;
-		JMP_REL(rel);
-		op_bytes = ad_bytes;
-		goto push;
-	}
-	case 0xe9: /* jmp rel */
-	case 0xeb: /* jmp rel short */
-		JMP_REL(src.val);
-		no_wb = 1; /* Disable writeback. */
-		break;
-
-
-	}
-	goto writeback;
-
-twobyte_insn:
-	switch (b) {
-	case 0x01: /* lgdt, lidt, lmsw */
-		/* Disable writeback. */
-		no_wb = 1;
-		switch (modrm_reg) {
-			u16 size;
-			unsigned long address;
-
-		case 2: /* lgdt */
-			rc = read_descriptor(ctxt, ops, src.ptr,
-					     &size, &address, op_bytes);
-			if (rc)
-				goto done;
-			realmode_lgdt(ctxt->vcpu, size, address);
-			break;
-		case 3: /* lidt */
-			rc = read_descriptor(ctxt, ops, src.ptr,
-					     &size, &address, op_bytes);
-			if (rc)
-				goto done;
-			realmode_lidt(ctxt->vcpu, size, address);
-			break;
-		case 4: /* smsw */
-			if (modrm_mod != 3)
-				goto cannot_emulate;
-			*(u16 *)&_regs[modrm_rm]
-				= realmode_get_cr(ctxt->vcpu, 0);
-			break;
-		case 6: /* lmsw */
-			if (modrm_mod != 3)
-				goto cannot_emulate;
-			realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
-			break;
-		case 7: /* invlpg*/
-			emulate_invlpg(ctxt->vcpu, cr2);
-			break;
-		default:
-			goto cannot_emulate;
-		}
-		break;
-	case 0x21: /* mov from dr to reg */
-		no_wb = 1;
-		if (modrm_mod != 3)
-			goto cannot_emulate;
-		rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
-		break;
-	case 0x23: /* mov from reg to dr */
-		no_wb = 1;
-		if (modrm_mod != 3)
-			goto cannot_emulate;
-		rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
-		break;
-	case 0x40 ... 0x4f:	/* cmov */
-		dst.val = dst.orig_val = src.val;
-		no_wb = 1;
-		/*
-		 * First, assume we're decoding an even cmov opcode
-		 * (lsb == 0).
-		 */
-		switch ((b & 15) >> 1) {
-		case 0:	/* cmovo */
-			no_wb = (_eflags & EFLG_OF) ? 0 : 1;
-			break;
-		case 1:	/* cmovb/cmovc/cmovnae */
-			no_wb = (_eflags & EFLG_CF) ? 0 : 1;
-			break;
-		case 2:	/* cmovz/cmove */
-			no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
-			break;
-		case 3:	/* cmovbe/cmovna */
-			no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
-			break;
-		case 4:	/* cmovs */
-			no_wb = (_eflags & EFLG_SF) ? 0 : 1;
-			break;
-		case 5:	/* cmovp/cmovpe */
-			no_wb = (_eflags & EFLG_PF) ? 0 : 1;
-			break;
-		case 7:	/* cmovle/cmovng */
-			no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
-			/* fall through */
-		case 6:	/* cmovl/cmovnge */
-			no_wb &= (!(_eflags & EFLG_SF) !=
-			      !(_eflags & EFLG_OF)) ? 0 : 1;
-			break;
-		}
-		/* Odd cmov opcodes (lsb == 1) have inverted sense. */
-		no_wb ^= b & 1;
-		break;
-	case 0xa3:
-	      bt:		/* bt */
-		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-		emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
-		break;
-	case 0xab:
-	      bts:		/* bts */
-		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-		emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
-		break;
-	case 0xb0 ... 0xb1:	/* cmpxchg */
-		/*
-		 * Save real source value, then compare EAX against
-		 * destination.
-		 */
-		src.orig_val = src.val;
-		src.val = _regs[VCPU_REGS_RAX];
-		emulate_2op_SrcV("cmp", src, dst, _eflags);
-		if (_eflags & EFLG_ZF) {
-			/* Success: write back to memory. */
-			dst.val = src.orig_val;
-		} else {
-			/* Failure: write the value we saw to EAX. */
-			dst.type = OP_REG;
-			dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-		}
-		break;
-	case 0xb3:
-	      btr:		/* btr */
-		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-		emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
-		break;
-	case 0xb6 ... 0xb7:	/* movzx */
-		dst.bytes = op_bytes;
-		dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
-		break;
-	case 0xba:		/* Grp8 */
-		switch (modrm_reg & 3) {
-		case 0:
-			goto bt;
-		case 1:
-			goto bts;
-		case 2:
-			goto btr;
-		case 3:
-			goto btc;
-		}
-		break;
-	case 0xbb:
-	      btc:		/* btc */
-		src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-		emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
-		break;
-	case 0xbe ... 0xbf:	/* movsx */
-		dst.bytes = op_bytes;
-		dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
-		break;
-	case 0xc3:		/* movnti */
-		dst.bytes = op_bytes;
-		dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
-		break;
-	}
-	goto writeback;
-
-twobyte_special_insn:
-	/* Disable writeback. */
-	no_wb = 1;
-	switch (b) {
-	case 0x06:
-		emulate_clts(ctxt->vcpu);
-		break;
-	case 0x08:		/* invd */
-		break;
-	case 0x09:		/* wbinvd */
-		break;
-	case 0x0d:		/* GrpP (prefetch) */
-	case 0x18:		/* Grp16 (prefetch/nop) */
-		break;
-	case 0x20: /* mov cr, reg */
-		if (modrm_mod != 3)
-			goto cannot_emulate;
-		_regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
-		break;
-	case 0x22: /* mov reg, cr */
-		if (modrm_mod != 3)
-			goto cannot_emulate;
-		realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
-		break;
-	case 0x30:
-		/* wrmsr */
-		msr_data = (u32)_regs[VCPU_REGS_RAX]
-			| ((u64)_regs[VCPU_REGS_RDX] << 32);
-		rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
-		if (rc) {
-			kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
-			_eip = ctxt->vcpu->rip;
-		}
-		rc = X86EMUL_CONTINUE;
-		break;
-	case 0x32:
-		/* rdmsr */
-		rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
-		if (rc) {
-			kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
-			_eip = ctxt->vcpu->rip;
-		} else {
-			_regs[VCPU_REGS_RAX] = (u32)msr_data;
-			_regs[VCPU_REGS_RDX] = msr_data >> 32;
-		}
-		rc = X86EMUL_CONTINUE;
-		break;
-	case 0x80 ... 0x8f: /* jnz rel, etc*/ {
-		long int rel;
-
-		switch (op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, _eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, _eip);
-			break;
-		case 8:
-			rel = insn_fetch(s64, 8, _eip);
-			break;
-		default:
-			DPRINTF("jnz: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
-		if (test_cc(b, _eflags))
-			JMP_REL(rel);
-		break;
-	}
-	case 0xc7:		/* Grp9 (cmpxchg8b) */
-		{
-			u64 old, new;
-			if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
-									!= 0)
-				goto done;
-			if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
-			    ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
-				_regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-				_regs[VCPU_REGS_RDX] = (u32) (old >> 32);
-				_eflags &= ~EFLG_ZF;
-			} else {
-				new = ((u64)_regs[VCPU_REGS_RCX] << 32)
-					| (u32) _regs[VCPU_REGS_RBX];
-				if ((rc = ops->cmpxchg_emulated(cr2, &old,
-							  &new, 8, ctxt->vcpu)) != 0)
-					goto done;
-				_eflags |= EFLG_ZF;
-			}
-			break;
-		}
-	}
-	goto writeback;
-
-cannot_emulate:
-	DPRINTF("Cannot emulate %02x\n", b);
-	return -1;
-}
-
-#ifdef __XEN__
-
-#include <asm/mm.h>
-#include <asm/uaccess.h>
-
-int
-x86_emulate_read_std(unsigned long addr,
-		     unsigned long *val,
-		     unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
-	unsigned int rc;
-
-	*val = 0;
-
-	if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
-		propagate_page_fault(addr + bytes - rc, 0);	/* read fault */
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-
-	return X86EMUL_CONTINUE;
-}
-
-int
-x86_emulate_write_std(unsigned long addr,
-		      unsigned long val,
-		      unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
-	unsigned int rc;
-
-	if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
-		propagate_page_fault(addr + bytes - rc, PGERR_write_access);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-
-	return X86EMUL_CONTINUE;
-}
-
-#endif
diff -puN drivers/kvm/x86_emulate.h~git-kvm /dev/null
--- a/drivers/kvm/x86_emulate.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/******************************************************************************
- * x86_emulate.h
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __X86_EMULATE_H__
-#define __X86_EMULATE_H__
-
-struct x86_emulate_ctxt;
-
-/*
- * x86_emulate_ops:
- *
- * These operations represent the instruction emulator's interface to memory.
- * There are two categories of operation: those that act on ordinary memory
- * regions (*_std), and those that act on memory regions known to require
- * special treatment or emulation (*_emulated).
- *
- * The emulator assumes that an instruction accesses only one 'emulated memory'
- * location, that this location is the given linear faulting address (cr2), and
- * that this is one of the instruction's data operands. Instruction fetches and
- * stack operations are assumed never to access emulated memory. The emulator
- * automatically deduces which operand of a string-move operation is accessing
- * emulated memory, and assumes that the other operand accesses normal memory.
- *
- * NOTES:
- *  1. The emulator isn't very smart about emulated vs. standard memory.
- *     'Emulated memory' access addresses should be checked for sanity.
- *     'Normal memory' accesses may fault, and the caller must arrange to
- *     detect and handle reentrancy into the emulator via recursive faults.
- *     Accesses may be unaligned and may cross page boundaries.
- *  2. If the access fails (cannot emulate, or a standard access faults) then
- *     it is up to the memop to propagate the fault to the guest VM via
- *     some out-of-band mechanism, unknown to the emulator. The memop signals
- *     failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
- *     then immediately bail.
- *  3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
- *     cmpxchg8b_emulated need support 8-byte accesses.
- *  4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
- */
-/* Access completed successfully: continue emulation as normal. */
-#define X86EMUL_CONTINUE        0
-/* Access is unhandleable: bail from emulation and return error to caller. */
-#define X86EMUL_UNHANDLEABLE    1
-/* Terminate emulation but return success to the caller. */
-#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
-struct x86_emulate_ops {
-	/*
-	 * read_std: Read bytes of standard (non-emulated/special) memory.
-	 *           Used for instruction fetch, stack operations, and others.
-	 *  @addr:  [IN ] Linear address from which to read.
-	 *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'.
-	 *  @bytes: [IN ] Number of bytes to read from memory.
-	 */
-	int (*read_std)(unsigned long addr, void *val,
-			unsigned int bytes, struct kvm_vcpu *vcpu);
-
-	/*
-	 * write_std: Write bytes of standard (non-emulated/special) memory.
-	 *            Used for stack operations, and others.
-	 *  @addr:  [IN ] Linear address to which to write.
-	 *  @val:   [IN ] Value to write to memory (low-order bytes used as
-	 *                required).
-	 *  @bytes: [IN ] Number of bytes to write to memory.
-	 */
-	int (*write_std)(unsigned long addr, const void *val,
-			 unsigned int bytes, struct kvm_vcpu *vcpu);
-
-	/*
-	 * read_emulated: Read bytes from emulated/special memory area.
-	 *  @addr:  [IN ] Linear address from which to read.
-	 *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'.
-	 *  @bytes: [IN ] Number of bytes to read from memory.
-	 */
-	int (*read_emulated) (unsigned long addr,
-			      void *val,
-			      unsigned int bytes,
-			      struct kvm_vcpu *vcpu);
-
-	/*
-	 * write_emulated: Read bytes from emulated/special memory area.
-	 *  @addr:  [IN ] Linear address to which to write.
-	 *  @val:   [IN ] Value to write to memory (low-order bytes used as
-	 *                required).
-	 *  @bytes: [IN ] Number of bytes to write to memory.
-	 */
-	int (*write_emulated) (unsigned long addr,
-			       const void *val,
-			       unsigned int bytes,
-			       struct kvm_vcpu *vcpu);
-
-	/*
-	 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
-	 *                   emulated/special memory area.
-	 *  @addr:  [IN ] Linear address to access.
-	 *  @old:   [IN ] Value expected to be current at @addr.
-	 *  @new:   [IN ] Value to write to @addr.
-	 *  @bytes: [IN ] Number of bytes to access using CMPXCHG.
-	 */
-	int (*cmpxchg_emulated) (unsigned long addr,
-				 const void *old,
-				 const void *new,
-				 unsigned int bytes,
-				 struct kvm_vcpu *vcpu);
-
-};
-
-struct x86_emulate_ctxt {
-	/* Register state before/after emulation. */
-	struct kvm_vcpu *vcpu;
-
-	/* Linear faulting address (if emulating a page-faulting instruction). */
-	unsigned long eflags;
-	unsigned long cr2;
-
-	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
-	int mode;
-
-	unsigned long cs_base;
-	unsigned long ds_base;
-	unsigned long es_base;
-	unsigned long ss_base;
-	unsigned long gs_base;
-	unsigned long fs_base;
-};
-
-/* Execution mode, passed to the emulator. */
-#define X86EMUL_MODE_REAL     0	/* Real mode.             */
-#define X86EMUL_MODE_PROT16   2	/* 16-bit protected mode. */
-#define X86EMUL_MODE_PROT32   4	/* 32-bit protected mode. */
-#define X86EMUL_MODE_PROT64   8	/* 64-bit (long) mode.    */
-
-/* Host execution mode. */
-#if defined(__i386__)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
-#elif defined(CONFIG_X86_64)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
-#endif
-
-/*
- * x86_emulate_memop: Emulate an instruction that faulted attempting to
- *                    read/write a 'special' memory area.
- * Returns -1 on failure, 0 on success.
- */
-int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
-		      struct x86_emulate_ops *ops);
-
-#endif				/* __X86_EMULATE_H__ */
diff -puN include/asm-x86/Kbuild~git-kvm include/asm-x86/Kbuild
--- a/include/asm-x86/Kbuild~git-kvm
+++ a/include/asm-x86/Kbuild
@@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm
 header-y += boot.h
 header-y += bootparam.h
 header-y += debugreg.h
+header-y += kvm.h
 header-y += ldt.h
 header-y += msr-index.h
 header-y += prctl.h
diff -puN /dev/null include/asm-x86/kvm.h
--- /dev/null
+++ a/include/asm-x86/kvm.h
@@ -0,0 +1,191 @@
+#ifndef __LINUX_KVM_X86_H
+#define __LINUX_KVM_X86_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <asm/types.h>
+#include <linux/ioctl.h>
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+struct kvm_memory_alias {
+	__u32 slot;  /* this has a different namespace than memory slots */
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size;
+	__u64 target_phys_addr;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+	__u8 last_irr;	/* edge detection */
+	__u8 irr;		/* interrupt request register */
+	__u8 imr;		/* interrupt mask register */
+	__u8 isr;		/* interrupt service register */
+	__u8 priority_add;	/* highest irq priority */
+	__u8 irq_base;
+	__u8 read_reg_select;
+	__u8 poll;
+	__u8 special_mask;
+	__u8 init_state;
+	__u8 auto_eoi;
+	__u8 rotate_on_auto_eoi;
+	__u8 special_fully_nested_mode;
+	__u8 init4;		/* true if 4 byte init */
+	__u8 elcr;		/* PIIX edge/trigger selection */
+	__u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS  24
+struct kvm_ioapic_state {
+	__u64 base_address;
+	__u32 ioregsel;
+	__u32 id;
+	__u32 irr;
+	__u32 pad;
+	union {
+		__u64 bits;
+		struct {
+			__u8 vector;
+			__u8 delivery_mode:3;
+			__u8 dest_mode:1;
+			__u8 delivery_status:1;
+			__u8 polarity:1;
+			__u8 remote_irr:1;
+			__u8 trig_mode:1;
+			__u8 mask:1;
+			__u8 reserve:7;
+			__u8 reserved[4];
+			__u8 dest_id;
+		} fields;
+	} redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER   0
+#define KVM_IRQCHIP_PIC_SLAVE    1
+#define KVM_IRQCHIP_IOAPIC       2
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+	__u64 rax, rbx, rcx, rdx;
+	__u64 rsi, rdi, rsp, rbp;
+	__u64 r8,  r9,  r10, r11;
+	__u64 r12, r13, r14, r15;
+	__u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+	char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+	__u64 base;
+	__u32 limit;
+	__u16 selector;
+	__u8  type;
+	__u8  present, dpl, db, s, l, g, avl;
+	__u8  unusable;
+	__u8  padding;
+};
+
+struct kvm_dtable {
+	__u64 base;
+	__u16 limit;
+	__u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+	/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+	struct kvm_segment cs, ds, es, fs, gs, ss;
+	struct kvm_segment tr, ldt;
+	struct kvm_dtable gdt, idt;
+	__u64 cr0, cr2, cr3, cr4, cr8;
+	__u64 efer;
+	__u64 apic_base;
+	__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+	__u8  fpr[8][16];
+	__u16 fcw;
+	__u16 fsw;
+	__u8  ftwx;  /* in fxsave format */
+	__u8  pad1;
+	__u16 last_opcode;
+	__u64 last_ip;
+	__u64 last_dp;
+	__u8  xmm[16][16];
+	__u32 mxcsr;
+	__u32 pad2;
+};
+
+struct kvm_msr_entry {
+	__u32 index;
+	__u32 reserved;
+	__u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+	__u32 nmsrs; /* number of msrs in entries */
+	__u32 pad;
+
+	struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+	__u32 nmsrs; /* number of msrs in entries */
+	__u32 indices[0];
+};
+
+
+struct kvm_cpuid_entry {
+	__u32 function;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+	__u32 nent;
+	__u32 padding;
+	struct kvm_cpuid_entry entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+	__u32 function;
+	__u32 index;
+	__u32 flags;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+	__u32 nent;
+	__u32 padding;
+	struct kvm_cpuid_entry2 entries[0];
+};
+
+#endif
diff -puN /dev/null include/asm-x86/kvm_host.h
--- /dev/null
+++ a/include/asm-x86/kvm_host.h
@@ -0,0 +1,612 @@
+#/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This header defines architecture specific interfaces, x86 version
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef ASM_KVM_HOST_H
+#define ASM_KVM_HOST_H
+
+#include <linux/types.h>
+#include <linux/mm.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/desc.h>
+
+#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
+#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
+
+#define KVM_GUEST_CR0_MASK \
+	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
+	 | X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON \
+	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
+	 | X86_CR0_MP)
+#define KVM_GUEST_CR4_MASK \
+	(X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+#define INVALID_PAGE (~(hpa_t)0)
+#define UNMAPPED_GVA (~(gpa_t)0)
+
+#define DE_VECTOR 0
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+
+#define SELECTOR_TI_MASK (1 << 2)
+#define SELECTOR_RPL_MASK 0x03
+
+#define IOPL_SHIFT 12
+
+#define KVM_ALIAS_SLOTS 4
+
+#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_MMU_HASH_SHIFT 10
+#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 40
+
+extern spinlock_t kvm_lock;
+extern struct list_head vm_list;
+
+struct kvm_vcpu;
+struct kvm;
+
+enum {
+	VCPU_REGS_RAX = 0,
+	VCPU_REGS_RCX = 1,
+	VCPU_REGS_RDX = 2,
+	VCPU_REGS_RBX = 3,
+	VCPU_REGS_RSP = 4,
+	VCPU_REGS_RBP = 5,
+	VCPU_REGS_RSI = 6,
+	VCPU_REGS_RDI = 7,
+#ifdef CONFIG_X86_64
+	VCPU_REGS_R8 = 8,
+	VCPU_REGS_R9 = 9,
+	VCPU_REGS_R10 = 10,
+	VCPU_REGS_R11 = 11,
+	VCPU_REGS_R12 = 12,
+	VCPU_REGS_R13 = 13,
+	VCPU_REGS_R14 = 14,
+	VCPU_REGS_R15 = 15,
+#endif
+	NR_VCPU_REGS
+};
+
+enum {
+	VCPU_SREG_CS,
+	VCPU_SREG_DS,
+	VCPU_SREG_ES,
+	VCPU_SREG_FS,
+	VCPU_SREG_GS,
+	VCPU_SREG_SS,
+	VCPU_SREG_TR,
+	VCPU_SREG_LDTR,
+};
+
+#include <asm/kvm_x86_emulate.h>
+
+#define KVM_NR_MEM_OBJS 40
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+	int nobjs;
+	void *objects[KVM_NR_MEM_OBJS];
+};
+
+#define NR_PTE_CHAIN_ENTRIES 5
+
+struct kvm_pte_chain {
+	u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
+	struct hlist_node link;
+};
+
+/*
+ * kvm_mmu_page_role, below, is defined as:
+ *
+ *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
+ *   bits 4:7 - page table level for this shadow (1-4)
+ *   bits 8:9 - page table quadrant for 2-level guests
+ *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ *   bits 17:19 - common access permissions for all ptes in this shadow page
+ */
+union kvm_mmu_page_role {
+	unsigned word;
+	struct {
+		unsigned glevels : 4;
+		unsigned level : 4;
+		unsigned quadrant : 2;
+		unsigned pad_for_nice_hex_output : 6;
+		unsigned metaphysical : 1;
+		unsigned access : 3;
+	};
+};
+
+struct kvm_mmu_page {
+	struct list_head link;
+	struct hlist_node hash_link;
+
+	/*
+	 * The following two entries are used to key the shadow page in the
+	 * hash table.
+	 */
+	gfn_t gfn;
+	union kvm_mmu_page_role role;
+
+	u64 *spt;
+	/* hold the gfn of each spte inside spt */
+	gfn_t *gfns;
+	unsigned long slot_bitmap; /* One bit set per slot which has memory
+				    * in this shadow page.
+				    */
+	int multimapped;         /* More than one parent_pte? */
+	int root_count;          /* Currently serving as active root */
+	union {
+		u64 *parent_pte;               /* !multimapped */
+		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
+	};
+};
+
+/*
+ * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
+ * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
+ * mode.
+ */
+struct kvm_mmu {
+	void (*new_cr3)(struct kvm_vcpu *vcpu);
+	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+	void (*free)(struct kvm_vcpu *vcpu);
+	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+	void (*prefetch_page)(struct kvm_vcpu *vcpu,
+			      struct kvm_mmu_page *page);
+	hpa_t root_hpa;
+	int root_level;
+	int shadow_root_level;
+
+	u64 *pae_root;
+};
+
+struct kvm_vcpu_arch {
+	u64 host_tsc;
+	int interrupt_window_open;
+	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
+	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
+	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
+	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
+
+	unsigned long cr0;
+	unsigned long cr2;
+	unsigned long cr3;
+	unsigned long cr4;
+	unsigned long cr8;
+	u64 pdptrs[4]; /* pae */
+	u64 shadow_efer;
+	u64 apic_base;
+	struct kvm_lapic *apic;    /* kernel irqchip context */
+#define VCPU_MP_STATE_RUNNABLE          0
+#define VCPU_MP_STATE_UNINITIALIZED     1
+#define VCPU_MP_STATE_INIT_RECEIVED     2
+#define VCPU_MP_STATE_SIPI_RECEIVED     3
+#define VCPU_MP_STATE_HALTED            4
+	int mp_state;
+	int sipi_vector;
+	u64 ia32_misc_enable_msr;
+	bool tpr_access_reporting;
+
+	struct kvm_mmu mmu;
+
+	struct kvm_mmu_memory_cache mmu_pte_chain_cache;
+	struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+	struct kvm_mmu_memory_cache mmu_page_cache;
+	struct kvm_mmu_memory_cache mmu_page_header_cache;
+
+	gfn_t last_pt_write_gfn;
+	int   last_pt_write_count;
+	u64  *last_pte_updated;
+
+	struct {
+		gfn_t gfn;          /* presumed gfn during guest pte update */
+		struct page *page;  /* page corresponding to that gfn */
+	} update_pte;
+
+	struct i387_fxsave_struct host_fx_image;
+	struct i387_fxsave_struct guest_fx_image;
+
+	gva_t mmio_fault_cr2;
+	struct kvm_pio_request pio;
+	void *pio_data;
+
+	struct kvm_queued_exception {
+		bool pending;
+		bool has_error_code;
+		u8 nr;
+		u32 error_code;
+	} exception;
+
+	struct {
+		int active;
+		u8 save_iopl;
+		struct kvm_save_segment {
+			u16 selector;
+			unsigned long base;
+			u32 limit;
+			u32 ar;
+		} tr, es, ds, fs, gs;
+	} rmode;
+	int halt_request; /* real mode on Intel only */
+
+	int cpuid_nent;
+	struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+	/* emulate context */
+
+	struct x86_emulate_ctxt emulate_ctxt;
+};
+
+struct kvm_mem_alias {
+	gfn_t base_gfn;
+	unsigned long npages;
+	gfn_t target_gfn;
+};
+
+struct kvm_arch{
+	int naliases;
+	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+
+	unsigned int n_free_mmu_pages;
+	unsigned int n_requested_mmu_pages;
+	unsigned int n_alloc_mmu_pages;
+	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+	/*
+	 * Hash table of struct kvm_mmu_page.
+	 */
+	struct list_head active_mmu_pages;
+	struct kvm_pic *vpic;
+	struct kvm_ioapic *vioapic;
+
+	int round_robin_prev_vcpu;
+	unsigned int tss_addr;
+	struct page *apic_access_page;
+};
+
+struct kvm_vm_stat {
+	u32 mmu_shadow_zapped;
+	u32 mmu_pte_write;
+	u32 mmu_pte_updated;
+	u32 mmu_pde_zapped;
+	u32 mmu_flooded;
+	u32 mmu_recycled;
+	u32 mmu_cache_miss;
+	u32 remote_tlb_flush;
+};
+
+struct kvm_vcpu_stat {
+	u32 pf_fixed;
+	u32 pf_guest;
+	u32 tlb_flush;
+	u32 invlpg;
+
+	u32 exits;
+	u32 io_exits;
+	u32 mmio_exits;
+	u32 signal_exits;
+	u32 irq_window_exits;
+	u32 halt_exits;
+	u32 halt_wakeup;
+	u32 request_irq_exits;
+	u32 irq_exits;
+	u32 host_state_reload;
+	u32 efer_reload;
+	u32 fpu_reload;
+	u32 insn_emulation;
+	u32 insn_emulation_fail;
+};
+
+struct descriptor_table {
+	u16 limit;
+	unsigned long base;
+} __attribute__((packed));
+
+struct kvm_x86_ops {
+	int (*cpu_has_kvm_support)(void);          /* __init */
+	int (*disabled_by_bios)(void);             /* __init */
+	void (*hardware_enable)(void *dummy);      /* __init */
+	void (*hardware_disable)(void *dummy);
+	void (*check_processor_compatibility)(void *rtn);
+	int (*hardware_setup)(void);               /* __init */
+	void (*hardware_unsetup)(void);            /* __exit */
+	bool (*cpu_has_accelerated_tpr)(void);
+
+	/* Create, but do not attach this VCPU */
+	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
+	void (*vcpu_free)(struct kvm_vcpu *vcpu);
+	int (*vcpu_reset)(struct kvm_vcpu *vcpu);
+
+	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+	void (*vcpu_put)(struct kvm_vcpu *vcpu);
+	void (*vcpu_decache)(struct kvm_vcpu *vcpu);
+
+	int (*set_guest_debug)(struct kvm_vcpu *vcpu,
+			       struct kvm_debug_guest *dbg);
+	void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
+	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
+	void (*get_segment)(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg);
+	void (*set_segment)(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg);
+	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+	void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
+	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
+	void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+	void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+	void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+	void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+	unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
+	void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
+		       int *exception);
+	void (*cache_regs)(struct kvm_vcpu *vcpu);
+	void (*decache_regs)(struct kvm_vcpu *vcpu);
+	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+
+	void (*tlb_flush)(struct kvm_vcpu *vcpu);
+
+	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
+	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
+				unsigned char *hypercall_addr);
+	int (*get_irq)(struct kvm_vcpu *vcpu);
+	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
+				bool has_error_code, u32 error_code);
+	bool (*exception_injected)(struct kvm_vcpu *vcpu);
+	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
+	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
+				       struct kvm_run *run);
+
+	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+};
+
+extern struct kvm_x86_ops *kvm_x86_ops;
+
+int kvm_mmu_module_init(void);
+void kvm_mmu_module_exit(void);
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_mmu_create(struct kvm_vcpu *vcpu);
+int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_zap_all(struct kvm *kvm);
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+
+enum emulation_result {
+	EMULATE_DONE,       /* no further processing */
+	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
+	EMULATE_FAIL,         /* can't emulate this instruction */
+};
+
+#define EMULTYPE_NO_DECODE	    (1 << 0)
+#define EMULTYPE_TRAP_UD	    (1 << 1)
+int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
+			unsigned long cr2, u16 error_code, int emulation_type);
+void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+		   unsigned long *rflags);
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
+		     unsigned long *rflags);
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+
+struct x86_emulate_ctxt;
+
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+		     int size, unsigned port);
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+			   int size, unsigned long count, int down,
+			    gva_t address, int rep, unsigned port);
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
+int emulate_clts(struct kvm_vcpu *vcpu);
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+		    unsigned long *dest);
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+		    unsigned long value);
+
+void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
+void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
+void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
+unsigned long get_cr8(struct kvm_vcpu *vcpu);
+void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
+			   u32 error_code);
+
+void fx_init(struct kvm_vcpu *vcpu);
+
+int emulator_read_std(unsigned long addr,
+		      void *val,
+		      unsigned int bytes,
+		      struct kvm_vcpu *vcpu);
+int emulator_write_emulated(unsigned long addr,
+			    const void *val,
+			    unsigned int bytes,
+			    struct kvm_vcpu *vcpu);
+
+unsigned long segment_base(u16 selector);
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+		       const u8 *new, int bytes);
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int complete_pio(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+{
+	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
+
+	return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline u16 read_fs(void)
+{
+	u16 seg;
+	asm("mov %%fs, %0" : "=g"(seg));
+	return seg;
+}
+
+static inline u16 read_gs(void)
+{
+	u16 seg;
+	asm("mov %%gs, %0" : "=g"(seg));
+	return seg;
+}
+
+static inline u16 read_ldt(void)
+{
+	u16 ldt;
+	asm("sldt %0" : "=g"(ldt));
+	return ldt;
+}
+
+static inline void load_fs(u16 sel)
+{
+	asm("mov %0, %%fs" : : "rm"(sel));
+}
+
+static inline void load_gs(u16 sel)
+{
+	asm("mov %0, %%gs" : : "rm"(sel));
+}
+
+#ifndef load_ldt
+static inline void load_ldt(u16 sel)
+{
+	asm("lldt %0" : : "rm"(sel));
+}
+#endif
+
+static inline void get_idt(struct descriptor_table *table)
+{
+	asm("sidt %0" : "=m"(*table));
+}
+
+static inline void get_gdt(struct descriptor_table *table)
+{
+	asm("sgdt %0" : "=m"(*table));
+}
+
+static inline unsigned long read_tr_base(void)
+{
+	u16 tr;
+	asm("str %0" : "=g"(tr));
+	return segment_base(tr);
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_msr(unsigned long msr)
+{
+	u64 value;
+
+	rdmsrl(msr, value);
+	return value;
+}
+#endif
+
+static inline void fx_save(struct i387_fxsave_struct *image)
+{
+	asm("fxsave (%0)":: "r" (image));
+}
+
+static inline void fx_restore(struct i387_fxsave_struct *image)
+{
+	asm("fxrstor (%0)":: "r" (image));
+}
+
+static inline void fpu_init(void)
+{
+	asm("finit");
+}
+
+static inline u32 get_rdx_init_val(void)
+{
+	return 0x600; /* P6 family */
+}
+
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+{
+	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+}
+
+#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+
+#define MSR_IA32_TIME_STAMP_COUNTER		0x010
+
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
+#endif
diff -puN /dev/null include/asm-x86/kvm_para.h
--- /dev/null
+++ a/include/asm-x86/kvm_para.h
@@ -0,0 +1,105 @@
+#ifndef __X86_KVM_PARA_H
+#define __X86_KVM_PARA_H
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE	0x40000000
+
+/* This CPUID returns a feature bitmap in eax.  Before enabling a particular
+ * paravirtualization, the appropriate feature bit should be checked.
+ */
+#define KVM_CPUID_FEATURES	0x40000001
+
+#ifdef __KERNEL__
+#include <asm/processor.h>
+
+/* This instruction is vmcall.  On non-VT architectures, it will generate a
+ * trap that we will then rewrite to the appropriate instruction.
+ */
+#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+
+/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+ * instruction.  The hypervisor may replace it with something else but only the
+ * instructions are guaranteed to be supported.
+ *
+ * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ * The hypercall number should be placed in rax and the return value will be
+ * placed in rax.  No other registers will be clobbered unless explicited
+ * noted by the particular hypercall.
+ */
+
+static inline long kvm_hypercall0(unsigned int nr)
+{
+	long ret;
+	asm volatile(KVM_HYPERCALL
+		     : "=a"(ret)
+		     : "a"(nr));
+	return ret;
+}
+
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+	long ret;
+	asm volatile(KVM_HYPERCALL
+		     : "=a"(ret)
+		     : "a"(nr), "b"(p1));
+	return ret;
+}
+
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+				  unsigned long p2)
+{
+	long ret;
+	asm volatile(KVM_HYPERCALL
+		     : "=a"(ret)
+		     : "a"(nr), "b"(p1), "c"(p2));
+	return ret;
+}
+
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+				  unsigned long p2, unsigned long p3)
+{
+	long ret;
+	asm volatile(KVM_HYPERCALL
+		     : "=a"(ret)
+		     : "a"(nr), "b"(p1), "c"(p2), "d"(p3));
+	return ret;
+}
+
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+				  unsigned long p2, unsigned long p3,
+				  unsigned long p4)
+{
+	long ret;
+	asm volatile(KVM_HYPERCALL
+		     : "=a"(ret)
+		     : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
+	return ret;
+}
+
+static inline int kvm_para_available(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	char signature[13];
+
+	cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
+	memcpy(signature + 0, &ebx, 4);
+	memcpy(signature + 4, &ecx, 4);
+	memcpy(signature + 8, &edx, 4);
+	signature[12] = 0;
+
+	if (strcmp(signature, "KVMKVMKVM") == 0)
+		return 1;
+
+	return 0;
+}
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+	return cpuid_eax(KVM_CPUID_FEATURES);
+}
+
+#endif
+
+#endif
diff -puN /dev/null include/asm-x86/kvm_x86_emulate.h
--- /dev/null
+++ a/include/asm-x86/kvm_x86_emulate.h
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef __X86_EMULATE_H__
+#define __X86_EMULATE_H__
+
+struct x86_emulate_ctxt;
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ *  1. The emulator isn't very smart about emulated vs. standard memory.
+ *     'Emulated memory' access addresses should be checked for sanity.
+ *     'Normal memory' accesses may fault, and the caller must arrange to
+ *     detect and handle reentrancy into the emulator via recursive faults.
+ *     Accesses may be unaligned and may cross page boundaries.
+ *  2. If the access fails (cannot emulate, or a standard access faults) then
+ *     it is up to the memop to propagate the fault to the guest VM via
+ *     some out-of-band mechanism, unknown to the emulator. The memop signals
+ *     failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ *     then immediately bail.
+ *  3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ *     cmpxchg8b_emulated need support 8-byte accesses.
+ *  4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE        0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE    1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
+struct x86_emulate_ops {
+	/*
+	 * read_std: Read bytes of standard (non-emulated/special) memory.
+	 *           Used for instruction fetch, stack operations, and others.
+	 *  @addr:  [IN ] Linear address from which to read.
+	 *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'.
+	 *  @bytes: [IN ] Number of bytes to read from memory.
+	 */
+	int (*read_std)(unsigned long addr, void *val,
+			unsigned int bytes, struct kvm_vcpu *vcpu);
+
+	/*
+	 * read_emulated: Read bytes from emulated/special memory area.
+	 *  @addr:  [IN ] Linear address from which to read.
+	 *  @val:   [OUT] Value read from memory, zero-extended to 'u_long'.
+	 *  @bytes: [IN ] Number of bytes to read from memory.
+	 */
+	int (*read_emulated) (unsigned long addr,
+			      void *val,
+			      unsigned int bytes,
+			      struct kvm_vcpu *vcpu);
+
+	/*
+	 * write_emulated: Read bytes from emulated/special memory area.
+	 *  @addr:  [IN ] Linear address to which to write.
+	 *  @val:   [IN ] Value to write to memory (low-order bytes used as
+	 *                required).
+	 *  @bytes: [IN ] Number of bytes to write to memory.
+	 */
+	int (*write_emulated) (unsigned long addr,
+			       const void *val,
+			       unsigned int bytes,
+			       struct kvm_vcpu *vcpu);
+
+	/*
+	 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+	 *                   emulated/special memory area.
+	 *  @addr:  [IN ] Linear address to access.
+	 *  @old:   [IN ] Value expected to be current at @addr.
+	 *  @new:   [IN ] Value to write to @addr.
+	 *  @bytes: [IN ] Number of bytes to access using CMPXCHG.
+	 */
+	int (*cmpxchg_emulated) (unsigned long addr,
+				 const void *old,
+				 const void *new,
+				 unsigned int bytes,
+				 struct kvm_vcpu *vcpu);
+
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+	enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+	unsigned int bytes;
+	unsigned long val, orig_val, *ptr;
+};
+
+struct fetch_cache {
+	u8 data[15];
+	unsigned long start;
+	unsigned long end;
+};
+
+struct decode_cache {
+	u8 twobyte;
+	u8 b;
+	u8 lock_prefix;
+	u8 rep_prefix;
+	u8 op_bytes;
+	u8 ad_bytes;
+	u8 rex_prefix;
+	struct operand src;
+	struct operand dst;
+	unsigned long *override_base;
+	unsigned int d;
+	unsigned long regs[NR_VCPU_REGS];
+	unsigned long eip;
+	/* modrm */
+	u8 modrm;
+	u8 modrm_mod;
+	u8 modrm_reg;
+	u8 modrm_rm;
+	u8 use_modrm_ea;
+	unsigned long modrm_ea;
+	unsigned long modrm_val;
+	struct fetch_cache fetch;
+};
+
+struct x86_emulate_ctxt {
+	/* Register state before/after emulation. */
+	struct kvm_vcpu *vcpu;
+
+	/* Linear faulting address (if emulating a page-faulting instruction). */
+	unsigned long eflags;
+
+	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
+	int mode;
+
+	unsigned long cs_base;
+	unsigned long ds_base;
+	unsigned long es_base;
+	unsigned long ss_base;
+	unsigned long gs_base;
+	unsigned long fs_base;
+
+	/* decode cache */
+
+	struct decode_cache decode;
+};
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX  1
+#define REPNE_PREFIX    2
+
+/* Execution mode, passed to the emulator. */
+#define X86EMUL_MODE_REAL     0	/* Real mode.             */
+#define X86EMUL_MODE_PROT16   2	/* 16-bit protected mode. */
+#define X86EMUL_MODE_PROT32   4	/* 32-bit protected mode. */
+#define X86EMUL_MODE_PROT64   8	/* 64-bit (long) mode.    */
+
+/* Host execution mode. */
+#if defined(__i386__)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
+		    struct x86_emulate_ops *ops);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
+		     struct x86_emulate_ops *ops);
+
+#endif				/* __X86_EMULATE_H__ */
diff -puN include/linux/Kbuild~git-kvm include/linux/Kbuild
--- a/include/linux/Kbuild~git-kvm
+++ a/include/linux/Kbuild
@@ -97,7 +97,6 @@ header-y += iso_fs.h
 header-y += ixjuser.h
 header-y += jffs2.h
 header-y += keyctl.h
-header-y += kvm.h
 header-y += limits.h
 header-y += lock_dlm_plock.h
 header-y += magic.h
@@ -255,6 +254,7 @@ unifdef-y += kd.h
 unifdef-y += kernelcapi.h
 unifdef-y += kernel.h
 unifdef-y += keyboard.h
+unifdef-$(CONFIG_ARCH_SUPPORTS_KVM) += kvm.h
 unifdef-y += llc.h
 unifdef-y += loop.h
 unifdef-y += lp.h
diff -puN include/linux/kvm.h~git-kvm include/linux/kvm.h
--- a/include/linux/kvm.h~git-kvm
+++ a/include/linux/kvm.h
@@ -9,12 +9,10 @@
 
 #include <asm/types.h>
 #include <linux/ioctl.h>
+#include <asm/kvm.h>
 
 #define KVM_API_VERSION 12
 
-/* Architectural interrupt line count. */
-#define KVM_NR_INTERRUPTS 256
-
 /* for KVM_CREATE_MEMORY_REGION */
 struct kvm_memory_region {
 	__u32 slot;
@@ -23,17 +21,19 @@ struct kvm_memory_region {
 	__u64 memory_size; /* bytes */
 };
 
-/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES  1UL
-
-struct kvm_memory_alias {
-	__u32 slot;  /* this has a different namespace than memory slots */
+/* for KVM_SET_USER_MEMORY_REGION */
+struct kvm_userspace_memory_region {
+	__u32 slot;
 	__u32 flags;
 	__u64 guest_phys_addr;
-	__u64 memory_size;
-	__u64 target_phys_addr;
+	__u64 memory_size; /* bytes */
+	__u64 userspace_addr; /* start of the userspace allocated memory */
 };
 
+/* for kvm_memory_region::flags */
+#define KVM_MEM_LOG_DIRTY_PAGES  1UL
+
+
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
 	/*
@@ -45,62 +45,18 @@ struct kvm_irq_level {
 	__u32 level;
 };
 
-/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
-struct kvm_pic_state {
-	__u8 last_irr;	/* edge detection */
-	__u8 irr;		/* interrupt request register */
-	__u8 imr;		/* interrupt mask register */
-	__u8 isr;		/* interrupt service register */
-	__u8 priority_add;	/* highest irq priority */
-	__u8 irq_base;
-	__u8 read_reg_select;
-	__u8 poll;
-	__u8 special_mask;
-	__u8 init_state;
-	__u8 auto_eoi;
-	__u8 rotate_on_auto_eoi;
-	__u8 special_fully_nested_mode;
-	__u8 init4;		/* true if 4 byte init */
-	__u8 elcr;		/* PIIX edge/trigger selection */
-	__u8 elcr_mask;
-};
-
-#define KVM_IOAPIC_NUM_PINS  24
-struct kvm_ioapic_state {
-	__u64 base_address;
-	__u32 ioregsel;
-	__u32 id;
-	__u32 irr;
-	__u32 pad;
-	union {
-		__u64 bits;
-		struct {
-			__u8 vector;
-			__u8 delivery_mode:3;
-			__u8 dest_mode:1;
-			__u8 delivery_status:1;
-			__u8 polarity:1;
-			__u8 remote_irr:1;
-			__u8 trig_mode:1;
-			__u8 mask:1;
-			__u8 reserve:7;
-			__u8 reserved[4];
-			__u8 dest_id;
-		} fields;
-	} redirtbl[KVM_IOAPIC_NUM_PINS];
-};
-
-#define KVM_IRQCHIP_PIC_MASTER   0
-#define KVM_IRQCHIP_PIC_SLAVE    1
-#define KVM_IRQCHIP_IOAPIC       2
 
 struct kvm_irqchip {
 	__u32 chip_id;
 	__u32 pad;
         union {
 		char dummy[512];  /* reserving space */
+#ifdef CONFIG_X86
 		struct kvm_pic_state pic;
+#endif
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
 		struct kvm_ioapic_state ioapic;
+#endif
 	} chip;
 };
 
@@ -116,6 +72,7 @@ struct kvm_irqchip {
 #define KVM_EXIT_FAIL_ENTRY       9
 #define KVM_EXIT_INTR             10
 #define KVM_EXIT_SET_TPR          11
+#define KVM_EXIT_TPR_ACCESS       12
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
@@ -174,90 +131,17 @@ struct kvm_run {
 			__u32 longmode;
 			__u32 pad;
 		} hypercall;
+		/* KVM_EXIT_TPR_ACCESS */
+		struct {
+			__u64 rip;
+			__u32 is_write;
+			__u32 pad;
+		} tpr_access;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
 };
 
-/* for KVM_GET_REGS and KVM_SET_REGS */
-struct kvm_regs {
-	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
-	__u64 rax, rbx, rcx, rdx;
-	__u64 rsi, rdi, rsp, rbp;
-	__u64 r8,  r9,  r10, r11;
-	__u64 r12, r13, r14, r15;
-	__u64 rip, rflags;
-};
-
-/* for KVM_GET_FPU and KVM_SET_FPU */
-struct kvm_fpu {
-	__u8  fpr[8][16];
-	__u16 fcw;
-	__u16 fsw;
-	__u8  ftwx;  /* in fxsave format */
-	__u8  pad1;
-	__u16 last_opcode;
-	__u64 last_ip;
-	__u64 last_dp;
-	__u8  xmm[16][16];
-	__u32 mxcsr;
-	__u32 pad2;
-};
-
-/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
-#define KVM_APIC_REG_SIZE 0x400
-struct kvm_lapic_state {
-	char regs[KVM_APIC_REG_SIZE];
-};
-
-struct kvm_segment {
-	__u64 base;
-	__u32 limit;
-	__u16 selector;
-	__u8  type;
-	__u8  present, dpl, db, s, l, g, avl;
-	__u8  unusable;
-	__u8  padding;
-};
-
-struct kvm_dtable {
-	__u64 base;
-	__u16 limit;
-	__u16 padding[3];
-};
-
-/* for KVM_GET_SREGS and KVM_SET_SREGS */
-struct kvm_sregs {
-	/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
-	struct kvm_segment cs, ds, es, fs, gs, ss;
-	struct kvm_segment tr, ldt;
-	struct kvm_dtable gdt, idt;
-	__u64 cr0, cr2, cr3, cr4, cr8;
-	__u64 efer;
-	__u64 apic_base;
-	__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
-};
-
-struct kvm_msr_entry {
-	__u32 index;
-	__u32 reserved;
-	__u64 data;
-};
-
-/* for KVM_GET_MSRS and KVM_SET_MSRS */
-struct kvm_msrs {
-	__u32 nmsrs; /* number of msrs in entries */
-	__u32 pad;
-
-	struct kvm_msr_entry entries[0];
-};
-
-/* for KVM_GET_MSR_INDEX_LIST */
-struct kvm_msr_list {
-	__u32 nmsrs; /* number of msrs in entries */
-	__u32 indices[0];
-};
-
 /* for KVM_TRANSLATE */
 struct kvm_translation {
 	/* in */
@@ -302,28 +186,24 @@ struct kvm_dirty_log {
 	};
 };
 
-struct kvm_cpuid_entry {
-	__u32 function;
-	__u32 eax;
-	__u32 ebx;
-	__u32 ecx;
-	__u32 edx;
-	__u32 padding;
-};
-
-/* for KVM_SET_CPUID */
-struct kvm_cpuid {
-	__u32 nent;
-	__u32 padding;
-	struct kvm_cpuid_entry entries[0];
-};
-
 /* for KVM_SET_SIGNAL_MASK */
 struct kvm_signal_mask {
 	__u32 len;
 	__u8  sigset[0];
 };
 
+/* for KVM_TPR_ACCESS_REPORTING */
+struct kvm_tpr_access_ctl {
+	__u32 enabled;
+	__u32 flags;
+	__u32 reserved[8];
+};
+
+/* for KVM_SET_VAPIC_ADDR */
+struct kvm_vapic_addr {
+	__u64 vapic_addr;
+};
+
 #define KVMIO 0xAE
 
 /*
@@ -347,11 +227,21 @@ struct kvm_signal_mask {
  */
 #define KVM_CAP_IRQCHIP	  0
 #define KVM_CAP_HLT	  1
+#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
+#define KVM_CAP_USER_MEMORY 3
+#define KVM_CAP_SET_TSS_ADDR 4
+#define KVM_CAP_EXT_CPUID 5
+#define KVM_CAP_VAPIC 6
 
 /*
  * ioctls for VM fds
  */
 #define KVM_SET_MEMORY_REGION     _IOW(KVMIO, 0x40, struct kvm_memory_region)
+#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO, 0x44)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO, 0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
+					struct kvm_userspace_memory_region)
+#define KVM_SET_TSS_ADDR          _IO(KVMIO, 0x47)
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
@@ -359,6 +249,7 @@ struct kvm_signal_mask {
 #define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO, 0x43, struct kvm_memory_alias)
+#define KVM_GET_SUPPORTED_CPUID   _IOWR(KVMIO, 0x48, struct kvm_cpuid2)
 /* Device model IOC */
 #define KVM_CREATE_IRQCHIP	  _IO(KVMIO,  0x60)
 #define KVM_IRQ_LINE		  _IOW(KVMIO, 0x61, struct kvm_irq_level)
@@ -384,5 +275,11 @@ struct kvm_signal_mask {
 #define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu)
 #define KVM_GET_LAPIC             _IOR(KVMIO,  0x8e, struct kvm_lapic_state)
 #define KVM_SET_LAPIC             _IOW(KVMIO,  0x8f, struct kvm_lapic_state)
+#define KVM_SET_CPUID2            _IOW(KVMIO,  0x90, struct kvm_cpuid2)
+#define KVM_GET_CPUID2            _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO,  0x92, struct kvm_tpr_access_ctl)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_SET_VAPIC_ADDR        _IOW(KVMIO,  0x93, struct kvm_vapic_addr)
 
 #endif
diff -puN /dev/null include/linux/kvm_host.h
--- /dev/null
+++ a/include/linux/kvm_host.h
@@ -0,0 +1,299 @@
+#ifndef __KVM_HOST_H
+#define __KVM_HOST_H
+
+/*
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/hardirq.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/preempt.h>
+#include <asm/signal.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+
+#include <linux/kvm_types.h>
+
+#include <asm/kvm_host.h>
+
+#define KVM_MAX_VCPUS 4
+#define KVM_MEMORY_SLOTS 8
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+#define KVM_PIO_PAGE_OFFSET 1
+
+/*
+ * vcpu->requests bit members
+ */
+#define KVM_REQ_TLB_FLUSH          0
+#define KVM_REQ_MIGRATE_TIMER      1
+#define KVM_REQ_REPORT_TPR_ACCESS  2
+
+struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
+
+struct kvm_guest_debug {
+	int enabled;
+	unsigned long bp[4];
+	int singlestep;
+};
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice.  At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+	int                   dev_count;
+#define NR_IOBUS_DEVS 6
+	struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+			     struct kvm_io_device *dev);
+
+struct kvm_vcpu {
+	struct kvm *kvm;
+	struct preempt_notifier preempt_notifier;
+	int vcpu_id;
+	struct mutex mutex;
+	int   cpu;
+	struct kvm_run *run;
+	int guest_mode;
+	unsigned long requests;
+	struct kvm_guest_debug guest_debug;
+	int fpu_active;
+	int guest_fpu_loaded;
+	wait_queue_head_t wq;
+	int sigset_active;
+	sigset_t sigset;
+	struct kvm_vcpu_stat stat;
+
+#ifdef CONFIG_HAS_IOMEM
+	int mmio_needed;
+	int mmio_read_completed;
+	int mmio_is_write;
+	int mmio_size;
+	unsigned char mmio_data[8];
+	gpa_t mmio_phys_addr;
+#endif
+
+	struct kvm_vcpu_arch arch;
+};
+
+struct kvm_memory_slot {
+	gfn_t base_gfn;
+	unsigned long npages;
+	unsigned long flags;
+	unsigned long *rmap;
+	unsigned long *dirty_bitmap;
+	unsigned long userspace_addr;
+	int user_alloc;
+};
+
+struct kvm {
+	struct mutex lock; /* protects the vcpus array and APIC accesses */
+	spinlock_t mmu_lock;
+	struct mm_struct *mm; /* userspace tied to this vm */
+	int nmemslots;
+	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
+					KVM_PRIVATE_MEM_SLOTS];
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	struct list_head vm_list;
+	struct file *filp;
+	struct kvm_io_bus mmio_bus;
+	struct kvm_io_bus pio_bus;
+	struct kvm_vm_stat stat;
+	struct kvm_arch arch;
+};
+
+/* The guest did something we don't support. */
+#define pr_unimpl(vcpu, fmt, ...)					\
+ do {									\
+	if (printk_ratelimit())						\
+		printk(KERN_ERR "kvm: %i: cpu%i " fmt,			\
+		       current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
+ } while (0)
+
+#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
+#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void vcpu_load(struct kvm_vcpu *vcpu);
+void vcpu_put(struct kvm_vcpu *vcpu);
+
+void decache_vcpus_on_cpu(int cpu);
+
+
+int kvm_init(void *opaque, unsigned int vcpu_size,
+		  struct module *module);
+void kvm_exit(void);
+
+#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
+#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
+static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
+
+extern struct page *bad_page;
+
+int is_error_page(struct page *page);
+int kvm_is_error_hva(unsigned long addr);
+int kvm_set_memory_region(struct kvm *kvm,
+			  struct kvm_userspace_memory_region *mem,
+			  int user_alloc);
+int __kvm_set_memory_region(struct kvm *kvm,
+			    struct kvm_userspace_memory_region *mem,
+			    int user_alloc);
+int kvm_arch_set_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem,
+				struct kvm_memory_slot old,
+				int user_alloc);
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+void kvm_release_page_clean(struct page *page);
+void kvm_release_page_dirty(struct page *page);
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+			int len);
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+			  unsigned long len);
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+			 int offset, int len);
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+		    unsigned long len);
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+
+void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_resched(struct kvm_vcpu *vcpu);
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_flush_remote_tlbs(struct kvm *kvm);
+
+long kvm_arch_dev_ioctl(struct file *filp,
+			unsigned int ioctl, unsigned long arg);
+long kvm_arch_vcpu_ioctl(struct file *filp,
+			 unsigned int ioctl, unsigned long arg);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+
+int kvm_dev_ioctl_check_extension(long ext);
+
+int kvm_get_dirty_log(struct kvm *kvm,
+			struct kvm_dirty_log *log, int *is_dirty);
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+				struct kvm_dirty_log *log);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+				   struct
+				   kvm_userspace_memory_region *mem,
+				   int user_alloc);
+long kvm_arch_vm_ioctl(struct file *filp,
+		       unsigned int ioctl, unsigned long arg);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+				    struct kvm_translation *tr);
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+				    struct kvm_debug_guest *dbg);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+
+int kvm_arch_init(void *opaque);
+void kvm_arch_exit(void);
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_arch_hardware_enable(void *garbage);
+void kvm_arch_hardware_disable(void *garbage);
+int kvm_arch_hardware_setup(void);
+void kvm_arch_hardware_unsetup(void);
+void kvm_arch_check_processor_compat(void *rtn);
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+
+void kvm_free_physmem(struct kvm *kvm);
+
+struct  kvm *kvm_arch_create_vm(void);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+static inline void kvm_guest_enter(void)
+{
+	account_system_vtime(current);
+	current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+	account_system_vtime(current);
+	current->flags &= ~PF_VCPU;
+}
+
+static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	return slot - kvm->memslots;
+}
+
+static inline gpa_t gfn_to_gpa(gfn_t gfn)
+{
+	return (gpa_t)gfn << PAGE_SHIFT;
+}
+
+static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
+}
+
+enum kvm_stat_kind {
+	KVM_STAT_VM,
+	KVM_STAT_VCPU,
+};
+
+struct kvm_stats_debugfs_item {
+	const char *name;
+	int offset;
+	enum kvm_stat_kind kind;
+	struct dentry *dentry;
+};
+extern struct kvm_stats_debugfs_item debugfs_entries[];
+
+#endif
diff -puN include/linux/kvm_para.h~git-kvm include/linux/kvm_para.h
--- a/include/linux/kvm_para.h~git-kvm
+++ a/include/linux/kvm_para.h
@@ -2,72 +2,30 @@
 #define __LINUX_KVM_PARA_H
 
 /*
- * Guest OS interface for KVM paravirtualization
- *
- * Note: this interface is totally experimental, and is certain to change
- *       as we make progress.
+ * This header file provides a method for making a hypercall to the host
+ * Architectures should define:
+ * - kvm_hypercall0, kvm_hypercall1...
+ * - kvm_arch_para_features
+ * - kvm_para_available
  */
 
-/*
- * Per-VCPU descriptor area shared between guest and host. Writable to
- * both guest and host. Registered with the host by the guest when
- * a guest acknowledges paravirtual mode.
- *
- * NOTE: all addresses are guest-physical addresses (gpa), to make it
- * easier for the hypervisor to map between the various addresses.
- */
-struct kvm_vcpu_para_state {
-	/*
-	 * API version information for compatibility. If there's any support
-	 * mismatch (too old host trying to execute too new guest) then
-	 * the host will deny entry into paravirtual mode. Any other
-	 * combination (new host + old guest and new host + new guest)
-	 * is supposed to work - new host versions will support all old
-	 * guest API versions.
-	 */
-	u32 guest_version;
-	u32 host_version;
-	u32 size;
-	u32 ret;
-
-	/*
-	 * The address of the vm exit instruction (VMCALL or VMMCALL),
-	 * which the host will patch according to the CPU model the
-	 * VM runs on:
-	 */
-	u64 hypercall_gpa;
-
-} __attribute__ ((aligned(PAGE_SIZE)));
+/* Return values for hypercalls */
+#define KVM_ENOSYS		1000
 
-#define KVM_PARA_API_VERSION 1
+#define KVM_HC_VAPIC_POLL_IRQ            1
 
 /*
- * This is used for an RDMSR's ECX parameter to probe for a KVM host.
- * Hopefully no CPU vendor will use up this number. This is placed well
- * out of way of the typical space occupied by CPU vendors' MSR indices,
- * and we think (or at least hope) it wont be occupied in the future
- * either.
+ * hypercalls use architecture specific
  */
-#define MSR_KVM_API_MAGIC 0x87655678
+#include <asm/kvm_para.h>
 
-#define KVM_EINVAL 1
-
-/*
- * Hypercall calling convention:
- *
- * Each hypercall may have 0-6 parameters.
- *
- * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
- *
- * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
- * order: RDI, RSI, RDX, RCX, R8, R9.
- *
- * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
- * (the first 3 are according to the gcc regparm calling convention)
- *
- * No registers are clobbered by the hypercall, except that the
- * return value is in RAX.
- */
-#define __NR_hypercalls			0
+#ifdef __KERNEL__
+static inline int kvm_para_has_feature(unsigned int feature)
+{
+	if (kvm_arch_para_features() & (1UL << feature))
+		return 1;
+	return 0;
+}
+#endif /* __KERNEL__ */
+#endif /* __LINUX_KVM_PARA_H */
 
-#endif
diff -puN /dev/null include/linux/kvm_types.h
--- /dev/null
+++ a/include/linux/kvm_types.h
@@ -0,0 +1,54 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __KVM_TYPES_H__
+#define __KVM_TYPES_H__
+
+#include <asm/types.h>
+
+/*
+ * Address types:
+ *
+ *  gva - guest virtual address
+ *  gpa - guest physical address
+ *  gfn - guest frame number
+ *  hva - host virtual address
+ *  hpa - host physical address
+ *  hfn - host frame number
+ */
+
+typedef unsigned long  gva_t;
+typedef u64            gpa_t;
+typedef unsigned long  gfn_t;
+
+typedef unsigned long  hva_t;
+typedef u64            hpa_t;
+typedef unsigned long  hfn_t;
+
+struct kvm_pio_request {
+	unsigned long count;
+	int cur_count;
+	struct page *guest_pages[2];
+	unsigned guest_page_offset;
+	int in;
+	int port;
+	int size;
+	int string;
+	int down;
+	int rep;
+};
+
+#endif /* __KVM_TYPES_H__ */
diff -puN kernel/fork.c~git-kvm kernel/fork.c
--- a/kernel/fork.c~git-kvm
+++ a/kernel/fork.c
@@ -392,6 +392,7 @@ void fastcall __mmdrop(struct mm_struct 
 	destroy_context(mm);
 	free_mm(mm);
 }
+EXPORT_SYMBOL_GPL(__mmdrop);
 
 /*
  * Decrement the use count and release all resources for an mm.
diff -puN /dev/null virt/kvm/ioapic.c
--- /dev/null
+++ a/virt/kvm/ioapic.c
@@ -0,0 +1,403 @@
+/*
+ *  Copyright (C) 2001  MandrakeSoft S.A.
+ *
+ *    MandrakeSoft S.A.
+ *    43, rue d'Aboukir
+ *    75002 Paris - France
+ *    http://www.linux-mandrake.com/
+ *    http://www.mandrakesoft.com/
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Yunhong Jiang <yunhong.jiang@intel.com>
+ *  Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *  Based on Xen 3.1 code.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
+#define ioapic_debug(fmt, arg...)
+#endif
+static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
+					  unsigned long addr,
+					  unsigned long length)
+{
+	unsigned long result = 0;
+
+	switch (ioapic->ioregsel) {
+	case IOAPIC_REG_VERSION:
+		result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+			  | (IOAPIC_VERSION_ID & 0xff));
+		break;
+
+	case IOAPIC_REG_APIC_ID:
+	case IOAPIC_REG_ARB_ID:
+		result = ((ioapic->id & 0xf) << 24);
+		break;
+
+	default:
+		{
+			u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+			u64 redir_content;
+
+			ASSERT(redir_index < IOAPIC_NUM_PINS);
+
+			redir_content = ioapic->redirtbl[redir_index].bits;
+			result = (ioapic->ioregsel & 0x1) ?
+			    (redir_content >> 32) & 0xffffffff :
+			    redir_content & 0xffffffff;
+			break;
+		}
+	}
+
+	return result;
+}
+
+static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
+{
+	union ioapic_redir_entry *pent;
+
+	pent = &ioapic->redirtbl[idx];
+
+	if (!pent->fields.mask) {
+		ioapic_deliver(ioapic, idx);
+		if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
+			pent->fields.remote_irr = 1;
+	}
+	if (!pent->fields.trig_mode)
+		ioapic->irr &= ~(1 << idx);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+	unsigned index;
+
+	switch (ioapic->ioregsel) {
+	case IOAPIC_REG_VERSION:
+		/* Writes are ignored. */
+		break;
+
+	case IOAPIC_REG_APIC_ID:
+		ioapic->id = (val >> 24) & 0xf;
+		break;
+
+	case IOAPIC_REG_ARB_ID:
+		break;
+
+	default:
+		index = (ioapic->ioregsel - 0x10) >> 1;
+
+		ioapic_debug("change redir index %x val %x\n", index, val);
+		if (index >= IOAPIC_NUM_PINS)
+			return;
+		if (ioapic->ioregsel & 1) {
+			ioapic->redirtbl[index].bits &= 0xffffffff;
+			ioapic->redirtbl[index].bits |= (u64) val << 32;
+		} else {
+			ioapic->redirtbl[index].bits &= ~0xffffffffULL;
+			ioapic->redirtbl[index].bits |= (u32) val;
+			ioapic->redirtbl[index].fields.remote_irr = 0;
+		}
+		if (ioapic->irr & (1 << index))
+			ioapic_service(ioapic, index);
+		break;
+	}
+}
+
+static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
+			   struct kvm_vcpu *vcpu,
+			   u8 vector, u8 trig_mode, u8 delivery_mode)
+{
+	ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
+		     delivery_mode);
+
+	ASSERT((delivery_mode == IOAPIC_FIXED) ||
+	       (delivery_mode == IOAPIC_LOWEST_PRIORITY));
+
+	kvm_apic_set_irq(vcpu, vector, trig_mode);
+}
+
+static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+				       u8 dest_mode)
+{
+	u32 mask = 0;
+	int i;
+	struct kvm *kvm = ioapic->kvm;
+	struct kvm_vcpu *vcpu;
+
+	ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
+
+	if (dest_mode == 0) {	/* Physical mode. */
+		if (dest == 0xFF) {	/* Broadcast. */
+			for (i = 0; i < KVM_MAX_VCPUS; ++i)
+				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
+					mask |= 1 << i;
+			return mask;
+		}
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
+				if (vcpu->arch.apic)
+					mask = 1 << i;
+				break;
+			}
+		}
+	} else if (dest != 0)	/* Logical mode, MDA non-zero. */
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (vcpu->arch.apic &&
+			    kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
+				mask |= 1 << vcpu->vcpu_id;
+		}
+	ioapic_debug("mask %x\n", mask);
+	return mask;
+}
+
+static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+{
+	u8 dest = ioapic->redirtbl[irq].fields.dest_id;
+	u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
+	u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
+	u8 vector = ioapic->redirtbl[irq].fields.vector;
+	u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
+	u32 deliver_bitmask;
+	struct kvm_vcpu *vcpu;
+	int vcpu_id;
+
+	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+		     "vector=%x trig_mode=%x\n",
+		     dest, dest_mode, delivery_mode, vector, trig_mode);
+
+	deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
+	if (!deliver_bitmask) {
+		ioapic_debug("no target on destination\n");
+		return;
+	}
+
+	switch (delivery_mode) {
+	case IOAPIC_LOWEST_PRIORITY:
+		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
+				deliver_bitmask);
+		if (vcpu != NULL)
+			ioapic_inj_irq(ioapic, vcpu, vector,
+				       trig_mode, delivery_mode);
+		else
+			ioapic_debug("null lowest prio vcpu: "
+				     "mask=%x vector=%x delivery_mode=%x\n",
+				     deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
+		break;
+	case IOAPIC_FIXED:
+		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+			if (!(deliver_bitmask & (1 << vcpu_id)))
+				continue;
+			deliver_bitmask &= ~(1 << vcpu_id);
+			vcpu = ioapic->kvm->vcpus[vcpu_id];
+			if (vcpu) {
+				ioapic_inj_irq(ioapic, vcpu, vector,
+					       trig_mode, delivery_mode);
+			}
+		}
+		break;
+
+		/* TODO: NMI */
+	default:
+		printk(KERN_WARNING "Unsupported delivery mode %d\n",
+		       delivery_mode);
+		break;
+	}
+}
+
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+{
+	u32 old_irr = ioapic->irr;
+	u32 mask = 1 << irq;
+	union ioapic_redir_entry entry;
+
+	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
+		entry = ioapic->redirtbl[irq];
+		level ^= entry.fields.polarity;
+		if (!level)
+			ioapic->irr &= ~mask;
+		else {
+			ioapic->irr |= mask;
+			if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
+			    || !entry.fields.remote_irr)
+				ioapic_service(ioapic, irq);
+		}
+	}
+}
+
+static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
+{
+	int i;
+
+	for (i = 0; i < IOAPIC_NUM_PINS; i++)
+		if (ioapic->redirtbl[i].fields.vector == vector)
+			return i;
+	return -1;
+}
+
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+	union ioapic_redir_entry *ent;
+	int gsi;
+
+	gsi = get_eoi_gsi(ioapic, vector);
+	if (gsi == -1) {
+		printk(KERN_WARNING "Can't find redir item for %d EOI\n",
+		       vector);
+		return;
+	}
+
+	ent = &ioapic->redirtbl[gsi];
+	ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+
+	ent->fields.remote_irr = 0;
+	if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
+		ioapic_deliver(ioapic, gsi);
+}
+
+static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
+
+	return ((addr >= ioapic->base_address &&
+		 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+			     void *val)
+{
+	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
+	u32 result;
+
+	ioapic_debug("addr %lx\n", (unsigned long)addr);
+	ASSERT(!(addr & 0xf));	/* check alignment */
+
+	addr &= 0xff;
+	switch (addr) {
+	case IOAPIC_REG_SELECT:
+		result = ioapic->ioregsel;
+		break;
+
+	case IOAPIC_REG_WINDOW:
+		result = ioapic_read_indirect(ioapic, addr, len);
+		break;
+
+	default:
+		result = 0;
+		break;
+	}
+	switch (len) {
+	case 8:
+		*(u64 *) val = result;
+		break;
+	case 1:
+	case 2:
+	case 4:
+		memcpy(val, (char *)&result, len);
+		break;
+	default:
+		printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+	}
+}
+
+static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
+			      const void *val)
+{
+	struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
+	u32 data;
+
+	ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+		     (void*)addr, len, val);
+	ASSERT(!(addr & 0xf));	/* check alignment */
+	if (len == 4 || len == 8)
+		data = *(u32 *) val;
+	else {
+		printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+		return;
+	}
+
+	addr &= 0xff;
+	switch (addr) {
+	case IOAPIC_REG_SELECT:
+		ioapic->ioregsel = data;
+		break;
+
+	case IOAPIC_REG_WINDOW:
+		ioapic_write_indirect(ioapic, data);
+		break;
+#ifdef	CONFIG_IA64
+	case IOAPIC_REG_EOI:
+		kvm_ioapic_update_eoi(ioapic->kvm, data);
+		break;
+#endif
+
+	default:
+		break;
+	}
+}
+
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+	int i;
+
+	for (i = 0; i < IOAPIC_NUM_PINS; i++)
+		ioapic->redirtbl[i].fields.mask = 1;
+	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+	ioapic->ioregsel = 0;
+	ioapic->irr = 0;
+	ioapic->id = 0;
+}
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+	struct kvm_ioapic *ioapic;
+
+	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+	if (!ioapic)
+		return -ENOMEM;
+	kvm->arch.vioapic = ioapic;
+	kvm_ioapic_reset(ioapic);
+	ioapic->dev.read = ioapic_mmio_read;
+	ioapic->dev.write = ioapic_mmio_write;
+	ioapic->dev.in_range = ioapic_in_range;
+	ioapic->dev.private = ioapic;
+	ioapic->kvm = kvm;
+	kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
+	return 0;
+}
diff -puN /dev/null virt/kvm/ioapic.h
--- /dev/null
+++ a/virt/kvm/ioapic.h
@@ -0,0 +1,95 @@
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
+#define IOAPIC_EDGE_TRIG  0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
+#define IOAPIC_MEM_LENGTH            0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT  0x00
+#define IOAPIC_REG_WINDOW  0x10
+#define IOAPIC_REG_EOI     0x40	/* IA64 IOSAPIC only */
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00	/* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID  0x02	/* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define	IOAPIC_FIXED			0x0
+#define	IOAPIC_LOWEST_PRIORITY		0x1
+#define	IOAPIC_PMI			0x2
+#define	IOAPIC_NMI			0x4
+#define	IOAPIC_INIT			0x5
+#define	IOAPIC_EXTINT			0x7
+
+struct kvm_ioapic {
+	u64 base_address;
+	u32 ioregsel;
+	u32 id;
+	u32 irr;
+	u32 pad;
+	union ioapic_redir_entry {
+		u64 bits;
+		struct {
+			u8 vector;
+			u8 delivery_mode:3;
+			u8 dest_mode:1;
+			u8 delivery_status:1;
+			u8 polarity:1;
+			u8 remote_irr:1;
+			u8 trig_mode:1;
+			u8 mask:1;
+			u8 reserve:7;
+			u8 reserved[4];
+			u8 dest_id;
+		} fields;
+	} redirtbl[IOAPIC_NUM_PINS];
+	struct kvm_io_device dev;
+	struct kvm *kvm;
+};
+
+#ifdef DEBUG
+#define ASSERT(x)  							\
+do {									\
+	if (!(x)) {							\
+		printk(KERN_EMERG "assertion failed %s: %d: %s\n",	\
+		       __FILE__, __LINE__, #x);				\
+		BUG();							\
+	}								\
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+	return kvm->arch.vioapic;
+}
+
+#ifdef CONFIG_IA64
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	return 1;
+}
+#endif
+
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+				       unsigned long bitmap);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
+
+#endif
diff -puN /dev/null virt/kvm/iodev.h
--- /dev/null
+++ a/virt/kvm/iodev.h
@@ -0,0 +1,63 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __KVM_IODEV_H__
+#define __KVM_IODEV_H__
+
+#include <linux/kvm_types.h>
+
+struct kvm_io_device {
+	void (*read)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     void *val);
+	void (*write)(struct kvm_io_device *this,
+		      gpa_t addr,
+		      int len,
+		      const void *val);
+	int (*in_range)(struct kvm_io_device *this, gpa_t addr);
+	void (*destructor)(struct kvm_io_device *this);
+
+	void             *private;
+};
+
+static inline void kvm_iodevice_read(struct kvm_io_device *dev,
+				     gpa_t addr,
+				     int len,
+				     void *val)
+{
+	dev->read(dev, addr, len, val);
+}
+
+static inline void kvm_iodevice_write(struct kvm_io_device *dev,
+				      gpa_t addr,
+				      int len,
+				      const void *val)
+{
+	dev->write(dev, addr, len, val);
+}
+
+static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
+{
+	return dev->in_range(dev, addr);
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+	if (dev->destructor)
+		dev->destructor(dev);
+}
+
+#endif /* __KVM_IODEV_H__ */
diff -puN /dev/null virt/kvm/kvm_main.c
--- /dev/null
+++ a/virt/kvm/kvm_main.c
@@ -0,0 +1,1400 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/reboot.h>
+#include <linux/debugfs.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/smp.h>
+#include <linux/anon_inodes.h>
+#include <linux/profile.h>
+#include <linux/kvm_para.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+DEFINE_SPINLOCK(kvm_lock);
+LIST_HEAD(vm_list);
+
+static cpumask_t cpus_hardware_enabled;
+
+struct kmem_cache *kvm_vcpu_cache;
+EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+
+static __read_mostly struct preempt_ops kvm_preempt_ops;
+
+static struct dentry *debugfs_dir;
+
+static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
+			   unsigned long arg);
+
+static inline int valid_vcpu(int n)
+{
+	return likely(n >= 0 && n < KVM_MAX_VCPUS);
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put()
+ */
+void vcpu_load(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+
+	mutex_lock(&vcpu->mutex);
+	cpu = get_cpu();
+	preempt_notifier_register(&vcpu->preempt_notifier);
+	kvm_arch_vcpu_load(vcpu, cpu);
+	put_cpu();
+}
+
+void vcpu_put(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	kvm_arch_vcpu_put(vcpu);
+	preempt_notifier_unregister(&vcpu->preempt_notifier);
+	preempt_enable();
+	mutex_unlock(&vcpu->mutex);
+}
+
+static void ack_flush(void *_completed)
+{
+}
+
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+	int i, cpu;
+	cpumask_t cpus;
+	struct kvm_vcpu *vcpu;
+
+	cpus_clear(cpus);
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		vcpu = kvm->vcpus[i];
+		if (!vcpu)
+			continue;
+		if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+			continue;
+		cpu = vcpu->cpu;
+		if (cpu != -1 && cpu != raw_smp_processor_id())
+			cpu_set(cpu, cpus);
+	}
+	if (cpus_empty(cpus))
+		return;
+	++kvm->stat.remote_tlb_flush;
+	smp_call_function_mask(cpus, ack_flush, NULL, 1);
+}
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+{
+	struct page *page;
+	int r;
+
+	mutex_init(&vcpu->mutex);
+	vcpu->cpu = -1;
+	vcpu->kvm = kvm;
+	vcpu->vcpu_id = id;
+	init_waitqueue_head(&vcpu->wq);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		r = -ENOMEM;
+		goto fail;
+	}
+	vcpu->run = page_address(page);
+
+	r = kvm_arch_vcpu_init(vcpu);
+	if (r < 0)
+		goto fail_free_run;
+	return 0;
+
+fail_free_run:
+	free_page((unsigned long)vcpu->run);
+fail:
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_init);
+
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	kvm_arch_vcpu_uninit(vcpu);
+	free_page((unsigned long)vcpu->run);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+
+static struct kvm *kvm_create_vm(void)
+{
+	struct kvm *kvm = kvm_arch_create_vm();
+
+	if (IS_ERR(kvm))
+		goto out;
+
+	kvm->mm = current->mm;
+	atomic_inc(&kvm->mm->mm_count);
+	spin_lock_init(&kvm->mmu_lock);
+	kvm_io_bus_init(&kvm->pio_bus);
+	mutex_init(&kvm->lock);
+	kvm_io_bus_init(&kvm->mmio_bus);
+	spin_lock(&kvm_lock);
+	list_add(&kvm->vm_list, &vm_list);
+	spin_unlock(&kvm_lock);
+out:
+	return kvm;
+}
+
+/*
+ * Free any memory in @free but not in @dont.
+ */
+static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+				  struct kvm_memory_slot *dont)
+{
+	if (!dont || free->rmap != dont->rmap)
+		vfree(free->rmap);
+
+	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+		vfree(free->dirty_bitmap);
+
+	free->npages = 0;
+	free->dirty_bitmap = NULL;
+	free->rmap = NULL;
+}
+
+void kvm_free_physmem(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->nmemslots; ++i)
+		kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+}
+
+static void kvm_destroy_vm(struct kvm *kvm)
+{
+	struct mm_struct *mm = kvm->mm;
+
+	spin_lock(&kvm_lock);
+	list_del(&kvm->vm_list);
+	spin_unlock(&kvm_lock);
+	kvm_io_bus_destroy(&kvm->pio_bus);
+	kvm_io_bus_destroy(&kvm->mmio_bus);
+	kvm_arch_destroy_vm(kvm);
+	mmdrop(mm);
+}
+
+static int kvm_vm_release(struct inode *inode, struct file *filp)
+{
+	struct kvm *kvm = filp->private_data;
+
+	kvm_destroy_vm(kvm);
+	return 0;
+}
+
+/*
+ * Allocate some memory and give it an address in the guest physical address
+ * space.
+ *
+ * Discontiguous memory is allowed, mostly for framebuffers.
+ *
+ * Must be called holding mmap_sem for write.
+ */
+int __kvm_set_memory_region(struct kvm *kvm,
+			    struct kvm_userspace_memory_region *mem,
+			    int user_alloc)
+{
+	int r;
+	gfn_t base_gfn;
+	unsigned long npages;
+	unsigned long i;
+	struct kvm_memory_slot *memslot;
+	struct kvm_memory_slot old, new;
+
+	r = -EINVAL;
+	/* General sanity checks */
+	if (mem->memory_size & (PAGE_SIZE - 1))
+		goto out;
+	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
+		goto out;
+	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+		goto out;
+	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
+		goto out;
+
+	memslot = &kvm->memslots[mem->slot];
+	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
+	npages = mem->memory_size >> PAGE_SHIFT;
+
+	if (!npages)
+		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
+	new = old = *memslot;
+
+	new.base_gfn = base_gfn;
+	new.npages = npages;
+	new.flags = mem->flags;
+
+	/* Disallow changing a memory slot's size. */
+	r = -EINVAL;
+	if (npages && old.npages && npages != old.npages)
+		goto out_free;
+
+	/* Check for overlaps */
+	r = -EEXIST;
+	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+		struct kvm_memory_slot *s = &kvm->memslots[i];
+
+		if (s == memslot)
+			continue;
+		if (!((base_gfn + npages <= s->base_gfn) ||
+		      (base_gfn >= s->base_gfn + s->npages)))
+			goto out_free;
+	}
+
+	/* Free page dirty bitmap if unneeded */
+	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
+		new.dirty_bitmap = NULL;
+
+	r = -ENOMEM;
+
+	/* Allocate if a slot is being created */
+	if (npages && !new.rmap) {
+		new.rmap = vmalloc(npages * sizeof(struct page *));
+
+		if (!new.rmap)
+			goto out_free;
+
+		memset(new.rmap, 0, npages * sizeof(*new.rmap));
+
+		new.user_alloc = user_alloc;
+		new.userspace_addr = mem->userspace_addr;
+	}
+
+	/* Allocate page dirty bitmap if needed */
+	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
+		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
+
+		new.dirty_bitmap = vmalloc(dirty_bytes);
+		if (!new.dirty_bitmap)
+			goto out_free;
+		memset(new.dirty_bitmap, 0, dirty_bytes);
+	}
+
+	if (mem->slot >= kvm->nmemslots)
+		kvm->nmemslots = mem->slot + 1;
+
+	*memslot = new;
+
+	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
+	if (r) {
+		*memslot = old;
+		goto out_free;
+	}
+
+	kvm_free_physmem_slot(&old, &new);
+	return 0;
+
+out_free:
+	kvm_free_physmem_slot(&new, &old);
+out:
+	return r;
+
+}
+EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
+
+int kvm_set_memory_region(struct kvm *kvm,
+			  struct kvm_userspace_memory_region *mem,
+			  int user_alloc)
+{
+	int r;
+
+	down_write(&current->mm->mmap_sem);
+	r = __kvm_set_memory_region(kvm, mem, user_alloc);
+	up_write(&current->mm->mmap_sem);
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_set_memory_region);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+				   struct
+				   kvm_userspace_memory_region *mem,
+				   int user_alloc)
+{
+	if (mem->slot >= KVM_MEMORY_SLOTS)
+		return -EINVAL;
+	return kvm_set_memory_region(kvm, mem, user_alloc);
+}
+
+int kvm_get_dirty_log(struct kvm *kvm,
+			struct kvm_dirty_log *log, int *is_dirty)
+{
+	struct kvm_memory_slot *memslot;
+	int r, i;
+	int n;
+	unsigned long any = 0;
+
+	r = -EINVAL;
+	if (log->slot >= KVM_MEMORY_SLOTS)
+		goto out;
+
+	memslot = &kvm->memslots[log->slot];
+	r = -ENOENT;
+	if (!memslot->dirty_bitmap)
+		goto out;
+
+	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+
+	for (i = 0; !any && i < n/sizeof(long); ++i)
+		any = memslot->dirty_bitmap[i];
+
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+		goto out;
+
+	if (any)
+		*is_dirty = 1;
+
+	r = 0;
+out:
+	return r;
+}
+
+int is_error_page(struct page *page)
+{
+	return page == bad_page;
+}
+EXPORT_SYMBOL_GPL(is_error_page);
+
+static inline unsigned long bad_hva(void)
+{
+	return PAGE_OFFSET;
+}
+
+int kvm_is_error_hva(unsigned long addr)
+{
+	return addr == bad_hva();
+}
+EXPORT_SYMBOL_GPL(kvm_is_error_hva);
+
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+
+	for (i = 0; i < kvm->nmemslots; ++i) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages)
+			return memslot;
+	}
+	return NULL;
+}
+
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+	gfn = unalias_gfn(kvm, gfn);
+	return __gfn_to_memslot(kvm, gfn);
+}
+
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+
+	gfn = unalias_gfn(kvm, gfn);
+	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages)
+			return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
+
+static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	gfn = unalias_gfn(kvm, gfn);
+	slot = __gfn_to_memslot(kvm, gfn);
+	if (!slot)
+		return bad_hva();
+	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
+}
+
+/*
+ * Requires current->mm->mmap_sem to be held
+ */
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+	struct page *page[1];
+	unsigned long addr;
+	int npages;
+
+	might_sleep();
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr)) {
+		get_page(bad_page);
+		return bad_page;
+	}
+
+	npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
+				NULL);
+
+	if (npages != 1) {
+		get_page(bad_page);
+		return bad_page;
+	}
+
+	return page[0];
+}
+
+EXPORT_SYMBOL_GPL(gfn_to_page);
+
+void kvm_release_page_clean(struct page *page)
+{
+	put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_clean);
+
+void kvm_release_page_dirty(struct page *page)
+{
+	if (!PageReserved(page))
+		SetPageDirty(page);
+	put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
+
+static int next_segment(unsigned long len, int offset)
+{
+	if (len > PAGE_SIZE - offset)
+		return PAGE_SIZE - offset;
+	else
+		return len;
+}
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+			int len)
+{
+	int r;
+	unsigned long addr;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return -EFAULT;
+	r = copy_from_user(data, (void __user *)addr + offset, len);
+	if (r)
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page);
+
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		++gfn;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest);
+
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+			  unsigned long len)
+{
+	int r;
+	unsigned long addr;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int offset = offset_in_page(gpa);
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return -EFAULT;
+	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+	if (r)
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL(kvm_read_guest_atomic);
+
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+			 int offset, int len)
+{
+	int r;
+	unsigned long addr;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return -EFAULT;
+	r = copy_to_user((void __user *)addr + offset, data, len);
+	if (r)
+		return -EFAULT;
+	mark_page_dirty(kvm, gfn);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_page);
+
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+		    unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		++gfn;
+	}
+	return 0;
+}
+
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+{
+	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
+
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
+	int ret;
+
+        while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		++gfn;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest);
+
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *memslot;
+
+	gfn = unalias_gfn(kvm, gfn);
+	memslot = __gfn_to_memslot(kvm, gfn);
+	if (memslot && memslot->dirty_bitmap) {
+		unsigned long rel_gfn = gfn - memslot->base_gfn;
+
+		/* avoid RMW */
+		if (!test_bit(rel_gfn, memslot->dirty_bitmap))
+			set_bit(rel_gfn, memslot->dirty_bitmap);
+	}
+}
+
+/*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(&vcpu->wq, &wait);
+
+	/*
+	 * We will block until either an interrupt or a signal wakes us up
+	 */
+	while (!kvm_cpu_has_interrupt(vcpu)
+	       && !signal_pending(current)
+	       && !kvm_arch_vcpu_runnable(vcpu)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		vcpu_put(vcpu);
+		schedule();
+		vcpu_load(vcpu);
+	}
+
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&vcpu->wq, &wait);
+}
+
+void kvm_resched(struct kvm_vcpu *vcpu)
+{
+	if (!need_resched())
+		return;
+	cond_resched();
+}
+EXPORT_SYMBOL_GPL(kvm_resched);
+
+static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff == 0)
+		page = virt_to_page(vcpu->run);
+	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
+		page = virt_to_page(vcpu->arch.pio_data);
+	else
+		return VM_FAULT_SIGBUS;
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static struct vm_operations_struct kvm_vcpu_vm_ops = {
+	.fault = kvm_vcpu_fault,
+};
+
+static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_vcpu_vm_ops;
+	return 0;
+}
+
+static int kvm_vcpu_release(struct inode *inode, struct file *filp)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+
+	fput(vcpu->kvm->filp);
+	return 0;
+}
+
+static struct file_operations kvm_vcpu_fops = {
+	.release        = kvm_vcpu_release,
+	.unlocked_ioctl = kvm_vcpu_ioctl,
+	.compat_ioctl   = kvm_vcpu_ioctl,
+	.mmap           = kvm_vcpu_mmap,
+};
+
+/*
+ * Allocates an inode for the vcpu.
+ */
+static int create_vcpu_fd(struct kvm_vcpu *vcpu)
+{
+	int fd, r;
+	struct inode *inode;
+	struct file *file;
+
+	r = anon_inode_getfd(&fd, &inode, &file,
+			     "kvm-vcpu", &kvm_vcpu_fops, vcpu);
+	if (r)
+		return r;
+	atomic_inc(&vcpu->kvm->filp->f_count);
+	return fd;
+}
+
+/*
+ * Creates some virtual cpus.  Good luck creating more than one.
+ */
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
+{
+	int r;
+	struct kvm_vcpu *vcpu;
+
+	if (!valid_vcpu(n))
+		return -EINVAL;
+
+	vcpu = kvm_arch_vcpu_create(kvm, n);
+	if (IS_ERR(vcpu))
+		return PTR_ERR(vcpu);
+
+	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+
+	r = kvm_arch_vcpu_setup(vcpu);
+	if (r)
+		goto vcpu_destroy;
+
+	mutex_lock(&kvm->lock);
+	if (kvm->vcpus[n]) {
+		r = -EEXIST;
+		mutex_unlock(&kvm->lock);
+		goto vcpu_destroy;
+	}
+	kvm->vcpus[n] = vcpu;
+	mutex_unlock(&kvm->lock);
+
+	/* Now it's all set up, let userspace reach it */
+	r = create_vcpu_fd(vcpu);
+	if (r < 0)
+		goto unlink;
+	return r;
+
+unlink:
+	mutex_lock(&kvm->lock);
+	kvm->vcpus[n] = NULL;
+	mutex_unlock(&kvm->lock);
+vcpu_destroy:
+	kvm_arch_vcpu_destroy(vcpu);
+	return r;
+}
+
+static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
+{
+	if (sigset) {
+		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		vcpu->sigset_active = 1;
+		vcpu->sigset = *sigset;
+	} else
+		vcpu->sigset_active = 0;
+	return 0;
+}
+
+static long kvm_vcpu_ioctl(struct file *filp,
+			   unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r;
+
+	if (vcpu->kvm->mm != current->mm)
+		return -EIO;
+	switch (ioctl) {
+	case KVM_RUN:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+		break;
+	case KVM_GET_REGS: {
+		struct kvm_regs kvm_regs;
+
+		memset(&kvm_regs, 0, sizeof kvm_regs);
+		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_REGS: {
+		struct kvm_regs kvm_regs;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_GET_SREGS: {
+		struct kvm_sregs kvm_sregs;
+
+		memset(&kvm_sregs, 0, sizeof kvm_sregs);
+		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_SREGS: {
+		struct kvm_sregs kvm_sregs;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_TRANSLATE: {
+		struct kvm_translation tr;
+
+		r = -EFAULT;
+		if (copy_from_user(&tr, argp, sizeof tr))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &tr, sizeof tr))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_DEBUG_GUEST: {
+		struct kvm_debug_guest dbg;
+
+		r = -EFAULT;
+		if (copy_from_user(&dbg, argp, sizeof dbg))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_SIGNAL_MASK: {
+		struct kvm_signal_mask __user *sigmask_arg = argp;
+		struct kvm_signal_mask kvm_sigmask;
+		sigset_t sigset, *p;
+
+		p = NULL;
+		if (argp) {
+			r = -EFAULT;
+			if (copy_from_user(&kvm_sigmask, argp,
+					   sizeof kvm_sigmask))
+				goto out;
+			r = -EINVAL;
+			if (kvm_sigmask.len != sizeof sigset)
+				goto out;
+			r = -EFAULT;
+			if (copy_from_user(&sigset, sigmask_arg->sigset,
+					   sizeof sigset))
+				goto out;
+			p = &sigset;
+		}
+		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+		break;
+	}
+	case KVM_GET_FPU: {
+		struct kvm_fpu fpu;
+
+		memset(&fpu, 0, sizeof fpu);
+		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &fpu, sizeof fpu))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_FPU: {
+		struct kvm_fpu fpu;
+
+		r = -EFAULT;
+		if (copy_from_user(&fpu, argp, sizeof fpu))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	default:
+		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+	}
+out:
+	return r;
+}
+
+static long kvm_vm_ioctl(struct file *filp,
+			   unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r;
+
+	if (kvm->mm != current->mm)
+		return -EIO;
+	switch (ioctl) {
+	case KVM_CREATE_VCPU:
+		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
+		if (r < 0)
+			goto out;
+		break;
+	case KVM_SET_USER_MEMORY_REGION: {
+		struct kvm_userspace_memory_region kvm_userspace_mem;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_userspace_mem, argp,
+						sizeof kvm_userspace_mem))
+			goto out;
+
+		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_GET_DIRTY_LOG: {
+		struct kvm_dirty_log log;
+
+		r = -EFAULT;
+		if (copy_from_user(&log, argp, sizeof log))
+			goto out;
+		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+		if (r)
+			goto out;
+		break;
+	}
+	default:
+		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
+	}
+out:
+	return r;
+}
+
+static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvm *kvm = vma->vm_file->private_data;
+	struct page *page;
+
+	if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
+		return VM_FAULT_SIGBUS;
+	page = gfn_to_page(kvm, vmf->pgoff);
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		return VM_FAULT_SIGBUS;
+	}
+	vmf->page = page;
+	return 0;
+}
+
+static struct vm_operations_struct kvm_vm_vm_ops = {
+	.fault = kvm_vm_fault,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_vm_vm_ops;
+	return 0;
+}
+
+static struct file_operations kvm_vm_fops = {
+	.release        = kvm_vm_release,
+	.unlocked_ioctl = kvm_vm_ioctl,
+	.compat_ioctl   = kvm_vm_ioctl,
+	.mmap           = kvm_vm_mmap,
+};
+
+static int kvm_dev_ioctl_create_vm(void)
+{
+	int fd, r;
+	struct inode *inode;
+	struct file *file;
+	struct kvm *kvm;
+
+	kvm = kvm_create_vm();
+	if (IS_ERR(kvm))
+		return PTR_ERR(kvm);
+	r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
+	if (r) {
+		kvm_destroy_vm(kvm);
+		return r;
+	}
+
+	kvm->filp = file;
+
+	return fd;
+}
+
+static long kvm_dev_ioctl(struct file *filp,
+			  unsigned int ioctl, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	long r = -EINVAL;
+
+	switch (ioctl) {
+	case KVM_GET_API_VERSION:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = KVM_API_VERSION;
+		break;
+	case KVM_CREATE_VM:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = kvm_dev_ioctl_create_vm();
+		break;
+	case KVM_CHECK_EXTENSION:
+		r = kvm_dev_ioctl_check_extension((long)argp);
+		break;
+	case KVM_GET_VCPU_MMAP_SIZE:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = 2 * PAGE_SIZE;
+		break;
+	default:
+		return kvm_arch_dev_ioctl(filp, ioctl, arg);
+	}
+out:
+	return r;
+}
+
+static struct file_operations kvm_chardev_ops = {
+	.unlocked_ioctl = kvm_dev_ioctl,
+	.compat_ioctl   = kvm_dev_ioctl,
+};
+
+static struct miscdevice kvm_dev = {
+	KVM_MINOR,
+	"kvm",
+	&kvm_chardev_ops,
+};
+
+static void hardware_enable(void *junk)
+{
+	int cpu = raw_smp_processor_id();
+
+	if (cpu_isset(cpu, cpus_hardware_enabled))
+		return;
+	cpu_set(cpu, cpus_hardware_enabled);
+	kvm_arch_hardware_enable(NULL);
+}
+
+static void hardware_disable(void *junk)
+{
+	int cpu = raw_smp_processor_id();
+
+	if (!cpu_isset(cpu, cpus_hardware_enabled))
+		return;
+	cpu_clear(cpu, cpus_hardware_enabled);
+	decache_vcpus_on_cpu(cpu);
+	kvm_arch_hardware_disable(NULL);
+}
+
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
+			   void *v)
+{
+	int cpu = (long)v;
+
+	val &= ~CPU_TASKS_FROZEN;
+	switch (val) {
+	case CPU_DYING:
+		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+		       cpu);
+		hardware_disable(NULL);
+		break;
+	case CPU_UP_CANCELED:
+		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+		       cpu);
+		smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
+		break;
+	case CPU_ONLINE:
+		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+		       cpu);
+		smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
+		      void *v)
+{
+	if (val == SYS_RESTART) {
+		/*
+		 * Some (well, at least mine) BIOSes hang on reboot if
+		 * in vmx root mode.
+		 */
+		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+		on_each_cpu(hardware_disable, NULL, 0, 1);
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_reboot_notifier = {
+	.notifier_call = kvm_reboot,
+	.priority = 0,
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus)
+{
+	memset(bus, 0, sizeof(*bus));
+}
+
+void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+{
+	int i;
+
+	for (i = 0; i < bus->dev_count; i++) {
+		struct kvm_io_device *pos = bus->devs[i];
+
+		kvm_iodevice_destructor(pos);
+	}
+}
+
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
+{
+	int i;
+
+	for (i = 0; i < bus->dev_count; i++) {
+		struct kvm_io_device *pos = bus->devs[i];
+
+		if (pos->in_range(pos, addr))
+			return pos;
+	}
+
+	return NULL;
+}
+
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
+{
+	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
+
+	bus->devs[bus->dev_count++] = dev;
+}
+
+static struct notifier_block kvm_cpu_notifier = {
+	.notifier_call = kvm_cpu_hotplug,
+	.priority = 20, /* must be > scheduler priority */
+};
+
+static u64 vm_stat_get(void *_offset)
+{
+	unsigned offset = (long)_offset;
+	u64 total = 0;
+	struct kvm *kvm;
+
+	spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		total += *(u32 *)((void *)kvm + offset);
+	spin_unlock(&kvm_lock);
+	return total;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
+
+static u64 vcpu_stat_get(void *_offset)
+{
+	unsigned offset = (long)_offset;
+	u64 total = 0;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (vcpu)
+				total += *(u32 *)((void *)vcpu + offset);
+		}
+	spin_unlock(&kvm_lock);
+	return total;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
+
+static struct file_operations *stat_fops[] = {
+	[KVM_STAT_VCPU] = &vcpu_stat_fops,
+	[KVM_STAT_VM]   = &vm_stat_fops,
+};
+
+static void kvm_init_debug(void)
+{
+	struct kvm_stats_debugfs_item *p;
+
+	debugfs_dir = debugfs_create_dir("kvm", NULL);
+	for (p = debugfs_entries; p->name; ++p)
+		p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
+						(void *)(long)p->offset,
+						stat_fops[p->kind]);
+}
+
+static void kvm_exit_debug(void)
+{
+	struct kvm_stats_debugfs_item *p;
+
+	for (p = debugfs_entries; p->name; ++p)
+		debugfs_remove(p->dentry);
+	debugfs_remove(debugfs_dir);
+}
+
+static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+{
+	hardware_disable(NULL);
+	return 0;
+}
+
+static int kvm_resume(struct sys_device *dev)
+{
+	hardware_enable(NULL);
+	return 0;
+}
+
+static struct sysdev_class kvm_sysdev_class = {
+	set_kset_name("kvm"),
+	.suspend = kvm_suspend,
+	.resume = kvm_resume,
+};
+
+static struct sys_device kvm_sysdev = {
+	.id = 0,
+	.cls = &kvm_sysdev_class,
+};
+
+struct page *bad_page;
+
+static inline
+struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
+{
+	return container_of(pn, struct kvm_vcpu, preempt_notifier);
+}
+
+static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
+{
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+	kvm_arch_vcpu_load(vcpu, cpu);
+}
+
+static void kvm_sched_out(struct preempt_notifier *pn,
+			  struct task_struct *next)
+{
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+	kvm_arch_vcpu_put(vcpu);
+}
+
+int kvm_init(void *opaque, unsigned int vcpu_size,
+		  struct module *module)
+{
+	int r;
+	int cpu;
+
+	kvm_init_debug();
+
+	r = kvm_arch_init(opaque);
+	if (r)
+		goto out_fail;
+
+	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (bad_page == NULL) {
+		r = -ENOMEM;
+		goto out;
+	}
+
+	r = kvm_arch_hardware_setup();
+	if (r < 0)
+		goto out_free_0;
+
+	for_each_online_cpu(cpu) {
+		smp_call_function_single(cpu,
+				kvm_arch_check_processor_compat,
+				&r, 0, 1);
+		if (r < 0)
+			goto out_free_1;
+	}
+
+	on_each_cpu(hardware_enable, NULL, 0, 1);
+	r = register_cpu_notifier(&kvm_cpu_notifier);
+	if (r)
+		goto out_free_2;
+	register_reboot_notifier(&kvm_reboot_notifier);
+
+	r = sysdev_class_register(&kvm_sysdev_class);
+	if (r)
+		goto out_free_3;
+
+	r = sysdev_register(&kvm_sysdev);
+	if (r)
+		goto out_free_4;
+
+	/* A kmem cache lets us meet the alignment requirements of fx_save. */
+	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
+					   __alignof__(struct kvm_vcpu),
+					   0, NULL);
+	if (!kvm_vcpu_cache) {
+		r = -ENOMEM;
+		goto out_free_5;
+	}
+
+	kvm_chardev_ops.owner = module;
+
+	r = misc_register(&kvm_dev);
+	if (r) {
+		printk(KERN_ERR "kvm: misc device register failed\n");
+		goto out_free;
+	}
+
+	kvm_preempt_ops.sched_in = kvm_sched_in;
+	kvm_preempt_ops.sched_out = kvm_sched_out;
+
+	return 0;
+
+out_free:
+	kmem_cache_destroy(kvm_vcpu_cache);
+out_free_5:
+	sysdev_unregister(&kvm_sysdev);
+out_free_4:
+	sysdev_class_unregister(&kvm_sysdev_class);
+out_free_3:
+	unregister_reboot_notifier(&kvm_reboot_notifier);
+	unregister_cpu_notifier(&kvm_cpu_notifier);
+out_free_2:
+	on_each_cpu(hardware_disable, NULL, 0, 1);
+out_free_1:
+	kvm_arch_hardware_unsetup();
+out_free_0:
+	__free_page(bad_page);
+out:
+	kvm_arch_exit();
+	kvm_exit_debug();
+out_fail:
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init);
+
+void kvm_exit(void)
+{
+	misc_deregister(&kvm_dev);
+	kmem_cache_destroy(kvm_vcpu_cache);
+	sysdev_unregister(&kvm_sysdev);
+	sysdev_class_unregister(&kvm_sysdev_class);
+	unregister_reboot_notifier(&kvm_reboot_notifier);
+	unregister_cpu_notifier(&kvm_cpu_notifier);
+	on_each_cpu(hardware_disable, NULL, 0, 1);
+	kvm_arch_hardware_unsetup();
+	kvm_arch_exit();
+	kvm_exit_debug();
+	__free_page(bad_page);
+}
+EXPORT_SYMBOL_GPL(kvm_exit);
_