GIT e203ad4bcf11981df6fc1677fedbdb29f6fa38e8 git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm.git#master

commit 
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 20:46:12 2007 +1000

    KVM: Remove redundant alloc_vmcs_cpu declaration
    
    alloc_vmcs_cpu is already declared (static) above, no need to
    redeclare.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8716bbed1f90ec805ca20a0c3264e181278c08cd
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 20:42:42 2007 +1000

    KVM: SVM: Make set_msr_interception more reliable
    
    set_msr_interception() is used by svm to set up which MSRs should be
    intercepted.  It can only fail if someone has changed the code to try
    to intercept an MSR without updating the array of ranges.
    
    The return value is ignored anyway: it should just BUG() if it doesn't
    work.  (A build-time failure would be better, but that's tricky).
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a3573510c9b6a93fffaa118e58494d439c37a17a
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 20:41:14 2007 +1000

    KVM: Cleanup mark_page_dirty
    
    For some reason, mark_page_dirty open-codes __gfn_to_memslot().
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 087ba994ef1267032319ff2ec2d8addb8bc5a567
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 20:45:03 2007 +1000

    KVM: Don't assign vcpu->cr3 if it's invalid: check first, set last
    
    sSigned-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c53b35b292e58cf234aa7ca08fc679e61d4b291b
Author: Yang, Sheng <sheng.yang@intel.com>
Date:   Tue Jul 31 14:23:01 2007 +0300

    KVM: VMX: Add cpu consistency check
    
    All the physical CPUs on the board should support the same VMX feature
    set.  Add check_processor_compatibility to kvm_arch_ops for the consistency
    check.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2e3bac2a9a2d52b6f349296812c5b752249e3e30
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 31 19:57:47 2007 +1000

    KVM: kvm_vm_ioctl_get_dirty_log restore "nothing dirty" optimization
    
    kvm_vm_ioctl_get_dirty_log scans bitmap to see it it's all zero, but
    doesn't use that information.
    
    Avi says:
    	Looks like it was used to guard	kvm_mmu_slot_remove_write_access();
    	optimizing the case where the guest just leaves the screen alone (which
    	it usually does, especially in benchmarks).
    
    	I'd rather reinstate that optimization.  See
    	90cb0529dd230548a7f0d6b315997be854caea1b where the damage was done.
    
    It's pretty simple: if the bitmap is all zero, we don't need to do anything to
    clean it.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 68fa04ca20fb8cf79e171c37bffd74466f12ad2b
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 21:13:43 2007 +1000

    KVM: Use alignment properties of vcpu to simplify FPU ops
    
    Now we use a kmem cache for allocating vcpus, we can get the 16-byte
    alignment required by fxsave & fxrstor instructions, and avoid
    manually aligning the buffer.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0ce565a6fc253c87f26d51c506cd13554889a598
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 21:12:19 2007 +1000

    KVM: Use kmem cache for allocating vcpus
    
    Avi wants the allocations of vcpus centralized again.  The easiest way
    is to add a "size" arg to kvm_init_arch, and expose the thus-prepared
    cache to the modules.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 29b8a493b293639ae509c44386dc6a8ff79debd0
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Jul 30 13:41:19 2007 +0300

    KVM: Remove kvm_{read,write}_guest()
    
    ... in favor of the more general emulator_{read,write}_*.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6d2b86f131a3cbf370b4a65f6a6db63081cb6efb
Author: Laurent Vivier <Laurent.Vivier@bull.net>
Date:   Mon Jul 30 13:35:24 2007 +0300

    KVM: Change the emulator_{read,write,cmpxchg}_* functions to take a vcpu
    
    ... instead of a x86_emulate_ctxt, so that other callers can use it easily.
    
    Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 80917728e43e248155c019f743655806b582b099
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Jul 30 15:56:36 2007 +0300

    KVM: x86 emulator: disable writeback for debug register instructions
    
    These are handled internally by the instruction.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1c23728a5acd3a1fe5d628e23e3e4c27ee77118f
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 20:08:05 2007 +1000

    KVM: SVM: internal function name cleanup
    
    Changes some svm.c internal function names:
    1) io_adress -> io_address  (de-germanify the spelling)
    2) kvm_reput_irq -> reput_irq  (it's not a generic kvm function)
    3) kvm_do_inject_irq -> (it's not a generic kvm function)
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 61736efb5398154eceafcce0337fe0621d7eeeb0
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 20:07:08 2007 +1000

    KVM: SVM: de-containization
    
    container_of is wonderful, but not casting at all is better.  This
    patch changes svm.c's internal functions to pass "struct vcpu_svm"
    instead of "struct kvm_vcpu" and using container_of.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b15c5febefc05f04b5db04552bef18a6902e657c
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 16:41:57 2007 +1000

    KVM: Remove three magic numbers
    
    There are several places where hardcoded numbers are used in place of
    the easily-available constant, which is poor form.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b21514dab8c88570bf2078249881a8210e50bafa
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 16:31:43 2007 +1000

    KVM: VMX: pass vcpu_vmx internally
    
    container_of is wonderful, but not casting at all is better.  This
    patch changes vmx.c's internal functions to pass "struct vcpu_vmx"
    instead of "struct kvm_vcpu" and using container_of.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7d3fd03221bb8352a263249e6adb1232064e4341
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 30 16:29:56 2007 +1000

    KVM: fx_init() needs preemption disabled while it plays with the FPU state
    
    Now that kvm generally runs with preemption enabled, we need to protect
    the fpu intialization sequence.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 985bc8087daf3719d89e5ed28fe59eecd58fae71
Author: Shaohua Li <shaohua.li@intel.com>
Date:   Mon Jul 23 14:51:37 2007 +0800

    KVM: Convert vm lock to a mutex
    
    This allows the kvm mmu to perform sleepy operations, such as memory
    allocation.
    
    Signed-off-by: Shaohua Li <shaohua.li@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8928fb48c7a7f9053a55f1d0023cbc533f2b3663
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jul 11 18:17:21 2007 +0300

    KVM: Use the scheduler preemption notifiers to make kvm preemptible
    
    Current kvm disables preemption while the new virtualization registers are
    in use.  This of course is not very good for latency sensitive workloads (one
    use of virtualization is to offload user interface and other latency
    insensitive stuff to a container, so that it is easier to analyze the
    remaining workload).  This patch re-enables preemption for kvm; preemption
    is now only disabled when switching the registers in and out, and during
    the switch to guest mode and back.
    
    Contains fixes from Shaohua Li <shaohua.li@intel.com>.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 510144c386fb650a5530311721ae9d90bf12eaee
Author: Yang, Sheng <sheng.yang@intel.com>
Date:   Sun Jul 29 11:07:42 2007 +0300

    KVM: VMX: Improve the method of writing vmcs control
    
    Put cpu feature detecting part in hardware_setup, and stored the vmcs
    condition in global variable for further check.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fbc4f2e23aa26a8537f8f147c75a632e498c39c7
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Fri Jul 27 17:16:56 2007 +1000

    KVM: Dynamically allocate vcpus
    
    This patch converts the vcpus array in "struct kvm" to a pointer
    array, and changes the "vcpu_create" and "vcpu_setup" hooks into one
    "vcpu_create" call which does the allocation and initialization of the
    vcpu (calling back into the kvm_vcpu_init core helper).
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6532f26b4f39a409475918da47844eaff219f50b
Author: Gregory Haskins <ghaskins@novell.com>
Date:   Fri Jul 27 08:13:10 2007 -0400

    KVM: Remove arch specific components from the general code
    
    struct kvm_vcpu has vmx-specific members; remove them to a private structure.
    
    Signed-off-by: Gregory Haskins <ghaskins@novell.com>
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 35b8e2b29b372ab285819c3b84d6db1d0165998b
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Wed Jul 25 13:29:51 2007 +1000

    KVM: load_pdptrs() cleanups
    
    load_pdptrs can be handed an invalid cr3, and it should not oops.
    This can happen because we injected #gp in set_cr3() after we set
    vcpu->cr3 to the invalid value, or from kvm_vcpu_ioctl_set_sregs(), or
    memory configuration changes after the guest did set_cr3().
    
    We should also copy the pdpte array once, before checking and
    assigning, otherwise an SMP guest can potentially alter the values
    between the check and the set.
    
    Finally one nitpick: ret = 1 should be done as late as possible: this
    allows GCC to check for unset "ret" should the function change in
    future.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9cb698bd020974a7e950eca6285254b50b0b64d5
Author: Aurelien Jarno <aurelien@aurel32.net>
Date:   Wed Jul 25 11:41:57 2007 +0200

    KVM: Remove dead code in the cmpxchg instruction emulation
    
    The writeback fixes (02c03a326a5df825cc01de426f72e160db2b9538) let
    some dead code in the cmpxchg instruction emulation. Remove it.
    
    Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d9cbd1d77543d731f31e8ea5d1738d4aad81694a
Author: Sheng Yang <sheng.yang@intel.com>
Date:   Wed Jul 25 12:17:06 2007 +0300

    KVM: VMX: Import some constants of vmcs from IA32 SDM
    
    This patch mainly imports some constants and rename two exist constants
    of vmcs according to IA32 SDM.
    
    It also adds two constants to indicate Lock bit and Enable bit in
    MSR_IA32_FEATURE_CONTROL, and replace the hardcode _5_ with these two
    bits.
    
    Signed-off-by: Sheng Yang <sheng.yang@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>
    
    --

commit bfa6c62f98bd0602025d7b48e267d817082f5d07
Author: Aurelien Jarno <aurelien@aurel32.net>
Date:   Wed Jul 25 10:19:54 2007 +0200

    KVM: disable writeback for 0x0f 0x01 instructions.
    
    0x0f 0x01 instructions (ie lgdt, lidt, smsw, lmsw and invlpg) does
    not use writeback. This patch set no_wb=1 when emulating those
    instructions.
    
    This fixes a regression booting the FreeBSD kernel on AMD.
    
    Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 24beb1e24843f05c3acfd20fc2fbcf4f5ab18ec7
Author: Shaohua Li <shaohua.li@intel.com>
Date:   Mon Jul 23 14:51:39 2007 +0800

    KVM: Move gfn_to_page out of kmap/unmap pairs
    
    gfn_to_page might sleep with swap support. Move it out of the kmap calls.
    
    Signed-off-by: Shaohua Li <shaohua.li@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 33c5dfed96a8cb19ccc2e08073ef97e5c731dae3
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jul 25 09:22:12 2007 +0300

    KVM: Fix removal of nx capability from guest cpuid
    
    Testing the wrong bit caused kvm not to disable nx on the guest when it is
    disabled on the host (an mmu optimization relies on the nx bits being the
    same in the guest and host).
    
    This allows Windows to boot when nx is disabled on te host (e.g. when
    host pae is disabled).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8d4faaba7b1ac40b96709dc244e7d81058918a08
Author: Shaohua Li <shaohua.li@intel.com>
Date:   Mon Jul 23 14:51:32 2007 +0800

    KVM: Hoist kvm_mmu_reload() out of the critical section
    
    vmx_cpu_run doesn't handle error correctly and kvm_mmu_reload might
    sleep with mutex changes, so I move it above.
    
    Signed-off-by: Shaohua Li <shaohua.li@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b41e5014dd8712e8de2b656617f7a7a158cd992a
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Jul 23 18:33:14 2007 +0300

    Revert "KVM: Avoid useless memory write when possible"
    
    This reverts commit 8a1449563b3e5ede56b28cc977c8da22a17cdf51.  While it
    does save useless updates, it (probably) defeats the fork detector, causing
    a massive performance loss.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4d69bc0c78587849583d63ada004c82dc6277829
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 23 17:11:02 2007 +1000

    KVM: Return if the pdptrs are invalid when the guest turns on PAE.
    
    Don't fall through and turn on PAE in this case.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e8c2eb98b58dd135b14d87e6dd1d621bc630d919
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Mon Jul 23 17:08:21 2007 +1000

    KVM: Fix unlikely kvm_create vs decache_vcpus_on_cpu race
    
    We add the kvm to the vm_list before initializing the vcpu mutexes,
    which can be mutex_trylock()'ed by decache_vcpus_on_cpu().
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aae0954ed6ac2a00ee76fd209aa2a39bb2f43a0c
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jul 22 18:48:54 2007 +0300

    KVM: Correctly handle writes crossing a page boundary
    
    Writes that are contiguous in virtual memory may not be contiguous in
    physical memory; so split writes that straddle a page boundary.
    
    Thanks to Aurelien for reporting the bug, patient testing, and a fix
    to this very patch.
    
    Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 76f0301b5e4d2603d8e1ee5295db29faea660b49
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jul 22 15:51:58 2007 +0300

    KVM: x86 emulator: fix faulty check for two-byte opcode
    
    Right now, the bug is harmless as we never emulate one-byte 0xb6 or 0xb7.
    But things may change.
    
    Noted by the mysterious Gabriel C.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 86ba3093d785da1d2d1c5ecbf060d91edd7a5092
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jul 22 12:32:57 2007 +0300

    KVM: Require CONFIG_ANON_INODES
    
    Found by Sebastian Siewior and randconfig.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6da018860ce19321e25b685b72f3836d243c2137
Author: Avi Kivity <avi@qumranet.com>
Date:   Sat Jul 21 09:00:21 2007 +0300

    KVM: MMU: Fix cleaning up the shadow page allocation cache
    
    __free_page() wants a struct page, not a virtual address.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 29530eb22ba3b0baf260e2767cb125b61151ed25
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Jul 20 12:30:58 2007 +0300

    KVM: x86 emulator: fix cmov for writeback changes
    
    The writeback fixes (02c03a326a5df825cc01de426f72e160db2b9538) broke
    cmov emulation.  Fix.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 92bd26eb2a199716ceeb5604b8f9f5ed7e69ac3d
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Jul 20 08:18:27 2007 +0300

    KVM: MMU: Fix oopses with SLUB
    
    The kvm mmu uses page->private on shadow page tables; so does slub, and
    an oops result.  Fix by allocating regular pages for shadows instead of
    using slub.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 860852357a6590299a273f1141dbf1871df0b491
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:37:17 2007 +1000

    KVM: Use standard CR8 flags, and fix TPR definition
    
    Intel manual (and KVM definition) say the TPR is 4 bits wide.  Also fix
    CR8_RESEVED_BITS typo.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Acked-by: H. Peter Anvin <hpa@zytor.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 56282e5368afbc8ec6eebb6413bbb2ec0733d0ed
Author: Jeff Dike <jdike@addtoit.com>
Date:   Tue Jul 17 12:26:59 2007 -0400

    KVM: Set exit_reason to KVM_EXIT_MMIO where run->mmio is initialized.
    
    Signed-off-by: Jeff Dike <jdike@linux.intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7e5437f39897a09e79e69bd0c8d4641f13715cc4
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Wed Jul 18 13:05:58 2007 +1000

    KVM: Trivial: Use standard BITMAP macros, open-code userspace-exposed header
    
    Creating one's own BITMAP macro seems suboptimal: if we use manual
    arithmetic in the one place exposed to userspace, we can use standard
    macros elsewhere.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0dfb860def58bfb2daa000af490ed1986373fea5
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:34:16 2007 +1000

    Use standard CR4 flags, tighten checking
    
    On this machine (Intel), writing to the CR4 bits 0x00000800 and
    0x00001000 cause a GPF.  The Intel manual is a little unclear, but
    AFIACT they're reserved, too.
    
    Also fix spelling of CR4_RESEVED_BITS.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2aee2b5274884f40475fe9ad6a7f7a3d608e0ea4
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:32:55 2007 +1000

    Use standard CR3 flags, tighten checking
    
    The kernel now has asm/cpu-features.h: use those macros instead of inventing
    our own.
    
    Also spell out definition of CR3_RESEVED_BITS, fix spelling and
    tighten it for the non-PAE case.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 688e14654b3ffb0292a209c052e7579948b17f27
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:19:08 2007 +1000

    KVM: Trivial: Use standard CR0 flags macros from asm/cpu-features.h
    
    The kernel now has asm/cpu-features.h: use those macros instead of
    inventing our own.
    
    Also spell out definition of CR0_RESEVED_BITS (no code change) and fix typo.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0da5e37f4dc3df7a941ddba8863b289863e8dd40
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:17:55 2007 +1000

    KVM: Trivial: Avoid hardware_disable predeclaration
    
    Don't pre-declare hardware_disable: shuffle the reboot hook down.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 24356bfad9c4b8ba70920153aec00e78698ccb9a
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:16:56 2007 +1000

    KVM: Trivial: Comment spelling may escape grep
    
    Speling error in comment.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 793551cce1b90fac232e0a38269247815fb0d02a
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:16:11 2007 +1000

    KVM: Trivial: Make decode_register() static
    
    I have shied away from touching x86_emulate.c (it could definitely use
    some love, but it is forked from the Xen code, and it would be more
    productive to cross-merge fixes).
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 53df15a3cae92d4528dc8de21132bed3aa929ca1
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:15:29 2007 +1000

    KVM: Trivial: Remove unused struct cpu_user_regs declaration
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a9531af471c86779d28ba973cf5f54f82cfbdb8d
Author: Rusty Russell <rusty@rustcorp.com.au>
Date:   Tue Jul 17 23:12:26 2007 +1000

    KVM: Trivial: /dev/kvm interface is no longer experimental.
    
    KVM interface is no longer experimental.
    
    Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 817d90b391f6c51d07bf9d6a94778a5957d46f65
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jul 17 14:20:30 2007 +0300

    KVM: x86 emulator: implement rdmsr and wrmsr
    
    Allow real-mode emulation of rdmsr and wrmsr.  This allows smp Windows to
    boot, presumably for its sipi trampoline.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 66d8a4e4d4bd470216028daabb9d887b73259c96
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jul 17 13:04:56 2007 +0300

    KVM: Fix memory slot management functions for guest smp
    
    The memory slot management functions were oriented against vcpu 0, where
    they should be kvm-wide.  This causes hangs starting X on guest smp.
    
    Fix by making the functions (and resultant tail in the mmu) non-vcpu-specific.
    Unfortunately this reduces the efficiency of the mmu object cache a bit.  We
    may have to revisit this later.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4dd0d9a876db49da29185c868cbea6c77c09c600
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Tue Jul 17 11:52:33 2007 +0300

    KVM: In-kernel string pio write support
    
    Add string pio write support to support some version of Windows.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7bb566d5c8661a179106579978c0c606e7fa8a93
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jul 17 11:45:55 2007 +0300

    KVM:: Future-proof the exit information union ABI
    
    Note that as the size of struct kvm_run is not part of the ABI, we can add
    things at the end.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f2973ff11f9f8ef4b90413cea9cedd7f20639e3e
Author: Jeff Dike <jdike@addtoit.com>
Date:   Mon Jul 16 15:24:47 2007 -0400

    KVM - add hypercall nr to kvm_run
    
    Add the hypercall number to kvm_run and initialize it.  This changes the ABI,
    but as this particular ABI was unusable before this no users are affected.
    
    Signed-off-by: Jeff Dike <jdike@linux.intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 973ae594c1a65936fc09acab412be51d97b703b9
Author: Qing He <qing.he@intel.com>
Date:   Thu Jul 12 12:33:56 2007 +0300

    KVM: SMP: Add vcpu_id field in struct vcpu
    
    This patch adds a `vcpu_id' field in `struct vcpu', so we can
    differentiate BSP and APs without pointer comparison or arithmetic.
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9f5aa99d6256aa14b64683283ba1c4be910bc67e
Author: Nguyen Anh Quynh <aquynh@gmail.com>
Date:   Wed Jul 11 14:30:54 2007 +0300

    KVM: Fix *nopage() in kvm_main.c
    
    *nopage() in kvm_main.c should only store the type of mmap() fault if
    the pointers are not NULL. This patch fixes the problem.
    
    Signed-off-by: Nguyen Anh Quynh <aquynh@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6287464e41b2b520d78d417f3d1b37aca9202a04
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jul 10 17:50:55 2007 +0300

    KVM: MMU: Store nx bit for large page shadows
    
    We need to distinguish between large page shadows which have the nx bit set
    and those which don't.  The problem shows up when booting a newer smp Linux
    kernel, where the trampoline page (which is in real mode, which uses the
    same shadow pages as large pages) is using the same mapping as a kernel data
    page, which is mapped using nx, causing kvm to spin on that page.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a737ba627a98f2ae66c308148c9c967c73f13f5d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 13:11:41 2007 +0300

    KVM: Use CPU_DYING for disabling virtualization
    
    Only at the CPU_DYING stage can we be sure that no user process will
    be scheduled onto the cpu and oops when trying to use virtualization
    extensions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4fba051d7ec9ec1961f477d9a20311d8432738b7
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 13:09:41 2007 +0300

    KVM: Tune hotplug/suspend IPIs
    
    The hotplug IPIs can be called from the cpu on which we are currently
    running on, so use on_cpu().  Similarly, drop on_each_cpu() for the
    suspend/resume callbacks, as we're in atomic context here and only one
    cpu is up anyway.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 63e8e638342401a5fd04ec310c5d0695c645e444
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 13:03:52 2007 +0300

    KVM: Keep track of which cpus have virtualization enabled
    
    By keeping track of which cpus have virtualization enabled, we
    prevent double-enable or double-disable during hotplug, which is a
    very fatal oops.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9b6f4dedfeb83190b6196fe201e2f33c97de1c73
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 12:42:10 2007 +0300

    SMP: Implement on_cpu()
    
    This defines on_cpu() which is similar to smp_call_function_single()
    except that it works if cpu happens to be the current cpu.  Can also be
    seen as a complement to on_each_cpu() (which also doesn't treat the
    current cpu specially).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 55971a0f3faab6ecdce1e17dafc6d968f3236ade
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 12:37:34 2007 +0300

    HOTPLUG: Adapt thermal throttle to CPU_DYING
    
    CPU_DYING is notified in atomic context, so no taking mutexes here.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 529bd39d193eeae66a7c0fc3b12169ea566dc0e5
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 12:33:15 2007 +0300

    HOTPLUG: Adapt cpuset hotplug callback to CPU_DYING
    
    CPU_DYING is called in atomic context, so don't try to take any locks.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 33e6f5c2bd102cb43a1e9ae5fe210b0d5f9ac69f
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 12:23:10 2007 +0300

    HOTPLUG: Add CPU_DYING notifier
    
    KVM wants a notification when a cpu is about to die, so it can disable
    hardware extensions, but at a time when user processes cannot be scheduled
    on the cpu, so it doesn't try to use virtualization extensions after they
    have been disabled.
    
    This adds a CPU_DYING notification.  The notification is called in atomic
    context on the doomed cpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0d9c57e0a7ee426096af3d79114d23e50ed6d42b
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jul 8 11:15:32 2007 +0300

    KVM: Fix svm availability check miscompile on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 222a35d12ad9ef4f4a97da496f0e038e94681d3b
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 28 14:15:57 2007 -0400

    KVM: Clean up #includes
    
    Remove unnecessary ones, and rearange the remaining in the standard order.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 41ac4b23696b12fec15191969bc18da42359861d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 28 08:38:16 2007 -0400

    KVM: Remove kvmfs in favor of the anonymous inodes source
    
    kvm uses a pseudo filesystem, kvmfs, to generate inodes, a job that the
    new anonymous inodes source does much better.
    
    Cc: Davide Libenzi <davidel@xmailserver.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cfc329b216bc3e54fe1107e8f714c7b3bc133224
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Fri Jun 22 12:29:50 2007 +0300

    KVM: SVM: Reliably detect if SVM was disabled by BIOS
    
    This patch adds an implementation to the svm is_disabled function to
    detect reliably if the BIOS disabled the SVM feature in the CPU. This
    fixes the issues with kernel panics when loading the kvm-amd module on
    machines where SVM is available but disabled.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a2a8a256f8d4ff1595900b810fea90e5e5911b6d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 21 11:54:45 2007 +0300

    KVM: VMX: Remove unnecessary code in vmx_tlb_flush()
    
    A vmexit implicitly flushes the tlb; the code is bogus.
    
    Noted by Shaohua Li.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 37ebbf17fbf71ec261c57c1404ac7c50ade97c13
Author: Shaohua Li <shaohua.li@intel.com>
Date:   Wed Jun 20 17:13:26 2007 +0800

    KVM: MMU: Fix Wrong tlb flush order
    
    Need to flush the tlb after updating a pte, not before.
    
    Signed-off-by: Shaohua Li <shaohua.li@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 030421334ae91b7f6302a1cfe9c971a8991b4870
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jun 20 11:20:04 2007 +0300

    KVM: VMX: Reinitialize the real-mode tss when entering real mode
    
    Protected mode code may have corrupted the real-mode tss, so re-initialize
    it when switching to real mode.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8a1449563b3e5ede56b28cc977c8da22a17cdf51
Author: Luca Tettamanti <kronos.it@gmail.com>
Date:   Tue Jun 19 22:41:38 2007 +0200

    KVM: Avoid useless memory write when possible
    
    When writing to normal memory and the memory area is unchanged the write
    can be safely skipped, avoiding the costly kvm_mmu_pte_write.
    
    Signed-Off-By: Luca Tettamanti <kronos.it@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ba9c20c048726037664d303362b688759fdf6e9d
Author: Luca Tettamanti <kronos.it@gmail.com>
Date:   Tue Jun 19 22:41:20 2007 +0200

    KVM: Fix x86 emulator writeback
    
    When the old value and new one are the same the emulator skips the
    write; this is undesirable when the destination is a MMIO area and the
    write shall be performed regardless of the previous value. This
    optimization breaks e.g. a Linux guest APIC compiled without
    X86_GOOD_APIC.
    
    Remove the check and perform the writeback stage in the emulation unless
    it's explicitly disabled (currently push and some 2 bytes instructions
    may disable the writeback).
    
    Signed-Off-By: Luca Tettamanti <kronos.it@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8e770bbe8651e8d13e1d09d426657fbed0fe052a
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Tue Jun 19 18:05:03 2007 +0300

    KVM: Add support for in-kernel pio handlers
    
    Useful for the PIC and PIT.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ecd01fac443e69a574cb064d44e78ff783a1e1a4
Author: Gregory Haskins <ghaskins@novell.com>
Date:   Thu May 31 14:08:58 2007 -0400

    KVM: VMX: Fix interrupt checking on lightweight exit
    
    With kernel-injected interrupts, we need to check for interrupts on
    lightweight exits too.
    
    Signed-off-by: Gregory Haskins <ghaskins@novell.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit af93971fab7729229a45ecd64c72f56421bbcd0f
Author: Gregory Haskins <ghaskins@novell.com>
Date:   Thu May 31 14:08:53 2007 -0400

    KVM: Adds support for in-kernel mmio handlers
    
    Signed-off-by: Gregory Haskins <ghaskins@novell.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e0d1fb847d117124da53145b2d9b7f4d3da8e82c
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Jun 19 11:21:15 2007 +0300

    KVM: Implement emulation of instruction "ret" (opcode 0xc3)
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 246e9cd14121973b3c653b990d80bcd1c2163dd5
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Tue Jun 19 11:16:04 2007 +0300

    KVM: Implement emulation of "pop reg" instruction (opcode 0x58-0x5f)
    
    For use in real mode.
    
    Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b0c4137315fc6f711fd3a0fc82aedb61a2536ac9
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Jun 17 12:24:23 2007 +0300

    KVM: Bring local tree in line with origin
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6685637b211ad67bdce21bfd9f91bc888b3acb4f
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jun 13 19:55:28 2007 +0300

    KVM: VMX: Ensure vcpu time stamp counter is monotonous
    
    If the time stamp counter goes backwards, a guest delay loop can become
    infinite.  This can happen if a vcpu is migrated to another cpu, where
    the counter has a lower value than the first cpu.
    
    Since we're doing an IPI to the first cpu anyway, we can use that to pick
    up the old tsc, and use that to calculate the adjustment we need to make
    to the tsc offset.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8aefa5d7ac55d487af62755545ecc02bc53678af
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Jun 13 19:43:19 2007 +0300

    KVM: Initialize the BSP bit in the APIC_BASE msr correctly
    
    Needs to be set on vcpu 0 only.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 218179e7978af0308bcbd08f6c43bd5b3607a909
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 12 08:58:13 2007 +0300

    KVM: Require a cpu which can set 64-bit values atomically
    
    set_64bit() is not available on 80386 and i486.  Noticed by Adrian Bunk.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 74a54c5cfe3a1ea3777964a9e8e7bef119ca549b
Author: Shani Moideen <shani.moideen@wipro.com>
Date:   Mon Jun 11 09:31:33 2007 +0530

    KVM: VMX: Replace memset(<addr>, 0, PAGESIZE) with clear_page(<addr>)
    
    Signed-off-by: Shani Moideen <shani.moideen@wipro.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ff4d2f93a9459aa820b56a59e9dbd3967aa407ce
Author: Shani Moideen <shani.moideen@wipro.com>
Date:   Mon Jun 11 09:28:26 2007 +0530

    KVM: SVM: Replace memset(<addr>, 0, PAGESIZE) with clear_page(<addr>)
    
    Signed-off-by: Shani Moideen <shani.moideen@wipro.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3105c9a9a2d5f64c9e67745120b8ee5c205847a3
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 7 19:18:30 2007 +0300

    KVM: Flush remote tlbs when reducing shadow pte permissions
    
    When a vcpu causes a shadow tlb entry to have reduced permissions, it
    must also clear the tlb on remote vcpus.  We do that by:
    
    - setting a bit on the vcpu that requests a tlb flush before the next entry
    - if the vcpu is currently executing, we send an ipi to make sure it
      exits before we continue
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2c3ac418d752e7f73ca0d9081a4377278432d565
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Jun 7 19:11:53 2007 +0300

    KVM: Keep an upper bound of initialized vcpus
    
    That way, we don't need to loop for KVM_MAX_VCPUS for a single vcpu
    vm.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7ca30c3f2efbf9ab5ab595d9bc3e0bd3b705aba1
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 16:15:51 2007 +0300

    KVM: Emulate hlt on real mode for Intel
    
    This has two use cases: the bios can't boot from disk, and guest smp
    bootstrap.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e7ebb74dbacc100cfd621157ac63b95e63e3292d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 15:53:05 2007 +0300

    KVM: Move duplicate halt handling code into kvm_main.c
    
    Will soon have a thid user.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a80408da7a05e0be2ae99ad47dafd4bb4bc847cd
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 14:37:09 2007 +0300

    KVM: Enable guest smp
    
    As we don't support guest tlb shootdown yet, this is only reliable
    for real-mode guests.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 80b70c068ce4333e5e1242f32f538835a4e5d896
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 14:36:10 2007 +0300

    KVM: Fix adding an smp virtual machine to the vm list
    
    If we add the vm once per vcpu, we corrupt the list if the guest has
    multiple vcpus.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 16fb83998b62717831dca3d913455091c855b3cd
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Jun 5 12:17:03 2007 +0300

    KVM: Fix vcpu freeing for guest smp
    
    A vcpu can pin up to four mmu shadow pages, which means the freeing
    loop will never terminate.  Fix by first unpinning shadow pages on
    all vcpus, then freeing shadow pages.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 55ae364d6a882c94511db17e8023c8976d44cd2d
Author: Nguyen Anh Quynh <aquynh@gmail.com>
Date:   Tue Jun 5 10:35:19 2007 +0300

    KVM: Remove unnecessary initialization and checks in mark_page_dirty()
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0ae1aebcc9825fba4d115c197e9c099fd9644caf
Author: Robert P. J. Day <rpjday@mindspring.com>
Date:   Sun Jun 3 13:35:29 2007 -0400

    KVM: Replace C code with call to ARRAY_SIZE() macro.
    
    Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4b82b37a35a085a07d9ed84efee06c69655fd3d1
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Jun 4 15:58:30 2007 +0300

    KVM: Lazy guest cr3 switching
    
    Switch guest paging context may require us to allocate memory, which
    might fail.  Instead of wiring up error paths everywhere, make context
    switching lazy and actually do the switch before the next guest entry,
    where we can return an error if allocation fails.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fa8cfb020b0ef0acef94ddc9035b932308840314
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Jun 4 11:11:23 2007 +0300

    KVM: VMX: Fix asm constraint
    
    "g" can select a memory location, in which case size information is lost
    and gas needs an instruction suffix.  Since the suffix is different for
    i386 and x86_64, we simply change the constraint to "r".
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 63275ba244275719d6fd4d77c10d6b15586aa727
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 18:28:51 2007 +0300

    KVM: MMU: Remove unused large page marker
    
    This has not been used for some time, as the same information is available
    in the page header.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 21e3670e57c34809d4c141ce1dde4fd8b23a4d60
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 18:24:09 2007 +0300

    KVM: MMU: Don't cache guest access bits in the shadow page table
    
    This was once used to avoid accessing the guest pte when upgrading
    the shadow pte from read-only to read-write.  But usually we need
    to set the guest pte dirty or accessed bits anyway, so this wasn't
    really exploited.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 319d035ef290b510edb7f848d41098c31ceaace0
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 18:20:14 2007 +0300

    KVM: MMU: Simpify accessed/dirty/present/nx bit handling
    
    Always set the accessed and dirty bit (since having them cleared causes
    a read-modify-write cycle), always set the present bit, and copy the
    nx bit from the guest.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 080e7fd753ec60140ea89ebb0ea94625ae541534
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 17:17:06 2007 +0300

    KVM: MMU: Remove cr0.wp tricks
    
    No longer needed as we do everything in one place.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cc9d465c7a9ef3a109814fa866676f876ff42133
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 15:46:04 2007 +0300

    KVM: MMU: Make setting shadow ptes atomic on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 823c30e8740ad71bd9556f3cd235231ad00bfa55
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 15:23:35 2007 +0300

    KVM: Make shadow pte updates atomic
    
    With guest smp, a second vcpu might see partial updates when the first
    vcpu services a page fault.  So delay all updates until we have figured
    out what the pte should look like.
    
    Note that on i386, this is still not completely atomic as a 64-bit write
    will be split into two on a 32-bit machine.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b7bd6888968e797f2deaa4aa9f98466a2371392b
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 15:14:09 2007 +0300

    KVM: Move shadow pte modifications from set_pte/set_pde to set_pde_common()
    
    We want all shadow pte modifications in one place.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b70ccb0b3fd4ac02c0f6cf5153008c736fa27710
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 15:08:29 2007 +0300

    KVM: MMU: Fold fix_write_pf() into set_pte_common()
    
    This prevents some work from being performed twice, and, more importantly,
    reduces the number of places where we modify shadow ptes.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ad5555224aa01b2ddcc45ab9f0172b5497a7cd5d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 11:56:54 2007 +0300

    KVM: MMU: Fold fix_read_pf() into set_pte_common()
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3f1380d422cbd5b9231c3e997e4cbec000e3a08f
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 31 11:45:18 2007 +0300

    KVM: MMU: Pass the guest pde to set_pte_common
    
    We will need the accessed bit (in addition to the dirty bit) and
    also write access (for setting the dirty bit) in a future patch.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5fe13ee0e2b404dd34dea17ec0849b4a940a5755
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 30 19:31:17 2007 +0300

    KVM: MMU: Move set_pte_common() to pte width dependent code
    
    In preparation of some modifications.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5ada0f87635fa10a40a22b8b249c3d1fedb79840
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 30 14:21:51 2007 +0300

    KVM: MMU: Simplify fetch() a little bit
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 67310badceaed0519cb8efbe6054d790563ea136
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 30 12:34:53 2007 +0300

    KVM: MMU: Use slab caches for shadow pages and their headers
    
    Use slab caches instead of a simple custom list.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6d9d80f421f77da043b8b6898e01327763adecd2
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Tue May 29 15:07:21 2007 +0300

    KVM: Use symbolic constants instead of magic numbers
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4eaa906699812e2e28c3237cfedd8c21cbd17c4b
Author: Markus Rechberger <markus.rechberger@amd.com>
Date:   Sun May 27 10:46:52 2007 +0300

    KVM: Fix includes
    
    KVM compilation fails for some .configs.  This fixes it.
    
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d67c455e06a1eaf8ab20b5c4e51f4ae8271b2637
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu May 24 11:17:33 2007 +0300

    KVM: x86 emulator: implement wbinvd
    
    Vista seems to trigger it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fc1193d546ec21c279a8e4e3e9eaf999275b2223
Author: Jan Engelhardt <jengelh@linux01.gwdg.de>
Date:   Wed May 23 14:22:11 2007 -0700

    Use menuconfig objects II - KVM/Virt
    
    Make a "menuconfig" out of the Kconfig objects "menu, ..., endmenu",
    so that the user can disable all the options in that menu at once
    instead of having to disable each option separately.
    
    Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit a6935dbdaa7278d5e4a4d7478f29462f2a5db7fe
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon May 21 09:15:47 2007 +0300

    KVM: VMX: Remove warnings on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1ab29f3fb765b08e65de563d9053d4d05cc95f52
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Mon May 21 07:28:09 2007 +0300

    KVM: VMX: Avoid saving and restoring msr_efer on lightweight vmexit
    
    MSR_EFER.LME/LMA bits are automatically save/restored by VMX
    hardware, KVM only needs to save NX/SCE bits at time of heavy
    weight VM Exit. But clearing NX bits in host envirnment may
    cause system hang if the host page table is using EXB bits,
    thus we leave NX bits as it is. If Host NX=1 and guest NX=0, we
    can do guest page table EXB bits check before inserting a shadow
    pte (though no guest is expecting to see this kind of gp fault).
    If host NX=0, we present guest no Execute-Disable feature to guest,
    thus no host NX=0, guest NX=1 combination.
    
    This patch reduces raw vmexit time by ~27%.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 64ce9a0cf0960f9a029e54d1bffc06123d3b5893
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sun May 20 16:28:59 2007 +0300

    KVM: VMX: Fix a typo which mixes X86_64 and CONFIG_X86_64
    
    This prevents compilation on 64-bits.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cc1d717e078464a049cf8364417ec44267cd6143
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Sun May 20 10:50:08 2007 +0300

    KVM: VMX: Cleanup redundant code in MSR set
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8bf50c5c6b2af81355412ec1696a7e2c8ad940f2
Author: Daniel Hecken <dh@bahntechnik.de>
Date:   Sun May 20 10:32:14 2007 +0300

    KVM: VMX: Compile-fix for 32-bit hosts
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f552bf62c86b383dd74030c5830c8043bf41e0bd
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Thu May 17 18:55:15 2007 +0300

    KVM: VMX: Avoid saving and restoring msrs on lightweight vmexit
    
    In a lightweight exit (where we exit and reenter the guest without
    scheduling or exiting to userspace in between), we don't need various
    msrs on the host, and avoiding shuffling them around reduces raw exit
    time by 8%.
    
    Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8edb11391b763357734cc5fd293d788d8591e6d7
Author: Nitin A Kamble <nitin.a.kamble@intel.com>
Date:   Thu May 17 15:50:34 2007 +0300

    KVM: VMX: Handle #SS faults from real mode
    
    Instructions with address size override prefix opcode 0x67
    Cause the #SS fault with 0 error code in VM86 mode.  Forward
    them to the emulator.
    
    Signed-Off-By: Nitin A Kamble <nitin.a.kamble@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit bdf3f418471ba3c65aa78a1943da179d8320fdf8
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon May 14 20:41:13 2007 +0300

    KVM: VMX: Use local labels in inline assembly
    
    This makes oprofile dumps and disassebly easier to read.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ca76d209b88c344fc6a8eac17057c0088a3d6940
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun May 13 20:18:14 2007 +0300

    KVM: Remove merge artifact
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 52916bb7c142b5cf8a81da225bf51c2ea60c5b49
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 8 11:34:07 2007 +0300

    KVM: Fix vmx I/O bitmap initialization on highmem systems
    
    kunmap() expects a struct page, not a virtual address.  Fixes an oops loading
    kvm-intel.ko on i386 with CONFIG_HIGHMEM.
    
    Thanks to Michael Ivanov <deruhu@peterstar.ru> for reporting.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit facc2faaf471ca539ddd96fdbdf2e147421468a6
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon May 7 10:55:37 2007 +0300

    KVM: Avoid corrupting tr in real mode
    
    The real mode tr needs to be set to a specific tss so that I/O
    instructions can function.  Divert the new tr values to the real
    mode save area from where they will be restored on transition to
    protected mode.
    
    This fixes some crashes on reboot when the bios accesses an I/O
    instruction.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 05eb943c9b547ecc4de850f04ed4c09356440528
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun May 6 16:10:01 2007 +0300

    KVM: VMX: Only reload guest msrs if they are already loaded
    
    If we set an msr via an ioctl() instead of by handling a guest exit, we
    have the host state loaded, so reloading the msrs would clobber host
    state instead of guest state.
    
    This fixes a host oops (and loss of a cpu) on a guest reboot.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 242b0f9ae76651226fb42d9ec3ecb1a9d8d7b263
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun May 6 15:50:58 2007 +0300

    KVM: MMU: Store shadow page tables as kernel virtual addresses, not physical
    
    Simpifies things a bit.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 03aeb06a4440265777ae4ed62e8431955cbea865
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun May 6 15:36:30 2007 +0300

    KVM: MMU: Simplify kvm_mmu_free_page() a tiny bit
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f66b4a983d460d68ef5cc392285190065b0617e5
Author: Matthew Gregan <kinetik@flim.org>
Date:   Sun May 6 10:59:46 2007 +0300

    KVM: Implement IA32_EBL_CR_POWERON msr
    
    Attempting to boot the default 'bsd' kernel of OpenBSD 4.1 i386 in a guest
    fails early in the kernel init inside p3_get_bus_clock while trying to read
    the IA32_EBL_CR_POWERON MSR.  KVM logs an 'unhandled MSR' message and the
    guest kernel faults.
    
    This patch is sufficient to allow OpenBSD to boot, after which it seems to
    run fine.  I'm not sure if this is the correct solution for dealing with
    this particular MSR, but it works for me.
    
    Signed-off-by: Matthew Gregan <kinetik@flim.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7a57011a5e7c4082fdfd204115a8212298ef723f
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 23:06:22 2007 +0300

    KVM: Set cr0.mp for guests
    
    This allows fwait instructions to be trapped when the guest fpu is not
    loaded.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 90fb720a59dafb11d591a8e53a4a65bfa6fcfea9
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 22:57:13 2007 +0300

    KVM: Ensure host cr0.ts is saved
    
    Otherwise, host fpu state may be corrupted after an exit.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7616f59b208b088afd85d40aa06ca6d4d4a6ca1a
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 20:40:00 2007 +0300

    KVM: Consolidate guest fpu activation and deactivation
    
    Easier to keep track of where the fpu is this way.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7ca14868fd7f3c0dc21450e61cca5b77a47daf0d
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 17:57:40 2007 +0300

    KVM: Rationalize exception bitmap usage
    
    Everyone owns a piece of the exception bitmap, but they happily write to
    the entire thing like there's no tomorrow.  Centralize handling in
    update_exception_bitmap() and have everyone call that.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit de32f820227fbe3e159ec42ce8fd55057155edca
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 17:33:43 2007 +0300

    KVM: Move some more msr mangling into vmx_save_host_state()
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fa580ecc53536620546659740ae2dfcea763d17c
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 17:30:48 2007 +0300

    KVM: Prevent guest fpu state from leaking into the host
    
    The lazy fpu changes did not take into account that some vmexit handlers
    can sleep.  Move loading the guest state into the inner loop so that it
    can be reloaded if necessary, and move loading the host state into
    vmx_vcpu_put() so it can be performed whenever we relinquish the vcpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit bc8dcc2107de0ba8f25fc910c4559ebe3df33045
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed May 2 16:54:03 2007 +0300

    KVM: Fix potential guest state leak into host
    
    The lightweight vmexit path avoids saving and reloading certain host
    state.  However in certain cases lightweight vmexit handling can schedule()
    which requires reloading the host state.
    
    So we store the host state in the vcpu structure, and reloaded it if we
    relinquish the vcpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 11bdaf6e26c0cbabd9b6c8f2e9de60190815d348
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 18:24:38 2007 +0300

    KVM: Increase mmu shadow cache to 1024 pages
    
    This improves kbuild times by about 10%, bringing it within a respectable
    25% of native.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d6540cdffea466f1ee17a52ef530d40577b476b2
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 16:53:31 2007 +0300

    KVM: Update shadow pte on write to guest pte
    
    A typical demand page/copy on write pattern is:
    
    - page fault on vaddr
    - kvm propagates fault to guest
    - guest handles fault, updates pte
    - kvm traps write, clears shadow pte, resumes guest
    - guest returns to userspace, re-faults on same vaddr
    - kvm installs shadow pte, resumes guest
    - guest continues
    
    So, three vmexits for a single guest page fault.  But if instead of clearing
    the page table entry, we update to correspond to the value that the guest
    has just written, we eliminate the third vmexit.
    
    This patch does exactly that, reducing kbuild time by about 10%.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 807762acc40f7cc16aefcfaef8a596a4af988b20
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 16:44:05 2007 +0300

    KVM: MMU: Respect nonpae pagetable quadrant when zapping ptes
    
    When a guest writes to a page that has an mmu shadow, we have to clear
    the shadow pte corresponding to the memory location touched by the guest.
    
    Now, in nonpae mode, a single guest page may have two or four shadow
    pages (because a nonpae page maps 4MB or 4GB, whereas the pae shadow maps
    2MB or 1GB), so we when we look up the page we find up to three additional
    aliases for the page.  Since we _clear_ the shadow pte, it doesn't matter
    except for a slight performance penalty, but if we want to _update_ the
    shadow pte instead of clearing it, it is vital that we don't modify the
    aliases.
    
    Fortunately, exactly which page is needed (the "quadrant") is easily
    computed, and is accessible in the shadow page header.  All we need is
    to ignore shadow pages from the wrong quadrants.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4a5c1655c9f6df8c668428d3c5d2ad4f67dce08d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 14:16:52 2007 +0300

    KVM: Unify kvm_mmu_pre_write() and kvm_mmu_post_write()
    
    Instead of calling two functions and repeating expensive checks, call one
    function and provide it with before/after information.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ff31cf26ff8e17c2f7164c39dc03fe309ed36506
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue May 1 11:32:28 2007 +0300

    KVM: Be more careful restoring fs on lightweight vmexit
    
    i386 wants fs for accessing the pda even on a lightweight exit, so ensure
    we can always restore it.  This fixes a regression on i386 introduced by
    the lightweight vmexit patch.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e6d2f6292194c931b2fa11373a66d640245e1b14
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 17:05:38 2007 +0300

    KVM: Reduce misfirings of the fork detector
    
    The kvm mmu tries to detects forks by looking for repeated writes to a
    page table.  If it sees a fork, it unshadows the page table so the page
    table copying can proceed at native speed instead of being emulated.
    
    However, the detector also triggered on simple demand paging access patterns:
    a linear walk of memory would of course cause repeated writes to the same
    pagetable page, causing it to unshadow prematurely.
    
    Fix by resetting the fork detector if we detect a demand fault.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f908e27039ab637013ad17c64e4ef77c4c0a24b8
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 16:15:58 2007 +0300

    KVM: Unindent some code
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5cf48c367dec74ba8553c53ed332cd075fa38b88
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 16:07:54 2007 +0300

    KVM: Avoid saving and restoring some host CPU state on lightweight vmexit
    
    Many msrs and the like will only be used by the host if we schedule() or
    return to userspace.  Therefore, we avoid saving them if we handle the
    exit within the kernel, and if a reschedule is not requested.
    
    Based on a patch from Eddie Dong <eddie.dong@intel.com> with a couple of
    fixes by me.
    
    Signed-off-by: Yaozu(Eddie) Dong <eddie.dong@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2d8d6944a2249f642420bbc70b199182c70ebc9a
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 14:47:02 2007 +0300

    KVM: Assume that writes smaller than 4 bytes are to non-pagetable pages
    
    This allows us to remove write protection earlier than otherwise.  Should
    some mad OS choose to use byte writes to update pagetables, it will suffer
    a performance hit, but still work correctly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7d0e7eed6200c54462e884abc8dd6681df2f5e7d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 30 12:42:43 2007 +0300

    KVM: Fix RMW mmio handling
    
    Commit 9bf671a47ed6af3164524a31dbef9360f1b66fb5 optimized the mmio
    read path by returning to the emulator directly after an mmio read request.
    But we may also need to return back to userspace in case the instruction
    was a read-modify-write instruction, which means we need to issue a write
    after completion of the read instead of returning to the guest.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f05f41f9bb1cf72a13caf61c2931dbbf4bff51eb
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Apr 30 09:48:11 2007 +0300

    KVM: SVM: Allow direct guest access to PC debug port
    
    The PC debug port is used for IO delay and does not require emulation.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 99c7b51d71c0b0062b752c5f0a4b3498d3d165db
Author: He, Qing <qing.he@intel.com>
Date:   Mon Apr 30 09:45:24 2007 +0300

    KVM: VMX: Enable io bitmaps to avoid IO port 0x80 VMEXITs
    
    This patch enables IO bitmaps control on vmx and unmask the 0x80 port to
    avoid VMEXITs caused by accessing port 0x80. 0x80 is used as delays (see
    include/asm/io.h), and handling VMEXITs on its access is unnecessary but
    slows things down. This patch improves kernel build test at around
    3%~5%.
    	Because every VM uses the same io bitmap, it is shared between
    all VMs rather than a per-VM data structure.
    
    Signed-off-by: Qing He <qing.he@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c06d7c14c006c5e2dcd2a7d84603b51e9e60d7a7
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 29 16:25:49 2007 +0300

    KVM: Remove unused 'instruction_length'
    
    As we no longer emulate in userspace, this is meaningless.  We don't
    compute it on SVM anyway.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 20426d1309353b3e2771f9c7f534e01ce7a019f2
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 29 15:02:17 2007 +0300

    KVM: Don't require explicit indication of completion of mmio or pio
    
    It is illegal not to return from a pio or mmio request without completing
    it, as mmio or pio is an atomic operation.  Therefore, we can simplify
    the userspace interface by avoiding the completion indication.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9bf671a47ed6af3164524a31dbef9360f1b66fb5
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 14 15:54:54 2007 +0200

    KVM: Remove extraneous guest entry on mmio read
    
    When emulating an mmio read, we actually emulate twice: once to determine
    the physical address of the mmio, and, after we've exited to userspace to
    get the mmio value, we emulate again to place the value in the result
    register and update any flags.
    
    But we don't really need to enter the guest again for that, only to take
    an immediate vmexit.  So, if we detect that we're doing an mmio read,
    emulate a single instruction before entering the guest again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8dfdb0d81fb9e858c14e03fd5e007b20167cd065
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 29 13:01:34 2007 +0300

    KVM: Remove trailing whitespace
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 1628bcc25417eae4c83ca87e0899c7e02961d975
Author: Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Date:   Sun Apr 29 11:56:06 2007 +0300

    KVM: SVM: Only save/restore MSRs when needed
    
    We only have to save/restore MSR_GS_BASE on every VMEXIT.  The rest can be
    saved/restored when we leave the VCPU.  Since we don't emulate the DEBUGCTL
    MSRs and the guest cannot write to them, we don't have to worry about
    saving/restoring them at all.
    
    This shaves a whopping 40% off raw vmexit costs on AMD.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 68ba823bbe6d546e3ceb63d006c62a84e92837db
Author: Adrian Bunk <bunk@stusta.de>
Date:   Sat Apr 28 21:20:48 2007 +0200

    KVM: fix an if() condition
    
    It might have worked in this case since PT_PRESENT_MASK is 1, but let's
    express this correctly.
    
    Signed-off-by: Adrian Bunk <bunk@stusta.de>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fe7dc1f2c0c3d0c21abf9dfa4387f0b748080688
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Fri Apr 27 09:29:49 2007 +0300

    KVM: VMX: Add lazy FPU support for VT
    
    Only save/restore the FPU host state when the guest is actually using the
    FPU.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4a579478e5259df8828a8b9e5b3ddac2a946ce88
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Fri Apr 27 09:29:21 2007 +0300

    KVM: VMX: Properly shadow the CR0 register in the vcpu struct
    
    Set all of the host mask bits for CR0 so that we can maintain a proper
    shadow of CR0.  This exposes CR0.TS, paving the way for lazy fpu handling.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aad1187a6c0201701026cdb2f7f6eeb49b2af4a2
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 16:57:46 2007 +0300

    KVM: Move need_resched() check to common code
    
    Pointed out by Anthony Liguori.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b08487bd204708241c9b71ebfc555e334a4e4711
Author: Eddie Dong <eddie.dong@intel.com>
Date:   Wed Apr 25 16:49:19 2007 +0300

    KVM: VMX: Avoid unnecessary vcpu_load()/vcpu_put() cycles
    
    By checking if a reschedule is needed, we avoid dropping the vcpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 25900fd20d141145348178ffe91948e47c83e2ab
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 11:51:06 2007 +0300

    KVM: Avoid unused function warning due to assertion removal
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2bd9b992631841b1be5883a5c27b9c58ae9bb96a
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 11:48:45 2007 +0300

    KVM: We want asserts on debug builds, not release
    
    Noticed by Michael Riepe.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c3efc3ab86aa651106f6302592e25c7ab8285c35
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 12 13:03:01 2007 +0300

    KVM: Initialize cr0 to indicate an fpu is present
    
    Solaris panics if it sees a cpu with no fpu, and it seems to rely on this
    bit.  Closes sourceforge bug 1698920.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 28b183145d34a8ad1bc462df565165a88bcb5220
Author: Yaozu Dong <eddie.dong@intel.com>
Date:   Wed Apr 25 14:17:25 2007 +0800

    KVM: MMU: Avoid heavy ASSERT at non debug mode.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 418987aef13b475140b76f9f780046d63eb16f86
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 11:01:28 2007 +0300

    KVM: Document MSR_K6_STAR's special place in the msr index array
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 90ca9e3d54c8b0ac2023c624d1c7260bb8926beb
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 25 10:59:52 2007 +0300

    KVM: Don't complain about cpu erratum AA15
    
    It slows down Windows x64 horribly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6f19cb49965e1316b285a443c9392031b1634f2e
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Apr 24 14:13:01 2007 +0300

    KVM: Fix msr-avoidance regression on Core processors
    
    Core processors don't have the STAR msr, so the attempt not to save
    it caused an underflow in the number of msrs.
    
    Fix by only avoiding the STAR msr if it is actually present.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ccf9e2f22e5caf6274b5e9aafd9814a32ef049d5
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Mon Apr 23 09:17:21 2007 -0500

    KVM: Lazy FPU support for SVM
    
    Avoid saving and restoring the guest fpu state on every exit.  This
    shaves ~100 cycles off the guest/host switch.
    
    Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d558e0b49319cfc9aa92e9b7215580f265a2ead7
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 22 15:28:19 2007 +0300

    KVM: Allow passing 64-bit values to the emulated read/write API
    
    This simplifies the API somewhat (by eliminating the special-case
    cmpxchg8b on i386).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 551284356a39f20de70cd5556e85ae92080aec8c
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Apr 20 13:41:09 2007 +0300

    KVM: Silence compile warning on i386
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 459377fe9ba4a307144ead3ad86993cdee9f8fe8
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 19 17:27:43 2007 +0300

    KVM: Per-vcpu statistics
    
    Make the exit statistics per-vcpu instead of global.  This gives a 3.5%
    boost when running one virtual machine per core on my two socket dual core
    (4 cores total) machine.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 5c828f83928f186320d74627089122ebc9ea98ce
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 19 14:28:44 2007 +0300

    KVM: VMX: Only save/restore MSR_K6_STAR if necessary
    
    Intel hosts only support syscall/sysret in long more (and only if efer.sce
    is enabled), so only reload the related MSR_K6_STAR if the guest will
    actually be able to use it.
    
    This reduces vmexit cost by about 500 cycles (6400 -> 5870) on my setup.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 37d6247b3636cbf47014694483d2d25c3806e8f2
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 19 13:26:39 2007 +0300

    KVM: Fold drivers/kvm/kvm_vmx.h into drivers/kvm/vmx.c
    
    No meat in that file.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ba9c2fc1015a2b2f1f930274d465662ed8b860e6
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 19 13:22:48 2007 +0300

    KVM: VMX: Don't switch 64-bit msrs for 32-bit guests
    
    Some msrs are only used by x86_64 instructions, and are therefore
    not needed when the guest is legacy mode.  By not bothering to switch
    them, we reduce vmexit latency by 2400 cycles (from about 8800) when
    running a 32-bt guest on a 64-bit host.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8d6c8a0d891f8c37889f28f368c2621f85e50035
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 18 11:18:18 2007 +0300

    KVM: Fix off-by-one when writing to a nonpae guest pde
    
    Nonpae guest pdes are shadowed by two pae ptes, so we double the offset
    twice: once to account for the pte size difference, and once because we
    need to shadow pdes for a single guest pde.
    
    But when writing to the upper guest pde we also need to truncate the
    lower bits, otherwise the multiply shifts these bits into the pde index
    and causes an access to the wrong shadow pde.  If we're at the end of the
    page (accessing the very last guest pde) we can even overflow into the
    next host page and oops.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f0b9c908fa1451147a07f2f4e4a9409fb7b14160
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Apr 17 15:30:24 2007 +0300

    KVM: VMX: Reduce unnecessary saving of host msrs
    
    THe automatically switched msrs are never changed on the host (with
    the exception of MSR_KERNEL_GS_BASE) and thus there is no need to save
    them on every vm entry.
    
    This reduces vmexit latency by ~400 cycles on i386 and by ~900 cycles (10%)
    on x86_64.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7368e6550cdf72b0ad1b68dbe923f85e37ef4d08
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Apr 17 10:53:22 2007 +0300

    KVM: Handle guest page faults when emulating mmio
    
    Usually, guest page faults are detected by the kvm page fault handler,
    which detects if they are shadow faults, mmio faults, pagetable faults,
    or normal guest page faults.
    
    However, in ceratin circumstances, we can detect a page fault much later.
    One of these events is the following combination:
    
    - A two memory operand instruction (e.g. movsb) is executed.
    - The first operand is in mmio space (which is the fault reported to kvm)
    - The second operand is in an ummaped address (e.g. a guest page fault)
    
    The Windows 2000 installer does such an access, an promptly hangs.  Fix
    by adding the missing page fault injection on that path.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 894f5a5efc0c48482eb10ad48891054a659e5941
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 16 14:28:40 2007 +0300

    KVM: SVM: Report hardware exit reason to userspace instead of dmesg
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 94d806a6efd4401ce43358af6a9e8df5a63151ae
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 16 13:36:10 2007 +0300

    KVM: Fix pio completion
    
    Check cur_count instead of count to avoid false completions.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit d3344ae6f6293913d6e4f230ebee0b370f2e3f98
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 16 11:53:17 2007 +0300

    KVM: Retry sleeping allocation if atomic allocation fails
    
    This avoids -ENOMEM under memory pressure.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 327585c3b4c1d6b04bb752f70f350d98ca855080
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 15 16:31:09 2007 +0300

    KVM: Use slab caches to allocate mmu data structures
    
    Better leak detection, statistics, memory use, speed -- goodness all
    around.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3079541923d2cdf702490eff7081610b7320e37f
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 15 15:48:11 2007 +0300

    KVM: Fix string pio when count == 0
    
    Surprisingly, VT traps when executing a string pio instruction with zero
    count.  Perhaps more surprisingly, the Windows ne2000 driver issues such
    instructions.
    
    Since we aren't prepared to handle completions of these instructions,
    avoid the entire mess by continuing execution without escaping to userspace.
    
    This fixes the networking problems reported by Leslie Mann <lmann@nt.net>
    with recent versions of kvm.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3ef1110c81993e01343e1b473f5d7d1a23e6a8a3
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Apr 12 17:35:58 2007 +0300

    KVM: Handle partial pae pdptr
    
    Some guests (Solaris) do not set up all four pdptrs, but leave some invalid.
    kvm incorrectly treated these as valid page directories, pinning the
    wrong pages and causing general confusion.
    
    Fix by checking the valid bit of a pae pdpte.  This closes sourceforge bug
    1698922.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4e9d9d330d9c9e66c449be10950562e407366a73
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Apr 11 19:04:39 2007 +0300

    KVM: Fix memory leak on pio completion
    
    We get_page() the pages participating in pio before we return to userspace,
    yet we neglect to free them.  The can leak all guest memory in a few seconds
    by doing a
    
        hdparm -d 0 /dev/hda; dd < /dev/hda > /dev/null
    
    on the guest.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b630b9c6819844e29cddcfeaee901f6ada5d571b
Author: Eric Sesterhenn / Snakebyte <snakebyte@gmx.de>
Date:   Mon Apr 9 16:15:05 2007 +0200

    KVM: Fix overflow bug in overflow detection code
    
    The expression
    
       sp - 6 < sp
    
    where sp is a u16 is undefined in C since 'sp - 6' is promoted to int,
    and signed overflow is undefined in C.  gcc 4.2 actually warns about it.
    Replace with a simpler test.
    
    Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c338c271f150ab2ded369ef4c1882f85b28af709
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Apr 2 13:05:50 2007 +0300

    KVM: Use kernel-standard types
    
    Noted by Joerg Roedel.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0ea6eecef44923d66409a49d71e4fa87fa0f5bed
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Apr 1 16:34:31 2007 +0300

    KVM: Add fpu get/set operations
    
    These are really helpful when migrating an floating point app to another
    machine.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 05671a064c73b8cb8966ddd037ece2d6ae2cb75b
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Mar 30 16:54:30 2007 +0300

    KVM: Add physical memory aliasing feature
    
    With this, we can specify that accesses to one physical memory range will
    be remapped to another.  This is useful for the vga window at 0xa0000 which
    is used as a movable window into the (much larger) framebuffer.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8e08039818b6a5b8c81b905f863adaa18d774171
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Mar 30 14:02:32 2007 +0300

    KVM: Simply gfn_to_page()
    
    Mapping a guest page to a host page is a common operation.  Currently,
    one has first to find the memory slot where the page belongs (gfn_to_memslot),
    then locate the page itself (gfn_to_page()).
    
    This is clumsy, and also won't work well with memory aliases.  So simplify
    gfn_to_page() not to require memory slot translation first, and instead do it
    internally.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 66a9932c55ff7240955d57b7d1e62178a9e80868
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Fri Mar 30 13:06:33 2007 +0300

    Add mmu cache clear function
    
    Functions that play around with the physical memory map
    need a way to clear mappings to possibly nonexistent or
    invalid memory.  Both the mmu cache and the processor tlb
    are cleared.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6095d7b8291fc3e05f3b8790a9bc86b54af281a2
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Fri Mar 30 17:02:14 2007 +0300

    KVM: SVM: enable LBRV virtualization if available
    
    This patch enables the virtualization of the last branch record MSRs on
    SVM if this feature is available in hardware. It also introduces a small
    and simple check feature for specific SVM extensions.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8f1469e8477bea483d5a6348a30a534449048c8d
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 28 20:04:16 2007 +0200

    KVM: x86 emulator: fix bit string operations operand size
    
    On x86, bit operations operate on a string of bits that can reside in
    multiple words.  For example, 'btsl %eax, (blah)' will touch the word
    at blah+4 if %eax is between 32 and 63.
    
    The x86 emulator compensates for that by advancing the operand address
    by (bit offset / BITS_PER_LONG) and truncating the bit offset to the
    range (0..BITS_PER_LONG-1).  This has a side effect of forcing the operand
    size to 8 bytes on 64-bit hosts.
    
    Now, a 32-bit guest goes and fork()s a process.  It write protects a stack
    page at 0xbffff000 using the 'btr' instruction, at offset 0xffc in the page
    table, with bit offset 1 (for the write permission bit).
    
    The emulator now forces the operand size to 8 bytes as previously described,
    and an innocent page table update turns into a cross-page-boundary write,
    which is assumed by the mmu code not to be a page table, so it doesn't
    actually clear the corresponding shadow page table entry.  The guest and
    host permissions are out of sync and guest memory is corrupted soon
    afterwards, leading to guest failure.
    
    Fix by not using BITS_PER_LONG as the word size; instead use the actual
    operand size, so we get a 32-bit write in that case.
    
    Note we still have to teach the mmu to handle cross-page-boundary writes
    to guest page table; but for now this allows Damn Small Linux 0.4 (2.4.20)
    to boot.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e3a065c4e99bb8282d72a2c3c75234d7d7408be6
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 27 17:50:20 2007 +0200

    KVM: Remove debug message
    
    No longer interesting.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 19cd40d605bb99fc9058973a69ef208c8b5b1e42
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 27 16:12:41 2007 +0200

    Revert "added KVM_GET_MEM_MAP ioctl to get the memory bitmap for a memory slot"
    
    This reverts commit ade11a015f83d270d1201c440199146f852fe5e4.
    
    As the balloon path will be through qemu, it will have direct knowledge of
    released gfns, so this API is not directly needed.  If it becomes useful in
    the future, it will be un-reverted.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 932bf20c0c2075f958bb86b481d8f359197b4d6a
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 26 19:31:52 2007 +0200

    KVM: Use list_move()
    
    Use list_move() where possible.  Noticed by Dor Laor.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 31e82571e8a77d5feb1093627ef0b31f28649590
Author: Michal Piotrowski <michal.k.k.piotrowski@gmail.com>
Date:   Sun Mar 25 17:59:32 2007 +0200

    KVM: Remove unused function
    
    Remove unused function
    
    CC      drivers/kvm/svm.o
    drivers/kvm/svm.c:207: warning: ‘inject_db’ defined but not used
    
    Signed-off-by: Michal Piotrowski <michal.k.k.piotrowski@gmail.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9207113c121519986a114ee5c498184e618ffd68
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 25 12:07:27 2007 +0200

    KVM: SVM: Ensure timestamp counter monotonicity
    
    When a vcpu is migrated from one cpu to another, its timestamp counter
    may lose its monotonic property if the host has unsynced timestamp counters.
    This can confuse the guest, sometimes to the point of refusing to boot.
    
    As the rdtsc instruction is rather fast on AMD processors (7-10 cycles),
    we can simply record the last host tsc when we drop the cpu, and adjust
    the vcpu tsc offset when we detect that we've migrated to a different cpu.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b40faf227eb371a52aa21d08f8e9c33fc06602b4
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Mar 23 09:55:25 2007 +0200

    KVM: MMU: Fix hugepage pdes mapping same physical address with different access
    
    The kvm mmu keeps a shadow page for hugepage pdes; if several such pdes map
    the same physical address, they share the same shadow page.  This is a fairly
    common case (kernel mappings on i386 nonpae Linux, for example).
    
    However, if the two pdes map the same memory but with different permissions, kvm
    will happily use the cached shadow page.  If the access through the more
    permissive pde will occur after the access to the strict pde, an endless pagefault
    loop will be generated and the guest will make no progress.
    
    Fix by making the access permissions part of the cache lookup key.
    
    The fix allows Xen pae to boot on kvm and run guest domains.
    
    Thanks to Jeremy Fitzhardinge for reporting the bug and testing the fix.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 061bba1190514205594d2046f5dc31a01a135163
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 22 15:10:32 2007 +0200

    Revert "KVM: Remove extraneous guest entry on mmio read"
    
    This reverts commit b0092d187cfa19dfcada3b85d728af5ae27989dc.
    
    While the optimization is sound, it regresses booting the Fedora Core 6
    32 bit kernel.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4cec1674d1436157c7dcc2b5b6f625b08b2b96e8
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Wed Mar 21 19:47:00 2007 +0100

    KVM: SVM: forbid guest to execute monitor/mwait
    
    This patch forbids the guest to execute monitor/mwait instructions on
    SVM. This is necessary because the guest can execute these instructions
    if they are available even if the kvm cpuid doesn't report its
    existence.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7921ad9e303f3f03dd81b552e3b0cd87ef355219
Author: Sergey Kiselev <sergey.kiselev@intel.com>
Date:   Thu Mar 22 14:06:18 2007 +0200

    KVM: Handle writes to MCG_STATUS msr
    
    Some older (~2.6.7) kernels write MCG_STATUS register during kernel
    boot (mce_clear_all() function, called from mce_init()). It's not
    currently handled by kvm and will cause it to inject a GPF.
    Following patch adds a "nop" handler for this.
    
    Signed-off-by: Sergey Kiselev <sergey.kiselev@intel.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 36809e1326c13887d324025d4592958ead8758d5
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 21 18:14:42 2007 +0200

    KVM: Remove unused and write-only variables
    
    Trivial cleanup.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 262e17b818054dad314a062a439681d79a336d48
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 21 18:11:36 2007 +0200

    KVM: Don't allow the guest to turn off the cpu cache
    
    The cpu cache is a host resource; the guest should not be able to turn
    it off (even for itself).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8c37a70d93ba3e4286ad7524f7915a32ed39cac9
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 21 17:58:32 2007 +0200

    KVM: Hack real-mode segments on vmx from KVM_SET_SREGS
    
    As usual, we need to mangle segment registers when emulating real mode
    as vm86 has specific constraints.  We special case the reset segment base,
    and set the "access rights" (or descriptor flags) to vm86 comaptible values.
    
    This fixes reboot on vmx.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0bf8d346418255335dc9062d96b9f8814b471690
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 21 13:44:58 2007 +0200

    KVM: Modify guest segments after potentially switching modes
    
    The SET_SREGS ioctl modifies both cr0.pe (real mode/protected mode) and
    guest segment registers.  Since segment handling is modified by the mode on
    Intel procesors, update the segment registers after the mode switch has taken
    place.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f97af70b3aa8a92ddeabb7d42477e7d13dd0a192
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 18:44:51 2007 +0200

    KVM: Remove set_cr0_no_modeswitch() arch op
    
    set_cr0_no_modeswitch() was a hack to avoid corrupting segment registers.
    As we now cache the protected mode values on entry to real mode, this
    isn't an issue anymore, and it interferes with reboot (which usually _is_
    a modeswitch).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e314dde30e3851e8effc017c6fffced11d90183a
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 18:40:40 2007 +0200

    KVM: Workaround vmx inability to virtualize the reset state
    
    The reset state has cs.selector == 0xf000 and cs.base == 0xffff0000,
    which aren't compatible with vm86 mode, which is used for real mode
    virtualization.
    
    When we create a vcpu, we set cs.base to 0xf0000, but if we get there by
    way of a reset, the values are inconsistent and vmx refuses to enter
    guest mode.
    
    Workaround by detecting the state and munging it appropriately.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 88aea7ddfae755633b0a80ccfa56244b3c79c7b0
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 14:34:28 2007 +0200

    KVM: MMU: Remove global pte tracking
    
    The initial, noncaching, version of the kvm mmu flushed the all nonglobal
    shadow page table translations (much like a native tlb flush).  The new
    implementation flushes translations only when they change, rendering global
    pte tracking superfluous.
    
    This removes the unused tracking mechanism and storage space.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 66e5d5c81b5b89e39aa86e3bf9864d228f468b0d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 14:29:06 2007 +0200

    KVM: MMU: Remove unnecessary check for pdptr access
    
    We already special case the pdptr access, so no need to check it again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c01571ed56754dfea458cc37d553c360082411a1
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 12:46:50 2007 +0200

    KVM: Avoid guest virtual addresses in string pio userspace interface
    
    The current string pio interface communicates using guest virtual addresses,
    relying on userspace to translate addresses and to check permissions.  This
    interface cannot fully support guest smp, as the check needs to take into
    account two pages at one in case an unaligned string transfer straddles a
    page boundary.
    
    Change the interface not to communicate guest addresses at all; instead use
    a buffer page (mmaped by userspace) and do transfers there.  The kernel
    manages the virtual to physical translation and can perform the checks
    atomically by taking the appropriate locks.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 74c24de6e7848a45d6109d987d4fd2ccd83e432e
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 7 13:11:17 2007 +0200

    KVM: Future-proof argument-less ioctls
    
    Some ioctls ignore their arguments.  By requiring them to be zero now,
    we allow a nonzero value to have some special meaning in the future.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 29e686a1dc9631b7898d087a0ab1c4716672e209
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 7 13:05:38 2007 +0200

    KVM: Allow kernel to select size of mmap() buffer
    
    This allows us to store offsets in the kernel/user kvm_run area, and be
    sure that userspace has them mapped.  As offsets can be outside the
    kvm_run struct, userspace has no way of knowing how much to mmap.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cce3a1062817218c67163732339e2ea25e9f023b
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 5 19:46:05 2007 +0200

    KVM: Add guest mode signal mask
    
    Allow a special signal mask to be used while executing in guest mode.  This
    allows signals to be used to interrupt a vcpu without requiring signal
    delivery to a userspace handler, which is quite expensive.  Userspace still
    receives -EINTR and can get the signal via sigwait().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cd3aaa2392baec9674792d71d304ec41e540b517
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 5 17:45:40 2007 +0200

    KVM: Initialize the apic_base msr on svm too
    
    Older userspace didn't care, but newer userspace (with the cpuid changes)
    does.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c303c0efc5b2ff8c0f77c9079fa66f62801da93d
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 14:24:03 2007 +0200

    KVM: Add a special exit reason when exiting due to an interrupt
    
    This is redundant, as we also return -EINTR from the ioctl, but it
    allows us to examine the exit_reason field on resume without seeing
    old data.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 62919332e00e3226dd1f728ff83107d06a6d9a81
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 14:17:08 2007 +0200

    KVM: Fold kvm_run::exit_type into kvm_run::exit_reason
    
    Currently, userspace is told about the nature of the last exit from the
    guest using two fields, exit_type and exit_reason, where exit_type has
    just two enumerations (and no need for more).  So fold exit_type into
    exit_reason, reducing the complexity of determining what really happened.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9e16898f4f5d6cdc35030bb272631611b71548fe
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 13:59:30 2007 +0200

    KVM: Allow userspace to process hypercalls which have no kernel handler
    
    This is useful for paravirtualized graphics devices, for example.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 440fd9098bceb2ca0856d962ff62db9af4d1094a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 17:56:20 2007 +0200

    KVM: Add method to check for backwards-compatible API extensions
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0b37dedb178bcb3b0a28f65e6ae835bf58184301
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 17:20:13 2007 +0200

    KVM: Renumber ioctls
    
    The recent changes have left the ioctl numbers in complete disarray.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 95cab16b18e1c1a786a9fc5ea6fcd68b29ae3481
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 16:47:06 2007 +0200

    KVM: Remove minor wart from KVM_CREATE_VCPU ioctl
    
    That ioctl does not transfer any data, so it should be an _IO rather than an
    _IOW.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ba5cb15b027b76ba7b4d247914eb6d20065c0767
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 16:20:40 2007 +0200

    KVM: Remove the 'emulated' field from the userspace interface
    
    We no longer emulate single instructions in userspace.  Instead, we service
    mmio or pio requests.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 706e8fe655be36aa686f1fbb398d3a4470d4939b
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 28 20:46:53 2007 +0200

    KVM: Handle cpuid in the kernel instead of punting to userspace
    
    KVM used to handle cpuid by letting userspace decide what values to
    return to the guest.  We now handle cpuid completely in the kernel.  We
    still let userspace decide which values the guest will see by having
    userspace set up the value table beforehand (this is necessary to allow
    management software to set the cpu features to the least common denominator,
    so that live migration can work).
    
    The motivation for the change is that kvm kernel code can be impacted by
    cpuid features, for example the x86 emulator.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aad2f6e0faf4b03e087bbe6751acdacd72e911b6
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Feb 22 19:48:43 2007 +0200

    KVM: Initialize PIO I/O count
    
    This allows userspace to ignore the io.rep field.  No a big deal, but
    friendly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e668cf946ee8654c7f5afe3feeed686a3566c22a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Feb 22 19:39:30 2007 +0200

    KVM: Do not communicate to userspace through cpu registers during PIO
    
    Currently when passing the a PIO emulation request to userspace, we
    rely on userspace updating %rax (on 'in' instructions) and %rsi/%rdi/%rcx
    (on string instructions).  This (a) requires two extra ioctls for getting
    and setting the registers and (b) is unfriendly to non-x86 archs, when
    they get kvm ports.
    
    So fix by doing the register fixups in the kernel and passing to userspace
    only an abstract description of the PIO to be done.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3de857cd1335bd2e02b60d3a50b7da93ccbabf1d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Feb 22 12:58:31 2007 +0200

    KVM: Use a shared page for kernel/user communication when runing a vcpu
    
    Instead of passing a 'struct kvm_run' back and forth between the kernel and
    userspace, allocate a page and allow the user to mmap() it.  This reduces
    needless copying and makes the interface expandable by providing lots of
    free space.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 128e159e11e999496ec44a549fcac91de3802389
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 19 13:18:10 2007 +0200

    KVM: Prevent system selectors leaking into guest on real->protected mode transition on vmx
    
    Intel virtualization extensions do not support virtualizing real mode.  So
    kvm uses virtualized vm86 mode to run real mode code.  Unfortunately, this
    virtualized vm86 mode does not support the so called "big real" mode, where
    the segment selector and base do not agree with each other according to the
    real mode rules (base == selector << 4).
    
    To work around this, kvm checks whether a selector/base pair violates the
    virtualized vm86 rules, and if so, forces it into conformance.  On a
    transition back to protected mode, if we see that the guest did not touch
    a forced segment, we restore it back to the original protected mode value.
    
    This pile of hacks breaks down if the gdt has changed in real mode, as it
    can cause a segment selector to point to a system descriptor instead of a
    normal data segment.  In fact, this happens with the Windows bootloader
    and the qemu acpi bios, where a protected mode memcpy routine issues an
    innocent 'pop %es' and traps on an attempt to load a system descriptor.
    
    "Fix" by checking if the to-be-restored selector points at a system segment,
    and if so, coercing it into a normal data segment.  The long term solution,
    of course, is to abandon vm86 mode and use emulation for big real mode.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ade11a015f83d270d1201c440199146f852fe5e4
Author: Uri Lublin <uril@qumranet.com>
Date:   Wed Mar 14 19:21:06 2007 +0200

    added KVM_GET_MEM_MAP ioctl to get the memory bitmap for a memory slot
    
    To be used when there may be "holes" in the memory.
    Specifically to not break VM migration when ballooning mechanism exists
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>

commit b0092d187cfa19dfcada3b85d728af5ae27989dc
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 14 15:54:54 2007 +0200

    KVM: Remove extraneous guest entry on mmio read
    
    When emulating an mmio read, we actually emulate twice: once to determine
    the physical address of the mmio, and, after we've exited to userspace to
    get the mmio value, we emulate again to place the value in the result
    register and update any flags.
    
    But we don't really need to enter the guest again for that, only to take
    an immediate vmexit.  So, if we detect that we're doing an mmio read,
    emulate a single instruction before entering the guest again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 470db88b8b3491199e8d55b771d66e74b2fd53cd
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Mar 11 13:52:33 2007 +0100

    KVM: always reload segment selectors
    
    failed VM entry on VMX might still change %fs or %gs, thus make sure
    that KVM always reloads the segment selectors. This is crutial on both
    x86 and x86_64: x86 has __KERNEL_PDA in %fs on which things like
    'current' depends and x86_64 has 0 there and needs MSR_GS_BASE to work.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit f7edc6a39584a3f95687a5320675fadb23bccbe5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Mar 10 11:22:51 2007 +0100

    KVM: trivial whitespace fixes
    
    trivial whitespace fixes.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit f3a33bfeaa5cade1a9ac1facb5cb904a483b1e5c
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Mar 9 13:04:31 2007 +0200

    KVM: MMU: Fix host memory corruption on i386 with >= 4GB ram
    
    PAGE_MASK is an unsigned long, so using it to mask physical addresses on
    i386 (which are 64-bit wide) leads to truncation.  This can result in
    page->private of unrelated memory pages being modified, with disasterous
    results.
    
    Fix by not using PAGE_MASK for physical addresses; instead calculate
    the correct value directly from PAGE_SIZE.  Also fix a similar BUG_ON().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6ee9853b015f8807f497ffad39b142ddc1403aa9
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 8 17:13:32 2007 +0200

    KVM: MMU: Fix guest writes to nonpae pde
    
    KVM shadow page tables are always in pae mode, regardless of the guest
    setting.  This means that a guest pde (mapping 4MB of memory) is mapped
    to two shadow pdes (mapping 2MB each).
    
    When the guest writes to a pte or pde, we intercept the write and emulate it.
    We also remove any shadowed mappings corresponding to the write.  Since the
    mmu did not account for the doubling in the number of pdes, it removed the
    wrong entry, resulting in a mismatch between shadow page tables and guest
    page tables, followed shortly by guest memory corruption.
    
    This patch fixes the problem by detecting the special case of writing to
    a non-pae pde and adjusting the address and number of shadow pdes zapped
    accordingly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 374c1509c7d04a4e351b1812c2f0b9dac3ea0c0a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 8 11:48:09 2007 +0200

    KVM: Fix bogus sign extension in mmu mapping audit
    
    When auditing a 32-bit guest on a 64-bit host, sign extension of the page
    table directory pointer table index caused bogus addresses to be shown on
    audit errors.
    
    Fix by declaring the index unsigned.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fac539542cbf923a39238b10557c88f99fd45b59
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 7 09:29:48 2007 +0200

    KVM: Export <linux/kvm.h>
    
    This allows users to actually build prgrams that use kvm without
    the entire source tree.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c14a46343cc9f04f15ebc67573031fe8bbe1555a
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 6 12:05:53 2007 +0200

    KVM: Fix guest sysenter on vmx
    
    The vmx code currently treats the guest's sysenter support msrs as 32-bit
    values, which breaks 32-bit compat mode userspace on 64-bit guests.  Fix by
    using the native word width of the machine.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ea135e7671189ffb7e67843bf98740dac0c6ccfa
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 13:27:36 2007 +0200

    KVM: Use own minor number
    
    Use the minor number (232) allocated to kvm by lanana.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 21af17507f37658414191b1cf1337efbaf7dd530
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Mon Feb 19 18:25:43 2007 +0200

    KVM: Use the generic skip_emulated_instruction() in hypercall code
    
    Instead of twiddling the rip registers directly, use the
    skip_emulated_instruction() function to do that for us.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 57d78025d84fb607aa335d015a79b257517aa209
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Mon Feb 19 16:44:49 2007 +0200

    KVM: Fix guest register corruption on paravirt hypercall
    
    The hypercall code mixes up the ->cache_regs() and ->decache_regs()
    callbacks, resulting in guest register corruption.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 28e9803c9134683a884efe05abdb3f814c1ca7e7
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 19:21:03 2007 +0200

    KVM: Unset kvm_arch_ops if arch module loading failed
    
    Otherwise, the core module thinks the arch module is loaded, and won't
    let you reload it after you've fixed the bug.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 426bc2fd1462706ec92d0e9efdb0cf3643f4eb67
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 11:28:13 2007 +0200

    KVM: Move kvmfs magic number to <linux/magic.h>
    
    From: Andrew Morton <akpm@linux-foundation.org>
    
    Use the standard magic.h for kvmfs.
    
    Cc: Avi Kivity <avi@qumranet.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c1a8557e1da6e7d8bf8f77cb1b47c077f5c2a67d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 26 16:29:43 2007 +0200

    KVM: Fix bogus failure in kvm.ko module initialization
    
    A bogus 'return r' can cause an otherwise successful module load to fail.
    This both denies users the use of kvm, and it also denies them the use of
    their machine, as it leaves a filesystem registered with its callbacks
    pointing into now-freed module memory.
    
    Fix by returning a zero like a good module.
    
    Thanks to Richard Lucassen <mailinglists@lucassen.org> (?) for reporting
    the problem and for providing access to a machine which exhibited it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7703ff91ee2ed171f2175d030e7f063c4efab2f5
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:37:32 2007 +0200

    KVM: Remove write access permissions when dirty-page-logging is enabled
    
    Enabling dirty page logging is done using KVM_SET_MEMORY_REGION ioctl.
    If the memory region already exists, we need to remove write accesses,
    so writes will be caught, and dirty pages will be logged.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b77fd1f62576463434fc434cbdcd808847e169a1
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:15:33 2007 +0200

    kvm: move do_remove_write_access() up
    
    To be called from kvm_vm_ioctl_set_memory_region()
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 62e287e7210d6ff142b3b05233fa1f5df686b794
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 16:43:09 2007 +0200

    KVM: Fix dirty page log bitmap size/access calculation
    
    Since dirty_bitmap is an unsigned long array, the alignment and size need
    to take that into account.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 871574eb14e959c19d94fdee7c3e2b88ae06770f
Author: Uri Lublin <uril@qumranet.com>
Date:   Wed Feb 21 18:25:21 2007 +0200

    KVM: Add missing calls to mark_page_dirty()
    
    A few places where we modify guest memory fail to call mark_page_dirty(),
    causing live migration to fail.  This adds the missing calls.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 42017e8bf8eb7b6f65b95bca1368ee274fc5ef50
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:37:32 2007 +0200

    kvm: dirty page logging: remove write access permissions when dirty-page-logging is enabled
    
    Enabling dirty page logging is done using KVM_SET_MEMORY_REGION ioctl.
    If the memory region already exists, there is a need to remove write accesses,
        so writes will be caught, and dirty pages will be logged.

commit a9fd29cfcb643b97cd76c7d836be4d0ed80f69e0
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:15:33 2007 +0200

    kvm: move do_remove_write_access() up
    
    To be called from kvm_vm_ioctl_set_memory_region()

commit fba4ba9c513ad2cd328f5f16980aa7b90d40cec0
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 16:43:09 2007 +0200

    kvm: dirty pages log: fix bitmap size/access calculation
    
    Since dirty_bitmap is an unsigned long array (pointer)

commit ae160d732685ab33d5a3a495663aa2b54c4d4734
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 15:47:42 2007 +0200

    .gitignore: ignore emacs backup files (*~)

commit 8267c1cd9a8a038e91c94e0cabc571a3614dc3e5
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 21 19:47:40 2007 +0200

    KVM: Bump API version
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c65237e78c19b8173338a49933c611dece13c1c6
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 21 18:04:26 2007 +0200

    KVM: Per-vcpu inodes
    
    Allocate a distinct inode for every vcpu in a VM.  This has the following
    benefits:
    
     - the filp cachelines are no longer bounced when f_count is incremented on
       every ioctl()
     - the API and internal code are distinctly clearer; for example, on the
       KVM_GET_REGS ioctl, there is no need to copy the vcpu number from
       userspace and then copy the registers back; the vcpu identity is derived
       from the fd used to make the call
    
    Right now the performance benefits are completely theoretical since (a) we
    don't support more than one vcpu per VM and (b) virtualization hardware
    inefficiencies completely everwhelm any cacheline bouncing effects.  But
    both of these will change, and we need to prepare the API today.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 11c1297fadc533d1f66252088b4f4775018bafbb
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Feb 20 18:41:05 2007 +0200

    KVM: Move kvm_vm_ioctl_create_vcpu() around
    
    In preparation of some hacking.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f3ad84386727171d8308338a2c5dee1deac2e50d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Feb 20 18:27:58 2007 +0200

    KVM: Rename some kvm_dev_ioctl_*() functions to kvm_vm_ioctl_*()
    
    This reflects the changed scope, from device-wide to single vm (previously
    every device open created a virtual machine).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 733e3f74f1c51bbc2e7a99df8b51767504b58de2
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 21 19:28:04 2007 +0200

    KVM: Create an inode per virtual machine
    
    This avoids having filp->f_op and the corresponding inode->i_fop different,
    which is a little unorthodox.
    
    The ioctl list is split into two: global kvm ioctls and per-vm ioctls.  A new
    ioctl, KVM_CREATE_VM, is used to create VMs and return the VM fd.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 52a96114380f8ab615626e4cec57b7015895bd0f
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Feb 20 14:07:37 2007 +0200

    KVM: Add internal filesystem for generating inodes
    
    The kvmfs inodes will represent virtual machines and vcpus, as necessary,
    reducing cacheline bouncing due to inodes and filps being shared.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b00bc8b10197715f5b842f1f9a60e67a3484b10f
Author: Uri Lublin <uril@qumranet.com>
Date:   Wed Feb 21 18:25:21 2007 +0200

    kvm, dirty pages log: adding some calls to mark_page_dirty()

commit 58a214eba321d92f833221c26777e2119e34a19d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:48 2007 +0200

    KVM: More 0 -> NULL conversions
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f73199bb57b4c8feb7d8f60c6f1a25107de18dab
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: SVM: intercept SMI to handle it at host level
    
    This patch changes the SVM code to intercept SMIs and handle it
    outside the guest.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fa2742c78f10fad8682e3af17df3e9fc2eece9e4
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: svm: init cr0 with the wp bit set
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8da588a919dc0bef76e384d16fd13ea2189aa82d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Wire up hypercall handlers to a central arch-independent location
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 68f16784f188d280c75b39e2367ebc1adbc66d9d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Add hypercall host support for svm
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7c8bd4d6fc0e2bfb35cd4c0e8ff39c4f8972d951
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Add host hypercall support for vmx
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f846fa34a14ec37dc0194c6f47ea4374c140e6f1
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: add MSR based hypercall API
    
    This adds a special MSR based hypercall API to KVM. This is to be
    used by paravirtual kernels and virtual drivers.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8aa04bb13cf90d68c26d6bea1e4c720f1f027be0
Author: Markus Rechberger <markus.rechberger@amd.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Use page_private()/set_page_private() apis
    
    Besides using an established api, this allows using kvm in older kernels.
    
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4d5a7e81cc63d28e94373cdeb74dc44045edaa10
Author: Ahmed S. Darwish <darwish.07@gmail.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: Use ARRAY_SIZE macro instead of manual calculation.
    
    Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0fe9875fb3f9946a6c1cef6f1b9a286edc8ee2b9
Author: Markus Rechberger <markus.rechberger@amd.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: vmx: hack set_cr0_no_modeswitch() to actually do modeswitch
    
    From: Joerg Roedel <joerg.roedel@amd.com>
    
    The whole thing is rotten, but this allows vmx to boot with the guest reboot
    fix.
    
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7e6e2bbad7f5dbccb389ee6d79be661972b18b15
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: Cosmetics
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cc66daca849ca8c2900ba8cc7640de664296d36a
Author: Jeremy Katz <katzj@redhat.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: Move virtualization deactivation from CPU_DEAD state to CPU_DOWN_PREPARE
    
    This gives it more chances of surviving suspend.
    
    Signed-off-by: Jeremy Katz <katzj@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2959cd13ecc1fbe1b2339937481844ff963f1e7f
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: mmu: add missing dirty page tracking cases
    
    We fail to mark a page dirty in three cases:
    
    - setting the accessed bit in a pte
    - setting the dirty bit in a pte
    - emulating a write into a pagetable
    
    This fix adds the missing cases.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>
 drivers/kvm/Kconfig                |    1 
 drivers/kvm/kvm.h                  |  130 ++----
 drivers/kvm/kvm_main.c             |  703 ++++++++++++++++----------------
 drivers/kvm/kvm_svm.h              |    3 
 drivers/kvm/mmu.c                  |   14 -
 drivers/kvm/paging_tmpl.h          |   82 ++--
 drivers/kvm/svm.c                  |  782 +++++++++++++++++++-----------------
 drivers/kvm/vmx.c                  |  648 ++++++++++++++++++------------
 drivers/kvm/vmx.h                  |   69 ++-
 drivers/kvm/x86_emulate.c          |   74 ++-
 drivers/kvm/x86_emulate.h          |   20 -
 include/asm-i386/processor-flags.h |    2 
 include/linux/kvm.h                |   16 -
 13 files changed, 1367 insertions(+), 1177 deletions(-)

diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
index 6cecc39..445c6e4 100644
--- a/drivers/kvm/Kconfig
+++ b/drivers/kvm/Kconfig
@@ -11,6 +11,7 @@ if VIRTUALIZATION
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on X86 && EXPERIMENTAL
+	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 3ac9cbc..e4f11b6 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -13,44 +13,26 @@ #include <linux/spinlock.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/preempt.h>
 #include <asm/signal.h>
 
-#include "vmx.h"
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
 
-#define CR0_PE_MASK (1ULL << 0)
-#define CR0_MP_MASK (1ULL << 1)
-#define CR0_TS_MASK (1ULL << 3)
-#define CR0_NE_MASK (1ULL << 5)
-#define CR0_WP_MASK (1ULL << 16)
-#define CR0_NW_MASK (1ULL << 29)
-#define CR0_CD_MASK (1ULL << 30)
-#define CR0_PG_MASK (1ULL << 31)
-
-#define CR3_WPT_MASK (1ULL << 3)
-#define CR3_PCD_MASK (1ULL << 4)
-
-#define CR3_RESEVED_BITS 0x07ULL
-#define CR3_L_MODE_RESEVED_BITS (~((1ULL << 40) - 1) | 0x0fe7ULL)
-#define CR3_FLAGS_MASK ((1ULL << 5) - 1)
-
-#define CR4_VME_MASK (1ULL << 0)
-#define CR4_PSE_MASK (1ULL << 4)
-#define CR4_PAE_MASK (1ULL << 5)
-#define CR4_PGE_MASK (1ULL << 7)
-#define CR4_VMXE_MASK (1ULL << 13)
+#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
+#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
 
 #define KVM_GUEST_CR0_MASK \
-	(CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \
-	 | CR0_NW_MASK | CR0_CD_MASK)
+	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
+	 | X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON \
-	(CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \
-	 | CR0_MP_MASK)
+	(X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
+	 | X86_CR0_MP)
 #define KVM_GUEST_CR4_MASK \
-	(CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK | CR4_VME_MASK)
+	(X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
 
 #define INVALID_PAGE (~(hpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
@@ -63,10 +45,6 @@ #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
 
-#define FX_IMAGE_SIZE 512
-#define FX_IMAGE_ALIGN 16
-#define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN)
-
 #define DE_VECTOR 0
 #define NM_VECTOR 7
 #define DF_VECTOR 8
@@ -158,15 +136,8 @@ struct kvm_mmu_page {
 	};
 };
 
-struct vmcs {
-	u32 revision_id;
-	u32 abort;
-	char data[0];
-};
-
-#define vmx_msr_entry kvm_msr_entry
-
 struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
 
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
@@ -328,21 +299,17 @@ void kvm_io_bus_register_dev(struct kvm_
 
 struct kvm_vcpu {
 	struct kvm *kvm;
-	union {
-		struct vmcs *vmcs;
-		struct vcpu_svm *svm;
-	};
+	struct preempt_notifier preempt_notifier;
+	int vcpu_id;
 	struct mutex mutex;
 	int   cpu;
-	int   launched;
 	u64 host_tsc;
 	struct kvm_run *run;
 	int interrupt_window_open;
 	int guest_mode;
 	unsigned long requests;
 	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
-	unsigned long irq_pending[NR_IRQ_WORDS];
+	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
 	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
 	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
 
@@ -358,14 +325,6 @@ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE
 	u64 shadow_efer;
 	u64 apic_base;
 	u64 ia32_misc_enable_msr;
-	int nmsrs;
-	int save_nmsrs;
-	int msr_offset_efer;
-#ifdef CONFIG_X86_64
-	int msr_offset_kernel_gs_base;
-#endif
-	struct vmx_msr_entry *guest_msrs;
-	struct vmx_msr_entry *host_msrs;
 
 	struct kvm_mmu mmu;
 
@@ -379,16 +338,10 @@ #endif
 
 	struct kvm_guest_debug guest_debug;
 
-	char fx_buf[FX_BUF_SIZE];
-	char *host_fx_image;
-	char *guest_fx_image;
+	struct i387_fxsave_struct host_fx_image;
+	struct i387_fxsave_struct guest_fx_image;
 	int fpu_active;
 	int guest_fpu_loaded;
-	struct vmx_host_state {
-		int loaded;
-		u16 fs_sel, gs_sel, ldt_sel;
-		int fs_gs_ldt_reload_needed;
-	} vmx_host_state;
 
 	int mmio_needed;
 	int mmio_read_completed;
@@ -436,7 +389,7 @@ struct kvm_memory_slot {
 };
 
 struct kvm {
-	spinlock_t lock; /* protects everything except vcpus */
+	struct mutex lock; /* protects everything except vcpus */
 	int naliases;
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
 	int nmemslots;
@@ -447,8 +400,7 @@ struct kvm {
 	struct list_head active_mmu_pages;
 	int n_free_mmu_pages;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
-	int nvcpus;
-	struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	int memory_config_version;
 	int busy;
 	unsigned long rmap_overflow;
@@ -468,13 +420,15 @@ struct kvm_arch_ops {
 	int (*disabled_by_bios)(void);             /* __init */
 	void (*hardware_enable)(void *dummy);      /* __init */
 	void (*hardware_disable)(void *dummy);
+	void (*check_processor_compatibility)(void *rtn);
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 
-	int (*vcpu_create)(struct kvm_vcpu *vcpu);
+	/* Create, but do not attach this VCPU */
+	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
 
-	void (*vcpu_load)(struct kvm_vcpu *vcpu);
+	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 	void (*vcpu_decache)(struct kvm_vcpu *vcpu);
 
@@ -513,7 +467,6 @@ struct kvm_arch_ops {
 	void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
 
 	int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
-	int (*vcpu_setup)(struct kvm_vcpu *vcpu);
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
@@ -524,7 +477,11 @@ extern struct kvm_arch_ops *kvm_arch_ops
 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
 
-int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module);
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+int kvm_init_arch(struct kvm_arch_ops *ops, unsigned int vcpu_size,
+		  struct module *module);
 void kvm_exit_arch(void);
 
 int kvm_mmu_module_init(void);
@@ -597,27 +554,24 @@ int kvm_set_msr_common(struct kvm_vcpu *
 
 void fx_init(struct kvm_vcpu *vcpu);
 
-void load_msrs(struct vmx_msr_entry *e, int n);
-void save_msrs(struct vmx_msr_entry *e, int n);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 
-int kvm_read_guest(struct kvm_vcpu *vcpu,
-	       gva_t addr,
-	       unsigned long size,
-	       void *dest);
-
-int kvm_write_guest(struct kvm_vcpu *vcpu,
-		gva_t addr,
-		unsigned long size,
-		void *data);
+int emulator_read_std(unsigned long addr,
+                      void *val,
+		      unsigned int bytes,
+		      struct kvm_vcpu *vcpu);
+int emulator_write_emulated(unsigned long addr,
+			    const void *val,
+			    unsigned int bytes,
+			    struct kvm_vcpu *vcpu);
 
 unsigned long segment_base(u16 selector);
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *old, const u8 *new, int bytes);
+		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
@@ -652,17 +606,17 @@ #endif
 
 static inline int is_pae(struct kvm_vcpu *vcpu)
 {
-	return vcpu->cr4 & CR4_PAE_MASK;
+	return vcpu->cr4 & X86_CR4_PAE;
 }
 
 static inline int is_pse(struct kvm_vcpu *vcpu)
 {
-	return vcpu->cr4 & CR4_PSE_MASK;
+	return vcpu->cr4 & X86_CR4_PSE;
 }
 
 static inline int is_paging(struct kvm_vcpu *vcpu)
 {
-	return vcpu->cr0 & CR0_PG_MASK;
+	return vcpu->cr0 & X86_CR0_PG;
 }
 
 static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
@@ -742,12 +696,12 @@ static inline unsigned long read_msr(uns
 }
 #endif
 
-static inline void fx_save(void *image)
+static inline void fx_save(struct i387_fxsave_struct *image)
 {
 	asm ("fxsave (%0)":: "r" (image));
 }
 
-static inline void fx_restore(void *image)
+static inline void fx_restore(struct i387_fxsave_struct *image)
 {
 	asm ("fxrstor (%0)":: "r" (image));
 }
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 9685609..a201a51 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -53,8 +53,10 @@ static LIST_HEAD(vm_list);
 static cpumask_t cpus_hardware_enabled;
 
 struct kvm_arch_ops *kvm_arch_ops;
+struct kmem_cache *kvm_vcpu_cache;
+EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
 
-static void hardware_disable(void *ignored);
+static __read_mostly struct preempt_ops kvm_preempt_ops;
 
 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
 
@@ -84,10 +86,17 @@ static struct dentry *debugfs_dir;
 
 #define MAX_IO_MSRS 256
 
-#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
-#define LMSW_GUEST_MASK 0x0eULL
-#define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
-#define CR8_RESEVED_BITS (~0x0fULL)
+#define CR0_RESERVED_BITS						\
+	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+#define CR4_RESERVED_BITS						\
+	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
+			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
+			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
 
 #ifdef CONFIG_X86_64
@@ -139,82 +148,14 @@ static inline int valid_vcpu(int n)
 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
 
-int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
-		   void *dest)
-{
-	unsigned char *host_buf = dest;
-	unsigned long req_size = size;
-
-	while (size) {
-		hpa_t paddr;
-		unsigned now;
-		unsigned offset;
-		hva_t guest_buf;
-
-		paddr = gva_to_hpa(vcpu, addr);
-
-		if (is_error_hpa(paddr))
-			break;
-
-		guest_buf = (hva_t)kmap_atomic(
-					pfn_to_page(paddr >> PAGE_SHIFT),
-					KM_USER0);
-		offset = addr & ~PAGE_MASK;
-		guest_buf |= offset;
-		now = min(size, PAGE_SIZE - offset);
-		memcpy(host_buf, (void*)guest_buf, now);
-		host_buf += now;
-		addr += now;
-		size -= now;
-		kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
-	}
-	return req_size - size;
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest);
-
-int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
-		    void *data)
-{
-	unsigned char *host_buf = data;
-	unsigned long req_size = size;
-
-	while (size) {
-		hpa_t paddr;
-		unsigned now;
-		unsigned offset;
-		hva_t guest_buf;
-		gfn_t gfn;
-
-		paddr = gva_to_hpa(vcpu, addr);
-
-		if (is_error_hpa(paddr))
-			break;
-
-		gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT;
-		mark_page_dirty(vcpu->kvm, gfn);
-		guest_buf = (hva_t)kmap_atomic(
-				pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
-		offset = addr & ~PAGE_MASK;
-		guest_buf |= offset;
-		now = min(size, PAGE_SIZE - offset);
-		memcpy((void*)guest_buf, host_buf, now);
-		host_buf += now;
-		addr += now;
-		size -= now;
-		kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
-	}
-	return req_size - size;
-}
-EXPORT_SYMBOL_GPL(kvm_write_guest);
-
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
 		return;
 
 	vcpu->guest_fpu_loaded = 1;
-	fx_save(vcpu->host_fx_image);
-	fx_restore(vcpu->guest_fx_image);
+	fx_save(&vcpu->host_fx_image);
+	fx_restore(&vcpu->guest_fx_image);
 }
 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
 
@@ -224,8 +165,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *
 		return;
 
 	vcpu->guest_fpu_loaded = 0;
-	fx_save(vcpu->guest_fx_image);
-	fx_restore(vcpu->host_fx_image);
+	fx_save(&vcpu->guest_fx_image);
+	fx_restore(&vcpu->host_fx_image);
 }
 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
 
@@ -234,13 +175,21 @@ EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
  */
 static void vcpu_load(struct kvm_vcpu *vcpu)
 {
+	int cpu;
+
 	mutex_lock(&vcpu->mutex);
-	kvm_arch_ops->vcpu_load(vcpu);
+	cpu = get_cpu();
+	preempt_notifier_register(&vcpu->preempt_notifier);
+	kvm_arch_ops->vcpu_load(vcpu, cpu);
+	put_cpu();
 }
 
 static void vcpu_put(struct kvm_vcpu *vcpu)
 {
+	preempt_disable();
 	kvm_arch_ops->vcpu_put(vcpu);
+	preempt_notifier_unregister(&vcpu->preempt_notifier);
+	preempt_enable();
 	mutex_unlock(&vcpu->mutex);
 }
 
@@ -261,8 +210,10 @@ void kvm_flush_remote_tlbs(struct kvm *k
 	atomic_set(&completed, 0);
 	cpus_clear(cpus);
 	needed = 0;
-	for (i = 0; i < kvm->nvcpus; ++i) {
-		vcpu = &kvm->vcpus[i];
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		vcpu = kvm->vcpus[i];
+		if (!vcpu)
+			continue;
 		if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
 			continue;
 		cpu = vcpu->cpu;
@@ -286,26 +237,65 @@ void kvm_flush_remote_tlbs(struct kvm *k
 	}
 }
 
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+{
+	struct page *page;
+	int r;
+
+	mutex_init(&vcpu->mutex);
+	vcpu->cpu = -1;
+	vcpu->mmu.root_hpa = INVALID_PAGE;
+	vcpu->kvm = kvm;
+	vcpu->vcpu_id = id;
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		r = -ENOMEM;
+		goto fail;
+	}
+	vcpu->run = page_address(page);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		r = -ENOMEM;
+		goto fail_free_run;
+	}
+	vcpu->pio_data = page_address(page);
+
+	r = kvm_mmu_create(vcpu);
+	if (r < 0)
+		goto fail_free_pio_data;
+
+	return 0;
+
+fail_free_pio_data:
+	free_page((unsigned long)vcpu->pio_data);
+fail_free_run:
+	free_page((unsigned long)vcpu->run);
+fail:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_init);
+
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	kvm_mmu_destroy(vcpu);
+	free_page((unsigned long)vcpu->pio_data);
+	free_page((unsigned long)vcpu->run);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+
 static struct kvm *kvm_create_vm(void)
 {
 	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-	int i;
 
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 
 	kvm_io_bus_init(&kvm->pio_bus);
-	spin_lock_init(&kvm->lock);
+	mutex_init(&kvm->lock);
 	INIT_LIST_HEAD(&kvm->active_mmu_pages);
 	kvm_io_bus_init(&kvm->mmio_bus);
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		struct kvm_vcpu *vcpu = &kvm->vcpus[i];
-
-		mutex_init(&vcpu->mutex);
-		vcpu->cpu = -1;
-		vcpu->kvm = kvm;
-		vcpu->mmu.root_hpa = INVALID_PAGE;
-	}
 	spin_lock(&kvm_lock);
 	list_add(&kvm->vm_list, &vm_list);
 	spin_unlock(&kvm_lock);
@@ -353,7 +343,7 @@ static void free_pio_guest_pages(struct 
 {
 	int i;
 
-	for (i = 0; i < 2; ++i)
+	for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
 		if (vcpu->pio.guest_pages[i]) {
 			__free_page(vcpu->pio.guest_pages[i]);
 			vcpu->pio.guest_pages[i] = NULL;
@@ -362,30 +352,11 @@ static void free_pio_guest_pages(struct 
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 {
-	if (!vcpu->vmcs)
-		return;
-
 	vcpu_load(vcpu);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 }
 
-static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->vmcs)
-		return;
-
-	vcpu_load(vcpu);
-	kvm_mmu_destroy(vcpu);
-	vcpu_put(vcpu);
-	kvm_arch_ops->vcpu_free(vcpu);
-	free_page((unsigned long)vcpu->run);
-	vcpu->run = NULL;
-	free_page((unsigned long)vcpu->pio_data);
-	vcpu->pio_data = NULL;
-	free_pio_guest_pages(vcpu);
-}
-
 static void kvm_free_vcpus(struct kvm *kvm)
 {
 	unsigned int i;
@@ -394,9 +365,15 @@ static void kvm_free_vcpus(struct kvm *k
 	 * Unpin any mmu pages first.
 	 */
 	for (i = 0; i < KVM_MAX_VCPUS; ++i)
-		kvm_unload_vcpu_mmu(&kvm->vcpus[i]);
-	for (i = 0; i < KVM_MAX_VCPUS; ++i)
-		kvm_free_vcpu(&kvm->vcpus[i]);
+		if (kvm->vcpus[i])
+			kvm_unload_vcpu_mmu(kvm->vcpus[i]);
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (kvm->vcpus[i]) {
+			kvm_arch_ops->vcpu_free(kvm->vcpus[i]);
+			kvm->vcpus[i] = NULL;
+		}
+	}
+
 }
 
 static int kvm_dev_release(struct inode *inode, struct file *filp)
@@ -437,58 +414,60 @@ static int load_pdptrs(struct kvm_vcpu *
 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 	int i;
-	u64 pdpte;
 	u64 *pdpt;
 	int ret;
 	struct page *page;
+	u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
 
-	spin_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->lock);
 	page = gfn_to_page(vcpu->kvm, pdpt_gfn);
-	/* FIXME: !page - emulate? 0xff? */
+	if (!page) {
+		ret = 0;
+		goto out;
+	}
+
 	pdpt = kmap_atomic(page, KM_USER0);
+	memcpy(pdpte, pdpt+offset, sizeof(pdpte));
+	kunmap_atomic(pdpt, KM_USER0);
 
-	ret = 1;
-	for (i = 0; i < 4; ++i) {
-		pdpte = pdpt[offset + i];
-		if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
+	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
 			ret = 0;
 			goto out;
 		}
 	}
+	ret = 1;
 
-	for (i = 0; i < 4; ++i)
-		vcpu->pdptrs[i] = pdpt[offset + i];
-
+	memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
 out:
-	kunmap_atomic(pdpt, KM_USER0);
-	spin_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&vcpu->kvm->lock);
 
 	return ret;
 }
 
 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
-	if (cr0 & CR0_RESEVED_BITS) {
+	if (cr0 & CR0_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 		       cr0, vcpu->cr0);
 		inject_gp(vcpu);
 		return;
 	}
 
-	if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
+	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 		inject_gp(vcpu);
 		return;
 	}
 
-	if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
+	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 		       "and a clear PE flag\n");
 		inject_gp(vcpu);
 		return;
 	}
 
-	if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
+	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 #ifdef CONFIG_X86_64
 		if ((vcpu->shadow_efer & EFER_LME)) {
 			int cs_db, cs_l;
@@ -521,9 +500,9 @@ #endif
 	kvm_arch_ops->set_cr0(vcpu, cr0);
 	vcpu->cr0 = cr0;
 
-	spin_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->lock);
 	kvm_mmu_reset_context(vcpu);
-	spin_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&vcpu->kvm->lock);
 	return;
 }
 EXPORT_SYMBOL_GPL(set_cr0);
@@ -536,62 +515,71 @@ EXPORT_SYMBOL_GPL(lmsw);
 
 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	if (cr4 & CR4_RESEVED_BITS) {
+	if (cr4 & CR4_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 		inject_gp(vcpu);
 		return;
 	}
 
 	if (is_long_mode(vcpu)) {
-		if (!(cr4 & CR4_PAE_MASK)) {
+		if (!(cr4 & X86_CR4_PAE)) {
 			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 			       "in long mode\n");
 			inject_gp(vcpu);
 			return;
 		}
-	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
+	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
 		   && !load_pdptrs(vcpu, vcpu->cr3)) {
 		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 		inject_gp(vcpu);
+		return;
 	}
 
-	if (cr4 & CR4_VMXE_MASK) {
+	if (cr4 & X86_CR4_VMXE) {
 		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 		inject_gp(vcpu);
 		return;
 	}
 	kvm_arch_ops->set_cr4(vcpu, cr4);
-	spin_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->lock);
 	kvm_mmu_reset_context(vcpu);
-	spin_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&vcpu->kvm->lock);
 }
 EXPORT_SYMBOL_GPL(set_cr4);
 
 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (is_long_mode(vcpu)) {
-		if (cr3 & CR3_L_MODE_RESEVED_BITS) {
+		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 			inject_gp(vcpu);
 			return;
 		}
 	} else {
-		if (cr3 & CR3_RESEVED_BITS) {
-			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
-			inject_gp(vcpu);
-			return;
-		}
-		if (is_paging(vcpu) && is_pae(vcpu) &&
-		    !load_pdptrs(vcpu, cr3)) {
-			printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
-			       "reserved bits\n");
-			inject_gp(vcpu);
-			return;
+		if (is_pae(vcpu)) {
+			if (cr3 & CR3_PAE_RESERVED_BITS) {
+				printk(KERN_DEBUG
+				       "set_cr3: #GP, reserved bits\n");
+				inject_gp(vcpu);
+				return;
+			}
+			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
+				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
+				       "reserved bits\n");
+				inject_gp(vcpu);
+				return;
+			}
+		} else {
+			if (cr3 & CR3_NONPAE_RESERVED_BITS) {
+				printk(KERN_DEBUG
+				       "set_cr3: #GP, reserved bits\n");
+				inject_gp(vcpu);
+				return;
+			}
 		}
 	}
 
-	vcpu->cr3 = cr3;
-	spin_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->lock);
 	/*
 	 * Does the new cr3 value map to physical memory? (Note, we
 	 * catch an invalid cr3 even in real-mode, because it would
@@ -603,15 +591,17 @@ void set_cr3(struct kvm_vcpu *vcpu, unsi
 	 */
 	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 		inject_gp(vcpu);
-	else
+	else {
+		vcpu->cr3 = cr3;
 		vcpu->mmu.new_cr3(vcpu);
-	spin_unlock(&vcpu->kvm->lock);
+	}
+	mutex_unlock(&vcpu->kvm->lock);
 }
 EXPORT_SYMBOL_GPL(set_cr3);
 
 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
-	if ( cr8 & CR8_RESEVED_BITS) {
+	if (cr8 & CR8_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 		inject_gp(vcpu);
 		return;
@@ -622,27 +612,20 @@ EXPORT_SYMBOL_GPL(set_cr8);
 
 void fx_init(struct kvm_vcpu *vcpu)
 {
-	struct __attribute__ ((__packed__)) fx_image_s {
-		u16 control; //fcw
-		u16 status; //fsw
-		u16 tag; // ftw
-		u16 opcode; //fop
-		u64 ip; // fpu ip
-		u64 operand;// fpu dp
-		u32 mxcsr;
-		u32 mxcsr_mask;
+	unsigned after_mxcsr_mask;
 
-	} *fx_image;
-
-	fx_save(vcpu->host_fx_image);
+	/* Initialize guest FPU by resetting ours and saving into guest's */
+	preempt_disable();
+	fx_save(&vcpu->host_fx_image);
 	fpu_init();
-	fx_save(vcpu->guest_fx_image);
-	fx_restore(vcpu->host_fx_image);
+	fx_save(&vcpu->guest_fx_image);
+	fx_restore(&vcpu->host_fx_image);
+	preempt_enable();
 
-	fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
-	fx_image->mxcsr = 0x1f80;
-	memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
-	       0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
+	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
+	vcpu->guest_fx_image.mxcsr = 0x1f80;
+	memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
+	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
 }
 EXPORT_SYMBOL_GPL(fx_init);
 
@@ -682,7 +665,7 @@ static int kvm_vm_ioctl_set_memory_regio
 		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
 
 raced:
-	spin_lock(&kvm->lock);
+	mutex_lock(&kvm->lock);
 
 	memory_config_version = kvm->memory_config_version;
 	new = old = *memslot;
@@ -711,7 +694,7 @@ raced:
 	 * Do memory allocations outside lock.  memory_config_version will
 	 * detect any races.
 	 */
-	spin_unlock(&kvm->lock);
+	mutex_unlock(&kvm->lock);
 
 	/* Deallocate if slot is being removed */
 	if (!npages)
@@ -750,10 +733,10 @@ raced:
 		memset(new.dirty_bitmap, 0, dirty_bytes);
 	}
 
-	spin_lock(&kvm->lock);
+	mutex_lock(&kvm->lock);
 
 	if (memory_config_version != kvm->memory_config_version) {
-		spin_unlock(&kvm->lock);
+		mutex_unlock(&kvm->lock);
 		kvm_free_physmem_slot(&new, &old);
 		goto raced;
 	}
@@ -771,13 +754,13 @@ raced:
 	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
 	kvm_flush_remote_tlbs(kvm);
 
-	spin_unlock(&kvm->lock);
+	mutex_unlock(&kvm->lock);
 
 	kvm_free_physmem_slot(&old, &new);
 	return 0;
 
 out_unlock:
-	spin_unlock(&kvm->lock);
+	mutex_unlock(&kvm->lock);
 out_free:
 	kvm_free_physmem_slot(&new, &old);
 out:
@@ -795,14 +778,14 @@ static int kvm_vm_ioctl_get_dirty_log(st
 	int n;
 	unsigned long any = 0;
 
-	spin_lock(&kvm->lock);
+	mutex_lock(&kvm->lock);
 
 	/*
 	 * Prevent changes to guest memory configuration even while the lock
 	 * is not taken.
 	 */
 	++kvm->busy;
-	spin_unlock(&kvm->lock);
+	mutex_unlock(&kvm->lock);
 	r = -EINVAL;
 	if (log->slot >= KVM_MEMORY_SLOTS)
 		goto out;
@@ -821,18 +804,21 @@ static int kvm_vm_ioctl_get_dirty_log(st
 	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
 		goto out;
 
-	spin_lock(&kvm->lock);
-	kvm_mmu_slot_remove_write_access(kvm, log->slot);
-	kvm_flush_remote_tlbs(kvm);
-	memset(memslot->dirty_bitmap, 0, n);
-	spin_unlock(&kvm->lock);
+	/* If nothing is dirty, don't bother messing with page tables. */
+	if (any) {
+		mutex_lock(&kvm->lock);
+		kvm_mmu_slot_remove_write_access(kvm, log->slot);
+		kvm_flush_remote_tlbs(kvm);
+		memset(memslot->dirty_bitmap, 0, n);
+		mutex_unlock(&kvm->lock);
+	}
 
 	r = 0;
 
 out:
-	spin_lock(&kvm->lock);
+	mutex_lock(&kvm->lock);
 	--kvm->busy;
-	spin_unlock(&kvm->lock);
+	mutex_unlock(&kvm->lock);
 	return r;
 }
 
@@ -862,7 +848,7 @@ static int kvm_vm_ioctl_set_memory_alias
 	    < alias->target_phys_addr)
 		goto out;
 
-	spin_lock(&kvm->lock);
+	mutex_lock(&kvm->lock);
 
 	p = &kvm->aliases[alias->slot];
 	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -876,7 +862,7 @@ static int kvm_vm_ioctl_set_memory_alias
 
 	kvm_mmu_zap_all(kvm);
 
-	spin_unlock(&kvm->lock);
+	mutex_unlock(&kvm->lock);
 
 	return 0;
 
@@ -930,37 +916,26 @@ struct page *gfn_to_page(struct kvm *kvm
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
+/* WARNING: Does not work on aliased pages. */
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
-	int i;
 	struct kvm_memory_slot *memslot;
-	unsigned long rel_gfn;
-
-	for (i = 0; i < kvm->nmemslots; ++i) {
-		memslot = &kvm->memslots[i];
-
-		if (gfn >= memslot->base_gfn
-		    && gfn < memslot->base_gfn + memslot->npages) {
-
-			if (!memslot->dirty_bitmap)
-				return;
 
-			rel_gfn = gfn - memslot->base_gfn;
+	memslot = __gfn_to_memslot(kvm, gfn);
+	if (memslot && memslot->dirty_bitmap) {
+		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
-			/* avoid RMW */
-			if (!test_bit(rel_gfn, memslot->dirty_bitmap))
-				set_bit(rel_gfn, memslot->dirty_bitmap);
-			return;
-		}
+		/* avoid RMW */
+		if (!test_bit(rel_gfn, memslot->dirty_bitmap))
+			set_bit(rel_gfn, memslot->dirty_bitmap);
 	}
 }
 
-static int emulator_read_std(unsigned long addr,
+int emulator_read_std(unsigned long addr,
 			     void *val,
 			     unsigned int bytes,
-			     struct x86_emulate_ctxt *ctxt)
+			     struct kvm_vcpu *vcpu)
 {
-	struct kvm_vcpu *vcpu = ctxt->vcpu;
 	void *data = val;
 
 	while (bytes) {
@@ -990,11 +965,12 @@ static int emulator_read_std(unsigned lo
 
 	return X86EMUL_CONTINUE;
 }
+EXPORT_SYMBOL_GPL(emulator_read_std);
 
 static int emulator_write_std(unsigned long addr,
 			      const void *val,
 			      unsigned int bytes,
-			      struct x86_emulate_ctxt *ctxt)
+			      struct kvm_vcpu *vcpu)
 {
 	printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
 	       addr, bytes);
@@ -1021,9 +997,8 @@ static struct kvm_io_device *vcpu_find_p
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
-				  struct x86_emulate_ctxt *ctxt)
+				  struct kvm_vcpu *vcpu)
 {
-	struct kvm_vcpu      *vcpu = ctxt->vcpu;
 	struct kvm_io_device *mmio_dev;
 	gpa_t                 gpa;
 
@@ -1031,7 +1006,7 @@ static int emulator_read_emulated(unsign
 		memcpy(val, vcpu->mmio_data, bytes);
 		vcpu->mmio_read_completed = 0;
 		return X86EMUL_CONTINUE;
-	} else if (emulator_read_std(addr, val, bytes, ctxt)
+	} else if (emulator_read_std(addr, val, bytes, vcpu)
 		   == X86EMUL_CONTINUE)
 		return X86EMUL_CONTINUE;
 
@@ -1061,7 +1036,6 @@ static int emulator_write_phys(struct kv
 {
 	struct page *page;
 	void *virt;
-	unsigned offset = offset_in_page(gpa);
 
 	if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
 		return 0;
@@ -1070,7 +1044,7 @@ static int emulator_write_phys(struct kv
 		return 0;
 	mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
 	virt = kmap_atomic(page, KM_USER0);
-	kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
 	memcpy(virt + offset_in_page(gpa), val, bytes);
 	kunmap_atomic(virt, KM_USER0);
 	return 1;
@@ -1079,9 +1053,8 @@ static int emulator_write_phys(struct kv
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
-					   struct x86_emulate_ctxt *ctxt)
+					   struct kvm_vcpu *vcpu)
 {
-	struct kvm_vcpu      *vcpu = ctxt->vcpu;
 	struct kvm_io_device *mmio_dev;
 	gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 
@@ -1111,31 +1084,32 @@ static int emulator_write_emulated_onepa
 	return X86EMUL_CONTINUE;
 }
 
-static int emulator_write_emulated(unsigned long addr,
+int emulator_write_emulated(unsigned long addr,
 				   const void *val,
 				   unsigned int bytes,
-				   struct x86_emulate_ctxt *ctxt)
+				   struct kvm_vcpu *vcpu)
 {
 	/* Crossing a page boundary? */
 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
 		int rc, now;
 
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, ctxt);
+		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
 		val += now;
 		bytes -= now;
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, ctxt);
+	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
 }
+EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
 static int emulator_cmpxchg_emulated(unsigned long addr,
 				     const void *old,
 				     const void *new,
 				     unsigned int bytes,
-				     struct x86_emulate_ctxt *ctxt)
+				     struct kvm_vcpu *vcpu)
 {
 	static int reported;
 
@@ -1143,7 +1117,7 @@ static int emulator_cmpxchg_emulated(uns
 		reported = 1;
 		printk(KERN_WARNING "kvm: emulating exchange as write\n");
 	}
-	return emulator_write_emulated(addr, new, bytes, ctxt);
+	return emulator_write_emulated(addr, new, bytes, vcpu);
 }
 
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
@@ -1160,7 +1134,7 @@ int emulate_clts(struct kvm_vcpu *vcpu)
 {
 	unsigned long cr0;
 
-	cr0 = vcpu->cr0 & ~CR0_TS_MASK;
+	cr0 = vcpu->cr0 & ~X86_CR0_TS;
 	kvm_arch_ops->set_cr0(vcpu, cr0);
 	return X86EMUL_CONTINUE;
 }
@@ -1205,7 +1179,7 @@ static void report_emulation_failure(str
 	if (reported)
 		return;
 
-	emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
+	emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt->vcpu);
 
 	printk(KERN_ERR "emulation failed but !mmio_needed?"
 	       " rip %lx %02x %02x %02x %02x\n",
@@ -1262,6 +1236,7 @@ int emulate_instruction(struct kvm_vcpu 
 	r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
 
 	if ((r || vcpu->mmio_is_write) && run) {
+		run->exit_reason = KVM_EXIT_MMIO;
 		run->mmio.phys_addr = vcpu->mmio_phys_addr;
 		memcpy(run->mmio.data, vcpu->mmio_data, 8);
 		run->mmio.len = vcpu->mmio_size;
@@ -1329,6 +1304,7 @@ #endif
 	}
 	switch (nr) {
 	default:
+		run->hypercall.nr = nr;
 		run->hypercall.args[0] = a0;
 		run->hypercall.args[1] = a1;
 		run->hypercall.args[2] = a2;
@@ -1439,7 +1415,7 @@ static int vcpu_register_para(struct kvm
 
 	mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
 	para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
-	para_state = kmap_atomic(para_state_page, KM_USER0);
+	para_state = kmap(para_state_page);
 
 	printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
 	printk(KERN_DEBUG "....           size: %d\n", para_state->size);
@@ -1475,7 +1451,7 @@ static int vcpu_register_para(struct kvm
 
 	para_state->ret = 0;
 err_kunmap_skip:
-	kunmap_atomic(para_state, KM_USER0);
+	kunmap(para_state_page);
 	return 0;
 err_gp:
 	return 1;
@@ -1622,30 +1598,10 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 {
 	if (!need_resched())
 		return;
-	vcpu_put(vcpu);
 	cond_resched();
-	vcpu_load(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 
-void load_msrs(struct vmx_msr_entry *e, int n)
-{
-	int i;
-
-	for (i = 0; i < n; ++i)
-		wrmsrl(e[i].index, e[i].data);
-}
-EXPORT_SYMBOL_GPL(load_msrs);
-
-void save_msrs(struct vmx_msr_entry *e, int n)
-{
-	int i;
-
-	for (i = 0; i < n; ++i)
-		rdmsrl(e[i].index, e[i].data);
-}
-EXPORT_SYMBOL_GPL(save_msrs);
-
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 {
 	int i;
@@ -1690,11 +1646,9 @@ static int pio_copy_data(struct kvm_vcpu
 	unsigned bytes;
 	int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
 
-	kvm_arch_ops->vcpu_put(vcpu);
 	q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
 		 PAGE_KERNEL);
 	if (!q) {
-		kvm_arch_ops->vcpu_load(vcpu);
 		free_pio_guest_pages(vcpu);
 		return -ENOMEM;
 	}
@@ -1706,7 +1660,6 @@ static int pio_copy_data(struct kvm_vcpu
 		memcpy(p, q, bytes);
 	q -= vcpu->pio.guest_page_offset;
 	vunmap(q);
-	kvm_arch_ops->vcpu_load(vcpu);
 	free_pio_guest_pages(vcpu);
 	return 0;
 }
@@ -1760,18 +1713,35 @@ static int complete_pio(struct kvm_vcpu 
 	return 0;
 }
 
-void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu)
+static void kernel_pio(struct kvm_io_device *pio_dev,
+		       struct kvm_vcpu *vcpu,
+		       void *pd)
 {
 	/* TODO: String I/O for in kernel device */
 
 	if (vcpu->pio.in)
 		kvm_iodevice_read(pio_dev, vcpu->pio.port,
 				  vcpu->pio.size,
-				  vcpu->pio_data);
+				  pd);
 	else
 		kvm_iodevice_write(pio_dev, vcpu->pio.port,
 				   vcpu->pio.size,
-				   vcpu->pio_data);
+				   pd);
+}
+
+static void pio_string_write(struct kvm_io_device *pio_dev,
+			     struct kvm_vcpu *vcpu)
+{
+	struct kvm_pio_request *io = &vcpu->pio;
+	void *pd = vcpu->pio_data;
+	int i;
+
+	for (i = 0; i < io->cur_count; i++) {
+		kvm_iodevice_write(pio_dev, io->port,
+				   io->size,
+				   pd);
+		pd += io->size;
+	}
 }
 
 int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
@@ -1779,7 +1749,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu,
 		  gva_t address, int rep, unsigned port)
 {
 	unsigned now, in_page;
-	int i;
+	int i, ret = 0;
 	int nr_pages = 1;
 	struct page *page;
 	struct kvm_io_device *pio_dev;
@@ -1806,15 +1776,12 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu,
 		memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
 		kvm_arch_ops->decache_regs(vcpu);
 		if (pio_dev) {
-			kernel_pio(pio_dev, vcpu);
+			kernel_pio(pio_dev, vcpu, vcpu->pio_data);
 			complete_pio(vcpu);
 			return 1;
 		}
 		return 0;
 	}
-	/* TODO: String I/O for in kernel device */
-	if (pio_dev)
-		printk(KERN_ERR "kvm_setup_pio: no string io support\n");
 
 	if (!count) {
 		kvm_arch_ops->skip_emulated_instruction(vcpu);
@@ -1849,12 +1816,12 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu,
 	vcpu->pio.cur_count = now;
 
 	for (i = 0; i < nr_pages; ++i) {
-		spin_lock(&vcpu->kvm->lock);
+		mutex_lock(&vcpu->kvm->lock);
 		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
 		if (page)
 			get_page(page);
 		vcpu->pio.guest_pages[i] = page;
-		spin_unlock(&vcpu->kvm->lock);
+		mutex_unlock(&vcpu->kvm->lock);
 		if (!page) {
 			inject_gp(vcpu);
 			free_pio_guest_pages(vcpu);
@@ -1862,9 +1829,21 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu,
 		}
 	}
 
-	if (!vcpu->pio.in)
-		return pio_copy_data(vcpu);
-	return 0;
+	if (!vcpu->pio.in) {
+		/* string PIO write */
+		ret = pio_copy_data(vcpu);
+		if (ret >= 0 && pio_dev) {
+			pio_string_write(pio_dev, vcpu);
+			complete_pio(vcpu);
+			if (vcpu->pio.count == 0)
+				ret = 1;
+		}
+	} else if (pio_dev)
+		printk(KERN_ERR "no string pio read support yet, "
+		       "port %x size %d count %ld\n",
+			port, size, count);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_setup_pio);
 
@@ -1897,7 +1876,6 @@ static int kvm_vcpu_ioctl_run(struct kvm
 			/*
 			 * Read-modify-write.  Back to userspace.
 			 */
-			kvm_run->exit_reason = KVM_EXIT_MMIO;
 			r = 0;
 			goto out;
 		}
@@ -2090,7 +2068,7 @@ #endif
 	memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
 	       sizeof vcpu->irq_pending);
 	vcpu->irq_summary = 0;
-	for (i = 0; i < NR_IRQ_WORDS; ++i)
+	for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
 		if (vcpu->irq_pending[i])
 			__set_bit(i, &vcpu->irq_summary);
 
@@ -2236,13 +2214,13 @@ static int kvm_vcpu_ioctl_translate(stru
 	gpa_t gpa;
 
 	vcpu_load(vcpu);
-	spin_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->lock);
 	gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
 	tr->physical_address = gpa;
 	tr->valid = gpa != UNMAPPED_GVA;
 	tr->writeable = 1;
 	tr->usermode = 0;
-	spin_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&vcpu->kvm->lock);
 	vcpu_put(vcpu);
 
 	return 0;
@@ -2285,7 +2263,6 @@ static struct page *kvm_vcpu_nopage(stru
 	unsigned long pgoff;
 	struct page *page;
 
-	*type = VM_FAULT_MINOR;
 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 	if (pgoff == 0)
 		page = virt_to_page(vcpu->run);
@@ -2294,6 +2271,9 @@ static struct page *kvm_vcpu_nopage(stru
 	else
 		return NOPAGE_SIGBUS;
 	get_page(page);
+	if (type != NULL)
+		*type = VM_FAULT_MINOR;
+
 	return page;
 }
 
@@ -2346,74 +2326,52 @@ static int kvm_vm_ioctl_create_vcpu(stru
 {
 	int r;
 	struct kvm_vcpu *vcpu;
-	struct page *page;
 
-	r = -EINVAL;
 	if (!valid_vcpu(n))
-		goto out;
-
-	vcpu = &kvm->vcpus[n];
-
-	mutex_lock(&vcpu->mutex);
-
-	if (vcpu->vmcs) {
-		mutex_unlock(&vcpu->mutex);
-		return -EEXIST;
-	}
-
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	r = -ENOMEM;
-	if (!page)
-		goto out_unlock;
-	vcpu->run = page_address(page);
-
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	r = -ENOMEM;
-	if (!page)
-		goto out_free_run;
-	vcpu->pio_data = page_address(page);
+		return -EINVAL;
 
-	vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
-					   FX_IMAGE_ALIGN);
-	vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
-	vcpu->cr0 = 0x10;
+	vcpu = kvm_arch_ops->vcpu_create(kvm, n);
+	if (IS_ERR(vcpu))
+		return PTR_ERR(vcpu);
 
-	r = kvm_arch_ops->vcpu_create(vcpu);
-	if (r < 0)
-		goto out_free_vcpus;
+	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 
-	r = kvm_mmu_create(vcpu);
-	if (r < 0)
-		goto out_free_vcpus;
+	/* We do fxsave: this must be aligned. */
+	BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
 
-	kvm_arch_ops->vcpu_load(vcpu);
+	vcpu_load(vcpu);
 	r = kvm_mmu_setup(vcpu);
-	if (r >= 0)
-		r = kvm_arch_ops->vcpu_setup(vcpu);
 	vcpu_put(vcpu);
-
 	if (r < 0)
-		goto out_free_vcpus;
+		goto free_vcpu;
+
+	mutex_lock(&kvm->lock);
+	if (kvm->vcpus[n]) {
+		r = -EEXIST;
+		mutex_unlock(&kvm->lock);
+		goto mmu_unload;
+	}
+	kvm->vcpus[n] = vcpu;
+	mutex_unlock(&kvm->lock);
 
+	/* Now it's all set up, let userspace reach it */
 	r = create_vcpu_fd(vcpu);
 	if (r < 0)
-		goto out_free_vcpus;
+		goto unlink;
+	return r;
 
-	spin_lock(&kvm_lock);
-	if (n >= kvm->nvcpus)
-		kvm->nvcpus = n + 1;
-	spin_unlock(&kvm_lock);
+unlink:
+	mutex_lock(&kvm->lock);
+	kvm->vcpus[n] = NULL;
+	mutex_unlock(&kvm->lock);
 
-	return r;
+mmu_unload:
+	vcpu_load(vcpu);
+	kvm_mmu_unload(vcpu);
+	vcpu_put(vcpu);
 
-out_free_vcpus:
-	kvm_free_vcpu(vcpu);
-out_free_run:
-	free_page((unsigned long)vcpu->run);
-	vcpu->run = NULL;
-out_unlock:
-	mutex_unlock(&vcpu->mutex);
-out:
+free_vcpu:
+	kvm_arch_ops->vcpu_free(vcpu);
 	return r;
 }
 
@@ -2493,7 +2451,7 @@ #endif
 
 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
 
 	vcpu_load(vcpu);
 
@@ -2513,7 +2471,7 @@ static int kvm_vcpu_ioctl_get_fpu(struct
 
 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
+	struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
 
 	vcpu_load(vcpu);
 
@@ -2768,12 +2726,14 @@ static struct page *kvm_vm_nopage(struct
 	unsigned long pgoff;
 	struct page *page;
 
-	*type = VM_FAULT_MINOR;
 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 	page = gfn_to_page(kvm, pgoff);
 	if (!page)
 		return NOPAGE_SIGBUS;
 	get_page(page);
+	if (type != NULL)
+		*type = VM_FAULT_MINOR;
+
 	return page;
 }
 
@@ -2893,25 +2853,6 @@ static struct miscdevice kvm_dev = {
 	&kvm_chardev_ops,
 };
 
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
-                       void *v)
-{
-	if (val == SYS_RESTART) {
-		/*
-		 * Some (well, at least mine) BIOSes hang on reboot if
-		 * in vmx root mode.
-		 */
-		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-		on_each_cpu(hardware_disable, NULL, 0, 1);
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block kvm_reboot_notifier = {
-	.notifier_call = kvm_reboot,
-	.priority = 0,
-};
-
 /*
  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
  * cached on it.
@@ -2925,7 +2866,9 @@ static void decache_vcpus_on_cpu(int cpu
 	spin_lock(&kvm_lock);
 	list_for_each_entry(vm, &vm_list, vm_list)
 		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = &vm->vcpus[i];
+			vcpu = vm->vcpus[i];
+			if (!vcpu)
+				continue;
 			/*
 			 * If the vcpu is locked, then it is running on some
 			 * other cpu and therefore it is not cached on the
@@ -2990,6 +2933,25 @@ static int kvm_cpu_hotplug(struct notifi
 	return NOTIFY_OK;
 }
 
+static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
+                       void *v)
+{
+	if (val == SYS_RESTART) {
+		/*
+		 * Some (well, at least mine) BIOSes hang on reboot if
+		 * in vmx root mode.
+		 */
+		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+		on_each_cpu(hardware_disable, NULL, 0, 1);
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_reboot_notifier = {
+	.notifier_call = kvm_reboot,
+	.priority = 0,
+};
+
 void kvm_io_bus_init(struct kvm_io_bus *bus)
 {
 	memset(bus, 0, sizeof(*bus));
@@ -3043,8 +3005,9 @@ static u64 stat_get(void *_offset)
 	spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = &kvm->vcpus[i];
-			total += *(u32 *)((void *)vcpu + offset);
+			vcpu = kvm->vcpus[i];
+			if (vcpu)
+				total += *(u32 *)((void *)vcpu + offset);
 		}
 	spin_unlock(&kvm_lock);
 	return total;
@@ -3101,9 +3064,32 @@ static struct sys_device kvm_sysdev = {
 
 hpa_t bad_page_address;
 
-int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
+static inline
+struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
+{
+	return container_of(pn, struct kvm_vcpu, preempt_notifier);
+}
+
+static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
+{
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+	kvm_arch_ops->vcpu_load(vcpu, cpu);
+}
+
+static void kvm_sched_out(struct preempt_notifier *pn,
+			  struct task_struct *next)
+{
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+	kvm_arch_ops->vcpu_put(vcpu);
+}
+
+int kvm_init_arch(struct kvm_arch_ops *ops, unsigned int vcpu_size,
+		  struct module *module)
 {
 	int r;
+	int cpu;
 
 	if (kvm_arch_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -3125,6 +3111,14 @@ int kvm_init_arch(struct kvm_arch_ops *o
 	if (r < 0)
 		goto out;
 
+	for_each_online_cpu(cpu) {
+		smp_call_function_single(cpu,
+				kvm_arch_ops->check_processor_compatibility,
+				&r, 0, 1);
+		if (r < 0)
+			goto out_free_0;
+	}
+
 	on_each_cpu(hardware_enable, NULL, 0, 1);
 	r = register_cpu_notifier(&kvm_cpu_notifier);
 	if (r)
@@ -3139,6 +3133,14 @@ int kvm_init_arch(struct kvm_arch_ops *o
 	if (r)
 		goto out_free_3;
 
+	/* A kmem cache lets us meet the alignment requirements of fx_save. */
+	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
+					   __alignof__(struct kvm_vcpu), 0, 0);
+	if (!kvm_vcpu_cache) {
+		r = -ENOMEM;
+		goto out_free_4;
+	}
+
 	kvm_chardev_ops.owner = module;
 
 	r = misc_register(&kvm_dev);
@@ -3147,9 +3149,14 @@ int kvm_init_arch(struct kvm_arch_ops *o
 		goto out_free;
 	}
 
+	kvm_preempt_ops.sched_in = kvm_sched_in;
+	kvm_preempt_ops.sched_out = kvm_sched_out;
+
 	return r;
 
 out_free:
+	kmem_cache_destroy(kvm_vcpu_cache);
+out_free_4:
 	sysdev_unregister(&kvm_sysdev);
 out_free_3:
 	sysdev_class_unregister(&kvm_sysdev_class);
@@ -3158,6 +3165,7 @@ out_free_2:
 	unregister_cpu_notifier(&kvm_cpu_notifier);
 out_free_1:
 	on_each_cpu(hardware_disable, NULL, 0, 1);
+out_free_0:
 	kvm_arch_ops->hardware_unsetup();
 out:
 	kvm_arch_ops = NULL;
@@ -3167,6 +3175,7 @@ out:
 void kvm_exit_arch(void)
 {
 	misc_deregister(&kvm_dev);
+	kmem_cache_destroy(kvm_vcpu_cache);
 	sysdev_unregister(&kvm_sysdev);
 	sysdev_class_unregister(&kvm_sysdev_class);
 	unregister_reboot_notifier(&kvm_reboot_notifier);
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h
index a869983..a0e415d 100644
--- a/drivers/kvm/kvm_svm.h
+++ b/drivers/kvm/kvm_svm.h
@@ -20,7 +20,10 @@ #endif
 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
 #define NUM_DB_REGS 4
 
+struct kvm_vcpu;
+
 struct vcpu_svm {
+	struct kvm_vcpu vcpu;
 	struct vmcb *vmcb;
 	unsigned long vmcb_pa;
 	struct svm_cpu_data *svm_data;
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 1a87ba9..bfe16d5 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -158,7 +158,7 @@ static struct kmem_cache *mmu_page_heade
 
 static int is_write_protection(struct kvm_vcpu *vcpu)
 {
-	return vcpu->cr0 & CR0_WP_MASK;
+	return vcpu->cr0 & X86_CR0_WP;
 }
 
 static int is_cpuid_PSE36(void)
@@ -274,11 +274,9 @@ static int mmu_topup_memory_caches(struc
 
 	r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT);
 	if (r < 0) {
-		spin_unlock(&vcpu->kvm->lock);
-		kvm_arch_ops->vcpu_put(vcpu);
+		mutex_unlock(&vcpu->kvm->lock);
 		r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
-		kvm_arch_ops->vcpu_load(vcpu);
-		spin_lock(&vcpu->kvm->lock);
+		mutex_lock(&vcpu->kvm->lock);
 	}
 	return r;
 }
@@ -1069,7 +1067,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	spin_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->lock);
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		goto out;
@@ -1077,7 +1075,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
 	kvm_mmu_flush_tlb(vcpu);
 out:
-	spin_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&vcpu->kvm->lock);
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_load);
@@ -1122,7 +1120,7 @@ static void mmu_pte_write_new_pte(struct
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *old, const u8 *new, int bytes)
+		       const u8 *new, int bytes)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *page;
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index 4b5391c..660243b 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -58,7 +58,10 @@ struct guest_walker {
 	int level;
 	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
 	pt_element_t *table;
+	pt_element_t pte;
 	pt_element_t *ptep;
+	struct page *page;
+	int index;
 	pt_element_t inherited_ar;
 	gfn_t gfn;
 	u32 error_code;
@@ -80,11 +83,14 @@ static int FNAME(walk_addr)(struct guest
 	pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
 	walker->level = vcpu->mmu.root_level;
 	walker->table = NULL;
+	walker->page = NULL;
+	walker->ptep = NULL;
 	root = vcpu->cr3;
 #if PTTYPE == 64
 	if (!is_long_mode(vcpu)) {
 		walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
 		root = *walker->ptep;
+		walker->pte = root;
 		if (!(root & PT_PRESENT_MASK))
 			goto not_present;
 		--walker->level;
@@ -96,10 +102,11 @@ #endif
 		 walker->level - 1, table_gfn);
 	slot = gfn_to_memslot(vcpu->kvm, table_gfn);
 	hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
-	walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
+	walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
+	walker->table = kmap_atomic(walker->page, KM_USER0);
 
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
+	       (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
 
 	walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
 
@@ -108,6 +115,7 @@ #endif
 		hpa_t paddr;
 
 		ptep = &walker->table[index];
+		walker->index = index;
 		ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
 		       ((unsigned long)ptep & PAGE_MASK));
 
@@ -148,16 +156,20 @@ #endif
 
 		walker->inherited_ar &= walker->table[index];
 		table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-		paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
 		kunmap_atomic(walker->table, KM_USER0);
-		walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
-					    KM_USER0);
+		paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
+		walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
+		walker->table = kmap_atomic(walker->page, KM_USER0);
 		--walker->level;
 		walker->table_gfn[walker->level - 1 ] = table_gfn;
 		pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
 			 walker->level - 1, table_gfn);
 	}
-	walker->ptep = ptep;
+	walker->pte = *ptep;
+	if (walker->page)
+		walker->ptep = NULL;
+	if (walker->table)
+		kunmap_atomic(walker->table, KM_USER0);
 	pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
 	return 1;
 
@@ -175,13 +187,9 @@ err:
 		walker->error_code |= PFERR_USER_MASK;
 	if (fetch_fault)
 		walker->error_code |= PFERR_FETCH_MASK;
-	return 0;
-}
-
-static void FNAME(release_walker)(struct guest_walker *walker)
-{
 	if (walker->table)
 		kunmap_atomic(walker->table, KM_USER0);
+	return 0;
 }
 
 static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
@@ -193,7 +201,7 @@ static void FNAME(mark_pagetable_dirty)(
 static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
 				  u64 *shadow_pte,
 				  gpa_t gaddr,
-				  pt_element_t *gpte,
+				  pt_element_t gpte,
 				  u64 access_bits,
 				  int user_fault,
 				  int write_fault,
@@ -202,23 +210,34 @@ static void FNAME(set_pte_common)(struct
 				  gfn_t gfn)
 {
 	hpa_t paddr;
-	int dirty = *gpte & PT_DIRTY_MASK;
+	int dirty = gpte & PT_DIRTY_MASK;
 	u64 spte = *shadow_pte;
 	int was_rmapped = is_rmap_pte(spte);
 
 	pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
 		 " user_fault %d gfn %lx\n",
-		 __FUNCTION__, spte, (u64)*gpte, access_bits,
+		 __FUNCTION__, spte, (u64)gpte, access_bits,
 		 write_fault, user_fault, gfn);
 
 	if (write_fault && !dirty) {
-		*gpte |= PT_DIRTY_MASK;
+		pt_element_t *guest_ent, *tmp = NULL;
+
+		if (walker->ptep)
+			guest_ent = walker->ptep;
+		else {
+			tmp = kmap_atomic(walker->page, KM_USER0);
+			guest_ent = &tmp[walker->index];
+		}
+
+		*guest_ent |= PT_DIRTY_MASK;
+		if (!walker->ptep)
+			kunmap_atomic(tmp, KM_USER0);
 		dirty = 1;
 		FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
 	}
 
 	spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
-	spte |= *gpte & PT64_NX_MASK;
+	spte |= gpte & PT64_NX_MASK;
 	if (!dirty)
 		access_bits &= ~PT_WRITABLE_MASK;
 
@@ -273,13 +292,13 @@ unshadowed:
 		rmap_add(vcpu, shadow_pte);
 }
 
-static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte,
+static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
 			   u64 *shadow_pte, u64 access_bits,
 			   int user_fault, int write_fault, int *ptwrite,
 			   struct guest_walker *walker, gfn_t gfn)
 {
-	access_bits &= *gpte;
-	FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK,
+	access_bits &= gpte;
+	FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
 			      gpte, access_bits, user_fault, write_fault,
 			      ptwrite, walker, gfn);
 }
@@ -295,22 +314,22 @@ static void FNAME(update_pte)(struct kvm
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
 		return;
 	pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
-	FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
+	FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
 		       0, NULL, NULL,
 		       (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
 }
 
-static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde,
+static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
 			   u64 *shadow_pte, u64 access_bits,
 			   int user_fault, int write_fault, int *ptwrite,
 			   struct guest_walker *walker, gfn_t gfn)
 {
 	gpa_t gaddr;
 
-	access_bits &= *gpde;
+	access_bits &= gpde;
 	gaddr = (gpa_t)gfn << PAGE_SHIFT;
 	if (PTTYPE == 32 && is_cpuid_PSE36())
-		gaddr |= (*gpde & PT32_DIR_PSE36_MASK) <<
+		gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
 			(32 - PT32_DIR_PSE36_SHIFT);
 	FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
 			      gpde, access_bits, user_fault, write_fault,
@@ -328,9 +347,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 	int level;
 	u64 *shadow_ent;
 	u64 *prev_shadow_ent = NULL;
-	pt_element_t *guest_ent = walker->ptep;
 
-	if (!is_present_pte(*guest_ent))
+	if (!is_present_pte(walker->pte))
 		return NULL;
 
 	shadow_addr = vcpu->mmu.root_hpa;
@@ -364,12 +382,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 		if (level - 1 == PT_PAGE_TABLE_LEVEL
 		    && walker->level == PT_DIRECTORY_LEVEL) {
 			metaphysical = 1;
-			hugepage_access = *guest_ent;
+			hugepage_access = walker->pte;
 			hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
-			if (*guest_ent & PT64_NX_MASK)
+			if (walker->pte & PT64_NX_MASK)
 				hugepage_access |= (1 << 2);
 			hugepage_access >>= PT_WRITABLE_SHIFT;
-			table_gfn = (*guest_ent & PT_BASE_ADDR_MASK)
+			table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
 				>> PAGE_SHIFT;
 		} else {
 			metaphysical = 0;
@@ -386,12 +404,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 	}
 
 	if (walker->level == PT_DIRECTORY_LEVEL) {
-		FNAME(set_pde)(vcpu, guest_ent, shadow_ent,
+		FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
 			       walker->inherited_ar, user_fault, write_fault,
 			       ptwrite, walker, walker->gfn);
 	} else {
 		ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
-		FNAME(set_pte)(vcpu, guest_ent, shadow_ent,
+		FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
 			       walker->inherited_ar, user_fault, write_fault,
 			       ptwrite, walker, walker->gfn);
 	}
@@ -442,7 +460,6 @@ static int FNAME(page_fault)(struct kvm_
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __FUNCTION__);
 		inject_page_fault(vcpu, addr, walker.error_code);
-		FNAME(release_walker)(&walker);
 		vcpu->last_pt_write_count = 0; /* reset fork detector */
 		return 0;
 	}
@@ -452,8 +469,6 @@ static int FNAME(page_fault)(struct kvm_
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
 		 shadow_pte, *shadow_pte, write_pt);
 
-	FNAME(release_walker)(&walker);
-
 	if (!write_pt)
 		vcpu->last_pt_write_count = 0; /* reset fork detector */
 
@@ -482,7 +497,6 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
 		gpa |= vaddr & ~PAGE_MASK;
 	}
 
-	FNAME(release_walker)(&walker);
 	return gpa;
 }
 
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index bc818cc..7beaff1 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -38,7 +38,6 @@ #define GP_VECTOR 13
 
 #define DR7_GD_MASK (1 << 13)
 #define DR6_BD_MASK (1 << 13)
-#define CR4_DE_MASK (1UL << 3)
 
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
@@ -50,6 +49,11 @@ #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_DEATURE_SVML (1 << 2)
 
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
 unsigned long iopm_base;
 unsigned long msrpm_base;
 
@@ -94,12 +98,12 @@ static inline u32 svm_has(u32 feat)
 	return svm_features & feat;
 }
 
-static unsigned get_addr_size(struct kvm_vcpu *vcpu)
+static unsigned get_addr_size(struct vcpu_svm *svm)
 {
-	struct vmcb_save_area *sa = &vcpu->svm->vmcb->save;
+	struct vmcb_save_area *sa = &svm->vmcb->save;
 	u16 cs_attrib;
 
-	if (!(sa->cr0 & CR0_PE_MASK) || (sa->rflags & X86_EFLAGS_VM))
+	if (!(sa->cr0 & X86_CR0_PE) || (sa->rflags & X86_EFLAGS_VM))
 		return 2;
 
 	cs_attrib = sa->cs.attrib;
@@ -182,7 +186,7 @@ static inline void write_dr7(unsigned lo
 
 static inline void force_new_asid(struct kvm_vcpu *vcpu)
 {
-	vcpu->svm->asid_generation--;
+	to_svm(vcpu)->asid_generation--;
 }
 
 static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
@@ -195,22 +199,24 @@ static void svm_set_efer(struct kvm_vcpu
 	if (!(efer & KVM_EFER_LMA))
 		efer &= ~KVM_EFER_LME;
 
-	vcpu->svm->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
+	to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
 	vcpu->shadow_efer = efer;
 }
 
 static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
 {
-	vcpu->svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.event_inj =		SVM_EVTINJ_VALID |
 						SVM_EVTINJ_VALID_ERR |
 						SVM_EVTINJ_TYPE_EXEPT |
 						GP_VECTOR;
-	vcpu->svm->vmcb->control.event_inj_err = error_code;
+	svm->vmcb->control.event_inj_err = error_code;
 }
 
 static void inject_ud(struct kvm_vcpu *vcpu)
 {
-	vcpu->svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
+	to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
 						SVM_EVTINJ_TYPE_EXEPT |
 						UD_VECTOR;
 }
@@ -229,19 +235,21 @@ static int is_external_interrupt(u32 inf
 
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
-	if (!vcpu->svm->next_rip) {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!svm->next_rip) {
 		printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
 		return;
 	}
-	if (vcpu->svm->next_rip - vcpu->svm->vmcb->save.rip > 15) {
+	if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
 		printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
 		       __FUNCTION__,
-		       vcpu->svm->vmcb->save.rip,
-		       vcpu->svm->next_rip);
+		       svm->vmcb->save.rip,
+		       svm->next_rip);
 	}
 
-	vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip;
-	vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+	vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
+	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
 	vcpu->interrupt_window_open = 1;
 }
@@ -351,8 +359,8 @@ err_1:
 
 }
 
-static int set_msr_interception(u32 *msrpm, unsigned msr,
-				int read, int write)
+static void set_msr_interception(u32 *msrpm, unsigned msr,
+				 int read, int write)
 {
 	int i;
 
@@ -367,11 +375,10 @@ static int set_msr_interception(u32 *msr
 			u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
 			*base = (*base & ~(0x3 << msr_shift)) |
 				(mask << msr_shift);
-			return 1;
+			return;
 		}
 	}
-	printk(KERN_DEBUG "%s: not found 0x%x\n", __FUNCTION__, msr);
-	return 0;
+	BUG();
 }
 
 static __init int svm_hardware_setup(void)
@@ -458,11 +465,6 @@ static void init_sys_seg(struct vmcb_seg
 	seg->base = 0;
 }
 
-static int svm_vcpu_setup(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
 static void init_vmcb(struct vmcb *vmcb)
 {
 	struct vmcb_control_area *control = &vmcb->control;
@@ -563,59 +565,70 @@ static void init_vmcb(struct vmcb *vmcb)
 	 * cr0 val on cpu init should be 0x60000010, we enable cpu
 	 * cache by default. the orderly way is to enable cache in bios.
 	 */
-	save->cr0 = 0x00000010 | CR0_PG_MASK | CR0_WP_MASK;
-	save->cr4 = CR4_PAE_MASK;
+	save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
+	save->cr4 = X86_CR4_PAE;
 	/* rdx = ?? */
 }
 
-static int svm_create_vcpu(struct kvm_vcpu *vcpu)
+static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 {
+	struct vcpu_svm *svm;
 	struct page *page;
-	int r;
+	int err;
+
+	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!svm) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = kvm_vcpu_init(&svm->vcpu, kvm, id);
+	if (err)
+		goto free_svm;
 
-	r = -ENOMEM;
-	vcpu->svm = kzalloc(sizeof *vcpu->svm, GFP_KERNEL);
-	if (!vcpu->svm)
-		goto out1;
 	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		goto out2;
-
-	vcpu->svm->vmcb = page_address(page);
-	clear_page(vcpu->svm->vmcb);
-	vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
-	vcpu->svm->asid_generation = 0;
-	memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs));
-	init_vmcb(vcpu->svm->vmcb);
-
-	fx_init(vcpu);
-	vcpu->fpu_active = 1;
-	vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-	if (vcpu == &vcpu->kvm->vcpus[0])
-		vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
+	if (!page) {
+		err = -ENOMEM;
+		goto uninit;
+	}
 
-	return 0;
+	svm->vmcb = page_address(page);
+	clear_page(svm->vmcb);
+	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+	svm->asid_generation = 0;
+	memset(svm->db_regs, 0, sizeof(svm->db_regs));
+	init_vmcb(svm->vmcb);
 
-out2:
-	kfree(vcpu->svm);
-out1:
-	return r;
+	fx_init(&svm->vcpu);
+	svm->vcpu.fpu_active = 1;
+	svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+	if (svm->vcpu.vcpu_id == 0)
+		svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
+
+	return &svm->vcpu;
+
+uninit:
+	kvm_vcpu_uninit(&svm->vcpu);
+free_svm:
+	kfree(svm);
+out:
+	return ERR_PTR(err);
 }
 
 static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 {
-	if (!vcpu->svm)
-		return;
-	if (vcpu->svm->vmcb)
-		__free_page(pfn_to_page(vcpu->svm->vmcb_pa >> PAGE_SHIFT));
-	kfree(vcpu->svm);
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+	kvm_vcpu_uninit(vcpu);
+	kfree(svm);
 }
 
-static void svm_vcpu_load(struct kvm_vcpu *vcpu)
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	int cpu, i;
+	struct vcpu_svm *svm = to_svm(vcpu);
+	int i;
 
-	cpu = get_cpu();
 	if (unlikely(cpu != vcpu->cpu)) {
 		u64 tsc_this, delta;
 
@@ -625,23 +638,23 @@ static void svm_vcpu_load(struct kvm_vcp
 		 */
 		rdtscll(tsc_this);
 		delta = vcpu->host_tsc - tsc_this;
-		vcpu->svm->vmcb->control.tsc_offset += delta;
+		svm->vmcb->control.tsc_offset += delta;
 		vcpu->cpu = cpu;
 	}
 
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-		rdmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]);
+		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
 
 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
 	int i;
 
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-		wrmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]);
+		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 
 	rdtscll(vcpu->host_tsc);
-	put_cpu();
 }
 
 static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -650,31 +663,34 @@ static void svm_vcpu_decache(struct kvm_
 
 static void svm_cache_regs(struct kvm_vcpu *vcpu)
 {
-	vcpu->regs[VCPU_REGS_RAX] = vcpu->svm->vmcb->save.rax;
-	vcpu->regs[VCPU_REGS_RSP] = vcpu->svm->vmcb->save.rsp;
-	vcpu->rip = vcpu->svm->vmcb->save.rip;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+	vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+	vcpu->rip = svm->vmcb->save.rip;
 }
 
 static void svm_decache_regs(struct kvm_vcpu *vcpu)
 {
-	vcpu->svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
-	vcpu->svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
-	vcpu->svm->vmcb->save.rip = vcpu->rip;
+	struct vcpu_svm *svm = to_svm(vcpu);
+	svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
+	svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
+	svm->vmcb->save.rip = vcpu->rip;
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
-	return vcpu->svm->vmcb->save.rflags;
+	return to_svm(vcpu)->vmcb->save.rflags;
 }
 
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
-	vcpu->svm->vmcb->save.rflags = rflags;
+	to_svm(vcpu)->vmcb->save.rflags = rflags;
 }
 
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
 {
-	struct vmcb_save_area *save = &vcpu->svm->vmcb->save;
+	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
 
 	switch (seg) {
 	case VCPU_SREG_CS: return &save->cs;
@@ -726,26 +742,34 @@ static void svm_get_cs_db_l_bits(struct 
 
 static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 {
-	dt->limit = vcpu->svm->vmcb->save.idtr.limit;
-	dt->base = vcpu->svm->vmcb->save.idtr.base;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	dt->limit = svm->vmcb->save.idtr.limit;
+	dt->base = svm->vmcb->save.idtr.base;
 }
 
 static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 {
-	vcpu->svm->vmcb->save.idtr.limit = dt->limit;
-	vcpu->svm->vmcb->save.idtr.base = dt->base ;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->save.idtr.limit = dt->limit;
+	svm->vmcb->save.idtr.base = dt->base ;
 }
 
 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 {
-	dt->limit = vcpu->svm->vmcb->save.gdtr.limit;
-	dt->base = vcpu->svm->vmcb->save.gdtr.base;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	dt->limit = svm->vmcb->save.gdtr.limit;
+	dt->base = svm->vmcb->save.gdtr.base;
 }
 
 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 {
-	vcpu->svm->vmcb->save.gdtr.limit = dt->limit;
-	vcpu->svm->vmcb->save.gdtr.base = dt->base ;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->save.gdtr.limit = dt->limit;
+	svm->vmcb->save.gdtr.base = dt->base ;
 }
 
 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
@@ -754,39 +778,42 @@ static void svm_decache_cr4_guest_bits(s
 
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
 #ifdef CONFIG_X86_64
 	if (vcpu->shadow_efer & KVM_EFER_LME) {
-		if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
+		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 			vcpu->shadow_efer |= KVM_EFER_LMA;
-			vcpu->svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
+			svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
 		}
 
-		if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK) ) {
+		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
 			vcpu->shadow_efer &= ~KVM_EFER_LMA;
-			vcpu->svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
+			svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
 		}
 	}
 #endif
-	if ((vcpu->cr0 & CR0_TS_MASK) && !(cr0 & CR0_TS_MASK)) {
-		vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+	if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
+		svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
 		vcpu->fpu_active = 1;
 	}
 
 	vcpu->cr0 = cr0;
-	cr0 |= CR0_PG_MASK | CR0_WP_MASK;
-	cr0 &= ~(CR0_CD_MASK | CR0_NW_MASK);
-	vcpu->svm->vmcb->save.cr0 = cr0;
+	cr0 |= X86_CR0_PG | X86_CR0_WP;
+	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+	svm->vmcb->save.cr0 = cr0;
 }
 
 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        vcpu->cr4 = cr4;
-       vcpu->svm->vmcb->save.cr4 = cr4 | CR4_PAE_MASK;
+       to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
 }
 
 static void svm_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb_seg *s = svm_seg(vcpu, seg);
 
 	s->base = var->base;
@@ -805,16 +832,16 @@ static void svm_set_segment(struct kvm_v
 		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
 	}
 	if (seg == VCPU_SREG_CS)
-		vcpu->svm->vmcb->save.cpl
-			= (vcpu->svm->vmcb->save.cs.attrib
+		svm->vmcb->save.cpl
+			= (svm->vmcb->save.cs.attrib
 			   >> SVM_SELECTOR_DPL_SHIFT) & 3;
 
 }
 
 /* FIXME:
 
-	vcpu->svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
-	vcpu->svm->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
+	svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
+	svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
 
 */
 
@@ -826,58 +853,60 @@ static int svm_guest_debug(struct kvm_vc
 static void load_host_msrs(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
-	wrmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base);
+	wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
 #endif
 }
 
 static void save_host_msrs(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
-	rdmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base);
+	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
 #endif
 }
 
-static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data)
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
 {
 	if (svm_data->next_asid > svm_data->max_asid) {
 		++svm_data->asid_generation;
 		svm_data->next_asid = 1;
-		vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
 	}
 
-	vcpu->cpu = svm_data->cpu;
-	vcpu->svm->asid_generation = svm_data->asid_generation;
-	vcpu->svm->vmcb->control.asid = svm_data->next_asid++;
+	svm->vcpu.cpu = svm_data->cpu;
+	svm->asid_generation = svm_data->asid_generation;
+	svm->vmcb->control.asid = svm_data->next_asid++;
 }
 
 static void svm_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 {
-	invlpga(address, vcpu->svm->vmcb->control.asid); // is needed?
+	invlpga(address, to_svm(vcpu)->vmcb->control.asid); // is needed?
 }
 
 static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
 {
-	return vcpu->svm->db_regs[dr];
+	return to_svm(vcpu)->db_regs[dr];
 }
 
 static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 		       int *exception)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
 	*exception = 0;
 
-	if (vcpu->svm->vmcb->save.dr7 & DR7_GD_MASK) {
-		vcpu->svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
-		vcpu->svm->vmcb->save.dr6 |= DR6_BD_MASK;
+	if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
+		svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
+		svm->vmcb->save.dr6 |= DR6_BD_MASK;
 		*exception = DB_VECTOR;
 		return;
 	}
 
 	switch (dr) {
 	case 0 ... 3:
-		vcpu->svm->db_regs[dr] = value;
+		svm->db_regs[dr] = value;
 		return;
 	case 4 ... 5:
-		if (vcpu->cr4 & CR4_DE_MASK) {
+		if (vcpu->cr4 & X86_CR4_DE) {
 			*exception = UD_VECTOR;
 			return;
 		}
@@ -886,7 +915,7 @@ static void svm_set_dr(struct kvm_vcpu *
 			*exception = GP_VECTOR;
 			return;
 		}
-		vcpu->svm->vmcb->save.dr7 = value;
+		svm->vmcb->save.dr7 = value;
 		return;
 	}
 	default:
@@ -897,42 +926,43 @@ static void svm_set_dr(struct kvm_vcpu *
 	}
 }
 
-static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 exit_int_info = vcpu->svm->vmcb->control.exit_int_info;
+	u32 exit_int_info = svm->vmcb->control.exit_int_info;
+	struct kvm *kvm = svm->vcpu.kvm;
 	u64 fault_address;
 	u32 error_code;
 	enum emulation_result er;
 	int r;
 
 	if (is_external_interrupt(exit_int_info))
-		push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
+		push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
 
-	spin_lock(&vcpu->kvm->lock);
+	mutex_lock(&kvm->lock);
 
-	fault_address  = vcpu->svm->vmcb->control.exit_info_2;
-	error_code = vcpu->svm->vmcb->control.exit_info_1;
-	r = kvm_mmu_page_fault(vcpu, fault_address, error_code);
+	fault_address  = svm->vmcb->control.exit_info_2;
+	error_code = svm->vmcb->control.exit_info_1;
+	r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 	if (r < 0) {
-		spin_unlock(&vcpu->kvm->lock);
+		mutex_unlock(&kvm->lock);
 		return r;
 	}
 	if (!r) {
-		spin_unlock(&vcpu->kvm->lock);
+		mutex_unlock(&kvm->lock);
 		return 1;
 	}
-	er = emulate_instruction(vcpu, kvm_run, fault_address, error_code);
-	spin_unlock(&vcpu->kvm->lock);
+	er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
+				 error_code);
+	mutex_unlock(&kvm->lock);
 
 	switch (er) {
 	case EMULATE_DONE:
 		return 1;
 	case EMULATE_DO_MMIO:
-		++vcpu->stat.mmio_exits;
-		kvm_run->exit_reason = KVM_EXIT_MMIO;
+		++svm->vcpu.stat.mmio_exits;
 		return 0;
 	case EMULATE_FAIL:
-		vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
+		vcpu_printf(&svm->vcpu, "%s: emulate fail\n", __FUNCTION__);
 		break;
 	default:
 		BUG();
@@ -942,30 +972,30 @@ static int pf_interception(struct kvm_vc
 	return 0;
 }
 
-static int nm_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-       vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
-       if (!(vcpu->cr0 & CR0_TS_MASK))
-               vcpu->svm->vmcb->save.cr0 &= ~CR0_TS_MASK;
-       vcpu->fpu_active = 1;
+	svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+	if (!(svm->vcpu.cr0 & X86_CR0_TS))
+		svm->vmcb->save.cr0 &= ~X86_CR0_TS;
+	svm->vcpu.fpu_active = 1;
 
-       return 1;
+	return 1;
 }
 
-static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	/*
 	 * VMCB is undefined after a SHUTDOWN intercept
 	 * so reinitialize it.
 	 */
-	clear_page(vcpu->svm->vmcb);
-	init_vmcb(vcpu->svm->vmcb);
+	clear_page(svm->vmcb);
+	init_vmcb(svm->vmcb);
 
 	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
 	return 0;
 }
 
-static int io_get_override(struct kvm_vcpu *vcpu,
+static int io_get_override(struct vcpu_svm *svm,
 			  struct vmcb_seg **seg,
 			  int *addr_override)
 {
@@ -974,21 +1004,22 @@ static int io_get_override(struct kvm_vc
 	gva_t rip;
 	int i;
 
-	rip =  vcpu->svm->vmcb->save.rip;
-	ins_length = vcpu->svm->next_rip - rip;
-	rip += vcpu->svm->vmcb->save.cs.base;
+	rip =  svm->vmcb->save.rip;
+	ins_length = svm->next_rip - rip;
+	rip += svm->vmcb->save.cs.base;
 
 	if (ins_length > MAX_INST_SIZE)
 		printk(KERN_DEBUG
 		       "%s: inst length err, cs base 0x%llx rip 0x%llx "
 		       "next rip 0x%llx ins_length %u\n",
 		       __FUNCTION__,
-		       vcpu->svm->vmcb->save.cs.base,
-		       vcpu->svm->vmcb->save.rip,
-		       vcpu->svm->vmcb->control.exit_info_2,
+		       svm->vmcb->save.cs.base,
+		       svm->vmcb->save.rip,
+		       svm->vmcb->control.exit_info_2,
 		       ins_length);
 
-	if (kvm_read_guest(vcpu, rip, ins_length, inst) != ins_length)
+	if (emulator_read_std(rip, inst, ins_length, &svm->vcpu)
+	    != X86EMUL_CONTINUE)
 		/* #PF */
 		return 0;
 
@@ -1005,22 +1036,22 @@ static int io_get_override(struct kvm_vc
 			*addr_override = 1;
 			continue;
 		case 0x2e:
-			*seg = &vcpu->svm->vmcb->save.cs;
+			*seg = &svm->vmcb->save.cs;
 			continue;
 		case 0x36:
-			*seg = &vcpu->svm->vmcb->save.ss;
+			*seg = &svm->vmcb->save.ss;
 			continue;
 		case 0x3e:
-			*seg = &vcpu->svm->vmcb->save.ds;
+			*seg = &svm->vmcb->save.ds;
 			continue;
 		case 0x26:
-			*seg = &vcpu->svm->vmcb->save.es;
+			*seg = &svm->vmcb->save.es;
 			continue;
 		case 0x64:
-			*seg = &vcpu->svm->vmcb->save.fs;
+			*seg = &svm->vmcb->save.fs;
 			continue;
 		case 0x65:
-			*seg = &vcpu->svm->vmcb->save.gs;
+			*seg = &svm->vmcb->save.gs;
 			continue;
 		default:
 			return 1;
@@ -1029,40 +1060,40 @@ static int io_get_override(struct kvm_vc
 	return 0;
 }
 
-static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, gva_t *address)
+static unsigned long io_address(struct vcpu_svm *svm, int ins, gva_t *address)
 {
 	unsigned long addr_mask;
 	unsigned long *reg;
 	struct vmcb_seg *seg;
 	int addr_override;
-	struct vmcb_save_area *save_area = &vcpu->svm->vmcb->save;
+	struct vmcb_save_area *save_area = &svm->vmcb->save;
 	u16 cs_attrib = save_area->cs.attrib;
-	unsigned addr_size = get_addr_size(vcpu);
+	unsigned addr_size = get_addr_size(svm);
 
-	if (!io_get_override(vcpu, &seg, &addr_override))
+	if (!io_get_override(svm, &seg, &addr_override))
 		return 0;
 
 	if (addr_override)
 		addr_size = (addr_size == 2) ? 4: (addr_size >> 1);
 
 	if (ins) {
-		reg = &vcpu->regs[VCPU_REGS_RDI];
-		seg = &vcpu->svm->vmcb->save.es;
+		reg = &svm->vcpu.regs[VCPU_REGS_RDI];
+		seg = &svm->vmcb->save.es;
 	} else {
-		reg = &vcpu->regs[VCPU_REGS_RSI];
-		seg = (seg) ? seg : &vcpu->svm->vmcb->save.ds;
+		reg = &svm->vcpu.regs[VCPU_REGS_RSI];
+		seg = (seg) ? seg : &svm->vmcb->save.ds;
 	}
 
 	addr_mask = ~0ULL >> (64 - (addr_size * 8));
 
 	if ((cs_attrib & SVM_SELECTOR_L_MASK) &&
-	    !(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_VM)) {
+	    !(svm->vmcb->save.rflags & X86_EFLAGS_VM)) {
 		*address = (*reg & addr_mask);
 		return addr_mask;
 	}
 
 	if (!(seg->attrib & SVM_SELECTOR_P_SHIFT)) {
-		svm_inject_gp(vcpu, 0);
+		svm_inject_gp(&svm->vcpu, 0);
 		return 0;
 	}
 
@@ -1070,17 +1101,17 @@ static unsigned long io_adress(struct kv
 	return addr_mask;
 }
 
-static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug?
+	u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
 	int size, down, in, string, rep;
 	unsigned port;
 	unsigned long count;
 	gva_t address = 0;
 
-	++vcpu->stat.io_exits;
+	++svm->vcpu.stat.io_exits;
 
-	vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2;
+	svm->next_rip = svm->vmcb->control.exit_info_2;
 
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	port = io_info >> 16;
@@ -1088,12 +1119,12 @@ static int io_interception(struct kvm_vc
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
 	rep = (io_info & SVM_IOIO_REP_MASK) != 0;
 	count = 1;
-	down = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
+	down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
 
 	if (string) {
 		unsigned addr_mask;
 
-		addr_mask = io_adress(vcpu, in, &address);
+		addr_mask = io_address(svm, in, &address);
 		if (!addr_mask) {
 			printk(KERN_DEBUG "%s: get io address failed\n",
 			       __FUNCTION__);
@@ -1101,93 +1132,98 @@ static int io_interception(struct kvm_vc
 		}
 
 		if (rep)
-			count = vcpu->regs[VCPU_REGS_RCX] & addr_mask;
+			count = svm->vcpu.regs[VCPU_REGS_RCX] & addr_mask;
 	}
-	return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down,
-			     address, rep, port);
+	return kvm_setup_pio(&svm->vcpu, kvm_run, in, size, count, string,
+			     down, address, rep, port);
 }
 
-static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	return 1;
 }
 
-static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
-	skip_emulated_instruction(vcpu);
-	return kvm_emulate_halt(vcpu);
+	svm->next_rip = svm->vmcb->save.rip + 1;
+	skip_emulated_instruction(&svm->vcpu);
+	return kvm_emulate_halt(&svm->vcpu);
 }
 
-static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 3;
-	skip_emulated_instruction(vcpu);
-	return kvm_hypercall(vcpu, kvm_run);
+	svm->next_rip = svm->vmcb->save.rip + 3;
+	skip_emulated_instruction(&svm->vcpu);
+	return kvm_hypercall(&svm->vcpu, kvm_run);
 }
 
-static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int invalid_op_interception(struct vcpu_svm *svm,
+				   struct kvm_run *kvm_run)
 {
-	inject_ud(vcpu);
+	inject_ud(&svm->vcpu);
 	return 1;
 }
 
-static int task_switch_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int task_switch_interception(struct vcpu_svm *svm,
+				    struct kvm_run *kvm_run)
 {
 	printk(KERN_DEBUG "%s: task swiche is unsupported\n", __FUNCTION__);
 	kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
 	return 0;
 }
 
-static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2;
-	kvm_emulate_cpuid(vcpu);
+	svm->next_rip = svm->vmcb->save.rip + 2;
+	kvm_emulate_cpuid(&svm->vcpu);
 	return 1;
 }
 
-static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int emulate_on_interception(struct vcpu_svm *svm,
+				   struct kvm_run *kvm_run)
 {
-	if (emulate_instruction(vcpu, NULL, 0, 0) != EMULATE_DONE)
+	if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
 		printk(KERN_ERR "%s: failed\n", __FUNCTION__);
 	return 1;
 }
 
 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
 	switch (ecx) {
 	case MSR_IA32_TIME_STAMP_COUNTER: {
 		u64 tsc;
 
 		rdtscll(tsc);
-		*data = vcpu->svm->vmcb->control.tsc_offset + tsc;
+		*data = svm->vmcb->control.tsc_offset + tsc;
 		break;
 	}
 	case MSR_K6_STAR:
-		*data = vcpu->svm->vmcb->save.star;
+		*data = svm->vmcb->save.star;
 		break;
 #ifdef CONFIG_X86_64
 	case MSR_LSTAR:
-		*data = vcpu->svm->vmcb->save.lstar;
+		*data = svm->vmcb->save.lstar;
 		break;
 	case MSR_CSTAR:
-		*data = vcpu->svm->vmcb->save.cstar;
+		*data = svm->vmcb->save.cstar;
 		break;
 	case MSR_KERNEL_GS_BASE:
-		*data = vcpu->svm->vmcb->save.kernel_gs_base;
+		*data = svm->vmcb->save.kernel_gs_base;
 		break;
 	case MSR_SYSCALL_MASK:
-		*data = vcpu->svm->vmcb->save.sfmask;
+		*data = svm->vmcb->save.sfmask;
 		break;
 #endif
 	case MSR_IA32_SYSENTER_CS:
-		*data = vcpu->svm->vmcb->save.sysenter_cs;
+		*data = svm->vmcb->save.sysenter_cs;
 		break;
 	case MSR_IA32_SYSENTER_EIP:
-		*data = vcpu->svm->vmcb->save.sysenter_eip;
+		*data = svm->vmcb->save.sysenter_eip;
 		break;
 	case MSR_IA32_SYSENTER_ESP:
-		*data = vcpu->svm->vmcb->save.sysenter_esp;
+		*data = svm->vmcb->save.sysenter_esp;
 		break;
 	default:
 		return kvm_get_msr_common(vcpu, ecx, data);
@@ -1195,57 +1231,59 @@ #endif
 	return 0;
 }
 
-static int rdmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
+	u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
 	u64 data;
 
-	if (svm_get_msr(vcpu, ecx, &data))
-		svm_inject_gp(vcpu, 0);
+	if (svm_get_msr(&svm->vcpu, ecx, &data))
+		svm_inject_gp(&svm->vcpu, 0);
 	else {
-		vcpu->svm->vmcb->save.rax = data & 0xffffffff;
-		vcpu->regs[VCPU_REGS_RDX] = data >> 32;
-		vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2;
-		skip_emulated_instruction(vcpu);
+		svm->vmcb->save.rax = data & 0xffffffff;
+		svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
+		svm->next_rip = svm->vmcb->save.rip + 2;
+		skip_emulated_instruction(&svm->vcpu);
 	}
 	return 1;
 }
 
 static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
 	switch (ecx) {
 	case MSR_IA32_TIME_STAMP_COUNTER: {
 		u64 tsc;
 
 		rdtscll(tsc);
-		vcpu->svm->vmcb->control.tsc_offset = data - tsc;
+		svm->vmcb->control.tsc_offset = data - tsc;
 		break;
 	}
 	case MSR_K6_STAR:
-		vcpu->svm->vmcb->save.star = data;
+		svm->vmcb->save.star = data;
 		break;
 #ifdef CONFIG_X86_64
 	case MSR_LSTAR:
-		vcpu->svm->vmcb->save.lstar = data;
+		svm->vmcb->save.lstar = data;
 		break;
 	case MSR_CSTAR:
-		vcpu->svm->vmcb->save.cstar = data;
+		svm->vmcb->save.cstar = data;
 		break;
 	case MSR_KERNEL_GS_BASE:
-		vcpu->svm->vmcb->save.kernel_gs_base = data;
+		svm->vmcb->save.kernel_gs_base = data;
 		break;
 	case MSR_SYSCALL_MASK:
-		vcpu->svm->vmcb->save.sfmask = data;
+		svm->vmcb->save.sfmask = data;
 		break;
 #endif
 	case MSR_IA32_SYSENTER_CS:
-		vcpu->svm->vmcb->save.sysenter_cs = data;
+		svm->vmcb->save.sysenter_cs = data;
 		break;
 	case MSR_IA32_SYSENTER_EIP:
-		vcpu->svm->vmcb->save.sysenter_eip = data;
+		svm->vmcb->save.sysenter_eip = data;
 		break;
 	case MSR_IA32_SYSENTER_ESP:
-		vcpu->svm->vmcb->save.sysenter_esp = data;
+		svm->vmcb->save.sysenter_esp = data;
 		break;
 	default:
 		return kvm_set_msr_common(vcpu, ecx, data);
@@ -1253,28 +1291,28 @@ #endif
 	return 0;
 }
 
-static int wrmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
-	u64 data = (vcpu->svm->vmcb->save.rax & -1u)
-		| ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
-	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2;
-	if (svm_set_msr(vcpu, ecx, data))
-		svm_inject_gp(vcpu, 0);
+	u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
+	u64 data = (svm->vmcb->save.rax & -1u)
+		| ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
+	svm->next_rip = svm->vmcb->save.rip + 2;
+	if (svm_set_msr(&svm->vcpu, ecx, data))
+		svm_inject_gp(&svm->vcpu, 0);
 	else
-		skip_emulated_instruction(vcpu);
+		skip_emulated_instruction(&svm->vcpu);
 	return 1;
 }
 
-static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	if (vcpu->svm->vmcb->control.exit_info_1)
-		return wrmsr_interception(vcpu, kvm_run);
+	if (svm->vmcb->control.exit_info_1)
+		return wrmsr_interception(svm, kvm_run);
 	else
-		return rdmsr_interception(vcpu, kvm_run);
+		return rdmsr_interception(svm, kvm_run);
 }
 
-static int interrupt_window_interception(struct kvm_vcpu *vcpu,
+static int interrupt_window_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
 	/*
@@ -1282,8 +1320,8 @@ static int interrupt_window_interception
 	 * possible
 	 */
 	if (kvm_run->request_interrupt_window &&
-	    !vcpu->irq_summary) {
-		++vcpu->stat.irq_window_exits;
+	    !svm->vcpu.irq_summary) {
+		++svm->vcpu.stat.irq_window_exits;
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
 	}
@@ -1291,7 +1329,7 @@ static int interrupt_window_interception
 	return 1;
 }
 
-static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
+static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 				      struct kvm_run *kvm_run) = {
 	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
 	[SVM_EXIT_READ_CR3]           		= emulate_on_interception,
@@ -1338,15 +1376,15 @@ static int (*svm_exit_handlers[])(struct
 };
 
 
-static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_exit(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 exit_code = vcpu->svm->vmcb->control.exit_code;
+	u32 exit_code = svm->vmcb->control.exit_code;
 
-	if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) &&
+	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
 	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
 		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
 		       "exit_code 0x%x\n",
-		       __FUNCTION__, vcpu->svm->vmcb->control.exit_int_info,
+		       __FUNCTION__, svm->vmcb->control.exit_int_info,
 		       exit_code);
 
 	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
@@ -1356,7 +1394,7 @@ static int handle_exit(struct kvm_vcpu *
 		return 0;
 	}
 
-	return svm_exit_handlers[exit_code](vcpu, kvm_run);
+	return svm_exit_handlers[exit_code](svm, kvm_run);
 }
 
 static void reload_tss(struct kvm_vcpu *vcpu)
@@ -1368,76 +1406,77 @@ static void reload_tss(struct kvm_vcpu *
 	load_TR_desc();
 }
 
-static void pre_svm_run(struct kvm_vcpu *vcpu)
+static void pre_svm_run(struct vcpu_svm *svm)
 {
 	int cpu = raw_smp_processor_id();
 
 	struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
 
-	vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
-	if (vcpu->cpu != cpu ||
-	    vcpu->svm->asid_generation != svm_data->asid_generation)
-		new_asid(vcpu, svm_data);
+	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+	if (svm->vcpu.cpu != cpu ||
+	    svm->asid_generation != svm_data->asid_generation)
+		new_asid(svm, svm_data);
 }
 
 
-static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
+static inline void inject_irq(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control;
 
-	control = &vcpu->svm->vmcb->control;
-	control->int_vector = pop_irq(vcpu);
+	control = &svm->vmcb->control;
+	control->int_vector = pop_irq(&svm->vcpu);
 	control->int_ctl &= ~V_INTR_PRIO_MASK;
 	control->int_ctl |= V_IRQ_MASK |
 		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
 }
 
-static void kvm_reput_irq(struct kvm_vcpu *vcpu)
+static void reput_irq(struct vcpu_svm *svm)
 {
-	struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
+	struct vmcb_control_area *control = &svm->vmcb->control;
 
 	if (control->int_ctl & V_IRQ_MASK) {
 		control->int_ctl &= ~V_IRQ_MASK;
-		push_irq(vcpu, control->int_vector);
+		push_irq(&svm->vcpu, control->int_vector);
 	}
 
-	vcpu->interrupt_window_open =
+	svm->vcpu.interrupt_window_open =
 		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
 }
 
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+static void do_interrupt_requests(struct vcpu_svm *svm,
 				       struct kvm_run *kvm_run)
 {
-	struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
+	struct vmcb_control_area *control = &svm->vmcb->control;
 
-	vcpu->interrupt_window_open =
+	svm->vcpu.interrupt_window_open =
 		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		 (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
+		 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
 
-	if (vcpu->interrupt_window_open && vcpu->irq_summary)
+	if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
 		/*
 		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
 		 */
-		kvm_do_inject_irq(vcpu);
+		inject_irq(svm);
 
 	/*
 	 * Interrupts blocked.  Wait for unblock.
 	 */
-	if (!vcpu->interrupt_window_open &&
-	    (vcpu->irq_summary || kvm_run->request_interrupt_window)) {
+	if (!svm->vcpu.interrupt_window_open &&
+	    (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
 		control->intercept |= 1ULL << INTERCEPT_VINTR;
 	} else
 		control->intercept &= ~(1ULL << INTERCEPT_VINTR);
 }
 
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
+static void post_kvm_run_save(struct vcpu_svm *svm,
 			      struct kvm_run *kvm_run)
 {
-	kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
-						  vcpu->irq_summary == 0);
-	kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0;
-	kvm_run->cr8 = vcpu->cr8;
-	kvm_run->apic_base = vcpu->apic_base;
+	kvm_run->ready_for_interrupt_injection
+		= (svm->vcpu.interrupt_window_open &&
+		   svm->vcpu.irq_summary == 0);
+	kvm_run->if_flag = (svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0;
+	kvm_run->cr8 = svm->vcpu.cr8;
+	kvm_run->apic_base = svm->vcpu.apic_base;
 }
 
 /*
@@ -1446,13 +1485,13 @@ static void post_kvm_run_save(struct kvm
  *
  * No need to exit to userspace if we already have an interrupt queued.
  */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
+static int dm_request_for_irq_injection(struct vcpu_svm *svm,
 					  struct kvm_run *kvm_run)
 {
-	return (!vcpu->irq_summary &&
+	return (!svm->vcpu.irq_summary &&
 		kvm_run->request_interrupt_window &&
-		vcpu->interrupt_window_open &&
-		(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
+		svm->vcpu.interrupt_window_open &&
+		(svm->vmcb->save.rflags & X86_EFLAGS_IF));
 }
 
 static void save_db_regs(unsigned long *db_regs)
@@ -1478,6 +1517,7 @@ static void svm_flush_tlb(struct kvm_vcp
 
 static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
 	u16 fs_selector;
 	u16 gs_selector;
 	u16 ldt_selector;
@@ -1489,7 +1529,7 @@ again:
 		return r;
 
 	if (!vcpu->mmio_read_completed)
-		do_interrupt_requests(vcpu, kvm_run);
+		do_interrupt_requests(svm, kvm_run);
 
 	clgi();
 
@@ -1498,26 +1538,26 @@ again:
 		if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
 		    svm_flush_tlb(vcpu);
 
-	pre_svm_run(vcpu);
+	pre_svm_run(svm);
 
 	save_host_msrs(vcpu);
 	fs_selector = read_fs();
 	gs_selector = read_gs();
 	ldt_selector = read_ldt();
-	vcpu->svm->host_cr2 = kvm_read_cr2();
-	vcpu->svm->host_dr6 = read_dr6();
-	vcpu->svm->host_dr7 = read_dr7();
-	vcpu->svm->vmcb->save.cr2 = vcpu->cr2;
+	svm->host_cr2 = kvm_read_cr2();
+	svm->host_dr6 = read_dr6();
+	svm->host_dr7 = read_dr7();
+	svm->vmcb->save.cr2 = vcpu->cr2;
 
-	if (vcpu->svm->vmcb->save.dr7 & 0xff) {
+	if (svm->vmcb->save.dr7 & 0xff) {
 		write_dr7(0);
-		save_db_regs(vcpu->svm->host_db_regs);
-		load_db_regs(vcpu->svm->db_regs);
+		save_db_regs(svm->host_db_regs);
+		load_db_regs(svm->db_regs);
 	}
 
 	if (vcpu->fpu_active) {
-		fx_save(vcpu->host_fx_image);
-		fx_restore(vcpu->guest_fx_image);
+		fx_save(&vcpu->host_fx_image);
+		fx_restore(&vcpu->guest_fx_image);
 	}
 
 	asm volatile (
@@ -1532,34 +1572,33 @@ #else
 #endif
 
 #ifdef CONFIG_X86_64
-		"mov %c[rbx](%[vcpu]), %%rbx \n\t"
-		"mov %c[rcx](%[vcpu]), %%rcx \n\t"
-		"mov %c[rdx](%[vcpu]), %%rdx \n\t"
-		"mov %c[rsi](%[vcpu]), %%rsi \n\t"
-		"mov %c[rdi](%[vcpu]), %%rdi \n\t"
-		"mov %c[rbp](%[vcpu]), %%rbp \n\t"
-		"mov %c[r8](%[vcpu]),  %%r8  \n\t"
-		"mov %c[r9](%[vcpu]),  %%r9  \n\t"
-		"mov %c[r10](%[vcpu]), %%r10 \n\t"
-		"mov %c[r11](%[vcpu]), %%r11 \n\t"
-		"mov %c[r12](%[vcpu]), %%r12 \n\t"
-		"mov %c[r13](%[vcpu]), %%r13 \n\t"
-		"mov %c[r14](%[vcpu]), %%r14 \n\t"
-		"mov %c[r15](%[vcpu]), %%r15 \n\t"
+		"mov %c[rbx](%[svm]), %%rbx \n\t"
+		"mov %c[rcx](%[svm]), %%rcx \n\t"
+		"mov %c[rdx](%[svm]), %%rdx \n\t"
+		"mov %c[rsi](%[svm]), %%rsi \n\t"
+		"mov %c[rdi](%[svm]), %%rdi \n\t"
+		"mov %c[rbp](%[svm]), %%rbp \n\t"
+		"mov %c[r8](%[svm]),  %%r8  \n\t"
+		"mov %c[r9](%[svm]),  %%r9  \n\t"
+		"mov %c[r10](%[svm]), %%r10 \n\t"
+		"mov %c[r11](%[svm]), %%r11 \n\t"
+		"mov %c[r12](%[svm]), %%r12 \n\t"
+		"mov %c[r13](%[svm]), %%r13 \n\t"
+		"mov %c[r14](%[svm]), %%r14 \n\t"
+		"mov %c[r15](%[svm]), %%r15 \n\t"
 #else
-		"mov %c[rbx](%[vcpu]), %%ebx \n\t"
-		"mov %c[rcx](%[vcpu]), %%ecx \n\t"
-		"mov %c[rdx](%[vcpu]), %%edx \n\t"
-		"mov %c[rsi](%[vcpu]), %%esi \n\t"
-		"mov %c[rdi](%[vcpu]), %%edi \n\t"
-		"mov %c[rbp](%[vcpu]), %%ebp \n\t"
+		"mov %c[rbx](%[svm]), %%ebx \n\t"
+		"mov %c[rcx](%[svm]), %%ecx \n\t"
+		"mov %c[rdx](%[svm]), %%edx \n\t"
+		"mov %c[rsi](%[svm]), %%esi \n\t"
+		"mov %c[rdi](%[svm]), %%edi \n\t"
+		"mov %c[rbp](%[svm]), %%ebp \n\t"
 #endif
 
 #ifdef CONFIG_X86_64
 		/* Enter guest mode */
 		"push %%rax \n\t"
-		"mov %c[svm](%[vcpu]), %%rax \n\t"
-		"mov %c[vmcb](%%rax), %%rax \n\t"
+		"mov %c[vmcb](%[svm]), %%rax \n\t"
 		SVM_VMLOAD "\n\t"
 		SVM_VMRUN "\n\t"
 		SVM_VMSAVE "\n\t"
@@ -1567,8 +1606,7 @@ #ifdef CONFIG_X86_64
 #else
 		/* Enter guest mode */
 		"push %%eax \n\t"
-		"mov %c[svm](%[vcpu]), %%eax \n\t"
-		"mov %c[vmcb](%%eax), %%eax \n\t"
+		"mov %c[vmcb](%[svm]), %%eax \n\t"
 		SVM_VMLOAD "\n\t"
 		SVM_VMRUN "\n\t"
 		SVM_VMSAVE "\n\t"
@@ -1577,73 +1615,72 @@ #endif
 
 		/* Save guest registers, load host registers */
 #ifdef CONFIG_X86_64
-		"mov %%rbx, %c[rbx](%[vcpu]) \n\t"
-		"mov %%rcx, %c[rcx](%[vcpu]) \n\t"
-		"mov %%rdx, %c[rdx](%[vcpu]) \n\t"
-		"mov %%rsi, %c[rsi](%[vcpu]) \n\t"
-		"mov %%rdi, %c[rdi](%[vcpu]) \n\t"
-		"mov %%rbp, %c[rbp](%[vcpu]) \n\t"
-		"mov %%r8,  %c[r8](%[vcpu]) \n\t"
-		"mov %%r9,  %c[r9](%[vcpu]) \n\t"
-		"mov %%r10, %c[r10](%[vcpu]) \n\t"
-		"mov %%r11, %c[r11](%[vcpu]) \n\t"
-		"mov %%r12, %c[r12](%[vcpu]) \n\t"
-		"mov %%r13, %c[r13](%[vcpu]) \n\t"
-		"mov %%r14, %c[r14](%[vcpu]) \n\t"
-		"mov %%r15, %c[r15](%[vcpu]) \n\t"
+		"mov %%rbx, %c[rbx](%[svm]) \n\t"
+		"mov %%rcx, %c[rcx](%[svm]) \n\t"
+		"mov %%rdx, %c[rdx](%[svm]) \n\t"
+		"mov %%rsi, %c[rsi](%[svm]) \n\t"
+		"mov %%rdi, %c[rdi](%[svm]) \n\t"
+		"mov %%rbp, %c[rbp](%[svm]) \n\t"
+		"mov %%r8,  %c[r8](%[svm]) \n\t"
+		"mov %%r9,  %c[r9](%[svm]) \n\t"
+		"mov %%r10, %c[r10](%[svm]) \n\t"
+		"mov %%r11, %c[r11](%[svm]) \n\t"
+		"mov %%r12, %c[r12](%[svm]) \n\t"
+		"mov %%r13, %c[r13](%[svm]) \n\t"
+		"mov %%r14, %c[r14](%[svm]) \n\t"
+		"mov %%r15, %c[r15](%[svm]) \n\t"
 
 		"pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
 		"pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
 		"pop  %%rbp; pop  %%rdi; pop  %%rsi;"
 		"pop  %%rdx; pop  %%rcx; pop  %%rbx; \n\t"
 #else
-		"mov %%ebx, %c[rbx](%[vcpu]) \n\t"
-		"mov %%ecx, %c[rcx](%[vcpu]) \n\t"
-		"mov %%edx, %c[rdx](%[vcpu]) \n\t"
-		"mov %%esi, %c[rsi](%[vcpu]) \n\t"
-		"mov %%edi, %c[rdi](%[vcpu]) \n\t"
-		"mov %%ebp, %c[rbp](%[vcpu]) \n\t"
+		"mov %%ebx, %c[rbx](%[svm]) \n\t"
+		"mov %%ecx, %c[rcx](%[svm]) \n\t"
+		"mov %%edx, %c[rdx](%[svm]) \n\t"
+		"mov %%esi, %c[rsi](%[svm]) \n\t"
+		"mov %%edi, %c[rdi](%[svm]) \n\t"
+		"mov %%ebp, %c[rbp](%[svm]) \n\t"
 
 		"pop  %%ebp; pop  %%edi; pop  %%esi;"
 		"pop  %%edx; pop  %%ecx; pop  %%ebx; \n\t"
 #endif
 		:
-		: [vcpu]"a"(vcpu),
-		  [svm]"i"(offsetof(struct kvm_vcpu, svm)),
+		: [svm]"a"(svm),
 		  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
-		  [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
-		  [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
-		  [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
-		  [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
-		  [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
-		  [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP]))
+		  [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
+		  [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
+		  [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
+		  [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
+		  [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
+		  [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
 #ifdef CONFIG_X86_64
-		  ,[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
-		  [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
-		  [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
-		  [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
-		  [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
-		  [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
-		  [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
-		  [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15]))
+		  ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
+		  [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
+		  [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
+		  [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
+		  [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
+		  [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
+		  [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
+		  [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
 #endif
 		: "cc", "memory" );
 
 	vcpu->guest_mode = 0;
 
 	if (vcpu->fpu_active) {
-		fx_save(vcpu->guest_fx_image);
-		fx_restore(vcpu->host_fx_image);
+		fx_save(&vcpu->guest_fx_image);
+		fx_restore(&vcpu->host_fx_image);
 	}
 
-	if ((vcpu->svm->vmcb->save.dr7 & 0xff))
-		load_db_regs(vcpu->svm->host_db_regs);
+	if ((svm->vmcb->save.dr7 & 0xff))
+		load_db_regs(svm->host_db_regs);
 
-	vcpu->cr2 = vcpu->svm->vmcb->save.cr2;
+	vcpu->cr2 = svm->vmcb->save.cr2;
 
-	write_dr6(vcpu->svm->host_dr6);
-	write_dr7(vcpu->svm->host_dr7);
-	kvm_write_cr2(vcpu->svm->host_cr2);
+	write_dr6(svm->host_dr6);
+	write_dr7(svm->host_dr7);
+	kvm_write_cr2(svm->host_cr2);
 
 	load_fs(fs_selector);
 	load_gs(gs_selector);
@@ -1657,52 +1694,54 @@ #endif
 	 */
 	if (unlikely(prof_on == KVM_PROFILING))
 		profile_hit(KVM_PROFILING,
-			(void *)(unsigned long)vcpu->svm->vmcb->save.rip);
+			(void *)(unsigned long)svm->vmcb->save.rip);
 
 	stgi();
 
-	kvm_reput_irq(vcpu);
+	reput_irq(svm);
 
-	vcpu->svm->next_rip = 0;
+	svm->next_rip = 0;
 
-	if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
-			= vcpu->svm->vmcb->control.exit_code;
-		post_kvm_run_save(vcpu, kvm_run);
+			= svm->vmcb->control.exit_code;
+		post_kvm_run_save(svm, kvm_run);
 		return 0;
 	}
 
-	r = handle_exit(vcpu, kvm_run);
+	r = handle_exit(svm, kvm_run);
 	if (r > 0) {
 		if (signal_pending(current)) {
 			++vcpu->stat.signal_exits;
-			post_kvm_run_save(vcpu, kvm_run);
+			post_kvm_run_save(svm, kvm_run);
 			kvm_run->exit_reason = KVM_EXIT_INTR;
 			return -EINTR;
 		}
 
-		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+		if (dm_request_for_irq_injection(svm, kvm_run)) {
 			++vcpu->stat.request_irq_exits;
-			post_kvm_run_save(vcpu, kvm_run);
+			post_kvm_run_save(svm, kvm_run);
 			kvm_run->exit_reason = KVM_EXIT_INTR;
 			return -EINTR;
 		}
 		kvm_resched(vcpu);
 		goto again;
 	}
-	post_kvm_run_save(vcpu, kvm_run);
+	post_kvm_run_save(svm, kvm_run);
 	return r;
 }
 
 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
-	vcpu->svm->vmcb->save.cr3 = root;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->save.cr3 = root;
 	force_new_asid(vcpu);
 
 	if (vcpu->fpu_active) {
-		vcpu->svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
-		vcpu->svm->vmcb->save.cr0 |= CR0_TS_MASK;
+		svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
+		svm->vmcb->save.cr0 |= X86_CR0_TS;
 		vcpu->fpu_active = 0;
 	}
 }
@@ -1711,26 +1750,27 @@ static void svm_inject_page_fault(struct
 				  unsigned long  addr,
 				  uint32_t err_code)
 {
-	uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info;
+	struct vcpu_svm *svm = to_svm(vcpu);
+	uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
 
 	++vcpu->stat.pf_guest;
 
 	if (is_page_fault(exit_int_info)) {
 
-		vcpu->svm->vmcb->control.event_inj_err = 0;
-		vcpu->svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
-							SVM_EVTINJ_VALID_ERR |
-							SVM_EVTINJ_TYPE_EXEPT |
-							DF_VECTOR;
+		svm->vmcb->control.event_inj_err = 0;
+		svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
+						SVM_EVTINJ_VALID_ERR |
+						SVM_EVTINJ_TYPE_EXEPT |
+						DF_VECTOR;
 		return;
 	}
 	vcpu->cr2 = addr;
-	vcpu->svm->vmcb->save.cr2 = addr;
-	vcpu->svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
-						SVM_EVTINJ_VALID_ERR |
-						SVM_EVTINJ_TYPE_EXEPT |
-						PF_VECTOR;
-	vcpu->svm->vmcb->control.event_inj_err = err_code;
+	svm->vmcb->save.cr2 = addr;
+	svm->vmcb->control.event_inj = 	SVM_EVTINJ_VALID |
+					SVM_EVTINJ_VALID_ERR |
+					SVM_EVTINJ_TYPE_EXEPT |
+					PF_VECTOR;
+	svm->vmcb->control.event_inj_err = err_code;
 }
 
 
@@ -1757,11 +1797,17 @@ svm_patch_hypercall(struct kvm_vcpu *vcp
 	hypercall[3] = 0xc3;
 }
 
+static void svm_check_processor_compat(void *rtn)
+{
+	*(int *)rtn = 0;
+}
+
 static struct kvm_arch_ops svm_arch_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
 	.hardware_setup = svm_hardware_setup,
 	.hardware_unsetup = svm_hardware_unsetup,
+	.check_processor_compatibility = svm_check_processor_compat,
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,
 
@@ -1803,13 +1849,13 @@ static struct kvm_arch_ops svm_arch_ops 
 
 	.run = svm_vcpu_run,
 	.skip_emulated_instruction = skip_emulated_instruction,
-	.vcpu_setup = svm_vcpu_setup,
 	.patch_hypercall = svm_patch_hypercall,
 };
 
 static int __init svm_init(void)
 {
-	return kvm_init_arch(&svm_arch_ops, THIS_MODULE);
+	return kvm_init_arch(&svm_arch_ops, sizeof(struct vcpu_svm),
+			      THIS_MODULE);
 }
 
 static void __exit svm_exit(void)
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 80628f6..49635c4 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -16,6 +16,7 @@
  */
 
 #include "kvm.h"
+#include "x86_emulate.h"
 #include "vmx.h"
 #include "segment_descriptor.h"
 
@@ -32,6 +33,37 @@ #include <asm/desc.h>
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+struct vmcs {
+	u32 revision_id;
+	u32 abort;
+	char data[0];
+};
+
+struct vcpu_vmx {
+	struct kvm_vcpu       vcpu;
+	int                   launched;
+	struct kvm_msr_entry *guest_msrs;
+	struct kvm_msr_entry *host_msrs;
+	int                   nmsrs;
+	int                   save_nmsrs;
+	int                   msr_offset_efer;
+#ifdef CONFIG_X86_64
+	int                   msr_offset_kernel_gs_base;
+#endif
+	struct vmcs          *vmcs;
+	struct {
+		int           loaded;
+		u16           fs_sel, gs_sel, ldt_sel;
+		int           fs_gs_ldt_reload_needed;
+	}host_state;
+
+};
+
+static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
 static int init_rmode_tss(struct kvm *kvm);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
@@ -40,18 +72,17 @@ static DEFINE_PER_CPU(struct vmcs *, cur
 static struct page *vmx_io_bitmap_a;
 static struct page *vmx_io_bitmap_b;
 
-#ifdef CONFIG_X86_64
-#define HOST_IS_64 1
-#else
-#define HOST_IS_64 0
-#endif
 #define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
 
-static struct vmcs_descriptor {
+static struct vmcs_config {
 	int size;
 	int order;
 	u32 revision_id;
-} vmcs_descriptor;
+	u32 pin_based_exec_ctrl;
+	u32 cpu_based_exec_ctrl;
+	u32 vmexit_ctrl;
+	u32 vmentry_ctrl;
+} vmcs_config;
 
 #define VMX_SEGMENT_FIELD(seg)					\
 	[VCPU_SREG_##seg] = {                                   \
@@ -89,16 +120,32 @@ #endif
 };
 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 
-static inline u64 msr_efer_save_restore_bits(struct vmx_msr_entry msr)
+static void load_msrs(struct kvm_msr_entry *e, int n)
+{
+	int i;
+
+	for (i = 0; i < n; ++i)
+		wrmsrl(e[i].index, e[i].data);
+}
+
+static void save_msrs(struct kvm_msr_entry *e, int n)
+{
+	int i;
+
+	for (i = 0; i < n; ++i)
+		rdmsrl(e[i].index, e[i].data);
+}
+
+static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
 {
 	return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
 }
 
-static inline int msr_efer_need_save_restore(struct kvm_vcpu *vcpu)
+static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
 {
-	int efer_offset = vcpu->msr_offset_efer;
-	return msr_efer_save_restore_bits(vcpu->host_msrs[efer_offset]) !=
-		msr_efer_save_restore_bits(vcpu->guest_msrs[efer_offset]);
+	int efer_offset = vmx->msr_offset_efer;
+	return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
+		msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
 }
 
 static inline int is_page_fault(u32 intr_info)
@@ -121,23 +168,23 @@ static inline int is_external_interrupt(
 		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
-static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr)
+static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
 
-	for (i = 0; i < vcpu->nmsrs; ++i)
-		if (vcpu->guest_msrs[i].index == msr)
+	for (i = 0; i < vmx->nmsrs; ++i)
+		if (vmx->guest_msrs[i].index == msr)
 			return i;
 	return -1;
 }
 
-static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
+static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
 
-	i = __find_msr_index(vcpu, msr);
+	i = __find_msr_index(vmx, msr);
 	if (i >= 0)
-		return &vcpu->guest_msrs[i];
+		return &vmx->guest_msrs[i];
 	return NULL;
 }
 
@@ -156,23 +203,24 @@ static void vmcs_clear(struct vmcs *vmcs
 
 static void __vcpu_clear(void *arg)
 {
-	struct kvm_vcpu *vcpu = arg;
+	struct vcpu_vmx *vmx = arg;
 	int cpu = raw_smp_processor_id();
 
-	if (vcpu->cpu == cpu)
-		vmcs_clear(vcpu->vmcs);
-	if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
+	if (vmx->vcpu.cpu == cpu)
+		vmcs_clear(vmx->vmcs);
+	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
-	rdtscll(vcpu->host_tsc);
+	rdtscll(vmx->vcpu.host_tsc);
 }
 
-static void vcpu_clear(struct kvm_vcpu *vcpu)
+static void vcpu_clear(struct vcpu_vmx *vmx)
 {
-	if (vcpu->cpu != raw_smp_processor_id() && vcpu->cpu != -1)
-		smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, 0, 1);
+	if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
+		smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
+					 vmx, 0, 1);
 	else
-		__vcpu_clear(vcpu);
-	vcpu->launched = 0;
+		__vcpu_clear(vmx);
+	vmx->launched = 0;
 }
 
 static unsigned long vmcs_readl(unsigned long field)
@@ -282,121 +330,117 @@ #ifndef CONFIG_X86_64
 #endif
 }
 
-static void load_transition_efer(struct kvm_vcpu *vcpu)
+static void load_transition_efer(struct vcpu_vmx *vmx)
 {
 	u64 trans_efer;
-	int efer_offset = vcpu->msr_offset_efer;
+	int efer_offset = vmx->msr_offset_efer;
 
-	trans_efer = vcpu->host_msrs[efer_offset].data;
+	trans_efer = vmx->host_msrs[efer_offset].data;
 	trans_efer &= ~EFER_SAVE_RESTORE_BITS;
-	trans_efer |= msr_efer_save_restore_bits(
-				vcpu->guest_msrs[efer_offset]);
+	trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
 	wrmsrl(MSR_EFER, trans_efer);
-	vcpu->stat.efer_reload++;
+	vmx->vcpu.stat.efer_reload++;
 }
 
-static void vmx_save_host_state(struct kvm_vcpu *vcpu)
+static void vmx_save_host_state(struct vcpu_vmx *vmx)
 {
-	struct vmx_host_state *hs = &vcpu->vmx_host_state;
-
-	if (hs->loaded)
+	if (vmx->host_state.loaded)
 		return;
 
-	hs->loaded = 1;
+	vmx->host_state.loaded = 1;
 	/*
 	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
 	 * allow segment selectors with cpl > 0 or ti == 1.
 	 */
-	hs->ldt_sel = read_ldt();
-	hs->fs_gs_ldt_reload_needed = hs->ldt_sel;
-	hs->fs_sel = read_fs();
-	if (!(hs->fs_sel & 7))
-		vmcs_write16(HOST_FS_SELECTOR, hs->fs_sel);
+	vmx->host_state.ldt_sel = read_ldt();
+	vmx->host_state.fs_gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+	vmx->host_state.fs_sel = read_fs();
+	if (!(vmx->host_state.fs_sel & 7))
+		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
 	else {
 		vmcs_write16(HOST_FS_SELECTOR, 0);
-		hs->fs_gs_ldt_reload_needed = 1;
+		vmx->host_state.fs_gs_ldt_reload_needed = 1;
 	}
-	hs->gs_sel = read_gs();
-	if (!(hs->gs_sel & 7))
-		vmcs_write16(HOST_GS_SELECTOR, hs->gs_sel);
+	vmx->host_state.gs_sel = read_gs();
+	if (!(vmx->host_state.gs_sel & 7))
+		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
 	else {
 		vmcs_write16(HOST_GS_SELECTOR, 0);
-		hs->fs_gs_ldt_reload_needed = 1;
+		vmx->host_state.fs_gs_ldt_reload_needed = 1;
 	}
 
 #ifdef CONFIG_X86_64
 	vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
 	vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
 #else
-	vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel));
-	vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel));
+	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
+	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
 #endif
 
 #ifdef CONFIG_X86_64
-	if (is_long_mode(vcpu)) {
-		save_msrs(vcpu->host_msrs + vcpu->msr_offset_kernel_gs_base, 1);
+	if (is_long_mode(&vmx->vcpu)) {
+		save_msrs(vmx->host_msrs +
+			  vmx->msr_offset_kernel_gs_base, 1);
 	}
 #endif
-	load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
-	if (msr_efer_need_save_restore(vcpu))
-		load_transition_efer(vcpu);
+	load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+	if (msr_efer_need_save_restore(vmx))
+		load_transition_efer(vmx);
 }
 
-static void vmx_load_host_state(struct kvm_vcpu *vcpu)
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
 {
-	struct vmx_host_state *hs = &vcpu->vmx_host_state;
+	unsigned long flags;
 
-	if (!hs->loaded)
+	if (!vmx->host_state.loaded)
 		return;
 
-	hs->loaded = 0;
-	if (hs->fs_gs_ldt_reload_needed) {
-		load_ldt(hs->ldt_sel);
-		load_fs(hs->fs_sel);
+	vmx->host_state.loaded = 0;
+	if (vmx->host_state.fs_gs_ldt_reload_needed) {
+		load_ldt(vmx->host_state.ldt_sel);
+		load_fs(vmx->host_state.fs_sel);
 		/*
 		 * If we have to reload gs, we must take care to
 		 * preserve our gs base.
 		 */
-		local_irq_disable();
-		load_gs(hs->gs_sel);
+		local_irq_save(flags);
+		load_gs(vmx->host_state.gs_sel);
 #ifdef CONFIG_X86_64
 		wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
 #endif
-		local_irq_enable();
+		local_irq_restore(flags);
 
 		reload_tss();
 	}
-	save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
-	load_msrs(vcpu->host_msrs, vcpu->save_nmsrs);
-	if (msr_efer_need_save_restore(vcpu))
-		load_msrs(vcpu->host_msrs + vcpu->msr_offset_efer, 1);
+	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
+	load_msrs(vmx->host_msrs, vmx->save_nmsrs);
+	if (msr_efer_need_save_restore(vmx))
+		load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
 }
 
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
  */
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	u64 phys_addr = __pa(vcpu->vmcs);
-	int cpu;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 phys_addr = __pa(vmx->vmcs);
 	u64 tsc_this, delta;
 
-	cpu = get_cpu();
-
 	if (vcpu->cpu != cpu)
-		vcpu_clear(vcpu);
+		vcpu_clear(vmx);
 
-	if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
+	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
 		u8 error;
 
-		per_cpu(current_vmcs, cpu) = vcpu->vmcs;
+		per_cpu(current_vmcs, cpu) = vmx->vmcs;
 		asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
 			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
 			      : "cc");
 		if (error)
 			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-			       vcpu->vmcs, phys_addr);
+			       vmx->vmcs, phys_addr);
 	}
 
 	if (vcpu->cpu != cpu) {
@@ -426,9 +470,8 @@ static void vmx_vcpu_load(struct kvm_vcp
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	vmx_load_host_state(vcpu);
+	vmx_load_host_state(to_vmx(vcpu));
 	kvm_put_guest_fpu(vcpu);
-	put_cpu();
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -436,9 +479,9 @@ static void vmx_fpu_activate(struct kvm_
 	if (vcpu->fpu_active)
 		return;
 	vcpu->fpu_active = 1;
-	vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
-	if (vcpu->cr0 & CR0_TS_MASK)
-		vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
+	vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
+	if (vcpu->cr0 & X86_CR0_TS)
+		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
 	update_exception_bitmap(vcpu);
 }
 
@@ -447,13 +490,13 @@ static void vmx_fpu_deactivate(struct kv
 	if (!vcpu->fpu_active)
 		return;
 	vcpu->fpu_active = 0;
-	vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
+	vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
 	update_exception_bitmap(vcpu);
 }
 
 static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
 {
-	vcpu_clear(vcpu);
+	vcpu_clear(to_vmx(vcpu));
 }
 
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -501,15 +544,16 @@ static void vmx_inject_gp(struct kvm_vcp
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
-void move_msr_up(struct kvm_vcpu *vcpu, int from, int to)
+static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 {
-	struct vmx_msr_entry tmp;
-	tmp = vcpu->guest_msrs[to];
-	vcpu->guest_msrs[to] = vcpu->guest_msrs[from];
-	vcpu->guest_msrs[from] = tmp;
-	tmp = vcpu->host_msrs[to];
-	vcpu->host_msrs[to] = vcpu->host_msrs[from];
-	vcpu->host_msrs[from] = tmp;
+	struct kvm_msr_entry tmp;
+
+	tmp = vmx->guest_msrs[to];
+	vmx->guest_msrs[to] = vmx->guest_msrs[from];
+	vmx->guest_msrs[from] = tmp;
+	tmp = vmx->host_msrs[to];
+	vmx->host_msrs[to] = vmx->host_msrs[from];
+	vmx->host_msrs[from] = tmp;
 }
 
 /*
@@ -517,43 +561,43 @@ void move_msr_up(struct kvm_vcpu *vcpu, 
  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
  * mode, as fiddling with msrs is very expensive.
  */
-static void setup_msrs(struct kvm_vcpu *vcpu)
+static void setup_msrs(struct vcpu_vmx *vmx)
 {
 	int save_nmsrs;
 
 	save_nmsrs = 0;
 #ifdef CONFIG_X86_64
-	if (is_long_mode(vcpu)) {
+	if (is_long_mode(&vmx->vcpu)) {
 		int index;
 
-		index = __find_msr_index(vcpu, MSR_SYSCALL_MASK);
+		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
 		if (index >= 0)
-			move_msr_up(vcpu, index, save_nmsrs++);
-		index = __find_msr_index(vcpu, MSR_LSTAR);
+			move_msr_up(vmx, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_LSTAR);
 		if (index >= 0)
-			move_msr_up(vcpu, index, save_nmsrs++);
-		index = __find_msr_index(vcpu, MSR_CSTAR);
+			move_msr_up(vmx, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_CSTAR);
 		if (index >= 0)
-			move_msr_up(vcpu, index, save_nmsrs++);
-		index = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE);
+			move_msr_up(vmx, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
 		if (index >= 0)
-			move_msr_up(vcpu, index, save_nmsrs++);
+			move_msr_up(vmx, index, save_nmsrs++);
 		/*
 		 * MSR_K6_STAR is only needed on long mode guests, and only
 		 * if efer.sce is enabled.
 		 */
-		index = __find_msr_index(vcpu, MSR_K6_STAR);
-		if ((index >= 0) && (vcpu->shadow_efer & EFER_SCE))
-			move_msr_up(vcpu, index, save_nmsrs++);
+		index = __find_msr_index(vmx, MSR_K6_STAR);
+		if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
+			move_msr_up(vmx, index, save_nmsrs++);
 	}
 #endif
-	vcpu->save_nmsrs = save_nmsrs;
+	vmx->save_nmsrs = save_nmsrs;
 
 #ifdef CONFIG_X86_64
-	vcpu->msr_offset_kernel_gs_base =
-		__find_msr_index(vcpu, MSR_KERNEL_GS_BASE);
+	vmx->msr_offset_kernel_gs_base =
+		__find_msr_index(vmx, MSR_KERNEL_GS_BASE);
 #endif
-	vcpu->msr_offset_efer = __find_msr_index(vcpu, MSR_EFER);
+	vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
 }
 
 /*
@@ -589,7 +633,7 @@ static void guest_write_tsc(u64 guest_ts
 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
 	u64 data;
-	struct vmx_msr_entry *msr;
+	struct kvm_msr_entry *msr;
 
 	if (!pdata) {
 		printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -620,7 +664,7 @@ #endif
 		data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	default:
-		msr = find_msr_entry(vcpu, msr_index);
+		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
 			data = msr->data;
 			break;
@@ -639,15 +683,16 @@ #endif
  */
 static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
-	struct vmx_msr_entry *msr;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr;
 	int ret = 0;
 
 	switch (msr_index) {
 #ifdef CONFIG_X86_64
 	case MSR_EFER:
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
-		if (vcpu->vmx_host_state.loaded)
-			load_transition_efer(vcpu);
+		if (vmx->host_state.loaded)
+			load_transition_efer(vmx);
 		break;
 	case MSR_FS_BASE:
 		vmcs_writel(GUEST_FS_BASE, data);
@@ -669,11 +714,11 @@ #endif
 		guest_write_tsc(data);
 		break;
 	default:
-		msr = find_msr_entry(vcpu, msr_index);
+		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
 			msr->data = data;
-			if (vcpu->vmx_host_state.loaded)
-				load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
+			if (vmx->host_state.loaded)
+				load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
 			break;
 		}
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -751,7 +796,10 @@ static __init int vmx_disabled_by_bios(v
 	u64 msr;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & 5) == 1; /* locked but not enabled */
+	return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
+		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+	    == MSR_IA32_FEATURE_CONTROL_LOCKED;
+	/* locked but not enabled */
 }
 
 static void hardware_enable(void *garbage)
@@ -761,10 +809,15 @@ static void hardware_enable(void *garbag
 	u64 old;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & 5) != 5)
+	if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
+		    MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+	    != (MSR_IA32_FEATURE_CONTROL_LOCKED |
+		MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
 		/* enable and lock */
-		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5);
-	write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
+		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
+		       MSR_IA32_FEATURE_CONTROL_LOCKED |
+		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
+	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
 		      : "memory", "cc");
 }
@@ -774,14 +827,91 @@ static void hardware_disable(void *garba
 	asm volatile (ASM_VMX_VMXOFF : : : "cc");
 }
 
-static __init void setup_vmcs_descriptor(void)
+static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
+				      u32 msr, u32* result)
 {
 	u32 vmx_msr_low, vmx_msr_high;
+	u32 ctl = ctl_min | ctl_opt;
+
+	rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
+
+	/* Ensure minimum (required) set of control bits are supported. */
+	if (ctl_min & ~ctl)
+		return -EIO;
+
+	*result = ctl;
+	return 0;
+}
+
+static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+{
+	u32 vmx_msr_low, vmx_msr_high;
+	u32 min, opt;
+	u32 _pin_based_exec_control = 0;
+	u32 _cpu_based_exec_control = 0;
+	u32 _vmexit_control = 0;
+	u32 _vmentry_control = 0;
+
+	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+	opt = 0;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+				&_pin_based_exec_control) < 0)
+		return -EIO;
+
+	min = CPU_BASED_HLT_EXITING |
+	      CPU_BASED_CR8_LOAD_EXITING |
+	      CPU_BASED_CR8_STORE_EXITING |
+	      CPU_BASED_USE_IO_BITMAPS |
+	      CPU_BASED_MOV_DR_EXITING |
+	      CPU_BASED_USE_TSC_OFFSETING;
+	opt = 0;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+				&_cpu_based_exec_control) < 0)
+		return -EIO;
+
+	min = 0;
+#ifdef CONFIG_X86_64
+	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+	opt = 0;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
+				&_vmexit_control) < 0)
+		return -EIO;
+
+	min = opt = 0;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
+				&_vmentry_control) < 0)
+		return -EIO;
 
 	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
-	vmcs_descriptor.size = vmx_msr_high & 0x1fff;
-	vmcs_descriptor.order = get_order(vmcs_descriptor.size);
-	vmcs_descriptor.revision_id = vmx_msr_low;
+
+	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+		return -EIO;
+
+#ifdef CONFIG_X86_64
+	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
+	if (vmx_msr_high & (1u<<16))
+		return -EIO;
+#endif
+
+	/* Require Write-Back (WB) memory type for VMCS accesses. */
+	if (((vmx_msr_high >> 18) & 15) != 6)
+		return -EIO;
+
+	vmcs_conf->size = vmx_msr_high & 0x1fff;
+	vmcs_conf->order = get_order(vmcs_config.size);
+	vmcs_conf->revision_id = vmx_msr_low;
+
+	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+	vmcs_conf->vmexit_ctrl         = _vmexit_control;
+	vmcs_conf->vmentry_ctrl        = _vmentry_control;
+
+	return 0;
 }
 
 static struct vmcs *alloc_vmcs_cpu(int cpu)
@@ -790,12 +920,12 @@ static struct vmcs *alloc_vmcs_cpu(int c
 	struct page *pages;
 	struct vmcs *vmcs;
 
-	pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order);
+	pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
 	if (!pages)
 		return NULL;
 	vmcs = page_address(pages);
-	memset(vmcs, 0, vmcs_descriptor.size);
-	vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
+	memset(vmcs, 0, vmcs_config.size);
+	vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
 	return vmcs;
 }
 
@@ -806,7 +936,7 @@ static struct vmcs *alloc_vmcs(void)
 
 static void free_vmcs(struct vmcs *vmcs)
 {
-	free_pages((unsigned long)vmcs, vmcs_descriptor.order);
+	free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
 static void free_kvm_area(void)
@@ -817,8 +947,6 @@ static void free_kvm_area(void)
 		free_vmcs(per_cpu(vmxarea, cpu));
 }
 
-extern struct vmcs *alloc_vmcs_cpu(int cpu);
-
 static __init int alloc_kvm_area(void)
 {
 	int cpu;
@@ -839,7 +967,8 @@ static __init int alloc_kvm_area(void)
 
 static __init int hardware_setup(void)
 {
-	setup_vmcs_descriptor();
+	if (setup_vmcs_config(&vmcs_config) < 0)
+		return -EIO;
 	return alloc_kvm_area();
 }
 
@@ -879,8 +1008,8 @@ static void enter_pmode(struct kvm_vcpu 
 	flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
 	vmcs_writel(GUEST_RFLAGS, flags);
 
-	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
-			(vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK));
+	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
+			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
 
 	update_exception_bitmap(vcpu);
 
@@ -937,7 +1066,7 @@ static void enter_rmode(struct kvm_vcpu 
 	flags |= IOPL_MASK | X86_EFLAGS_VM;
 
 	vmcs_writel(GUEST_RFLAGS, flags);
-	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
+	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
 	update_exception_bitmap(vcpu);
 
 	vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
@@ -975,7 +1104,7 @@ static void enter_lmode(struct kvm_vcpu 
 
 	vcpu->shadow_efer |= EFER_LMA;
 
-	find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
+	find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
 	vmcs_write32(VM_ENTRY_CONTROLS,
 		     vmcs_read32(VM_ENTRY_CONTROLS)
 		     | VM_ENTRY_CONTROLS_IA32E_MASK);
@@ -1002,17 +1131,17 @@ static void vmx_set_cr0(struct kvm_vcpu 
 {
 	vmx_fpu_deactivate(vcpu);
 
-	if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
+	if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
 		enter_pmode(vcpu);
 
-	if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
+	if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
 		enter_rmode(vcpu);
 
 #ifdef CONFIG_X86_64
 	if (vcpu->shadow_efer & EFER_LME) {
-		if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK))
+		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
 			enter_lmode(vcpu);
-		if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK))
+		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
 			exit_lmode(vcpu);
 	}
 #endif
@@ -1022,14 +1151,14 @@ #endif
 		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
 	vcpu->cr0 = cr0;
 
-	if (!(cr0 & CR0_TS_MASK) || !(cr0 & CR0_PE_MASK))
+	if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
 		vmx_fpu_activate(vcpu);
 }
 
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	vmcs_writel(GUEST_CR3, cr3);
-	if (vcpu->cr0 & CR0_PE_MASK)
+	if (vcpu->cr0 & X86_CR0_PE)
 		vmx_fpu_deactivate(vcpu);
 }
 
@@ -1045,7 +1174,8 @@ #ifdef CONFIG_X86_64
 
 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-	struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
 
 	vcpu->shadow_efer = efer;
 	if (efer & EFER_LMA) {
@@ -1061,7 +1191,7 @@ static void vmx_set_efer(struct kvm_vcpu
 
 		msr->data = efer & ~EFER_LME;
 	}
-	setup_msrs(vcpu);
+	setup_msrs(vmx);
 }
 
 #endif
@@ -1210,17 +1340,6 @@ static int init_rmode_tss(struct kvm* kv
 	return 1;
 }
 
-static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val)
-{
-	u32 msr_high, msr_low;
-
-	rdmsr(msr, msr_low, msr_high);
-
-	val &= msr_high;
-	val |= msr_low;
-	vmcs_write32(vmcs_field, val);
-}
-
 static void seg_setup(int seg)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1234,7 +1353,7 @@ static void seg_setup(int seg)
 /*
  * Sets up the vmcs for emulated real mode.
  */
-static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
+static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	u32 host_sysenter_cs;
 	u32 junk;
@@ -1244,19 +1363,18 @@ static int vmx_vcpu_setup(struct kvm_vcp
 	int ret = 0;
 	unsigned long kvm_vmx_return;
 
-	if (!init_rmode_tss(vcpu->kvm)) {
+	if (!init_rmode_tss(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	memset(vcpu->regs, 0, sizeof(vcpu->regs));
-	vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	vcpu->cr8 = 0;
-	vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-	if (vcpu == &vcpu->kvm->vcpus[0])
-		vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
+	vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+	vmx->vcpu.cr8 = 0;
+	vmx->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+	if (vmx->vcpu.vcpu_id == 0)
+		vmx->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
 
-	fx_init(vcpu);
+	fx_init(&vmx->vcpu);
 
 	/*
 	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
@@ -1316,20 +1434,10 @@ static int vmx_vcpu_setup(struct kvm_vcp
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 
 	/* Control */
-	vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS,
-			       PIN_BASED_VM_EXEC_CONTROL,
-			       PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
-			       | PIN_BASED_NMI_EXITING   /* 20.6.1 */
-			);
-	vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS,
-			       CPU_BASED_VM_EXEC_CONTROL,
-			       CPU_BASED_HLT_EXITING         /* 20.6.2 */
-			       | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
-			       | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
-			       | CPU_BASED_ACTIVATE_IO_BITMAP  /* 20.6.2 */
-			       | CPU_BASED_MOV_DR_EXITING
-			       | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
-			);
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+		vmcs_config.pin_based_exec_ctrl);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+		vmcs_config.cpu_based_exec_ctrl);
 
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
@@ -1377,28 +1485,27 @@ #endif
 		u32 index = vmx_msr_index[i];
 		u32 data_low, data_high;
 		u64 data;
-		int j = vcpu->nmsrs;
+		int j = vmx->nmsrs;
 
 		if (rdmsr_safe(index, &data_low, &data_high) < 0)
 			continue;
 		if (wrmsr_safe(index, data_low, data_high) < 0)
 			continue;
 		data = data_low | ((u64)data_high << 32);
-		vcpu->host_msrs[j].index = index;
-		vcpu->host_msrs[j].reserved = 0;
-		vcpu->host_msrs[j].data = data;
-		vcpu->guest_msrs[j] = vcpu->host_msrs[j];
-		++vcpu->nmsrs;
+		vmx->host_msrs[j].index = index;
+		vmx->host_msrs[j].reserved = 0;
+		vmx->host_msrs[j].data = data;
+		vmx->guest_msrs[j] = vmx->host_msrs[j];
+		++vmx->nmsrs;
 	}
 
-	setup_msrs(vcpu);
+	setup_msrs(vmx);
 
-	vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS,
-		     	       (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
+	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
 
 	/* 22.2.1, 20.8.1 */
-	vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS,
-                               VM_ENTRY_CONTROLS, 0);
+	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
 
 #ifdef CONFIG_X86_64
@@ -1409,14 +1516,14 @@ #endif
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
 	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
 
-	vcpu->cr0 = 0x60000010;
-	vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode
-	vmx_set_cr4(vcpu, 0);
+	vmx->vcpu.cr0 = 0x60000010;
+	vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
+	vmx_set_cr4(&vmx->vcpu, 0);
 #ifdef CONFIG_X86_64
-	vmx_set_efer(vcpu, 0);
+	vmx_set_efer(&vmx->vcpu, 0);
 #endif
-	vmx_fpu_activate(vcpu);
-	update_exception_bitmap(vcpu);
+	vmx_fpu_activate(&vmx->vcpu);
+	update_exception_bitmap(&vmx->vcpu);
 
 	return 0;
 
@@ -1443,8 +1550,8 @@ static void inject_rmode_irq(struct kvm_
 		return;
 	}
 
-	if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
-								sizeof(ent)) {
+	if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
+							X86EMUL_CONTINUE) {
 		vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
 		return;
 	}
@@ -1454,9 +1561,9 @@ static void inject_rmode_irq(struct kvm_
 	ip =  vmcs_readl(GUEST_RIP);
 
 
-	if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
-	    kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
-	    kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
+	if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
+	    emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
+	    emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
 		vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
 		return;
 	}
@@ -1591,26 +1698,25 @@ static int handle_exception(struct kvm_v
 	if (is_page_fault(intr_info)) {
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 
-		spin_lock(&vcpu->kvm->lock);
+		mutex_lock(&vcpu->kvm->lock);
 		r = kvm_mmu_page_fault(vcpu, cr2, error_code);
 		if (r < 0) {
-			spin_unlock(&vcpu->kvm->lock);
+			mutex_unlock(&vcpu->kvm->lock);
 			return r;
 		}
 		if (!r) {
-			spin_unlock(&vcpu->kvm->lock);
+			mutex_unlock(&vcpu->kvm->lock);
 			return 1;
 		}
 
 		er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
-		spin_unlock(&vcpu->kvm->lock);
+		mutex_unlock(&vcpu->kvm->lock);
 
 		switch (er) {
 		case EMULATE_DONE:
 			return 1;
 		case EMULATE_DO_MMIO:
 			++vcpu->stat.mmio_exits;
-			kvm_run->exit_reason = KVM_EXIT_MMIO;
 			return 0;
 		 case EMULATE_FAIL:
 			vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
@@ -1658,7 +1764,7 @@ static int get_io_count(struct kvm_vcpu 
 	u64 inst;
 	gva_t rip;
 	int countr_size;
-	int i, n;
+	int i;
 
 	if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
 		countr_size = 2;
@@ -1673,9 +1779,11 @@ static int get_io_count(struct kvm_vcpu 
 	if (countr_size != 8)
 		rip += vmcs_readl(GUEST_CS_BASE);
 
-	n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst);
+	if (emulator_read_std(rip, &inst, sizeof(inst), vcpu) !=
+							X86EMUL_CONTINUE)
+		return 0;
 
-	for (i = 0; i < n; i++) {
+	for (i = 0; i < sizeof(inst); i++) {
 		switch (((u8*)&inst)[i]) {
 		case 0xf0:
 		case 0xf2:
@@ -1778,7 +1886,7 @@ static int handle_cr(struct kvm_vcpu *vc
 	case 2: /* clts */
 		vcpu_load_rsp_rip(vcpu);
 		vmx_fpu_deactivate(vcpu);
-		vcpu->cr0 &= ~CR0_TS_MASK;
+		vcpu->cr0 &= ~X86_CR0_TS;
 		vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
 		vmx_fpu_activate(vcpu);
 		skip_emulated_instruction(vcpu);
@@ -1992,6 +2100,7 @@ static void vmx_flush_tlb(struct kvm_vcp
 
 static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u8 fail;
 	int r;
 
@@ -2000,16 +2109,18 @@ preempted:
 		kvm_guest_debug_pre(vcpu);
 
 again:
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r))
+		goto out;
+
+	preempt_disable();
+
 	if (!vcpu->mmio_read_completed)
 		do_interrupt_requests(vcpu, kvm_run);
 
-	vmx_save_host_state(vcpu);
+	vmx_save_host_state(vmx);
 	kvm_load_guest_fpu(vcpu);
 
-	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
-		goto out;
-
 	/*
 	 * Loading guest fpu may have cleared host cr0.ts
 	 */
@@ -2116,7 +2227,7 @@ #else
 #endif
 		"setbe %0 \n\t"
 	      : "=q" (fail)
-	      : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP),
+	      : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
 		"c"(vcpu),
 		[rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
 		[rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
@@ -2146,6 +2257,9 @@ #endif
 	vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
 
 	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+	vmx->launched = 1;
+
+	preempt_enable();
 
 	if (unlikely(fail)) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2160,7 +2274,6 @@ #endif
 	if (unlikely(prof_on == KVM_PROFILING))
 		profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
 
-	vcpu->launched = 1;
 	r = kvm_handle_exit(kvm_run, vcpu);
 	if (r > 0) {
 		/* Give scheduler a change to reschedule. */
@@ -2225,49 +2338,90 @@ static void vmx_inject_page_fault(struct
 
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->vmcs) {
-		on_each_cpu(__vcpu_clear, vcpu, 0, 1);
-		free_vmcs(vcpu->vmcs);
-		vcpu->vmcs = NULL;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->vmcs) {
+		on_each_cpu(__vcpu_clear, vmx, 0, 1);
+		free_vmcs(vmx->vmcs);
+		vmx->vmcs = NULL;
 	}
 }
 
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
 	vmx_free_vmcs(vcpu);
+	kfree(vmx->host_msrs);
+	kfree(vmx->guest_msrs);
+	kvm_vcpu_uninit(vcpu);
+	kfree(vmx);
 }
 
-static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
+static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
-	struct vmcs *vmcs;
-
-	vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!vcpu->guest_msrs)
-		return -ENOMEM;
+	int err;
+	struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	int cpu;
 
-	vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!vcpu->host_msrs)
-		goto out_free_guest_msrs;
+	if (!vmx)
+		return ERR_PTR(-ENOMEM);
 
-	vmcs = alloc_vmcs();
-	if (!vmcs)
-		goto out_free_msrs;
+	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
 
-	vmcs_clear(vmcs);
-	vcpu->vmcs = vmcs;
-	vcpu->launched = 0;
+	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!vmx->guest_msrs) {
+		err = -ENOMEM;
+		goto uninit_vcpu;
+	}
 
-	return 0;
+	vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!vmx->host_msrs)
+		goto free_guest_msrs;
 
-out_free_msrs:
-	kfree(vcpu->host_msrs);
-	vcpu->host_msrs = NULL;
+	vmx->vmcs = alloc_vmcs();
+	if (!vmx->vmcs)
+		goto free_msrs;
 
-out_free_guest_msrs:
-	kfree(vcpu->guest_msrs);
-	vcpu->guest_msrs = NULL;
+	vmcs_clear(vmx->vmcs);
 
-	return -ENOMEM;
+	cpu = get_cpu();
+	vmx_vcpu_load(&vmx->vcpu, cpu);
+	err = vmx_vcpu_setup(vmx);
+	vmx_vcpu_put(&vmx->vcpu);
+	put_cpu();
+	if (err)
+		goto free_vmcs;
+
+	return &vmx->vcpu;
+
+free_vmcs:
+	free_vmcs(vmx->vmcs);
+free_msrs:
+	kfree(vmx->host_msrs);
+free_guest_msrs:
+	kfree(vmx->guest_msrs);
+uninit_vcpu:
+	kvm_vcpu_uninit(&vmx->vcpu);
+free_vcpu:
+	kfree(vmx);
+	return ERR_PTR(err);
+}
+
+static void __init vmx_check_processor_compat(void *rtn)
+{
+	struct vmcs_config vmcs_conf;
+
+	*(int *)rtn = 0;
+	if (setup_vmcs_config(&vmcs_conf) < 0)
+		*(int *)rtn = -EIO;
+	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
+		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
+				smp_processor_id());
+		*(int *)rtn = -EIO;
+	}
 }
 
 static struct kvm_arch_ops vmx_arch_ops = {
@@ -2275,6 +2429,7 @@ static struct kvm_arch_ops vmx_arch_ops 
 	.disabled_by_bios = vmx_disabled_by_bios,
 	.hardware_setup = hardware_setup,
 	.hardware_unsetup = hardware_unsetup,
+	.check_processor_compatibility = vmx_check_processor_compat,
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
 
@@ -2315,7 +2470,6 @@ #endif
 
 	.run = vmx_vcpu_run,
 	.skip_emulated_instruction = skip_emulated_instruction,
-	.vcpu_setup = vmx_vcpu_setup,
 	.patch_hypercall = vmx_patch_hypercall,
 };
 
@@ -2347,7 +2501,7 @@ static int __init vmx_init(void)
 	memset(iova, 0xff, PAGE_SIZE);
 	kunmap(vmx_io_bitmap_b);
 
-	r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE);
+	r = kvm_init_arch(&vmx_arch_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
 	if (r)
 		goto out1;
 
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
index d0dc93d..7e4dc12 100644
--- a/drivers/kvm/vmx.h
+++ b/drivers/kvm/vmx.h
@@ -25,29 +25,36 @@ #define VMX_H
  *
  */
 
-#define CPU_BASED_VIRTUAL_INTR_PENDING  0x00000004
-#define CPU_BASED_USE_TSC_OFFSETING     0x00000008
-#define CPU_BASED_HLT_EXITING           0x00000080
-#define CPU_BASED_INVDPG_EXITING        0x00000200
-#define CPU_BASED_MWAIT_EXITING         0x00000400
-#define CPU_BASED_RDPMC_EXITING         0x00000800
-#define CPU_BASED_RDTSC_EXITING         0x00001000
-#define CPU_BASED_CR8_LOAD_EXITING      0x00080000
-#define CPU_BASED_CR8_STORE_EXITING     0x00100000
-#define CPU_BASED_TPR_SHADOW            0x00200000
-#define CPU_BASED_MOV_DR_EXITING        0x00800000
-#define CPU_BASED_UNCOND_IO_EXITING     0x01000000
-#define CPU_BASED_ACTIVATE_IO_BITMAP    0x02000000
-#define CPU_BASED_MSR_BITMAPS           0x10000000
-#define CPU_BASED_MONITOR_EXITING       0x20000000
-#define CPU_BASED_PAUSE_EXITING         0x40000000
+#define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING             0x00000008
+#define CPU_BASED_HLT_EXITING                   0x00000080
+#define CPU_BASED_INVLPG_EXITING                0x00000200
+#define CPU_BASED_MWAIT_EXITING                 0x00000400
+#define CPU_BASED_RDPMC_EXITING                 0x00000800
+#define CPU_BASED_RDTSC_EXITING                 0x00001000
+#define CPU_BASED_CR8_LOAD_EXITING              0x00080000
+#define CPU_BASED_CR8_STORE_EXITING             0x00100000
+#define CPU_BASED_TPR_SHADOW                    0x00200000
+#define CPU_BASED_MOV_DR_EXITING                0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING             0x01000000
+#define CPU_BASED_USE_IO_BITMAPS                0x02000000
+#define CPU_BASED_USE_MSR_BITMAPS               0x10000000
+#define CPU_BASED_MONITOR_EXITING               0x20000000
+#define CPU_BASED_PAUSE_EXITING                 0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
 
-#define PIN_BASED_EXT_INTR_MASK 0x1
-#define PIN_BASED_NMI_EXITING   0x8
+#define PIN_BASED_EXT_INTR_MASK                 0x00000001
+#define PIN_BASED_NMI_EXITING                   0x00000008
+#define PIN_BASED_VIRTUAL_NMIS                  0x00000020
 
-#define VM_EXIT_ACK_INTR_ON_EXIT        0x00008000
-#define VM_EXIT_HOST_ADD_SPACE_SIZE     0x00000200
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
+#define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
 
+#define VM_ENTRY_IA32E_MODE                     0x00000200
+#define VM_ENTRY_SMM                            0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
 
 /* VMCS Encodings */
 enum vmcs_field {
@@ -285,13 +292,21 @@ #define AR_DPL(ar) (((ar) >> AR_DPL_SHIF
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define CR4_VMXE 0x2000
+#define MSR_IA32_VMX_BASIC                      0x480
+#define MSR_IA32_VMX_PINBASED_CTLS              0x481
+#define MSR_IA32_VMX_PROCBASED_CTLS             0x482
+#define MSR_IA32_VMX_EXIT_CTLS                  0x483
+#define MSR_IA32_VMX_ENTRY_CTLS                 0x484
+#define MSR_IA32_VMX_MISC                       0x485
+#define MSR_IA32_VMX_CR0_FIXED0                 0x486
+#define MSR_IA32_VMX_CR0_FIXED1                 0x487
+#define MSR_IA32_VMX_CR4_FIXED0                 0x488
+#define MSR_IA32_VMX_CR4_FIXED1                 0x489
+#define MSR_IA32_VMX_VMCS_ENUM                  0x48a
+#define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
 
-#define MSR_IA32_VMX_BASIC   		0x480
-#define MSR_IA32_FEATURE_CONTROL 		0x03a
-#define MSR_IA32_VMX_PINBASED_CTLS		0x481
-#define MSR_IA32_VMX_PROCBASED_CTLS		0x482
-#define MSR_IA32_VMX_EXIT_CTLS		0x483
-#define MSR_IA32_VMX_ENTRY_CTLS		0x484
+#define MSR_IA32_FEATURE_CONTROL                0x3a
+#define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
+#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
 
 #endif
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
index 1f979cb..44eb28d 100644
--- a/drivers/kvm/x86_emulate.c
+++ b/drivers/kvm/x86_emulate.c
@@ -6,7 +6,7 @@
  * Copyright (c) 2005 Keir Fraser
  *
  * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privieged instructions:
+ * privileged instructions:
  *
  * Copyright (C) 2006 Qumranet
  *
@@ -420,7 +420,7 @@ #endif				/* __i386__ */
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
 	rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x,	\
-                                                  (_size), ctxt);       \
+                                                  (_size), ctxt->vcpu); \
 	if ( rc != 0 )							\
 		goto done;						\
 	(_eip) += (_size);						\
@@ -443,8 +443,13 @@ #define register_address_increment(reg, 
 			   (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
 	} while (0)
 
-void *decode_register(u8 modrm_reg, unsigned long *regs,
-		      int highbyte_regs)
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(u8 modrm_reg, unsigned long *regs,
+			     int highbyte_regs)
 {
 	void *p;
 
@@ -464,10 +469,12 @@ static int read_descriptor(struct x86_em
 	if (op_bytes == 2)
 		op_bytes = 3;
 	*address = 0;
-	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, ctxt);
+	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
+			   ctxt->vcpu);
 	if (rc)
 		return rc;
-	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, ctxt);
+	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
+			   ctxt->vcpu);
 	return rc;
 }
 
@@ -775,7 +782,7 @@ done_prefixes:
 		src.type = OP_MEM;
 		src.ptr = (unsigned long *)cr2;
 		if ((rc = ops->read_emulated((unsigned long)src.ptr,
-					     &src.val, src.bytes, ctxt)) != 0)
+					     &src.val, src.bytes, ctxt->vcpu)) != 0)
 			goto done;
 		src.orig_val = src.val;
 		break;
@@ -814,7 +821,7 @@ done_prefixes:
 	case DstReg:
 		dst.type = OP_REG;
 		if ((d & ByteOp)
-		    && !(twobyte_table && (b == 0xb6 || b == 0xb7))) {
+		    && !(twobyte && (b == 0xb6 || b == 0xb7))) {
 			dst.ptr = decode_register(modrm_reg, _regs,
 						  (rex_prefix == 0));
 			dst.val = *(u8 *) dst.ptr;
@@ -845,7 +852,7 @@ done_prefixes:
 		}
 		if (!(d & Mov) && /* optimisation - avoid slow emulated read */
 		    ((rc = ops->read_emulated((unsigned long)dst.ptr,
-					      &dst.val, dst.bytes, ctxt)) != 0))
+					      &dst.val, dst.bytes, ctxt->vcpu)) != 0))
 			goto done;
 		break;
 	}
@@ -958,7 +965,7 @@ done_prefixes:
 			dst.bytes = 8;
 		if ((rc = ops->read_std(register_address(ctxt->ss_base,
 							 _regs[VCPU_REGS_RSP]),
-					&dst.val, dst.bytes, ctxt)) != 0)
+					&dst.val, dst.bytes, ctxt->vcpu)) != 0)
 			goto done;
 		register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
 		break;
@@ -1043,7 +1050,7 @@ done_prefixes:
 				dst.bytes = 8;
 				if ((rc = ops->read_std((unsigned long)dst.ptr,
 							&dst.val, 8,
-							ctxt)) != 0)
+							ctxt->vcpu)) != 0)
 					goto done;
 			}
 			register_address_increment(_regs[VCPU_REGS_RSP],
@@ -1051,7 +1058,7 @@ done_prefixes:
 			if ((rc = ops->write_std(
 				     register_address(ctxt->ss_base,
 						      _regs[VCPU_REGS_RSP]),
-				     &dst.val, dst.bytes, ctxt)) != 0)
+				     &dst.val, dst.bytes, ctxt->vcpu)) != 0)
 				goto done;
 			no_wb = 1;
 			break;
@@ -1086,11 +1093,11 @@ writeback:
 				rc = ops->cmpxchg_emulated((unsigned long)dst.
 							   ptr, &dst.orig_val,
 							   &dst.val, dst.bytes,
-							   ctxt);
+							   ctxt->vcpu);
 			else
 				rc = ops->write_emulated((unsigned long)dst.ptr,
 							 &dst.val, dst.bytes,
-							 ctxt);
+							 ctxt->vcpu);
 			if (rc != 0)
 				goto done;
 		default:
@@ -1125,7 +1132,7 @@ special_insn:
 							_regs[VCPU_REGS_RDI]);
 		if ((rc = ops->read_emulated(register_address(
 		      override_base ? *override_base : ctxt->ds_base,
-		      _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt)) != 0)
+		      _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
 			goto done;
 		register_address_increment(_regs[VCPU_REGS_RSI],
 			     (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -1147,7 +1154,8 @@ special_insn:
 		dst.type = OP_REG;
 		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
 		dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-		if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, ctxt)) != 0)
+		if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
+					     ctxt->vcpu)) != 0)
 			goto done;
 		register_address_increment(_regs[VCPU_REGS_RSI],
 			   (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -1166,7 +1174,8 @@ special_insn:
 
 pop_instruction:
 		if ((rc = ops->read_std(register_address(ctxt->ss_base,
-			_regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0)
+			_regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
+			!= 0)
 			goto done;
 
 		register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
@@ -1217,51 +1226,53 @@ twobyte_insn:
 		}
 		break;
 	case 0x21: /* mov from dr to reg */
+		no_wb = 1;
 		if (modrm_mod != 3)
 			goto cannot_emulate;
 		rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
 		break;
 	case 0x23: /* mov from reg to dr */
+		no_wb = 1;
 		if (modrm_mod != 3)
 			goto cannot_emulate;
 		rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
 		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		dst.val = dst.orig_val = src.val;
-		d &= ~Mov;	/* default to no move */
+		no_wb = 1;
 		/*
 		 * First, assume we're decoding an even cmov opcode
 		 * (lsb == 0).
 		 */
 		switch ((b & 15) >> 1) {
 		case 0:	/* cmovo */
-			d |= (_eflags & EFLG_OF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_OF) ? 0 : 1;
 			break;
 		case 1:	/* cmovb/cmovc/cmovnae */
-			d |= (_eflags & EFLG_CF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_CF) ? 0 : 1;
 			break;
 		case 2:	/* cmovz/cmove */
-			d |= (_eflags & EFLG_ZF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
 			break;
 		case 3:	/* cmovbe/cmovna */
-			d |= (_eflags & (EFLG_CF | EFLG_ZF)) ? Mov : 0;
+			no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
 			break;
 		case 4:	/* cmovs */
-			d |= (_eflags & EFLG_SF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_SF) ? 0 : 1;
 			break;
 		case 5:	/* cmovp/cmovpe */
-			d |= (_eflags & EFLG_PF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_PF) ? 0 : 1;
 			break;
 		case 7:	/* cmovle/cmovng */
-			d |= (_eflags & EFLG_ZF) ? Mov : 0;
+			no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
 			/* fall through */
 		case 6:	/* cmovl/cmovnge */
-			d |= (!(_eflags & EFLG_SF) !=
-			      !(_eflags & EFLG_OF)) ? Mov : 0;
+			no_wb &= (!(_eflags & EFLG_SF) !=
+			      !(_eflags & EFLG_OF)) ? 0 : 1;
 			break;
 		}
 		/* Odd cmov opcodes (lsb == 1) have inverted sense. */
-		d ^= (b & 1) ? Mov : 0;
+		no_wb ^= b & 1;
 		break;
 	case 0xb0 ... 0xb1:	/* cmpxchg */
 		/*
@@ -1271,8 +1282,6 @@ twobyte_insn:
 		src.orig_val = src.val;
 		src.val = _regs[VCPU_REGS_RAX];
 		emulate_2op_SrcV("cmp", src, dst, _eflags);
-		/* Always write back. The question is: where to? */
-		d |= Mov;
 		if (_eflags & EFLG_ZF) {
 			/* Success: write back to memory. */
 			dst.val = src.orig_val;
@@ -1373,7 +1382,8 @@ twobyte_special_insn:
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
 		{
 			u64 old, new;
-			if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0)
+			if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
+									!= 0)
 				goto done;
 			if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
 			    ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
@@ -1384,7 +1394,7 @@ twobyte_special_insn:
 				new = ((u64)_regs[VCPU_REGS_RCX] << 32)
 					| (u32) _regs[VCPU_REGS_RBX];
 				if ((rc = ops->cmpxchg_emulated(cr2, &old,
-							  &new, 8, ctxt)) != 0)
+							  &new, 8, ctxt->vcpu)) != 0)
 					goto done;
 				_eflags |= EFLG_ZF;
 			}
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h
index ea3407d..92c73aa 100644
--- a/drivers/kvm/x86_emulate.h
+++ b/drivers/kvm/x86_emulate.h
@@ -60,7 +60,7 @@ struct x86_emulate_ops {
 	 *  @bytes: [IN ] Number of bytes to read from memory.
 	 */
 	int (*read_std)(unsigned long addr, void *val,
-			unsigned int bytes, struct x86_emulate_ctxt * ctxt);
+			unsigned int bytes, struct kvm_vcpu *vcpu);
 
 	/*
 	 * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -71,7 +71,7 @@ struct x86_emulate_ops {
 	 *  @bytes: [IN ] Number of bytes to write to memory.
 	 */
 	int (*write_std)(unsigned long addr, const void *val,
-			 unsigned int bytes, struct x86_emulate_ctxt * ctxt);
+			 unsigned int bytes, struct kvm_vcpu *vcpu);
 
 	/*
 	 * read_emulated: Read bytes from emulated/special memory area.
@@ -82,7 +82,7 @@ struct x86_emulate_ops {
 	int (*read_emulated) (unsigned long addr,
 			      void *val,
 			      unsigned int bytes,
-			      struct x86_emulate_ctxt * ctxt);
+			      struct kvm_vcpu *vcpu);
 
 	/*
 	 * write_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +94,7 @@ struct x86_emulate_ops {
 	int (*write_emulated) (unsigned long addr,
 			       const void *val,
 			       unsigned int bytes,
-			       struct x86_emulate_ctxt * ctxt);
+			       struct kvm_vcpu *vcpu);
 
 	/*
 	 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
@@ -108,12 +108,10 @@ struct x86_emulate_ops {
 				 const void *old,
 				 const void *new,
 				 unsigned int bytes,
-				 struct x86_emulate_ctxt * ctxt);
+				 struct kvm_vcpu *vcpu);
 
 };
 
-struct cpu_user_regs;
-
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
@@ -154,12 +152,4 @@ #endif
 int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
 		      struct x86_emulate_ops *ops);
 
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-void *decode_register(u8 modrm_reg, unsigned long *regs,
-		      int highbyte_regs);
-
 #endif				/* __X86_EMULATE_H__ */
diff --git a/include/asm-i386/processor-flags.h b/include/asm-i386/processor-flags.h
index 5404e90..199cab1 100644
--- a/include/asm-i386/processor-flags.h
+++ b/include/asm-i386/processor-flags.h
@@ -63,7 +63,7 @@ #define X86_CR4_VMXE	0x00002000 /* enabl
 /*
  * x86-64 Task Priority Register, CR8
  */
-#define X86_CR8_TPR	0x00000007 /* task priority register */
+#define X86_CR8_TPR	0x0000000F /* task priority register */
 
 /*
  * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e6edca8..91a446f 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -4,8 +4,7 @@ #define __LINUX_KVM_H
 /*
  * Userspace interface for /dev/kvm - kernel based virtual machine
  *
- * Note: this interface is considered experimental and may change without
- *       notice.
+ * Note: you must update KVM_API_VERSION if you change this interface.
  */
 
 #include <asm/types.h>
@@ -13,14 +12,8 @@ #include <linux/ioctl.h>
 
 #define KVM_API_VERSION 12
 
-/*
- * Architectural interrupt line count, and the size of the bitmap needed
- * to hold them.
- */
+/* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
-#define KVM_IRQ_BITMAP_SIZE_BYTES    ((KVM_NR_INTERRUPTS + 7) / 8)
-#define KVM_IRQ_BITMAP_SIZE(type)    (KVM_IRQ_BITMAP_SIZE_BYTES / sizeof(type))
-
 
 /* for KVM_CREATE_MEMORY_REGION */
 struct kvm_memory_region {
@@ -106,11 +99,14 @@ #define KVM_EXIT_IO_OUT 1
 		} mmio;
 		/* KVM_EXIT_HYPERCALL */
 		struct {
+			__u64 nr;
 			__u64 args[6];
 			__u64 ret;
 			__u32 longmode;
 			__u32 pad;
 		} hypercall;
+		/* Fix the size of the union. */
+		char padding[256];
 	};
 };
 
@@ -164,7 +160,7 @@ struct kvm_sregs {
 	__u64 cr0, cr2, cr3, cr4, cr8;
 	__u64 efer;
 	__u64 apic_base;
-	__u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)];
+	__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
 };
 
 struct kvm_msr_entry {