GIT f97af70b3aa8a92ddeabb7d42477e7d13dd0a192 git://kvm.qumranet.com/home/avi/kvm.git

commit 
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 18:44:51 2007 +0200

    KVM: Remove set_cr0_no_modeswitch() arch op
    
    set_cr0_no_modeswitch() was a hack to avoid corrupting segment registers.
    As we now cache the protected mode values on entry to real mode, this
    isn't an issue anymore, and it interferes with reboot (which usually _is_
    a modeswitch).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e314dde30e3851e8effc017c6fffced11d90183a
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 18:40:40 2007 +0200

    KVM: Workaround vmx inability to virtualize the reset state
    
    The reset state has cs.selector == 0xf000 and cs.base == 0xffff0000,
    which aren't compatible with vm86 mode, which is used for real mode
    virtualization.
    
    When we create a vcpu, we set cs.base to 0xf0000, but if we get there by
    way of a reset, the values are inconsistent and vmx refuses to enter
    guest mode.
    
    Workaround by detecting the state and munging it appropriately.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 88aea7ddfae755633b0a80ccfa56244b3c79c7b0
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 14:34:28 2007 +0200

    KVM: MMU: Remove global pte tracking
    
    The initial, noncaching, version of the kvm mmu flushed the all nonglobal
    shadow page table translations (much like a native tlb flush).  The new
    implementation flushes translations only when they change, rendering global
    pte tracking superfluous.
    
    This removes the unused tracking mechanism and storage space.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 66e5d5c81b5b89e39aa86e3bf9864d228f468b0d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 14:29:06 2007 +0200

    KVM: MMU: Remove unnecessary check for pdptr access
    
    We already special case the pdptr access, so no need to check it again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c01571ed56754dfea458cc37d553c360082411a1
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 20 12:46:50 2007 +0200

    KVM: Avoid guest virtual addresses in string pio userspace interface
    
    The current string pio interface communicates using guest virtual addresses,
    relying on userspace to translate addresses and to check permissions.  This
    interface cannot fully support guest smp, as the check needs to take into
    account two pages at one in case an unaligned string transfer straddles a
    page boundary.
    
    Change the interface not to communicate guest addresses at all; instead use
    a buffer page (mmaped by userspace) and do transfers there.  The kernel
    manages the virtual to physical translation and can perform the checks
    atomically by taking the appropriate locks.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 74c24de6e7848a45d6109d987d4fd2ccd83e432e
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 7 13:11:17 2007 +0200

    KVM: Future-proof argument-less ioctls
    
    Some ioctls ignore their arguments.  By requiring them to be zero now,
    we allow a nonzero value to have some special meaning in the future.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 29e686a1dc9631b7898d087a0ab1c4716672e209
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 7 13:05:38 2007 +0200

    KVM: Allow kernel to select size of mmap() buffer
    
    This allows us to store offsets in the kernel/user kvm_run area, and be
    sure that userspace has them mapped.  As offsets can be outside the
    kvm_run struct, userspace has no way of knowing how much to mmap.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cce3a1062817218c67163732339e2ea25e9f023b
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 5 19:46:05 2007 +0200

    KVM: Add guest mode signal mask
    
    Allow a special signal mask to be used while executing in guest mode.  This
    allows signals to be used to interrupt a vcpu without requiring signal
    delivery to a userspace handler, which is quite expensive.  Userspace still
    receives -EINTR and can get the signal via sigwait().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cd3aaa2392baec9674792d71d304ec41e540b517
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 5 17:45:40 2007 +0200

    KVM: Initialize the apic_base msr on svm too
    
    Older userspace didn't care, but newer userspace (with the cpuid changes)
    does.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c303c0efc5b2ff8c0f77c9079fa66f62801da93d
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 14:24:03 2007 +0200

    KVM: Add a special exit reason when exiting due to an interrupt
    
    This is redundant, as we also return -EINTR from the ioctl, but it
    allows us to examine the exit_reason field on resume without seeing
    old data.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 62919332e00e3226dd1f728ff83107d06a6d9a81
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 14:17:08 2007 +0200

    KVM: Fold kvm_run::exit_type into kvm_run::exit_reason
    
    Currently, userspace is told about the nature of the last exit from the
    guest using two fields, exit_type and exit_reason, where exit_type has
    just two enumerations (and no need for more).  So fold exit_type into
    exit_reason, reducing the complexity of determining what really happened.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 9e16898f4f5d6cdc35030bb272631611b71548fe
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 13:59:30 2007 +0200

    KVM: Allow userspace to process hypercalls which have no kernel handler
    
    This is useful for paravirtualized graphics devices, for example.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 440fd9098bceb2ca0856d962ff62db9af4d1094a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 17:56:20 2007 +0200

    KVM: Add method to check for backwards-compatible API extensions
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0b37dedb178bcb3b0a28f65e6ae835bf58184301
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 17:20:13 2007 +0200

    KVM: Renumber ioctls
    
    The recent changes have left the ioctl numbers in complete disarray.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 95cab16b18e1c1a786a9fc5ea6fcd68b29ae3481
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 16:47:06 2007 +0200

    KVM: Remove minor wart from KVM_CREATE_VCPU ioctl
    
    That ioctl does not transfer any data, so it should be an _IO rather than an
    _IOW.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ba5cb15b027b76ba7b4d247914eb6d20065c0767
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 16:20:40 2007 +0200

    KVM: Remove the 'emulated' field from the userspace interface
    
    We no longer emulate single instructions in userspace.  Instead, we service
    mmio or pio requests.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 706e8fe655be36aa686f1fbb398d3a4470d4939b
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 28 20:46:53 2007 +0200

    KVM: Handle cpuid in the kernel instead of punting to userspace
    
    KVM used to handle cpuid by letting userspace decide what values to
    return to the guest.  We now handle cpuid completely in the kernel.  We
    still let userspace decide which values the guest will see by having
    userspace set up the value table beforehand (this is necessary to allow
    management software to set the cpu features to the least common denominator,
    so that live migration can work).
    
    The motivation for the change is that kvm kernel code can be impacted by
    cpuid features, for example the x86 emulator.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit aad2f6e0faf4b03e087bbe6751acdacd72e911b6
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Feb 22 19:48:43 2007 +0200

    KVM: Initialize PIO I/O count
    
    This allows userspace to ignore the io.rep field.  No a big deal, but
    friendly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit e668cf946ee8654c7f5afe3feeed686a3566c22a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Feb 22 19:39:30 2007 +0200

    KVM: Do not communicate to userspace through cpu registers during PIO
    
    Currently when passing the a PIO emulation request to userspace, we
    rely on userspace updating %rax (on 'in' instructions) and %rsi/%rdi/%rcx
    (on string instructions).  This (a) requires two extra ioctls for getting
    and setting the registers and (b) is unfriendly to non-x86 archs, when
    they get kvm ports.
    
    So fix by doing the register fixups in the kernel and passing to userspace
    only an abstract description of the PIO to be done.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 3de857cd1335bd2e02b60d3a50b7da93ccbabf1d
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Feb 22 12:58:31 2007 +0200

    KVM: Use a shared page for kernel/user communication when runing a vcpu
    
    Instead of passing a 'struct kvm_run' back and forth between the kernel and
    userspace, allocate a page and allow the user to mmap() it.  This reduces
    needless copying and makes the interface expandable by providing lots of
    free space.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 128e159e11e999496ec44a549fcac91de3802389
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Mar 19 13:18:10 2007 +0200

    KVM: Prevent system selectors leaking into guest on real->protected mode transition on vmx
    
    Intel virtualization extensions do not support virtualizing real mode.  So
    kvm uses virtualized vm86 mode to run real mode code.  Unfortunately, this
    virtualized vm86 mode does not support the so called "big real" mode, where
    the segment selector and base do not agree with each other according to the
    real mode rules (base == selector << 4).
    
    To work around this, kvm checks whether a selector/base pair violates the
    virtualized vm86 rules, and if so, forces it into conformance.  On a
    transition back to protected mode, if we see that the guest did not touch
    a forced segment, we restore it back to the original protected mode value.
    
    This pile of hacks breaks down if the gdt has changed in real mode, as it
    can cause a segment selector to point to a system descriptor instead of a
    normal data segment.  In fact, this happens with the Windows bootloader
    and the qemu acpi bios, where a protected mode memcpy routine issues an
    innocent 'pop %es' and traps on an attempt to load a system descriptor.
    
    "Fix" by checking if the to-be-restored selector points at a system segment,
    and if so, coercing it into a normal data segment.  The long term solution,
    of course, is to abandon vm86 mode and use emulation for big real mode.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ade11a015f83d270d1201c440199146f852fe5e4
Author: Uri Lublin <uril@qumranet.com>
Date:   Wed Mar 14 19:21:06 2007 +0200

    added KVM_GET_MEM_MAP ioctl to get the memory bitmap for a memory slot
    
    To be used when there may be "holes" in the memory.
    Specifically to not break VM migration when ballooning mechanism exists
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>

commit b0092d187cfa19dfcada3b85d728af5ae27989dc
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 14 15:54:54 2007 +0200

    KVM: Remove extraneous guest entry on mmio read
    
    When emulating an mmio read, we actually emulate twice: once to determine
    the physical address of the mmio, and, after we've exited to userspace to
    get the mmio value, we emulate again to place the value in the result
    register and update any flags.
    
    But we don't really need to enter the guest again for that, only to take
    an immediate vmexit.  So, if we detect that we're doing an mmio read,
    emulate a single instruction before entering the guest again.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 470db88b8b3491199e8d55b771d66e74b2fd53cd
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sun Mar 11 13:52:33 2007 +0100

    KVM: always reload segment selectors
    
    failed VM entry on VMX might still change %fs or %gs, thus make sure
    that KVM always reloads the segment selectors. This is crutial on both
    x86 and x86_64: x86 has __KERNEL_PDA in %fs on which things like
    'current' depends and x86_64 has 0 there and needs MSR_GS_BASE to work.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit f7edc6a39584a3f95687a5320675fadb23bccbe5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Sat Mar 10 11:22:51 2007 +0100

    KVM: trivial whitespace fixes
    
    trivial whitespace fixes.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit f3a33bfeaa5cade1a9ac1facb5cb904a483b1e5c
Author: Avi Kivity <avi@qumranet.com>
Date:   Fri Mar 9 13:04:31 2007 +0200

    KVM: MMU: Fix host memory corruption on i386 with >= 4GB ram
    
    PAGE_MASK is an unsigned long, so using it to mask physical addresses on
    i386 (which are 64-bit wide) leads to truncation.  This can result in
    page->private of unrelated memory pages being modified, with disasterous
    results.
    
    Fix by not using PAGE_MASK for physical addresses; instead calculate
    the correct value directly from PAGE_SIZE.  Also fix a similar BUG_ON().
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 6ee9853b015f8807f497ffad39b142ddc1403aa9
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 8 17:13:32 2007 +0200

    KVM: MMU: Fix guest writes to nonpae pde
    
    KVM shadow page tables are always in pae mode, regardless of the guest
    setting.  This means that a guest pde (mapping 4MB of memory) is mapped
    to two shadow pdes (mapping 2MB each).
    
    When the guest writes to a pte or pde, we intercept the write and emulate it.
    We also remove any shadowed mappings corresponding to the write.  Since the
    mmu did not account for the doubling in the number of pdes, it removed the
    wrong entry, resulting in a mismatch between shadow page tables and guest
    page tables, followed shortly by guest memory corruption.
    
    This patch fixes the problem by detecting the special case of writing to
    a non-pae pde and adjusting the address and number of shadow pdes zapped
    accordingly.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 374c1509c7d04a4e351b1812c2f0b9dac3ea0c0a
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 8 11:48:09 2007 +0200

    KVM: Fix bogus sign extension in mmu mapping audit
    
    When auditing a 32-bit guest on a 64-bit host, sign extension of the page
    table directory pointer table index caused bogus addresses to be shown on
    audit errors.
    
    Fix by declaring the index unsigned.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fac539542cbf923a39238b10557c88f99fd45b59
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Mar 7 09:29:48 2007 +0200

    KVM: Export <linux/kvm.h>
    
    This allows users to actually build prgrams that use kvm without
    the entire source tree.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c14a46343cc9f04f15ebc67573031fe8bbe1555a
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Mar 6 12:05:53 2007 +0200

    KVM: Fix guest sysenter on vmx
    
    The vmx code currently treats the guest's sysenter support msrs as 32-bit
    values, which breaks 32-bit compat mode userspace on 64-bit guests.  Fix by
    using the native word width of the machine.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit ea135e7671189ffb7e67843bf98740dac0c6ccfa
Author: Avi Kivity <avi@qumranet.com>
Date:   Sun Mar 4 13:27:36 2007 +0200

    KVM: Use own minor number
    
    Use the minor number (232) allocated to kvm by lanana.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 21af17507f37658414191b1cf1337efbaf7dd530
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Mon Feb 19 18:25:43 2007 +0200

    KVM: Use the generic skip_emulated_instruction() in hypercall code
    
    Instead of twiddling the rip registers directly, use the
    skip_emulated_instruction() function to do that for us.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 57d78025d84fb607aa335d015a79b257517aa209
Author: Dor Laor <dor.laor@qumranet.com>
Date:   Mon Feb 19 16:44:49 2007 +0200

    KVM: Fix guest register corruption on paravirt hypercall
    
    The hypercall code mixes up the ->cache_regs() and ->decache_regs()
    callbacks, resulting in guest register corruption.
    
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 28e9803c9134683a884efe05abdb3f814c1ca7e7
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 19:21:03 2007 +0200

    KVM: Unset kvm_arch_ops if arch module loading failed
    
    Otherwise, the core module thinks the arch module is loaded, and won't
    let you reload it after you've fixed the bug.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 426bc2fd1462706ec92d0e9efdb0cf3643f4eb67
Author: Avi Kivity <avi@qumranet.com>
Date:   Thu Mar 1 11:28:13 2007 +0200

    KVM: Move kvmfs magic number to <linux/magic.h>
    
    From: Andrew Morton <akpm@linux-foundation.org>
    
    Use the standard magic.h for kvmfs.
    
    Cc: Avi Kivity <avi@qumranet.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c1a8557e1da6e7d8bf8f77cb1b47c077f5c2a67d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 26 16:29:43 2007 +0200

    KVM: Fix bogus failure in kvm.ko module initialization
    
    A bogus 'return r' can cause an otherwise successful module load to fail.
    This both denies users the use of kvm, and it also denies them the use of
    their machine, as it leaves a filesystem registered with its callbacks
    pointing into now-freed module memory.
    
    Fix by returning a zero like a good module.
    
    Thanks to Richard Lucassen <mailinglists@lucassen.org> (?) for reporting
    the problem and for providing access to a machine which exhibited it.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7703ff91ee2ed171f2175d030e7f063c4efab2f5
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:37:32 2007 +0200

    KVM: Remove write access permissions when dirty-page-logging is enabled
    
    Enabling dirty page logging is done using KVM_SET_MEMORY_REGION ioctl.
    If the memory region already exists, we need to remove write accesses,
    so writes will be caught, and dirty pages will be logged.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b77fd1f62576463434fc434cbdcd808847e169a1
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:15:33 2007 +0200

    kvm: move do_remove_write_access() up
    
    To be called from kvm_vm_ioctl_set_memory_region()
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 62e287e7210d6ff142b3b05233fa1f5df686b794
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 16:43:09 2007 +0200

    KVM: Fix dirty page log bitmap size/access calculation
    
    Since dirty_bitmap is an unsigned long array, the alignment and size need
    to take that into account.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 871574eb14e959c19d94fdee7c3e2b88ae06770f
Author: Uri Lublin <uril@qumranet.com>
Date:   Wed Feb 21 18:25:21 2007 +0200

    KVM: Add missing calls to mark_page_dirty()
    
    A few places where we modify guest memory fail to call mark_page_dirty(),
    causing live migration to fail.  This adds the missing calls.
    
    Signed-off-by: Uri Lublin <uril@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 42017e8bf8eb7b6f65b95bca1368ee274fc5ef50
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:37:32 2007 +0200

    kvm: dirty page logging: remove write access permissions when dirty-page-logging is enabled
    
    Enabling dirty page logging is done using KVM_SET_MEMORY_REGION ioctl.
    If the memory region already exists, there is a need to remove write accesses,
        so writes will be caught, and dirty pages will be logged.

commit a9fd29cfcb643b97cd76c7d836be4d0ed80f69e0
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 17:15:33 2007 +0200

    kvm: move do_remove_write_access() up
    
    To be called from kvm_vm_ioctl_set_memory_region()

commit fba4ba9c513ad2cd328f5f16980aa7b90d40cec0
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 16:43:09 2007 +0200

    kvm: dirty pages log: fix bitmap size/access calculation
    
    Since dirty_bitmap is an unsigned long array (pointer)

commit ae160d732685ab33d5a3a495663aa2b54c4d4734
Author: Uri Lublin <uril@qumranet.com>
Date:   Thu Feb 22 15:47:42 2007 +0200

    .gitignore: ignore emacs backup files (*~)

commit 8267c1cd9a8a038e91c94e0cabc571a3614dc3e5
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 21 19:47:40 2007 +0200

    KVM: Bump API version
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit c65237e78c19b8173338a49933c611dece13c1c6
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 21 18:04:26 2007 +0200

    KVM: Per-vcpu inodes
    
    Allocate a distinct inode for every vcpu in a VM.  This has the following
    benefits:
    
     - the filp cachelines are no longer bounced when f_count is incremented on
       every ioctl()
     - the API and internal code are distinctly clearer; for example, on the
       KVM_GET_REGS ioctl, there is no need to copy the vcpu number from
       userspace and then copy the registers back; the vcpu identity is derived
       from the fd used to make the call
    
    Right now the performance benefits are completely theoretical since (a) we
    don't support more than one vcpu per VM and (b) virtualization hardware
    inefficiencies completely everwhelm any cacheline bouncing effects.  But
    both of these will change, and we need to prepare the API today.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 11c1297fadc533d1f66252088b4f4775018bafbb
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Feb 20 18:41:05 2007 +0200

    KVM: Move kvm_vm_ioctl_create_vcpu() around
    
    In preparation of some hacking.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f3ad84386727171d8308338a2c5dee1deac2e50d
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Feb 20 18:27:58 2007 +0200

    KVM: Rename some kvm_dev_ioctl_*() functions to kvm_vm_ioctl_*()
    
    This reflects the changed scope, from device-wide to single vm (previously
    every device open created a virtual machine).
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 733e3f74f1c51bbc2e7a99df8b51767504b58de2
Author: Avi Kivity <avi@qumranet.com>
Date:   Wed Feb 21 19:28:04 2007 +0200

    KVM: Create an inode per virtual machine
    
    This avoids having filp->f_op and the corresponding inode->i_fop different,
    which is a little unorthodox.
    
    The ioctl list is split into two: global kvm ioctls and per-vm ioctls.  A new
    ioctl, KVM_CREATE_VM, is used to create VMs and return the VM fd.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 52a96114380f8ab615626e4cec57b7015895bd0f
Author: Avi Kivity <avi@qumranet.com>
Date:   Tue Feb 20 14:07:37 2007 +0200

    KVM: Add internal filesystem for generating inodes
    
    The kvmfs inodes will represent virtual machines and vcpus, as necessary,
    reducing cacheline bouncing due to inodes and filps being shared.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit b00bc8b10197715f5b842f1f9a60e67a3484b10f
Author: Uri Lublin <uril@qumranet.com>
Date:   Wed Feb 21 18:25:21 2007 +0200

    kvm, dirty pages log: adding some calls to mark_page_dirty()

commit 58a214eba321d92f833221c26777e2119e34a19d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:48 2007 +0200

    KVM: More 0 -> NULL conversions
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f73199bb57b4c8feb7d8f60c6f1a25107de18dab
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: SVM: intercept SMI to handle it at host level
    
    This patch changes the SVM code to intercept SMIs and handle it
    outside the guest.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit fa2742c78f10fad8682e3af17df3e9fc2eece9e4
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: svm: init cr0 with the wp bit set
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8da588a919dc0bef76e384d16fd13ea2189aa82d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Wire up hypercall handlers to a central arch-independent location
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 68f16784f188d280c75b39e2367ebc1adbc66d9d
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Add hypercall host support for svm
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7c8bd4d6fc0e2bfb35cd4c0e8ff39c4f8972d951
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Add host hypercall support for vmx
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit f846fa34a14ec37dc0194c6f47ea4374c140e6f1
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: add MSR based hypercall API
    
    This adds a special MSR based hypercall API to KVM. This is to be
    used by paravirtual kernels and virtual drivers.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 8aa04bb13cf90d68c26d6bea1e4c720f1f027be0
Author: Markus Rechberger <markus.rechberger@amd.com>
Date:   Mon Feb 19 14:37:47 2007 +0200

    KVM: Use page_private()/set_page_private() apis
    
    Besides using an established api, this allows using kvm in older kernels.
    
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 4d5a7e81cc63d28e94373cdeb74dc44045edaa10
Author: Ahmed S. Darwish <darwish.07@gmail.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: Use ARRAY_SIZE macro instead of manual calculation.
    
    Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
    Signed-off-by: Dor Laor <dor.laor@qumranet.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 0fe9875fb3f9946a6c1cef6f1b9a286edc8ee2b9
Author: Markus Rechberger <markus.rechberger@amd.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: vmx: hack set_cr0_no_modeswitch() to actually do modeswitch
    
    From: Joerg Roedel <joerg.roedel@amd.com>
    
    The whole thing is rotten, but this allows vmx to boot with the guest reboot
    fix.
    
    Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 7e6e2bbad7f5dbccb389ee6d79be661972b18b15
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: Cosmetics
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit cc66daca849ca8c2900ba8cc7640de664296d36a
Author: Jeremy Katz <katzj@redhat.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: Move virtualization deactivation from CPU_DEAD state to CPU_DOWN_PREPARE
    
    This gives it more chances of surviving suspend.
    
    Signed-off-by: Jeremy Katz <katzj@redhat.com>
    Signed-off-by: Avi Kivity <avi@qumranet.com>

commit 2959cd13ecc1fbe1b2339937481844ff963f1e7f
Author: Avi Kivity <avi@qumranet.com>
Date:   Mon Feb 19 14:37:46 2007 +0200

    KVM: mmu: add missing dirty page tracking cases
    
    We fail to mark a page dirty in three cases:
    
    - setting the accessed bit in a pte
    - setting the dirty bit in a pte
    - emulating a write into a pagetable
    
    This fix adds the missing cases.
    
    Signed-off-by: Avi Kivity <avi@qumranet.com>
 .gitignore                 |    3 
 drivers/kvm/kvm.h          |   33 +++
 drivers/kvm/kvm_main.c     |  476 ++++++++++++++++++++++++++++++++++++++++++--
 drivers/kvm/mmu.c          |   20 +-
 drivers/kvm/paging_tmpl.h  |    3 
 drivers/kvm/svm.c          |   58 +++--
 drivers/kvm/vmx.c          |  111 +++++-----
 include/linux/Kbuild       |    1 
 include/linux/kvm.h        |  113 +++++++---
 include/linux/miscdevice.h |    1 
 10 files changed, 652 insertions(+), 167 deletions(-)

diff --git a/.gitignore b/.gitignore
index 060a71d..343c716 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,3 +45,6 @@ series
 
 # cscope files
 cscope.*
+
+# emacs backup files
+*~
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 0d122bf..8917df8 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -55,6 +55,7 @@ #define KVM_MEMORY_SLOTS 4
 #define KVM_NUM_MMU_PAGES 256
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 40
 
 #define FX_IMAGE_SIZE 512
 #define FX_IMAGE_ALIGN 16
@@ -73,6 +74,8 @@ #define SELECTOR_RPL_MASK 0x03
 
 #define IOPL_SHIFT 12
 
+#define KVM_PIO_PAGE_OFFSET 1
+
 /*
  * Address types:
  *
@@ -133,7 +136,6 @@ struct kvm_mmu_page {
 	unsigned long slot_bitmap; /* One bit set per slot which has memory
 				    * in this shadow page.
 				    */
-	int global;              /* Set if all ptes in this page are global */
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
 	union {
@@ -219,6 +221,18 @@ enum {
 	VCPU_SREG_LDTR,
 };
 
+struct kvm_pio_request {
+	unsigned long count;
+	int cur_count;
+	struct page *guest_pages[2];
+	unsigned guest_page_offset;
+	int in;
+	int size;
+	int string;
+	int down;
+	int rep;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 	union {
@@ -228,6 +242,7 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	int   cpu;
 	int   launched;
+	struct kvm_run *run;
 	int interrupt_window_open;
 	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
 #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
@@ -273,6 +288,12 @@ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE
 	int mmio_size;
 	unsigned char mmio_data[8];
 	gpa_t mmio_phys_addr;
+	gva_t mmio_fault_cr2;
+	struct kvm_pio_request pio;
+	void *pio_data;
+
+	int sigset_active;
+	sigset_t sigset;
 
 	struct {
 		int active;
@@ -284,6 +305,9 @@ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE
 			u32 ar;
 		} tr, es, ds, fs, gs;
 	} rmode;
+
+	int cpuid_nent;
+	struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
 };
 
 struct kvm_memory_slot {
@@ -360,8 +384,6 @@ struct kvm_arch_ops {
 	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
 	void (*decache_cr0_cr4_guest_bits)(struct kvm_vcpu *vcpu);
 	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
-	void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu,
-				      unsigned long cr0);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
 	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
@@ -412,6 +434,7 @@ #define HPA_MSB ((sizeof(hpa_t) * 8) - 1
 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
 hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
 
 void kvm_emulator_want_group7_invlpg(void);
 
@@ -444,6 +467,10 @@ void realmode_set_cr(struct kvm_vcpu *vc
 
 struct x86_emulate_ctxt;
 
+int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+		  int size, unsigned long count, int string, int down,
+		  gva_t address, int rep, unsigned port);
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index dc7a8c7..77c4176 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -346,6 +346,17 @@ static void kvm_free_physmem(struct kvm 
 		kvm_free_physmem_slot(&kvm->memslots[i], NULL);
 }
 
+static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	for (i = 0; i < 2; ++i)
+		if (vcpu->pio.guest_pages[i]) {
+			__free_page(vcpu->pio.guest_pages[i]);
+			vcpu->pio.guest_pages[i] = NULL;
+		}
+}
+
 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	if (!vcpu->vmcs)
@@ -355,6 +366,11 @@ static void kvm_free_vcpu(struct kvm_vcp
 	kvm_mmu_destroy(vcpu);
 	vcpu_put(vcpu);
 	kvm_arch_ops->vcpu_free(vcpu);
+	free_page((unsigned long)vcpu->run);
+	vcpu->run = NULL;
+	free_page((unsigned long)vcpu->pio_data);
+	vcpu->pio_data = NULL;
+	free_pio_guest_pages(vcpu);
 }
 
 static void kvm_free_vcpus(struct kvm *kvm)
@@ -830,6 +846,61 @@ out:
 	return r;
 }
 
+/*
+ * Get the memory map for a memory slot.
+ * A bit is on iff the page exists.
+ * To be used when there may be "holes" in the memory.
+ */
+static int kvm_vm_ioctl_get_memory_map(struct kvm *kvm,
+				       struct kvm_dirty_log *log)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned char *mem_bitmap = NULL;
+	int r, n;
+	unsigned long i;
+
+	/* mark busy */
+	spin_lock(&kvm->lock);
+	++kvm->busy;
+	spin_unlock(&kvm->lock);
+
+	r = -EINVAL;
+	if (log->slot >= KVM_MEMORY_SLOTS)
+		goto out;
+
+	memslot = &kvm->memslots[log->slot];
+	r = -ENOENT;
+	if (!memslot->phys_mem)
+		goto out;
+	
+	/* allocate a temporary bitmap */
+	n = memslot->npages / 8; /* one bit per page */
+	r = -ENOMEM;
+	mem_bitmap = vmalloc(n);
+	if (!mem_bitmap)
+		goto out;
+	memset(mem_bitmap, 0, n);
+
+	/* fill the bitmap */
+	for (i = 0; i < memslot->npages; ++i)
+		if (memslot->phys_mem[i])
+			__set_bit(i, mem_bitmap);
+	
+	/* copy bitmap to user */
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, mem_bitmap, n))
+		goto out;
+	
+	r = 0;
+out:
+	if (mem_bitmap)
+		vfree(mem_bitmap);
+	spin_lock(&kvm->lock);
+	--kvm->busy;
+	spin_unlock(&kvm->lock);
+	return r;
+}
+
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
@@ -1116,6 +1187,7 @@ int emulate_instruction(struct kvm_vcpu 
 	int r;
 	int cs_db, cs_l;
 
+	vcpu->mmio_fault_cr2 = cr2;
 	kvm_arch_ops->cache_regs(vcpu);
 
 	kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -1177,7 +1249,7 @@ int kvm_hypercall(struct kvm_vcpu *vcpu,
 {
 	unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
 
-	kvm_arch_ops->decache_regs(vcpu);
+	kvm_arch_ops->cache_regs(vcpu);
 	ret = -KVM_EINVAL;
 #ifdef CONFIG_X86_64
 	if (is_long_mode(vcpu)) {
@@ -1201,10 +1273,19 @@ #endif
 	}
 	switch (nr) {
 	default:
-		;
+		run->hypercall.args[0] = a0;
+		run->hypercall.args[1] = a1;
+		run->hypercall.args[2] = a2;
+		run->hypercall.args[3] = a3;
+		run->hypercall.args[4] = a4;
+		run->hypercall.args[5] = a5;
+		run->hypercall.ret = ret;
+		run->hypercall.longmode = is_long_mode(vcpu);
+		kvm_arch_ops->decache_regs(vcpu);
+		return 0;
 	}
 	vcpu->regs[VCPU_REGS_RAX] = ret;
-	kvm_arch_ops->cache_regs(vcpu);
+	kvm_arch_ops->decache_regs(vcpu);
 	return 1;
 }
 EXPORT_SYMBOL_GPL(kvm_hypercall);
@@ -1502,29 +1583,236 @@ void save_msrs(struct vmx_msr_entry *e, 
 }
 EXPORT_SYMBOL_GPL(save_msrs);
 
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+	int i;
+	u32 function;
+	struct kvm_cpuid_entry *e, *best;
+
+	kvm_arch_ops->cache_regs(vcpu);
+	function = vcpu->regs[VCPU_REGS_RAX];
+	vcpu->regs[VCPU_REGS_RAX] = 0;
+	vcpu->regs[VCPU_REGS_RBX] = 0;
+	vcpu->regs[VCPU_REGS_RCX] = 0;
+	vcpu->regs[VCPU_REGS_RDX] = 0;
+	best = NULL;
+	for (i = 0; i < vcpu->cpuid_nent; ++i) {
+		e = &vcpu->cpuid_entries[i];
+		if (e->function == function) {
+			best = e;
+			break;
+		}
+		/*
+		 * Both basic or both extended?
+		 */
+		if (((e->function ^ function) & 0x80000000) == 0)
+			if (!best || e->function > best->function)
+				best = e;
+	}
+	if (best) {
+		vcpu->regs[VCPU_REGS_RAX] = best->eax;
+		vcpu->regs[VCPU_REGS_RBX] = best->ebx;
+		vcpu->regs[VCPU_REGS_RCX] = best->ecx;
+		vcpu->regs[VCPU_REGS_RDX] = best->edx;
+	}
+	kvm_arch_ops->decache_regs(vcpu);
+	kvm_arch_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
+
+static int pio_copy_data(struct kvm_vcpu *vcpu)
+{
+	void *p = vcpu->pio_data;
+	void *q;
+	unsigned bytes;
+	int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
+
+	kvm_arch_ops->vcpu_put(vcpu);
+	q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
+		 PAGE_KERNEL);
+	if (!q) {
+		kvm_arch_ops->vcpu_load(vcpu);
+		return -ENOMEM;
+	}
+	q += vcpu->pio.guest_page_offset;
+	bytes = vcpu->pio.size * vcpu->pio.cur_count;
+	if (vcpu->pio.in)
+		memcpy(q, p, bytes);
+	else
+		memcpy(p, q, bytes);
+	q -= vcpu->pio.guest_page_offset;
+	vunmap(q);
+	kvm_arch_ops->vcpu_load(vcpu);
+	return 0;
+}
+
+static int complete_pio(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pio_request *io = &vcpu->pio;
+	long delta;
+	int r;
+
+	kvm_arch_ops->cache_regs(vcpu);
+
+	io->count -= io->cur_count;
+	if (!io->string) {
+		if (io->in)
+			memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
+			       io->size);
+	} else {
+		if (io->in) {
+			r = pio_copy_data(vcpu);
+			if (r) {
+				kvm_arch_ops->cache_regs(vcpu);
+				return r;
+			}
+		}
+
+		delta = 1;
+		if (io->rep) {
+			delta *= io->cur_count;
+			/*
+			 * The size of the register should really depend on
+			 * current address size.
+			 */
+			vcpu->regs[VCPU_REGS_RCX] -= delta;
+		}
+		if (io->down)
+			delta = -delta;
+		delta *= io->size;
+		if (io->in)
+			vcpu->regs[VCPU_REGS_RDI] += delta;
+		else
+			vcpu->regs[VCPU_REGS_RSI] += delta;
+	}
+
+	vcpu->run->io_completed = 0;
+
+	kvm_arch_ops->decache_regs(vcpu);
+
+	if (!io->count)
+		kvm_arch_ops->skip_emulated_instruction(vcpu);
+	return 0;
+}
+
+int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+		  int size, unsigned long count, int string, int down,
+		  gva_t address, int rep, unsigned port)
+{
+	unsigned now, in_page;
+	int i;
+	int nr_pages = 1;
+	struct page *page;
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+	vcpu->run->io.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+	vcpu->run->io.count = count;
+	vcpu->run->io.port = port;
+	vcpu->pio.count = count;
+	vcpu->pio.cur_count = count;
+	vcpu->pio.size = size;
+	vcpu->pio.in = in;
+	vcpu->pio.string = string;
+	vcpu->pio.down = down;
+	vcpu->pio.guest_page_offset = offset_in_page(address);
+	vcpu->pio.rep = rep;
+
+	if (!string) {
+		kvm_arch_ops->cache_regs(vcpu);
+		memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
+		kvm_arch_ops->decache_regs(vcpu);
+		return 0;
+	}
+
+	now = min(count, PAGE_SIZE / size);
+
+	if (!down)
+		in_page = PAGE_SIZE - offset_in_page(address);
+	else
+		in_page = offset_in_page(address) + size;
+	now = min(count, (unsigned long)in_page / size);
+	if (!now) {
+		/*
+		 * String I/O straddles page boundary.  Pin two guest pages
+		 * so that we satisfy atomicity constraints.  Do just one
+		 * transaction to avoid complexity.
+		 */
+		nr_pages = 2;
+		now = 1;
+	}
+	if (down) {
+		/*
+		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
+		 */
+		printk(KERN_ERR "kvm: guest string pio down\n");
+		inject_gp(vcpu);
+		return 1;
+	}
+	vcpu->run->io.count = now;
+	vcpu->pio.cur_count = now;
+
+	for (i = 0; i < nr_pages; ++i) {
+		spin_lock(&vcpu->kvm->lock);
+		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
+		if (page)
+			get_page(page);
+		vcpu->pio.guest_pages[i] = page;
+		spin_unlock(&vcpu->kvm->lock);
+		if (!page) {
+			inject_gp(vcpu);
+			free_pio_guest_pages(vcpu);
+			return 1;
+		}
+	}
+
+	if (!vcpu->pio.in)
+		return pio_copy_data(vcpu);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_setup_pio);
+
 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
+	sigset_t sigsaved;
 
 	vcpu_load(vcpu);
 
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
 	/* re-sync apic's tpr */
 	vcpu->cr8 = kvm_run->cr8;
 
-	if (kvm_run->emulated) {
-		kvm_arch_ops->skip_emulated_instruction(vcpu);
-		kvm_run->emulated = 0;
-	}
-
-	if (kvm_run->mmio_completed) {
-		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-		vcpu->mmio_read_completed = 1;
+	if (kvm_run->io_completed) {
+		if (vcpu->pio.count) {
+			r = complete_pio(vcpu);
+			if (r)
+				goto out;
+		} else {
+			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+			vcpu->mmio_read_completed = 1;
+			emulate_instruction(vcpu, kvm_run,
+					    vcpu->mmio_fault_cr2, 0);
+		}
 	}
 
 	vcpu->mmio_needed = 0;
 
+	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
+		kvm_arch_ops->cache_regs(vcpu);
+		vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
+		kvm_arch_ops->decache_regs(vcpu);
+	}
+
 	r = kvm_arch_ops->run(vcpu, kvm_run);
 
+out:
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
 	vcpu_put(vcpu);
 	return r;
 }
@@ -1697,7 +1985,7 @@ #endif
 	kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
 
 	mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
-	kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0);
+	kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
 
 	mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
 	kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
@@ -1887,6 +2175,36 @@ static int kvm_vcpu_ioctl_debug_guest(st
 	return r;
 }
 
+static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
+				    unsigned long address,
+				    int *type)
+{
+	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+	unsigned long pgoff;
+	struct page *page;
+
+	*type = VM_FAULT_MINOR;
+	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (pgoff == 0)
+		page = virt_to_page(vcpu->run);
+	else if (pgoff == KVM_PIO_PAGE_OFFSET)
+		page = virt_to_page(vcpu->pio_data);
+	else
+		return NOPAGE_SIGBUS;
+	get_page(page);
+	return page;
+}
+
+static struct vm_operations_struct kvm_vcpu_vm_ops = {
+	.nopage = kvm_vcpu_nopage,
+};
+
+static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_vcpu_vm_ops;
+	return 0;
+}
+
 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
@@ -1899,6 +2217,7 @@ static struct file_operations kvm_vcpu_f
 	.release        = kvm_vcpu_release,
 	.unlocked_ioctl = kvm_vcpu_ioctl,
 	.compat_ioctl   = kvm_vcpu_ioctl,
+	.mmap           = kvm_vcpu_mmap,
 };
 
 /*
@@ -1947,6 +2266,7 @@ static int kvm_vm_ioctl_create_vcpu(stru
 {
 	int r;
 	struct kvm_vcpu *vcpu;
+	struct page *page;
 
 	r = -EINVAL;
 	if (!valid_vcpu(n))
@@ -1961,6 +2281,18 @@ static int kvm_vm_ioctl_create_vcpu(stru
 		return -EEXIST;
 	}
 
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	r = -ENOMEM;
+	if (!page)
+		goto out_unlock;
+	vcpu->run = page_address(page);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	r = -ENOMEM;
+	if (!page)
+		goto out_free_run;
+	vcpu->pio_data = page_address(page);
+
 	vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
 					   FX_IMAGE_ALIGN);
 	vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
@@ -1990,11 +2322,46 @@ static int kvm_vm_ioctl_create_vcpu(stru
 
 out_free_vcpus:
 	kvm_free_vcpu(vcpu);
+out_free_run:
+	free_page((unsigned long)vcpu->run);
+	vcpu->run = NULL;
+out_unlock:
 	mutex_unlock(&vcpu->mutex);
 out:
 	return r;
 }
 
+static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+				    struct kvm_cpuid *cpuid,
+				    struct kvm_cpuid_entry __user *entries)
+{
+	int r;
+
+	r = -E2BIG;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		goto out;
+	r = -EFAULT;
+	if (copy_from_user(&vcpu->cpuid_entries, entries,
+			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+		goto out;
+	vcpu->cpuid_nent = cpuid->nent;
+	return 0;
+
+out:
+	return r;
+}
+
+static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
+{
+	if (sigset) {
+		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		vcpu->sigset_active = 1;
+		vcpu->sigset = *sigset;
+	} else
+		vcpu->sigset_active = 0;
+	return 0;
+}
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2003,21 +2370,12 @@ static long kvm_vcpu_ioctl(struct file *
 	int r = -EINVAL;
 
 	switch (ioctl) {
-	case KVM_RUN: {
-		struct kvm_run kvm_run;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_run, argp, sizeof kvm_run))
-			goto out;
-		r = kvm_vcpu_ioctl_run(vcpu, &kvm_run);
-		if (r < 0 &&  r != -EINTR)
-			goto out;
-		if (copy_to_user(argp, &kvm_run, sizeof kvm_run)) {
-			r = -EFAULT;
+	case KVM_RUN:
+		r = -EINVAL;
+		if (arg)
 			goto out;
-		}
+		r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
 		break;
-	}
 	case KVM_GET_REGS: {
 		struct kvm_regs kvm_regs;
 
@@ -2113,6 +2471,41 @@ static long kvm_vcpu_ioctl(struct file *
 	case KVM_SET_MSRS:
 		r = msr_io(vcpu, argp, do_set_msr, 0);
 		break;
+	case KVM_SET_CPUID: {
+		struct kvm_cpuid __user *cpuid_arg = argp;
+		struct kvm_cpuid cpuid;
+
+		r = -EFAULT;
+		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+			goto out;
+		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_SET_SIGNAL_MASK: {
+		struct kvm_signal_mask __user *sigmask_arg = argp;
+		struct kvm_signal_mask kvm_sigmask;
+		sigset_t sigset, *p;
+
+		p = NULL;
+		if (argp) {
+			r = -EFAULT;
+			if (copy_from_user(&kvm_sigmask, argp,
+					   sizeof kvm_sigmask))
+				goto out;
+			r = -EINVAL;
+			if (kvm_sigmask.len != sizeof sigset)
+				goto out;
+			r = -EFAULT;
+			if (copy_from_user(&sigset, sigmask_arg->sigset,
+					   sizeof sigset))
+				goto out;
+			p = &sigset;
+		}
+		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+		break;
+	}
 	default:
 		;
 	}
@@ -2155,6 +2548,17 @@ static long kvm_vm_ioctl(struct file *fi
 			goto out;
 		break;
 	}
+	case KVM_GET_MEM_MAP: {
+		struct kvm_dirty_log log;
+
+		r = -EFAULT;
+		if (copy_from_user(&log, argp, sizeof log))
+			goto out;
+		r = kvm_vm_ioctl_get_memory_map(kvm, &log);
+		if (r)
+			goto out;
+		break;
+	}
 	default:
 		;
 	}
@@ -2248,13 +2652,19 @@ static long kvm_dev_ioctl(struct file *f
 			  unsigned int ioctl, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
-	int r = -EINVAL;
+	long r = -EINVAL;
 
 	switch (ioctl) {
 	case KVM_GET_API_VERSION:
+		r = -EINVAL;
+		if (arg)
+			goto out;
 		r = KVM_API_VERSION;
 		break;
 	case KVM_CREATE_VM:
+		r = -EINVAL;
+		if (arg)
+			goto out;
 		r = kvm_dev_ioctl_create_vm();
 		break;
 	case KVM_GET_MSR_INDEX_LIST: {
@@ -2284,6 +2694,18 @@ static long kvm_dev_ioctl(struct file *f
 		r = 0;
 		break;
 	}
+	case KVM_CHECK_EXTENSION:
+		/*
+		 * No extensions defined at present.
+		 */
+		r = 0;
+		break;
+	case KVM_GET_VCPU_MMAP_SIZE:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = 2 * PAGE_SIZE;
+		break;
 	default:
 		;
 	}
@@ -2299,7 +2721,7 @@ static struct file_operations kvm_charde
 };
 
 static struct miscdevice kvm_dev = {
-	MISC_DYNAMIC_MINOR,
+	KVM_MINOR,
 	"kvm",
 	&kvm_chardev_ops,
 };
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index e85b4c7..b181106 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -461,7 +461,6 @@ static struct kvm_mmu_page *kvm_mmu_allo
 	list_add(&page->link, &vcpu->kvm->active_mmu_pages);
 	ASSERT(is_empty_shadow_page(page->page_hpa));
 	page->slot_bitmap = 0;
-	page->global = 1;
 	page->multimapped = 0;
 	page->parent_pte = parent_pte;
 	--vcpu->kvm->n_free_mmu_pages;
@@ -735,6 +734,15 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, 
 	return gpa_to_hpa(vcpu, gpa);
 }
 
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
+
+	if (gpa == UNMAPPED_GVA)
+		return NULL;
+	return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
+}
+
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
@@ -918,11 +926,6 @@ static void paging_new_cr3(struct kvm_vc
 	kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
 }
 
-static void mark_pagetable_nonglobal(void *shadow_pte)
-{
-	page_header(__pa(shadow_pte))->global = 0;
-}
-
 static inline void set_pte_common(struct kvm_vcpu *vcpu,
 			     u64 *shadow_pte,
 			     gpa_t gaddr,
@@ -940,9 +943,6 @@ static inline void set_pte_common(struct
 
 	*shadow_pte |= access_bits;
 
-	if (!(*shadow_pte & PT_GLOBAL_MASK))
-		mark_pagetable_nonglobal(shadow_pte);
-
 	if (is_error_hpa(paddr)) {
 		*shadow_pte |= gaddr;
 		*shadow_pte |= PT_SHADOW_IO_MARK;
@@ -1359,7 +1359,7 @@ static void audit_mappings_page(struct k
 
 static void audit_mappings(struct kvm_vcpu *vcpu)
 {
-	int i;
+	unsigned i;
 
 	if (vcpu->mmu.root_level == 4)
 		audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index f3bcee9..17bd440 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -148,8 +148,7 @@ #endif
 			break;
 		}
 
-		if (walker->level != 3 || is_long_mode(vcpu))
-			walker->inherited_ar &= walker->table[index];
+		walker->inherited_ar &= walker->table[index];
 		table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
 		paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
 		kunmap_atomic(walker->table, KM_USER0);
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 3d8ea7a..d3cc115 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -582,6 +582,9 @@ static int svm_create_vcpu(struct kvm_vc
 	init_vmcb(vcpu->svm->vmcb);
 
 	fx_init(vcpu);
+	vcpu->apic_base = 0xfee00000 |
+			/*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
+			MSR_IA32_APICBASE_ENABLE;
 
 	return 0;
 
@@ -981,7 +984,7 @@ static int io_get_override(struct kvm_vc
 	return 0;
 }
 
-static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address)
+static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, gva_t *address)
 {
 	unsigned long addr_mask;
 	unsigned long *reg;
@@ -1025,38 +1028,38 @@ static unsigned long io_adress(struct kv
 static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug?
-	int _in = io_info & SVM_IOIO_TYPE_MASK;
+	int size, down, in, string, rep;
+	unsigned port;
+	unsigned long count;
+	gva_t address = 0;
 
 	++kvm_stat.io_exits;
 
 	vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2;
 
-	kvm_run->exit_reason = KVM_EXIT_IO;
-	kvm_run->io.port = io_info >> 16;
-	kvm_run->io.direction = (_in) ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	kvm_run->io.size = ((io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT);
-	kvm_run->io.string = (io_info & SVM_IOIO_STR_MASK) != 0;
-	kvm_run->io.rep = (io_info & SVM_IOIO_REP_MASK) != 0;
+	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+	port = io_info >> 16;
+	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+	string = (io_info & SVM_IOIO_STR_MASK) != 0;
+	rep = (io_info & SVM_IOIO_REP_MASK) != 0;
+	count = 1;
+	down = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
 
-	if (kvm_run->io.string) {
+	if (string) {
 		unsigned addr_mask;
 
-		addr_mask = io_adress(vcpu, _in, &kvm_run->io.address);
+		addr_mask = io_adress(vcpu, in, &address);
 		if (!addr_mask) {
 			printk(KERN_DEBUG "%s: get io address failed\n",
 			       __FUNCTION__);
 			return 1;
 		}
 
-		if (kvm_run->io.rep) {
-			kvm_run->io.count
-				= vcpu->regs[VCPU_REGS_RCX] & addr_mask;
-			kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags
-						   & X86_EFLAGS_DF) != 0;
-		}
-	} else
-		kvm_run->io.value = vcpu->svm->vmcb->save.rax;
-	return 0;
+		if (rep)
+			count = vcpu->regs[VCPU_REGS_RCX] & addr_mask;
+	}
+	return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down,
+			     address, rep, port);
 }
 
 static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1078,7 +1081,8 @@ static int halt_interception(struct kvm_
 
 static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	vcpu->svm->vmcb->save.rip += 3;
+	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 3;
+	skip_emulated_instruction(vcpu);
 	return kvm_hypercall(vcpu, kvm_run);
 }
 
@@ -1098,8 +1102,8 @@ static int task_switch_interception(stru
 static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2;
-	kvm_run->exit_reason = KVM_EXIT_CPUID;
-	return 0;
+	kvm_emulate_cpuid(vcpu);
+	return 1;
 }
 
 static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1295,8 +1299,6 @@ static int handle_exit(struct kvm_vcpu *
 {
 	u32 exit_code = vcpu->svm->vmcb->control.exit_code;
 
-	kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
-
 	if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) &&
 	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
 		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
@@ -1606,8 +1608,9 @@ #endif
 	vcpu->svm->next_rip = 0;
 
 	if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
-		kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
-		kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code;
+		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		kvm_run->fail_entry.hardware_entry_failure_reason
+			= vcpu->svm->vmcb->control.exit_code;
 		post_kvm_run_save(vcpu, kvm_run);
 		return 0;
 	}
@@ -1617,12 +1620,14 @@ #endif
 		if (signal_pending(current)) {
 			++kvm_stat.signal_exits;
 			post_kvm_run_save(vcpu, kvm_run);
+			kvm_run->exit_reason = KVM_EXIT_INTR;
 			return -EINTR;
 		}
 
 		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
 			++kvm_stat.request_irq_exits;
 			post_kvm_run_save(vcpu, kvm_run);
+			kvm_run->exit_reason = KVM_EXIT_INTR;
 			return -EINTR;
 		}
 		kvm_resched(vcpu);
@@ -1711,7 +1716,6 @@ static struct kvm_arch_ops svm_arch_ops 
 	.get_cs_db_l_bits = svm_get_cs_db_l_bits,
 	.decache_cr0_cr4_guest_bits = svm_decache_cr0_cr4_guest_bits,
 	.set_cr0 = svm_set_cr0,
-	.set_cr0_no_modeswitch = svm_set_cr0,
 	.set_cr3 = svm_set_cr3,
 	.set_cr4 = svm_set_cr4,
 	.set_efer = svm_set_efer,
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index bfa0ce4..027a962 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -618,7 +618,7 @@ static void fix_pmode_dataseg(int seg, s
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
-	if (vmcs_readl(sf->base) == save->base) {
+	if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
 		vmcs_write16(sf->selector, save->selector);
 		vmcs_writel(sf->base, save->base);
 		vmcs_write32(sf->limit, save->limit);
@@ -712,6 +712,8 @@ static void enter_rmode(struct kvm_vcpu 
 
 	vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
 	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+	if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
+		vmcs_writel(GUEST_CS_BASE, 0xf0000);
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
 
 	fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
@@ -786,22 +788,6 @@ #endif
 	vcpu->cr0 = cr0;
 }
 
-/*
- * Used when restoring the VM to avoid corrupting segment registers
- */
-static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-	if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
-		enter_rmode(vcpu);
-
-	vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0);
-	update_exception_bitmap(vcpu);
-	vmcs_writel(CR0_READ_SHADOW, cr0);
-	vmcs_writel(GUEST_CR0,
-		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
-	vcpu->cr0 = cr0;
-}
-
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	vmcs_writel(GUEST_CR3, cr3);
@@ -1394,7 +1380,7 @@ static int handle_triple_fault(struct kv
 	return 0;
 }
 
-static int get_io_count(struct kvm_vcpu *vcpu, u64 *count)
+static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count)
 {
 	u64 inst;
 	gva_t rip;
@@ -1439,33 +1425,35 @@ static int get_io_count(struct kvm_vcpu 
 done:
 	countr_size *= 8;
 	*count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
+	//printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]);
 	return 1;
 }
 
 static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	u64 exit_qualification;
+	int size, down, in, string, rep;
+	unsigned port;
+	unsigned long count;
+	gva_t address;
 
 	++kvm_stat.io_exits;
 	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
-	kvm_run->exit_reason = KVM_EXIT_IO;
-	if (exit_qualification & 8)
-		kvm_run->io.direction = KVM_EXIT_IO_IN;
-	else
-		kvm_run->io.direction = KVM_EXIT_IO_OUT;
-	kvm_run->io.size = (exit_qualification & 7) + 1;
-	kvm_run->io.string = (exit_qualification & 16) != 0;
-	kvm_run->io.string_down
-		= (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
-	kvm_run->io.rep = (exit_qualification & 32) != 0;
-	kvm_run->io.port = exit_qualification >> 16;
-	if (kvm_run->io.string) {
-		if (!get_io_count(vcpu, &kvm_run->io.count))
+	in = (exit_qualification & 8) != 0;
+	size = (exit_qualification & 7) + 1;
+	string = (exit_qualification & 16) != 0;
+	down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
+	count = 1;
+	rep = (exit_qualification & 32) != 0;
+	port = exit_qualification >> 16;
+	address = 0;
+	if (string) {
+		if (rep && !get_io_count(vcpu, &count))
 			return 1;
-		kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
-	} else
-		kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
-	return 0;
+		address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	}
+	return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down,
+			     address, rep, port);
 }
 
 static void
@@ -1583,8 +1571,8 @@ static int handle_dr(struct kvm_vcpu *vc
 
 static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	kvm_run->exit_reason = KVM_EXIT_CPUID;
-	return 0;
+	kvm_emulate_cpuid(vcpu);
+	return 1;
 }
 
 static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1658,7 +1646,7 @@ static int handle_halt(struct kvm_vcpu *
 
 static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3);
+	skip_emulated_instruction(vcpu);
 	return kvm_hypercall(vcpu, kvm_run);
 }
 
@@ -1888,6 +1876,27 @@ #endif
 		[cr2]"i"(offsetof(struct kvm_vcpu, cr2))
 	      : "cc", "memory" );
 
+	/*
+	 * Reload segment selectors ASAP. (it's needed for a functional
+	 * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64
+	 * relies on having 0 in %gs for the CPU PDA to work.)
+	 */
+	if (fs_gs_ldt_reload_needed) {
+		load_ldt(ldt_sel);
+		load_fs(fs_sel);
+		/*
+		 * If we have to reload gs, we must take care to
+		 * preserve our gs base.
+		 */
+		local_irq_disable();
+		load_gs(gs_sel);
+#ifdef CONFIG_X86_64
+		wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+#endif
+		local_irq_enable();
+
+		reload_tss();
+	}
 	++kvm_stat.exits;
 
 	save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
@@ -1899,28 +1908,12 @@ #endif
 
 	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 
-	kvm_run->exit_type = 0;
 	if (fail) {
-		kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
-		kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
+		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		kvm_run->fail_entry.hardware_entry_failure_reason
+			= vmcs_read32(VM_INSTRUCTION_ERROR);
 		r = 0;
 	} else {
-		if (fs_gs_ldt_reload_needed) {
-			load_ldt(ldt_sel);
-			load_fs(fs_sel);
-			/*
-			 * If we have to reload gs, we must take care to
-			 * preserve our gs base.
-			 */
-			local_irq_disable();
-			load_gs(gs_sel);
-#ifdef CONFIG_X86_64
-			wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
-#endif
-			local_irq_enable();
-
-			reload_tss();
-		}
 		/*
 		 * Profile KVM exit RIPs:
 		 */
@@ -1928,19 +1921,20 @@ #endif
 			profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
 
 		vcpu->launched = 1;
-		kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
 		r = kvm_handle_exit(kvm_run, vcpu);
 		if (r > 0) {
 			/* Give scheduler a change to reschedule. */
 			if (signal_pending(current)) {
 				++kvm_stat.signal_exits;
 				post_kvm_run_save(vcpu, kvm_run);
+				kvm_run->exit_reason = KVM_EXIT_INTR;
 				return -EINTR;
 			}
 
 			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
 				++kvm_stat.request_irq_exits;
 				post_kvm_run_save(vcpu, kvm_run);
+				kvm_run->exit_reason = KVM_EXIT_INTR;
 				return -EINTR;
 			}
 
@@ -2059,7 +2053,6 @@ static struct kvm_arch_ops vmx_arch_ops 
 	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
 	.decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits,
 	.set_cr0 = vmx_set_cr0,
-	.set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch,
 	.set_cr3 = vmx_set_cr3,
 	.set_cr4 = vmx_set_cr4,
 #ifdef CONFIG_X86_64
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index e81e301..b35b593 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -99,6 +99,7 @@ header-y += iso_fs.h
 header-y += ixjuser.h
 header-y += jffs2.h
 header-y += keyctl.h
+header-y += kvm.h
 header-y += limits.h
 header-y += lock_dlm_plock.h
 header-y += magic.h
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 275354f..e7e2fa8 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -11,7 +11,7 @@ #define __LINUX_KVM_H
 #include <asm/types.h>
 #include <linux/ioctl.h>
 
-#define KVM_API_VERSION 4
+#define KVM_API_VERSION 9
 
 /*
  * Architectural interrupt line count, and the size of the bitmap needed
@@ -34,36 +34,33 @@ struct kvm_memory_region {
 #define KVM_MEM_LOG_DIRTY_PAGES  1UL
 
 
-#define KVM_EXIT_TYPE_FAIL_ENTRY 1
-#define KVM_EXIT_TYPE_VM_EXIT    2
-
 enum kvm_exit_reason {
 	KVM_EXIT_UNKNOWN          = 0,
 	KVM_EXIT_EXCEPTION        = 1,
 	KVM_EXIT_IO               = 2,
-	KVM_EXIT_CPUID            = 3,
+	KVM_EXIT_HYPERCALL        = 3,
 	KVM_EXIT_DEBUG            = 4,
 	KVM_EXIT_HLT              = 5,
 	KVM_EXIT_MMIO             = 6,
 	KVM_EXIT_IRQ_WINDOW_OPEN  = 7,
 	KVM_EXIT_SHUTDOWN         = 8,
+	KVM_EXIT_FAIL_ENTRY       = 9,
+	KVM_EXIT_INTR             = 10,
 };
 
-/* for KVM_RUN */
+/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
 	/* in */
-	__u32 emulated;  /* skip current instruction */
-	__u32 mmio_completed; /* mmio request completed */
+	__u32 io_completed; /* mmio/pio request completed */
 	__u8 request_interrupt_window;
-	__u8 padding1[7];
+	__u8 padding1[3];
 
 	/* out */
-	__u32 exit_type;
 	__u32 exit_reason;
 	__u32 instruction_length;
 	__u8 ready_for_interrupt_injection;
 	__u8 if_flag;
-	__u16 padding2;
+	__u8 padding2[6];
 
 	/* in (pre_kvm_run), out (post_kvm_run) */
 	__u64 cr8;
@@ -72,29 +69,26 @@ struct kvm_run {
 	union {
 		/* KVM_EXIT_UNKNOWN */
 		struct {
-			__u32 hardware_exit_reason;
+			__u64 hardware_exit_reason;
 		} hw;
+		/* KVM_EXIT_FAIL_ENTRY */
+		struct {
+			__u64 hardware_entry_failure_reason;
+		} fail_entry;
 		/* KVM_EXIT_EXCEPTION */
 		struct {
 			__u32 exception;
 			__u32 error_code;
 		} ex;
 		/* KVM_EXIT_IO */
-		struct {
+		struct kvm_io {
 #define KVM_EXIT_IO_IN  0
 #define KVM_EXIT_IO_OUT 1
 			__u8 direction;
 			__u8 size; /* bytes */
-			__u8 string;
-			__u8 string_down;
-			__u8 rep;
-			__u8 pad;
 			__u16 port;
-			__u64 count;
-			union {
-				__u64 address;
-				__u32 value;
-			};
+			__u32 count;
+			__u64 data_offset; /* relative to kvm_run start */
 		} io;
 		struct {
 		} debug;
@@ -105,6 +99,13 @@ #define KVM_EXIT_IO_OUT 1
 			__u32 len;
 			__u8  is_write;
 		} mmio;
+		/* KVM_EXIT_HYPERCALL */
+		struct {
+			__u64 args[6];
+			__u64 ret;
+			__u32 longmode;
+			__u32 pad;
+		} hypercall;
 	};
 };
 
@@ -200,7 +201,7 @@ struct kvm_debug_guest {
 	__u32 singlestep;
 };
 
-/* for KVM_GET_DIRTY_LOG */
+/* for KVM_GET_DIRTY_LOG and KVM_GET_MEM_MAP */
 struct kvm_dirty_log {
 	__u32 slot;
 	__u32 padding;
@@ -210,38 +211,72 @@ struct kvm_dirty_log {
 	};
 };
 
+struct kvm_cpuid_entry {
+	__u32 function;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+	__u32 nent;
+	__u32 padding;
+	struct kvm_cpuid_entry entries[0];
+};
+
+/* for KVM_SET_SIGNAL_MASK */
+struct kvm_signal_mask {
+	__u32 len;
+	__u8  sigset[0];
+};
+
 #define KVMIO 0xAE
 
 /*
  * ioctls for /dev/kvm fds:
  */
-#define KVM_GET_API_VERSION       _IO(KVMIO, 1)
-#define KVM_CREATE_VM             _IO(KVMIO, 2) /* returns a VM fd */
-#define KVM_GET_MSR_INDEX_LIST    _IOWR(KVMIO, 15, struct kvm_msr_list)
+#define KVM_GET_API_VERSION       _IO(KVMIO,   0x00)
+#define KVM_CREATE_VM             _IO(KVMIO,   0x01) /* returns a VM fd */
+#define KVM_GET_MSR_INDEX_LIST    _IOWR(KVMIO, 0x02, struct kvm_msr_list)
+/*
+ * Check if a kvm extension is available.  Argument is extension number,
+ * return is 1 (yes) or 0 (no, sorry).
+ */
+#define KVM_CHECK_EXTENSION       _IO(KVMIO,   0x03)
+/*
+ * Get size for mmap(vcpu_fd)
+ */
+#define KVM_GET_VCPU_MMAP_SIZE    _IO(KVMIO,   0x04) /* in bytes */
 
 /*
  * ioctls for VM fds
  */
-#define KVM_SET_MEMORY_REGION     _IOW(KVMIO, 10, struct kvm_memory_region)
+#define KVM_SET_MEMORY_REGION     _IOW(KVMIO, 0x40, struct kvm_memory_region)
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
  */
-#define KVM_CREATE_VCPU           _IOW(KVMIO, 11, int)
-#define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 12, struct kvm_dirty_log)
+#define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
+#define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
+#define KVM_GET_MEM_MAP           _IOW(KVMIO, 0x43, struct kvm_dirty_log)
 
 /*
  * ioctls for vcpu fds
  */
-#define KVM_RUN                   _IOWR(KVMIO, 2, struct kvm_run)
-#define KVM_GET_REGS              _IOR(KVMIO, 3, struct kvm_regs)
-#define KVM_SET_REGS              _IOW(KVMIO, 4, struct kvm_regs)
-#define KVM_GET_SREGS             _IOR(KVMIO, 5, struct kvm_sregs)
-#define KVM_SET_SREGS             _IOW(KVMIO, 6, struct kvm_sregs)
-#define KVM_TRANSLATE             _IOWR(KVMIO, 7, struct kvm_translation)
-#define KVM_INTERRUPT             _IOW(KVMIO, 8, struct kvm_interrupt)
-#define KVM_DEBUG_GUEST           _IOW(KVMIO, 9, struct kvm_debug_guest)
-#define KVM_GET_MSRS              _IOWR(KVMIO, 13, struct kvm_msrs)
-#define KVM_SET_MSRS              _IOW(KVMIO, 14, struct kvm_msrs)
+#define KVM_RUN                   _IO(KVMIO,   0x80)
+#define KVM_GET_REGS              _IOR(KVMIO,  0x81, struct kvm_regs)
+#define KVM_SET_REGS              _IOW(KVMIO,  0x82, struct kvm_regs)
+#define KVM_GET_SREGS             _IOR(KVMIO,  0x83, struct kvm_sregs)
+#define KVM_SET_SREGS             _IOW(KVMIO,  0x84, struct kvm_sregs)
+#define KVM_TRANSLATE             _IOWR(KVMIO, 0x85, struct kvm_translation)
+#define KVM_INTERRUPT             _IOW(KVMIO,  0x86, struct kvm_interrupt)
+#define KVM_DEBUG_GUEST           _IOW(KVMIO,  0x87, struct kvm_debug_guest)
+#define KVM_GET_MSRS              _IOWR(KVMIO, 0x88, struct kvm_msrs)
+#define KVM_SET_MSRS              _IOW(KVMIO,  0x89, struct kvm_msrs)
+#define KVM_SET_CPUID             _IOW(KVMIO,  0x8a, struct kvm_cpuid)
+#define KVM_SET_SIGNAL_MASK       _IOW(KVMIO,  0x8b, struct kvm_signal_mask)
 
 #endif
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 326da7d..dff9ea3 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -29,6 +29,7 @@ #define MISC_DYNAMIC_MINOR 255
 
 #define TUN_MINOR	     200
 #define	HPET_MINOR	     228
+#define KVM_MINOR            232
 
 struct device;