GIT 0ea6eecef44923d66409a49d71e4fa87fa0f5bed git://kvm.qumranet.com/home/avi/kvm.git commit Author: Avi Kivity Date: Sun Apr 1 16:34:31 2007 +0300 KVM: Add fpu get/set operations These are really helpful when migrating an floating point app to another machine. Signed-off-by: Avi Kivity commit 05671a064c73b8cb8966ddd037ece2d6ae2cb75b Author: Avi Kivity Date: Fri Mar 30 16:54:30 2007 +0300 KVM: Add physical memory aliasing feature With this, we can specify that accesses to one physical memory range will be remapped to another. This is useful for the vga window at 0xa0000 which is used as a movable window into the (much larger) framebuffer. Signed-off-by: Avi Kivity commit 8e08039818b6a5b8c81b905f863adaa18d774171 Author: Avi Kivity Date: Fri Mar 30 14:02:32 2007 +0300 KVM: Simply gfn_to_page() Mapping a guest page to a host page is a common operation. Currently, one has first to find the memory slot where the page belongs (gfn_to_memslot), then locate the page itself (gfn_to_page()). This is clumsy, and also won't work well with memory aliases. So simplify gfn_to_page() not to require memory slot translation first, and instead do it internally. Signed-off-by: Avi Kivity commit 66a9932c55ff7240955d57b7d1e62178a9e80868 Author: Dor Laor Date: Fri Mar 30 13:06:33 2007 +0300 Add mmu cache clear function Functions that play around with the physical memory map need a way to clear mappings to possibly nonexistent or invalid memory. Both the mmu cache and the processor tlb are cleared. Signed-off-by: Dor Laor Signed-off-by: Avi Kivity commit 6095d7b8291fc3e05f3b8790a9bc86b54af281a2 Author: Joerg Roedel Date: Fri Mar 30 17:02:14 2007 +0300 KVM: SVM: enable LBRV virtualization if available This patch enables the virtualization of the last branch record MSRs on SVM if this feature is available in hardware. It also introduces a small and simple check feature for specific SVM extensions. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity commit 8f1469e8477bea483d5a6348a30a534449048c8d Author: Avi Kivity Date: Wed Mar 28 20:04:16 2007 +0200 KVM: x86 emulator: fix bit string operations operand size On x86, bit operations operate on a string of bits that can reside in multiple words. For example, 'btsl %eax, (blah)' will touch the word at blah+4 if %eax is between 32 and 63. The x86 emulator compensates for that by advancing the operand address by (bit offset / BITS_PER_LONG) and truncating the bit offset to the range (0..BITS_PER_LONG-1). This has a side effect of forcing the operand size to 8 bytes on 64-bit hosts. Now, a 32-bit guest goes and fork()s a process. It write protects a stack page at 0xbffff000 using the 'btr' instruction, at offset 0xffc in the page table, with bit offset 1 (for the write permission bit). The emulator now forces the operand size to 8 bytes as previously described, and an innocent page table update turns into a cross-page-boundary write, which is assumed by the mmu code not to be a page table, so it doesn't actually clear the corresponding shadow page table entry. The guest and host permissions are out of sync and guest memory is corrupted soon afterwards, leading to guest failure. Fix by not using BITS_PER_LONG as the word size; instead use the actual operand size, so we get a 32-bit write in that case. Note we still have to teach the mmu to handle cross-page-boundary writes to guest page table; but for now this allows Damn Small Linux 0.4 (2.4.20) to boot. Signed-off-by: Avi Kivity commit e3a065c4e99bb8282d72a2c3c75234d7d7408be6 Author: Avi Kivity Date: Tue Mar 27 17:50:20 2007 +0200 KVM: Remove debug message No longer interesting. Signed-off-by: Avi Kivity commit 19cd40d605bb99fc9058973a69ef208c8b5b1e42 Author: Avi Kivity Date: Tue Mar 27 16:12:41 2007 +0200 Revert "added KVM_GET_MEM_MAP ioctl to get the memory bitmap for a memory slot" This reverts commit ade11a015f83d270d1201c440199146f852fe5e4. As the balloon path will be through qemu, it will have direct knowledge of released gfns, so this API is not directly needed. If it becomes useful in the future, it will be un-reverted. Signed-off-by: Avi Kivity commit 932bf20c0c2075f958bb86b481d8f359197b4d6a Author: Avi Kivity Date: Mon Mar 26 19:31:52 2007 +0200 KVM: Use list_move() Use list_move() where possible. Noticed by Dor Laor. Signed-off-by: Avi Kivity commit 31e82571e8a77d5feb1093627ef0b31f28649590 Author: Michal Piotrowski Date: Sun Mar 25 17:59:32 2007 +0200 KVM: Remove unused function Remove unused function CC drivers/kvm/svm.o drivers/kvm/svm.c:207: warning: ‘inject_db’ defined but not used Signed-off-by: Michal Piotrowski Signed-off-by: Avi Kivity commit 9207113c121519986a114ee5c498184e618ffd68 Author: Avi Kivity Date: Sun Mar 25 12:07:27 2007 +0200 KVM: SVM: Ensure timestamp counter monotonicity When a vcpu is migrated from one cpu to another, its timestamp counter may lose its monotonic property if the host has unsynced timestamp counters. This can confuse the guest, sometimes to the point of refusing to boot. As the rdtsc instruction is rather fast on AMD processors (7-10 cycles), we can simply record the last host tsc when we drop the cpu, and adjust the vcpu tsc offset when we detect that we've migrated to a different cpu. Signed-off-by: Avi Kivity commit b40faf227eb371a52aa21d08f8e9c33fc06602b4 Author: Avi Kivity Date: Fri Mar 23 09:55:25 2007 +0200 KVM: MMU: Fix hugepage pdes mapping same physical address with different access The kvm mmu keeps a shadow page for hugepage pdes; if several such pdes map the same physical address, they share the same shadow page. This is a fairly common case (kernel mappings on i386 nonpae Linux, for example). However, if the two pdes map the same memory but with different permissions, kvm will happily use the cached shadow page. If the access through the more permissive pde will occur after the access to the strict pde, an endless pagefault loop will be generated and the guest will make no progress. Fix by making the access permissions part of the cache lookup key. The fix allows Xen pae to boot on kvm and run guest domains. Thanks to Jeremy Fitzhardinge for reporting the bug and testing the fix. Signed-off-by: Avi Kivity commit 061bba1190514205594d2046f5dc31a01a135163 Author: Avi Kivity Date: Thu Mar 22 15:10:32 2007 +0200 Revert "KVM: Remove extraneous guest entry on mmio read" This reverts commit b0092d187cfa19dfcada3b85d728af5ae27989dc. While the optimization is sound, it regresses booting the Fedora Core 6 32 bit kernel. Signed-off-by: Avi Kivity commit 4cec1674d1436157c7dcc2b5b6f625b08b2b96e8 Author: Joerg Roedel Date: Wed Mar 21 19:47:00 2007 +0100 KVM: SVM: forbid guest to execute monitor/mwait This patch forbids the guest to execute monitor/mwait instructions on SVM. This is necessary because the guest can execute these instructions if they are available even if the kvm cpuid doesn't report its existence. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity commit 7921ad9e303f3f03dd81b552e3b0cd87ef355219 Author: Sergey Kiselev Date: Thu Mar 22 14:06:18 2007 +0200 KVM: Handle writes to MCG_STATUS msr Some older (~2.6.7) kernels write MCG_STATUS register during kernel boot (mce_clear_all() function, called from mce_init()). It's not currently handled by kvm and will cause it to inject a GPF. Following patch adds a "nop" handler for this. Signed-off-by: Sergey Kiselev Signed-off-by: Avi Kivity commit 36809e1326c13887d324025d4592958ead8758d5 Author: Avi Kivity Date: Wed Mar 21 18:14:42 2007 +0200 KVM: Remove unused and write-only variables Trivial cleanup. Signed-off-by: Avi Kivity commit 262e17b818054dad314a062a439681d79a336d48 Author: Avi Kivity Date: Wed Mar 21 18:11:36 2007 +0200 KVM: Don't allow the guest to turn off the cpu cache The cpu cache is a host resource; the guest should not be able to turn it off (even for itself). Signed-off-by: Avi Kivity commit 8c37a70d93ba3e4286ad7524f7915a32ed39cac9 Author: Avi Kivity Date: Wed Mar 21 17:58:32 2007 +0200 KVM: Hack real-mode segments on vmx from KVM_SET_SREGS As usual, we need to mangle segment registers when emulating real mode as vm86 has specific constraints. We special case the reset segment base, and set the "access rights" (or descriptor flags) to vm86 comaptible values. This fixes reboot on vmx. Signed-off-by: Avi Kivity commit 0bf8d346418255335dc9062d96b9f8814b471690 Author: Avi Kivity Date: Wed Mar 21 13:44:58 2007 +0200 KVM: Modify guest segments after potentially switching modes The SET_SREGS ioctl modifies both cr0.pe (real mode/protected mode) and guest segment registers. Since segment handling is modified by the mode on Intel procesors, update the segment registers after the mode switch has taken place. Signed-off-by: Avi Kivity commit f97af70b3aa8a92ddeabb7d42477e7d13dd0a192 Author: Avi Kivity Date: Tue Mar 20 18:44:51 2007 +0200 KVM: Remove set_cr0_no_modeswitch() arch op set_cr0_no_modeswitch() was a hack to avoid corrupting segment registers. As we now cache the protected mode values on entry to real mode, this isn't an issue anymore, and it interferes with reboot (which usually _is_ a modeswitch). Signed-off-by: Avi Kivity commit e314dde30e3851e8effc017c6fffced11d90183a Author: Avi Kivity Date: Tue Mar 20 18:40:40 2007 +0200 KVM: Workaround vmx inability to virtualize the reset state The reset state has cs.selector == 0xf000 and cs.base == 0xffff0000, which aren't compatible with vm86 mode, which is used for real mode virtualization. When we create a vcpu, we set cs.base to 0xf0000, but if we get there by way of a reset, the values are inconsistent and vmx refuses to enter guest mode. Workaround by detecting the state and munging it appropriately. Signed-off-by: Avi Kivity commit 88aea7ddfae755633b0a80ccfa56244b3c79c7b0 Author: Avi Kivity Date: Tue Mar 20 14:34:28 2007 +0200 KVM: MMU: Remove global pte tracking The initial, noncaching, version of the kvm mmu flushed the all nonglobal shadow page table translations (much like a native tlb flush). The new implementation flushes translations only when they change, rendering global pte tracking superfluous. This removes the unused tracking mechanism and storage space. Signed-off-by: Avi Kivity commit 66e5d5c81b5b89e39aa86e3bf9864d228f468b0d Author: Avi Kivity Date: Tue Mar 20 14:29:06 2007 +0200 KVM: MMU: Remove unnecessary check for pdptr access We already special case the pdptr access, so no need to check it again. Signed-off-by: Avi Kivity commit c01571ed56754dfea458cc37d553c360082411a1 Author: Avi Kivity Date: Tue Mar 20 12:46:50 2007 +0200 KVM: Avoid guest virtual addresses in string pio userspace interface The current string pio interface communicates using guest virtual addresses, relying on userspace to translate addresses and to check permissions. This interface cannot fully support guest smp, as the check needs to take into account two pages at one in case an unaligned string transfer straddles a page boundary. Change the interface not to communicate guest addresses at all; instead use a buffer page (mmaped by userspace) and do transfers there. The kernel manages the virtual to physical translation and can perform the checks atomically by taking the appropriate locks. Signed-off-by: Avi Kivity commit 74c24de6e7848a45d6109d987d4fd2ccd83e432e Author: Avi Kivity Date: Wed Mar 7 13:11:17 2007 +0200 KVM: Future-proof argument-less ioctls Some ioctls ignore their arguments. By requiring them to be zero now, we allow a nonzero value to have some special meaning in the future. Signed-off-by: Avi Kivity commit 29e686a1dc9631b7898d087a0ab1c4716672e209 Author: Avi Kivity Date: Wed Mar 7 13:05:38 2007 +0200 KVM: Allow kernel to select size of mmap() buffer This allows us to store offsets in the kernel/user kvm_run area, and be sure that userspace has them mapped. As offsets can be outside the kvm_run struct, userspace has no way of knowing how much to mmap. Signed-off-by: Avi Kivity commit cce3a1062817218c67163732339e2ea25e9f023b Author: Avi Kivity Date: Mon Mar 5 19:46:05 2007 +0200 KVM: Add guest mode signal mask Allow a special signal mask to be used while executing in guest mode. This allows signals to be used to interrupt a vcpu without requiring signal delivery to a userspace handler, which is quite expensive. Userspace still receives -EINTR and can get the signal via sigwait(). Signed-off-by: Avi Kivity commit cd3aaa2392baec9674792d71d304ec41e540b517 Author: Avi Kivity Date: Mon Mar 5 17:45:40 2007 +0200 KVM: Initialize the apic_base msr on svm too Older userspace didn't care, but newer userspace (with the cpuid changes) does. Signed-off-by: Avi Kivity commit c303c0efc5b2ff8c0f77c9079fa66f62801da93d Author: Avi Kivity Date: Sun Mar 4 14:24:03 2007 +0200 KVM: Add a special exit reason when exiting due to an interrupt This is redundant, as we also return -EINTR from the ioctl, but it allows us to examine the exit_reason field on resume without seeing old data. Signed-off-by: Avi Kivity commit 62919332e00e3226dd1f728ff83107d06a6d9a81 Author: Avi Kivity Date: Sun Mar 4 14:17:08 2007 +0200 KVM: Fold kvm_run::exit_type into kvm_run::exit_reason Currently, userspace is told about the nature of the last exit from the guest using two fields, exit_type and exit_reason, where exit_type has just two enumerations (and no need for more). So fold exit_type into exit_reason, reducing the complexity of determining what really happened. Signed-off-by: Avi Kivity commit 9e16898f4f5d6cdc35030bb272631611b71548fe Author: Avi Kivity Date: Sun Mar 4 13:59:30 2007 +0200 KVM: Allow userspace to process hypercalls which have no kernel handler This is useful for paravirtualized graphics devices, for example. Signed-off-by: Avi Kivity commit 440fd9098bceb2ca0856d962ff62db9af4d1094a Author: Avi Kivity Date: Thu Mar 1 17:56:20 2007 +0200 KVM: Add method to check for backwards-compatible API extensions Signed-off-by: Avi Kivity commit 0b37dedb178bcb3b0a28f65e6ae835bf58184301 Author: Avi Kivity Date: Thu Mar 1 17:20:13 2007 +0200 KVM: Renumber ioctls The recent changes have left the ioctl numbers in complete disarray. Signed-off-by: Avi Kivity commit 95cab16b18e1c1a786a9fc5ea6fcd68b29ae3481 Author: Avi Kivity Date: Thu Mar 1 16:47:06 2007 +0200 KVM: Remove minor wart from KVM_CREATE_VCPU ioctl That ioctl does not transfer any data, so it should be an _IO rather than an _IOW. Signed-off-by: Avi Kivity commit ba5cb15b027b76ba7b4d247914eb6d20065c0767 Author: Avi Kivity Date: Thu Mar 1 16:20:40 2007 +0200 KVM: Remove the 'emulated' field from the userspace interface We no longer emulate single instructions in userspace. Instead, we service mmio or pio requests. Signed-off-by: Avi Kivity commit 706e8fe655be36aa686f1fbb398d3a4470d4939b Author: Avi Kivity Date: Wed Feb 28 20:46:53 2007 +0200 KVM: Handle cpuid in the kernel instead of punting to userspace KVM used to handle cpuid by letting userspace decide what values to return to the guest. We now handle cpuid completely in the kernel. We still let userspace decide which values the guest will see by having userspace set up the value table beforehand (this is necessary to allow management software to set the cpu features to the least common denominator, so that live migration can work). The motivation for the change is that kvm kernel code can be impacted by cpuid features, for example the x86 emulator. Signed-off-by: Avi Kivity commit aad2f6e0faf4b03e087bbe6751acdacd72e911b6 Author: Avi Kivity Date: Thu Feb 22 19:48:43 2007 +0200 KVM: Initialize PIO I/O count This allows userspace to ignore the io.rep field. No a big deal, but friendly. Signed-off-by: Avi Kivity commit e668cf946ee8654c7f5afe3feeed686a3566c22a Author: Avi Kivity Date: Thu Feb 22 19:39:30 2007 +0200 KVM: Do not communicate to userspace through cpu registers during PIO Currently when passing the a PIO emulation request to userspace, we rely on userspace updating %rax (on 'in' instructions) and %rsi/%rdi/%rcx (on string instructions). This (a) requires two extra ioctls for getting and setting the registers and (b) is unfriendly to non-x86 archs, when they get kvm ports. So fix by doing the register fixups in the kernel and passing to userspace only an abstract description of the PIO to be done. Signed-off-by: Avi Kivity commit 3de857cd1335bd2e02b60d3a50b7da93ccbabf1d Author: Avi Kivity Date: Thu Feb 22 12:58:31 2007 +0200 KVM: Use a shared page for kernel/user communication when runing a vcpu Instead of passing a 'struct kvm_run' back and forth between the kernel and userspace, allocate a page and allow the user to mmap() it. This reduces needless copying and makes the interface expandable by providing lots of free space. Signed-off-by: Avi Kivity commit 128e159e11e999496ec44a549fcac91de3802389 Author: Avi Kivity Date: Mon Mar 19 13:18:10 2007 +0200 KVM: Prevent system selectors leaking into guest on real->protected mode transition on vmx Intel virtualization extensions do not support virtualizing real mode. So kvm uses virtualized vm86 mode to run real mode code. Unfortunately, this virtualized vm86 mode does not support the so called "big real" mode, where the segment selector and base do not agree with each other according to the real mode rules (base == selector << 4). To work around this, kvm checks whether a selector/base pair violates the virtualized vm86 rules, and if so, forces it into conformance. On a transition back to protected mode, if we see that the guest did not touch a forced segment, we restore it back to the original protected mode value. This pile of hacks breaks down if the gdt has changed in real mode, as it can cause a segment selector to point to a system descriptor instead of a normal data segment. In fact, this happens with the Windows bootloader and the qemu acpi bios, where a protected mode memcpy routine issues an innocent 'pop %es' and traps on an attempt to load a system descriptor. "Fix" by checking if the to-be-restored selector points at a system segment, and if so, coercing it into a normal data segment. The long term solution, of course, is to abandon vm86 mode and use emulation for big real mode. Signed-off-by: Avi Kivity commit ade11a015f83d270d1201c440199146f852fe5e4 Author: Uri Lublin Date: Wed Mar 14 19:21:06 2007 +0200 added KVM_GET_MEM_MAP ioctl to get the memory bitmap for a memory slot To be used when there may be "holes" in the memory. Specifically to not break VM migration when ballooning mechanism exists Signed-off-by: Uri Lublin commit b0092d187cfa19dfcada3b85d728af5ae27989dc Author: Avi Kivity Date: Wed Mar 14 15:54:54 2007 +0200 KVM: Remove extraneous guest entry on mmio read When emulating an mmio read, we actually emulate twice: once to determine the physical address of the mmio, and, after we've exited to userspace to get the mmio value, we emulate again to place the value in the result register and update any flags. But we don't really need to enter the guest again for that, only to take an immediate vmexit. So, if we detect that we're doing an mmio read, emulate a single instruction before entering the guest again. Signed-off-by: Avi Kivity commit 470db88b8b3491199e8d55b771d66e74b2fd53cd Author: Ingo Molnar Date: Sun Mar 11 13:52:33 2007 +0100 KVM: always reload segment selectors failed VM entry on VMX might still change %fs or %gs, thus make sure that KVM always reloads the segment selectors. This is crutial on both x86 and x86_64: x86 has __KERNEL_PDA in %fs on which things like 'current' depends and x86_64 has 0 there and needs MSR_GS_BASE to work. Signed-off-by: Ingo Molnar commit f7edc6a39584a3f95687a5320675fadb23bccbe5 Author: Ingo Molnar Date: Sat Mar 10 11:22:51 2007 +0100 KVM: trivial whitespace fixes trivial whitespace fixes. Signed-off-by: Ingo Molnar commit f3a33bfeaa5cade1a9ac1facb5cb904a483b1e5c Author: Avi Kivity Date: Fri Mar 9 13:04:31 2007 +0200 KVM: MMU: Fix host memory corruption on i386 with >= 4GB ram PAGE_MASK is an unsigned long, so using it to mask physical addresses on i386 (which are 64-bit wide) leads to truncation. This can result in page->private of unrelated memory pages being modified, with disasterous results. Fix by not using PAGE_MASK for physical addresses; instead calculate the correct value directly from PAGE_SIZE. Also fix a similar BUG_ON(). Signed-off-by: Avi Kivity commit 6ee9853b015f8807f497ffad39b142ddc1403aa9 Author: Avi Kivity Date: Thu Mar 8 17:13:32 2007 +0200 KVM: MMU: Fix guest writes to nonpae pde KVM shadow page tables are always in pae mode, regardless of the guest setting. This means that a guest pde (mapping 4MB of memory) is mapped to two shadow pdes (mapping 2MB each). When the guest writes to a pte or pde, we intercept the write and emulate it. We also remove any shadowed mappings corresponding to the write. Since the mmu did not account for the doubling in the number of pdes, it removed the wrong entry, resulting in a mismatch between shadow page tables and guest page tables, followed shortly by guest memory corruption. This patch fixes the problem by detecting the special case of writing to a non-pae pde and adjusting the address and number of shadow pdes zapped accordingly. Signed-off-by: Avi Kivity commit 374c1509c7d04a4e351b1812c2f0b9dac3ea0c0a Author: Avi Kivity Date: Thu Mar 8 11:48:09 2007 +0200 KVM: Fix bogus sign extension in mmu mapping audit When auditing a 32-bit guest on a 64-bit host, sign extension of the page table directory pointer table index caused bogus addresses to be shown on audit errors. Fix by declaring the index unsigned. Signed-off-by: Avi Kivity commit fac539542cbf923a39238b10557c88f99fd45b59 Author: Avi Kivity Date: Wed Mar 7 09:29:48 2007 +0200 KVM: Export This allows users to actually build prgrams that use kvm without the entire source tree. Signed-off-by: Avi Kivity commit c14a46343cc9f04f15ebc67573031fe8bbe1555a Author: Avi Kivity Date: Tue Mar 6 12:05:53 2007 +0200 KVM: Fix guest sysenter on vmx The vmx code currently treats the guest's sysenter support msrs as 32-bit values, which breaks 32-bit compat mode userspace on 64-bit guests. Fix by using the native word width of the machine. Signed-off-by: Avi Kivity commit ea135e7671189ffb7e67843bf98740dac0c6ccfa Author: Avi Kivity Date: Sun Mar 4 13:27:36 2007 +0200 KVM: Use own minor number Use the minor number (232) allocated to kvm by lanana. Signed-off-by: Avi Kivity commit 21af17507f37658414191b1cf1337efbaf7dd530 Author: Dor Laor Date: Mon Feb 19 18:25:43 2007 +0200 KVM: Use the generic skip_emulated_instruction() in hypercall code Instead of twiddling the rip registers directly, use the skip_emulated_instruction() function to do that for us. Signed-off-by: Dor Laor Signed-off-by: Avi Kivity commit 57d78025d84fb607aa335d015a79b257517aa209 Author: Dor Laor Date: Mon Feb 19 16:44:49 2007 +0200 KVM: Fix guest register corruption on paravirt hypercall The hypercall code mixes up the ->cache_regs() and ->decache_regs() callbacks, resulting in guest register corruption. Signed-off-by: Dor Laor Signed-off-by: Avi Kivity commit 28e9803c9134683a884efe05abdb3f814c1ca7e7 Author: Avi Kivity Date: Thu Mar 1 19:21:03 2007 +0200 KVM: Unset kvm_arch_ops if arch module loading failed Otherwise, the core module thinks the arch module is loaded, and won't let you reload it after you've fixed the bug. Signed-off-by: Avi Kivity commit 426bc2fd1462706ec92d0e9efdb0cf3643f4eb67 Author: Avi Kivity Date: Thu Mar 1 11:28:13 2007 +0200 KVM: Move kvmfs magic number to From: Andrew Morton Use the standard magic.h for kvmfs. Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Avi Kivity commit c1a8557e1da6e7d8bf8f77cb1b47c077f5c2a67d Author: Avi Kivity Date: Mon Feb 26 16:29:43 2007 +0200 KVM: Fix bogus failure in kvm.ko module initialization A bogus 'return r' can cause an otherwise successful module load to fail. This both denies users the use of kvm, and it also denies them the use of their machine, as it leaves a filesystem registered with its callbacks pointing into now-freed module memory. Fix by returning a zero like a good module. Thanks to Richard Lucassen (?) for reporting the problem and for providing access to a machine which exhibited it. Signed-off-by: Avi Kivity commit 7703ff91ee2ed171f2175d030e7f063c4efab2f5 Author: Uri Lublin Date: Thu Feb 22 17:37:32 2007 +0200 KVM: Remove write access permissions when dirty-page-logging is enabled Enabling dirty page logging is done using KVM_SET_MEMORY_REGION ioctl. If the memory region already exists, we need to remove write accesses, so writes will be caught, and dirty pages will be logged. Signed-off-by: Uri Lublin Signed-off-by: Avi Kivity commit b77fd1f62576463434fc434cbdcd808847e169a1 Author: Uri Lublin Date: Thu Feb 22 17:15:33 2007 +0200 kvm: move do_remove_write_access() up To be called from kvm_vm_ioctl_set_memory_region() Signed-off-by: Uri Lublin Signed-off-by: Avi Kivity commit 62e287e7210d6ff142b3b05233fa1f5df686b794 Author: Uri Lublin Date: Thu Feb 22 16:43:09 2007 +0200 KVM: Fix dirty page log bitmap size/access calculation Since dirty_bitmap is an unsigned long array, the alignment and size need to take that into account. Signed-off-by: Uri Lublin Signed-off-by: Avi Kivity commit 871574eb14e959c19d94fdee7c3e2b88ae06770f Author: Uri Lublin Date: Wed Feb 21 18:25:21 2007 +0200 KVM: Add missing calls to mark_page_dirty() A few places where we modify guest memory fail to call mark_page_dirty(), causing live migration to fail. This adds the missing calls. Signed-off-by: Uri Lublin Signed-off-by: Avi Kivity commit 42017e8bf8eb7b6f65b95bca1368ee274fc5ef50 Author: Uri Lublin Date: Thu Feb 22 17:37:32 2007 +0200 kvm: dirty page logging: remove write access permissions when dirty-page-logging is enabled Enabling dirty page logging is done using KVM_SET_MEMORY_REGION ioctl. If the memory region already exists, there is a need to remove write accesses, so writes will be caught, and dirty pages will be logged. commit a9fd29cfcb643b97cd76c7d836be4d0ed80f69e0 Author: Uri Lublin Date: Thu Feb 22 17:15:33 2007 +0200 kvm: move do_remove_write_access() up To be called from kvm_vm_ioctl_set_memory_region() commit fba4ba9c513ad2cd328f5f16980aa7b90d40cec0 Author: Uri Lublin Date: Thu Feb 22 16:43:09 2007 +0200 kvm: dirty pages log: fix bitmap size/access calculation Since dirty_bitmap is an unsigned long array (pointer) commit ae160d732685ab33d5a3a495663aa2b54c4d4734 Author: Uri Lublin Date: Thu Feb 22 15:47:42 2007 +0200 .gitignore: ignore emacs backup files (*~) commit 8267c1cd9a8a038e91c94e0cabc571a3614dc3e5 Author: Avi Kivity Date: Wed Feb 21 19:47:40 2007 +0200 KVM: Bump API version Signed-off-by: Avi Kivity commit c65237e78c19b8173338a49933c611dece13c1c6 Author: Avi Kivity Date: Wed Feb 21 18:04:26 2007 +0200 KVM: Per-vcpu inodes Allocate a distinct inode for every vcpu in a VM. This has the following benefits: - the filp cachelines are no longer bounced when f_count is incremented on every ioctl() - the API and internal code are distinctly clearer; for example, on the KVM_GET_REGS ioctl, there is no need to copy the vcpu number from userspace and then copy the registers back; the vcpu identity is derived from the fd used to make the call Right now the performance benefits are completely theoretical since (a) we don't support more than one vcpu per VM and (b) virtualization hardware inefficiencies completely everwhelm any cacheline bouncing effects. But both of these will change, and we need to prepare the API today. Signed-off-by: Avi Kivity commit 11c1297fadc533d1f66252088b4f4775018bafbb Author: Avi Kivity Date: Tue Feb 20 18:41:05 2007 +0200 KVM: Move kvm_vm_ioctl_create_vcpu() around In preparation of some hacking. Signed-off-by: Avi Kivity commit f3ad84386727171d8308338a2c5dee1deac2e50d Author: Avi Kivity Date: Tue Feb 20 18:27:58 2007 +0200 KVM: Rename some kvm_dev_ioctl_*() functions to kvm_vm_ioctl_*() This reflects the changed scope, from device-wide to single vm (previously every device open created a virtual machine). Signed-off-by: Avi Kivity commit 733e3f74f1c51bbc2e7a99df8b51767504b58de2 Author: Avi Kivity Date: Wed Feb 21 19:28:04 2007 +0200 KVM: Create an inode per virtual machine This avoids having filp->f_op and the corresponding inode->i_fop different, which is a little unorthodox. The ioctl list is split into two: global kvm ioctls and per-vm ioctls. A new ioctl, KVM_CREATE_VM, is used to create VMs and return the VM fd. Signed-off-by: Avi Kivity commit 52a96114380f8ab615626e4cec57b7015895bd0f Author: Avi Kivity Date: Tue Feb 20 14:07:37 2007 +0200 KVM: Add internal filesystem for generating inodes The kvmfs inodes will represent virtual machines and vcpus, as necessary, reducing cacheline bouncing due to inodes and filps being shared. Signed-off-by: Avi Kivity commit b00bc8b10197715f5b842f1f9a60e67a3484b10f Author: Uri Lublin Date: Wed Feb 21 18:25:21 2007 +0200 kvm, dirty pages log: adding some calls to mark_page_dirty() commit 58a214eba321d92f833221c26777e2119e34a19d Author: Avi Kivity Date: Mon Feb 19 14:37:48 2007 +0200 KVM: More 0 -> NULL conversions Signed-off-by: Avi Kivity commit f73199bb57b4c8feb7d8f60c6f1a25107de18dab Author: Joerg Roedel Date: Mon Feb 19 14:37:47 2007 +0200 KVM: SVM: intercept SMI to handle it at host level This patch changes the SVM code to intercept SMIs and handle it outside the guest. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity commit fa2742c78f10fad8682e3af17df3e9fc2eece9e4 Author: Avi Kivity Date: Mon Feb 19 14:37:47 2007 +0200 KVM: svm: init cr0 with the wp bit set Signed-off-by: Avi Kivity commit 8da588a919dc0bef76e384d16fd13ea2189aa82d Author: Avi Kivity Date: Mon Feb 19 14:37:47 2007 +0200 KVM: Wire up hypercall handlers to a central arch-independent location Signed-off-by: Avi Kivity commit 68f16784f188d280c75b39e2367ebc1adbc66d9d Author: Avi Kivity Date: Mon Feb 19 14:37:47 2007 +0200 KVM: Add hypercall host support for svm Signed-off-by: Avi Kivity commit 7c8bd4d6fc0e2bfb35cd4c0e8ff39c4f8972d951 Author: Ingo Molnar Date: Mon Feb 19 14:37:47 2007 +0200 KVM: Add host hypercall support for vmx Signed-off-by: Avi Kivity commit f846fa34a14ec37dc0194c6f47ea4374c140e6f1 Author: Ingo Molnar Date: Mon Feb 19 14:37:47 2007 +0200 KVM: add MSR based hypercall API This adds a special MSR based hypercall API to KVM. This is to be used by paravirtual kernels and virtual drivers. Signed-off-by: Ingo Molnar Signed-off-by: Avi Kivity commit 8aa04bb13cf90d68c26d6bea1e4c720f1f027be0 Author: Markus Rechberger Date: Mon Feb 19 14:37:47 2007 +0200 KVM: Use page_private()/set_page_private() apis Besides using an established api, this allows using kvm in older kernels. Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity commit 4d5a7e81cc63d28e94373cdeb74dc44045edaa10 Author: Ahmed S. Darwish Date: Mon Feb 19 14:37:46 2007 +0200 KVM: Use ARRAY_SIZE macro instead of manual calculation. Signed-off-by: Ahmed S. Darwish Signed-off-by: Dor Laor Signed-off-by: Avi Kivity commit 0fe9875fb3f9946a6c1cef6f1b9a286edc8ee2b9 Author: Markus Rechberger Date: Mon Feb 19 14:37:46 2007 +0200 KVM: vmx: hack set_cr0_no_modeswitch() to actually do modeswitch From: Joerg Roedel The whole thing is rotten, but this allows vmx to boot with the guest reboot fix. Signed-off-by: Markus Rechberger Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity commit 7e6e2bbad7f5dbccb389ee6d79be661972b18b15 Author: Avi Kivity Date: Mon Feb 19 14:37:46 2007 +0200 KVM: Cosmetics Signed-off-by: Avi Kivity commit cc66daca849ca8c2900ba8cc7640de664296d36a Author: Jeremy Katz Date: Mon Feb 19 14:37:46 2007 +0200 KVM: Move virtualization deactivation from CPU_DEAD state to CPU_DOWN_PREPARE This gives it more chances of surviving suspend. Signed-off-by: Jeremy Katz Signed-off-by: Avi Kivity commit 2959cd13ecc1fbe1b2339937481844ff963f1e7f Author: Avi Kivity Date: Mon Feb 19 14:37:46 2007 +0200 KVM: mmu: add missing dirty page tracking cases We fail to mark a page dirty in three cases: - setting the accessed bit in a pte - setting the dirty bit in a pte - emulating a write into a pagetable This fix adds the missing cases. Signed-off-by: Avi Kivity .gitignore | 3 drivers/kvm/kvm.h | 57 +++- drivers/kvm/kvm_main.c | 649 ++++++++++++++++++++++++++++++++++++++++---- drivers/kvm/kvm_svm.h | 2 drivers/kvm/mmu.c | 69 +++-- drivers/kvm/paging_tmpl.h | 10 - drivers/kvm/svm.c | 111 +++++--- drivers/kvm/svm.h | 6 drivers/kvm/vmx.c | 88 +++--- drivers/kvm/x86_emulate.c | 5 include/linux/Kbuild | 1 include/linux/kvm.h | 135 +++++++-- include/linux/miscdevice.h | 1 13 files changed, 901 insertions(+), 236 deletions(-) diff --git a/.gitignore b/.gitignore index 060a71d..343c716 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,6 @@ series # cscope files cscope.* + +# emacs backup files +*~ diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 0d122bf..fceeb84 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -51,10 +51,12 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) #define KVM_MAX_VCPUS 1 +#define KVM_ALIAS_SLOTS 4 #define KVM_MEMORY_SLOTS 4 #define KVM_NUM_MMU_PAGES 256 #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 +#define KVM_MAX_CPUID_ENTRIES 40 #define FX_IMAGE_SIZE 512 #define FX_IMAGE_ALIGN 16 @@ -73,6 +75,8 @@ #define SELECTOR_RPL_MASK 0x03 #define IOPL_SHIFT 12 +#define KVM_PIO_PAGE_OFFSET 1 + /* * Address types: * @@ -106,6 +110,7 @@ struct kvm_pte_chain { * bits 4:7 - page table level for this shadow (1-4) * bits 8:9 - page table quadrant for 2-level guests * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) + * bits 17:18 - "access" - the user and writable bits of a huge page pde */ union kvm_mmu_page_role { unsigned word; @@ -115,6 +120,7 @@ union kvm_mmu_page_role { unsigned quadrant : 2; unsigned pad_for_nice_hex_output : 6; unsigned metaphysical : 1; + unsigned hugepage_access : 2; }; }; @@ -133,7 +139,6 @@ struct kvm_mmu_page { unsigned long slot_bitmap; /* One bit set per slot which has memory * in this shadow page. */ - int global; /* Set if all ptes in this page are global */ int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ union { @@ -219,6 +224,18 @@ enum { VCPU_SREG_LDTR, }; +struct kvm_pio_request { + unsigned long count; + int cur_count; + struct page *guest_pages[2]; + unsigned guest_page_offset; + int in; + int size; + int string; + int down; + int rep; +}; + struct kvm_vcpu { struct kvm *kvm; union { @@ -228,6 +245,8 @@ struct kvm_vcpu { struct mutex mutex; int cpu; int launched; + u64 host_tsc; + struct kvm_run *run; int interrupt_window_open; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) @@ -273,6 +292,11 @@ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE int mmio_size; unsigned char mmio_data[8]; gpa_t mmio_phys_addr; + struct kvm_pio_request pio; + void *pio_data; + + int sigset_active; + sigset_t sigset; struct { int active; @@ -284,6 +308,15 @@ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE u32 ar; } tr, es, ds, fs, gs; } rmode; + + int cpuid_nent; + struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; +}; + +struct kvm_mem_alias { + gfn_t base_gfn; + unsigned long npages; + gfn_t target_gfn; }; struct kvm_memory_slot { @@ -296,6 +329,8 @@ struct kvm_memory_slot { struct kvm { spinlock_t lock; /* protects everything except vcpus */ + int naliases; + struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; /* @@ -360,8 +395,6 @@ struct kvm_arch_ops { void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*decache_cr0_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); - void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu, - unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); @@ -406,22 +439,20 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot); +void kvm_mmu_zap_all(struct kvm_vcpu *vcpu); hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); void kvm_emulator_want_group7_invlpg(void); extern hpa_t bad_page_address; -static inline struct page *gfn_to_page(struct kvm_memory_slot *slot, gfn_t gfn) -{ - return slot->phys_mem[gfn - slot->base_gfn]; -} - +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); @@ -444,6 +475,10 @@ void realmode_set_cr(struct kvm_vcpu *vc struct x86_emulate_ctxt; +int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned long count, int string, int down, + gva_t address, int rep, unsigned port); +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, @@ -493,12 +528,6 @@ static inline int kvm_mmu_page_fault(str return vcpu->mmu.page_fault(vcpu, gva, error_code); } -static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : NULL; -} - static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index dc7a8c7..4473174 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -346,6 +346,17 @@ static void kvm_free_physmem(struct kvm kvm_free_physmem_slot(&kvm->memslots[i], NULL); } +static void free_pio_guest_pages(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < 2; ++i) + if (vcpu->pio.guest_pages[i]) { + __free_page(vcpu->pio.guest_pages[i]); + vcpu->pio.guest_pages[i] = NULL; + } +} + static void kvm_free_vcpu(struct kvm_vcpu *vcpu) { if (!vcpu->vmcs) @@ -355,6 +366,11 @@ static void kvm_free_vcpu(struct kvm_vcp kvm_mmu_destroy(vcpu); vcpu_put(vcpu); kvm_arch_ops->vcpu_free(vcpu); + free_page((unsigned long)vcpu->run); + vcpu->run = NULL; + free_page((unsigned long)vcpu->pio_data); + vcpu->pio_data = NULL; + free_pio_guest_pages(vcpu); } static void kvm_free_vcpus(struct kvm *kvm) @@ -404,12 +420,12 @@ static int load_pdptrs(struct kvm_vcpu * u64 pdpte; u64 *pdpt; int ret; - struct kvm_memory_slot *memslot; + struct page *page; spin_lock(&vcpu->kvm->lock); - memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn); - /* FIXME: !memslot - emulate? 0xff? */ - pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); + page = gfn_to_page(vcpu->kvm, pdpt_gfn); + /* FIXME: !page - emulate? 0xff? */ + pdpt = kmap_atomic(page, KM_USER0); ret = 1; for (i = 0; i < 4; ++i) { @@ -830,7 +846,73 @@ out: return r; } -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +/* + * Set a new alias region. Aliases map a portion of physical memory into + * another portion. This is useful for memory windows, for example the PC + * VGA region. + */ +static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, + struct kvm_memory_alias *alias) +{ + int r, n; + struct kvm_mem_alias *p; + + r = -EINVAL; + /* General sanity checks */ + if (alias->memory_size & (PAGE_SIZE - 1)) + goto out; + if (alias->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (alias->slot >= KVM_ALIAS_SLOTS) + goto out; + if (alias->guest_phys_addr + alias->memory_size + < alias->guest_phys_addr) + goto out; + if (alias->target_phys_addr + alias->memory_size + < alias->target_phys_addr) + goto out; + + spin_lock(&kvm->lock); + + p = &kvm->aliases[alias->slot]; + p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; + p->npages = alias->memory_size >> PAGE_SHIFT; + p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + + for (n = KVM_ALIAS_SLOTS; n > 0; --n) + if (kvm->aliases[n - 1].npages) + break; + kvm->naliases = n; + + spin_unlock(&kvm->lock); + + vcpu_load(&kvm->vcpus[0]); + spin_lock(&kvm->lock); + kvm_mmu_zap_all(&kvm->vcpus[0]); + spin_unlock(&kvm->lock); + vcpu_put(&kvm->vcpus[0]); + + return 0; + +out: + return r; +} + +static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + + for (i = 0; i < kvm->naliases; ++i) { + alias = &kvm->aliases[i]; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { int i; @@ -843,7 +925,24 @@ struct kvm_memory_slot *gfn_to_memslot(s } return NULL; } -EXPORT_SYMBOL_GPL(gfn_to_memslot); + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + gfn = unalias_gfn(kvm, gfn); + return __gfn_to_memslot(kvm, gfn); +} + +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + gfn = unalias_gfn(kvm, gfn); + slot = __gfn_to_memslot(kvm, gfn); + if (!slot) + return NULL; + return slot->phys_mem[gfn - slot->base_gfn]; +} +EXPORT_SYMBOL_GPL(gfn_to_page); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { @@ -883,20 +982,20 @@ static int emulator_read_std(unsigned lo unsigned offset = addr & (PAGE_SIZE-1); unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); unsigned long pfn; - struct kvm_memory_slot *memslot; - void *page; + struct page *page; + void *page_virt; if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; pfn = gpa >> PAGE_SHIFT; - memslot = gfn_to_memslot(vcpu->kvm, pfn); - if (!memslot) + page = gfn_to_page(vcpu->kvm, pfn); + if (!page) return X86EMUL_UNHANDLEABLE; - page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0); + page_virt = kmap_atomic(page, KM_USER0); - memcpy(data, page + offset, tocopy); + memcpy(data, page_virt + offset, tocopy); - kunmap_atomic(page, KM_USER0); + kunmap_atomic(page_virt, KM_USER0); bytes -= tocopy; data += tocopy; @@ -947,16 +1046,14 @@ static int emulator_read_emulated(unsign static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long val, int bytes) { - struct kvm_memory_slot *m; struct page *page; void *virt; if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; - m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!m) + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!page) return 0; - page = gfn_to_page(m, gpa >> PAGE_SHIFT); kvm_mmu_pre_write(vcpu, gpa, bytes); mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); @@ -1177,7 +1274,7 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, { unsigned long nr, a0, a1, a2, a3, a4, a5, ret; - kvm_arch_ops->decache_regs(vcpu); + kvm_arch_ops->cache_regs(vcpu); ret = -KVM_EINVAL; #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { @@ -1201,10 +1298,19 @@ #endif } switch (nr) { default: - ; + run->hypercall.args[0] = a0; + run->hypercall.args[1] = a1; + run->hypercall.args[2] = a2; + run->hypercall.args[3] = a3; + run->hypercall.args[4] = a4; + run->hypercall.args[5] = a5; + run->hypercall.ret = ret; + run->hypercall.longmode = is_long_mode(vcpu); + kvm_arch_ops->decache_regs(vcpu); + return 0; } vcpu->regs[VCPU_REGS_RAX] = ret; - kvm_arch_ops->cache_regs(vcpu); + kvm_arch_ops->decache_regs(vcpu); return 1; } EXPORT_SYMBOL_GPL(kvm_hypercall); @@ -1442,6 +1548,10 @@ #endif printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data); break; + case MSR_IA32_MCG_STATUS: + printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: case 0x200 ... 0x2ff: /* MTRRs */ @@ -1502,29 +1612,234 @@ void save_msrs(struct vmx_msr_entry *e, } EXPORT_SYMBOL_GPL(save_msrs); +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + int i; + u32 function; + struct kvm_cpuid_entry *e, *best; + + kvm_arch_ops->cache_regs(vcpu); + function = vcpu->regs[VCPU_REGS_RAX]; + vcpu->regs[VCPU_REGS_RAX] = 0; + vcpu->regs[VCPU_REGS_RBX] = 0; + vcpu->regs[VCPU_REGS_RCX] = 0; + vcpu->regs[VCPU_REGS_RDX] = 0; + best = NULL; + for (i = 0; i < vcpu->cpuid_nent; ++i) { + e = &vcpu->cpuid_entries[i]; + if (e->function == function) { + best = e; + break; + } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) == 0) + if (!best || e->function > best->function) + best = e; + } + if (best) { + vcpu->regs[VCPU_REGS_RAX] = best->eax; + vcpu->regs[VCPU_REGS_RBX] = best->ebx; + vcpu->regs[VCPU_REGS_RCX] = best->ecx; + vcpu->regs[VCPU_REGS_RDX] = best->edx; + } + kvm_arch_ops->decache_regs(vcpu); + kvm_arch_ops->skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); + +static int pio_copy_data(struct kvm_vcpu *vcpu) +{ + void *p = vcpu->pio_data; + void *q; + unsigned bytes; + int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; + + kvm_arch_ops->vcpu_put(vcpu); + q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, + PAGE_KERNEL); + if (!q) { + kvm_arch_ops->vcpu_load(vcpu); + return -ENOMEM; + } + q += vcpu->pio.guest_page_offset; + bytes = vcpu->pio.size * vcpu->pio.cur_count; + if (vcpu->pio.in) + memcpy(q, p, bytes); + else + memcpy(p, q, bytes); + q -= vcpu->pio.guest_page_offset; + vunmap(q); + kvm_arch_ops->vcpu_load(vcpu); + return 0; +} + +static int complete_pio(struct kvm_vcpu *vcpu) +{ + struct kvm_pio_request *io = &vcpu->pio; + long delta; + int r; + + kvm_arch_ops->cache_regs(vcpu); + + io->count -= io->cur_count; + if (!io->string) { + if (io->in) + memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, + io->size); + } else { + if (io->in) { + r = pio_copy_data(vcpu); + if (r) { + kvm_arch_ops->cache_regs(vcpu); + return r; + } + } + + delta = 1; + if (io->rep) { + delta *= io->cur_count; + /* + * The size of the register should really depend on + * current address size. + */ + vcpu->regs[VCPU_REGS_RCX] -= delta; + } + if (io->down) + delta = -delta; + delta *= io->size; + if (io->in) + vcpu->regs[VCPU_REGS_RDI] += delta; + else + vcpu->regs[VCPU_REGS_RSI] += delta; + } + + vcpu->run->io_completed = 0; + + kvm_arch_ops->decache_regs(vcpu); + + if (!io->count) + kvm_arch_ops->skip_emulated_instruction(vcpu); + return 0; +} + +int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned long count, int string, int down, + gva_t address, int rep, unsigned port) +{ + unsigned now, in_page; + int i; + int nr_pages = 1; + struct page *page; + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = count; + vcpu->run->io.port = port; + vcpu->pio.count = count; + vcpu->pio.cur_count = count; + vcpu->pio.size = size; + vcpu->pio.in = in; + vcpu->pio.string = string; + vcpu->pio.down = down; + vcpu->pio.guest_page_offset = offset_in_page(address); + vcpu->pio.rep = rep; + + if (!string) { + kvm_arch_ops->cache_regs(vcpu); + memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); + kvm_arch_ops->decache_regs(vcpu); + return 0; + } + + now = min(count, PAGE_SIZE / size); + + if (!down) + in_page = PAGE_SIZE - offset_in_page(address); + else + in_page = offset_in_page(address) + size; + now = min(count, (unsigned long)in_page / size); + if (!now) { + /* + * String I/O straddles page boundary. Pin two guest pages + * so that we satisfy atomicity constraints. Do just one + * transaction to avoid complexity. + */ + nr_pages = 2; + now = 1; + } + if (down) { + /* + * String I/O in reverse. Yuck. Kill the guest, fix later. + */ + printk(KERN_ERR "kvm: guest string pio down\n"); + inject_gp(vcpu); + return 1; + } + vcpu->run->io.count = now; + vcpu->pio.cur_count = now; + + for (i = 0; i < nr_pages; ++i) { + spin_lock(&vcpu->kvm->lock); + page = gva_to_page(vcpu, address + i * PAGE_SIZE); + if (page) + get_page(page); + vcpu->pio.guest_pages[i] = page; + spin_unlock(&vcpu->kvm->lock); + if (!page) { + inject_gp(vcpu); + free_pio_guest_pages(vcpu); + return 1; + } + } + + if (!vcpu->pio.in) + return pio_copy_data(vcpu); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_setup_pio); + static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; + sigset_t sigsaved; vcpu_load(vcpu); + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + /* re-sync apic's tpr */ vcpu->cr8 = kvm_run->cr8; - if (kvm_run->emulated) { - kvm_arch_ops->skip_emulated_instruction(vcpu); - kvm_run->emulated = 0; - } - - if (kvm_run->mmio_completed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; + if (kvm_run->io_completed) { + if (vcpu->pio.count) { + r = complete_pio(vcpu); + if (r) + goto out; + } else { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + } } vcpu->mmio_needed = 0; + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { + kvm_arch_ops->cache_regs(vcpu); + vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; + kvm_arch_ops->decache_regs(vcpu); + } + r = kvm_arch_ops->run(vcpu, kvm_run); +out: + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + vcpu_put(vcpu); return r; } @@ -1665,16 +1980,6 @@ static int kvm_vcpu_ioctl_set_sregs(stru vcpu_load(vcpu); - set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - set_segment(vcpu, &sregs->es, VCPU_SREG_ES); - set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - dt.limit = sregs->idt.limit; dt.base = sregs->idt.base; kvm_arch_ops->set_idt(vcpu, &dt); @@ -1697,7 +2002,7 @@ #endif kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); mmu_reset_needed |= vcpu->cr0 != sregs->cr0; - kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); + kvm_arch_ops->set_cr0(vcpu, sregs->cr0); mmu_reset_needed |= vcpu->cr4 != sregs->cr4; kvm_arch_ops->set_cr4(vcpu, sregs->cr4); @@ -1714,6 +2019,16 @@ #endif if (vcpu->irq_pending[i]) __set_bit(i, &vcpu->irq_summary); + set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + vcpu_put(vcpu); return 0; @@ -1887,6 +2202,36 @@ static int kvm_vcpu_ioctl_debug_guest(st return r; } +static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + struct kvm_vcpu *vcpu = vma->vm_file->private_data; + unsigned long pgoff; + struct page *page; + + *type = VM_FAULT_MINOR; + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (pgoff == 0) + page = virt_to_page(vcpu->run); + else if (pgoff == KVM_PIO_PAGE_OFFSET) + page = virt_to_page(vcpu->pio_data); + else + return NOPAGE_SIGBUS; + get_page(page); + return page; +} + +static struct vm_operations_struct kvm_vcpu_vm_ops = { + .nopage = kvm_vcpu_nopage, +}; + +static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vcpu_vm_ops; + return 0; +} + static int kvm_vcpu_release(struct inode *inode, struct file *filp) { struct kvm_vcpu *vcpu = filp->private_data; @@ -1899,6 +2244,7 @@ static struct file_operations kvm_vcpu_f .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, .compat_ioctl = kvm_vcpu_ioctl, + .mmap = kvm_vcpu_mmap, }; /* @@ -1947,6 +2293,7 @@ static int kvm_vm_ioctl_create_vcpu(stru { int r; struct kvm_vcpu *vcpu; + struct page *page; r = -EINVAL; if (!valid_vcpu(n)) @@ -1961,6 +2308,18 @@ static int kvm_vm_ioctl_create_vcpu(stru return -EEXIST; } + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + r = -ENOMEM; + if (!page) + goto out_unlock; + vcpu->run = page_address(page); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + r = -ENOMEM; + if (!page) + goto out_free_run; + vcpu->pio_data = page_address(page); + vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, FX_IMAGE_ALIGN); vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; @@ -1990,11 +2349,107 @@ static int kvm_vm_ioctl_create_vcpu(stru out_free_vcpus: kvm_free_vcpu(vcpu); +out_free_run: + free_page((unsigned long)vcpu->run); + vcpu->run = NULL; +out_unlock: mutex_unlock(&vcpu->mutex); out: return r; } +static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -EFAULT; + if (copy_from_user(&vcpu->cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out; + vcpu->cpuid_nent = cpuid->nent; + return 0; + +out: + return r; +} + +static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) +{ + if (sigset) { + sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + vcpu->sigset_active = 1; + vcpu->sigset = *sigset; + } else + vcpu->sigset_active = 0; + return 0; +} + +/* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + */ +struct fxsave { + u16 cwd; + u16 swd; + u16 twd; + u16 fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ +#ifdef CONFIG_X86_64 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ +#else + u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ +#endif +}; + +static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fpu->fpr, fxsave->st_space, 128); + fpu->fcw = fxsave->cwd; + fpu->fsw = fxsave->swd; + fpu->ftwx = fxsave->twd; + fpu->last_opcode = fxsave->fop; + fpu->last_ip = fxsave->rip; + fpu->last_dp = fxsave->rdp; + memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fxsave->st_space, fpu->fpr, 128); + fxsave->cwd = fpu->fcw; + fxsave->swd = fpu->fsw; + fxsave->twd = fpu->ftwx; + fxsave->fop = fpu->last_opcode; + fxsave->rip = fpu->last_ip; + fxsave->rdp = fpu->last_dp; + memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2003,21 +2458,12 @@ static long kvm_vcpu_ioctl(struct file * int r = -EINVAL; switch (ioctl) { - case KVM_RUN: { - struct kvm_run kvm_run; - - r = -EFAULT; - if (copy_from_user(&kvm_run, argp, sizeof kvm_run)) + case KVM_RUN: + r = -EINVAL; + if (arg) goto out; - r = kvm_vcpu_ioctl_run(vcpu, &kvm_run); - if (r < 0 && r != -EINTR) - goto out; - if (copy_to_user(argp, &kvm_run, sizeof kvm_run)) { - r = -EFAULT; - goto out; - } + r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); break; - } case KVM_GET_REGS: { struct kvm_regs kvm_regs; @@ -2113,6 +2559,66 @@ static long kvm_vcpu_ioctl(struct file * case KVM_SET_MSRS: r = msr_io(vcpu, argp, do_set_msr, 0); break; + case KVM_SET_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_SET_SIGNAL_MASK: { + struct kvm_signal_mask __user *sigmask_arg = argp; + struct kvm_signal_mask kvm_sigmask; + sigset_t sigset, *p; + + p = NULL; + if (argp) { + r = -EFAULT; + if (copy_from_user(&kvm_sigmask, argp, + sizeof kvm_sigmask)) + goto out; + r = -EINVAL; + if (kvm_sigmask.len != sizeof sigset) + goto out; + r = -EFAULT; + if (copy_from_user(&sigset, sigmask_arg->sigset, + sizeof sigset)) + goto out; + p = &sigset; + } + r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); + break; + } + case KVM_GET_FPU: { + struct kvm_fpu fpu; + + memset(&fpu, 0, sizeof fpu); + r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &fpu, sizeof fpu)) + goto out; + r = 0; + break; + } + case KVM_SET_FPU: { + struct kvm_fpu fpu; + + r = -EFAULT; + if (copy_from_user(&fpu, argp, sizeof fpu)) + goto out; + r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu); + if (r) + goto out; + r = 0; + break; + } default: ; } @@ -2155,6 +2661,17 @@ static long kvm_vm_ioctl(struct file *fi goto out; break; } + case KVM_SET_MEMORY_ALIAS: { + struct kvm_memory_alias alias; + + r = -EFAULT; + if (copy_from_user(&alias, argp, sizeof alias)) + goto out; + r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); + if (r) + goto out; + break; + } default: ; } @@ -2168,15 +2685,11 @@ static struct page *kvm_vm_nopage(struct { struct kvm *kvm = vma->vm_file->private_data; unsigned long pgoff; - struct kvm_memory_slot *slot; struct page *page; *type = VM_FAULT_MINOR; pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - slot = gfn_to_memslot(kvm, pgoff); - if (!slot) - return NOPAGE_SIGBUS; - page = gfn_to_page(slot, pgoff); + page = gfn_to_page(kvm, pgoff); if (!page) return NOPAGE_SIGBUS; get_page(page); @@ -2248,13 +2761,19 @@ static long kvm_dev_ioctl(struct file *f unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; - int r = -EINVAL; + long r = -EINVAL; switch (ioctl) { case KVM_GET_API_VERSION: + r = -EINVAL; + if (arg) + goto out; r = KVM_API_VERSION; break; case KVM_CREATE_VM: + r = -EINVAL; + if (arg) + goto out; r = kvm_dev_ioctl_create_vm(); break; case KVM_GET_MSR_INDEX_LIST: { @@ -2284,6 +2803,18 @@ static long kvm_dev_ioctl(struct file *f r = 0; break; } + case KVM_CHECK_EXTENSION: + /* + * No extensions defined at present. + */ + r = 0; + break; + case KVM_GET_VCPU_MMAP_SIZE: + r = -EINVAL; + if (arg) + goto out; + r = 2 * PAGE_SIZE; + break; default: ; } @@ -2299,7 +2830,7 @@ static struct file_operations kvm_charde }; static struct miscdevice kvm_dev = { - MISC_DYNAMIC_MINOR, + KVM_MINOR, "kvm", &kvm_chardev_ops, }; diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h index 624f1ca..a1a9eba 100644 --- a/drivers/kvm/kvm_svm.h +++ b/drivers/kvm/kvm_svm.h @@ -28,8 +28,6 @@ struct vcpu_svm { struct svm_cpu_data *svm_data; uint64_t asid_generation; - unsigned long cr0; - unsigned long cr4; unsigned long db_regs[NUM_DB_REGS]; u64 next_rip; diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index e85b4c7..35135ec 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -390,13 +390,11 @@ static void rmap_write_protect(struct kv { struct kvm *kvm = vcpu->kvm; struct page *page; - struct kvm_memory_slot *slot; struct kvm_rmap_desc *desc; u64 *spte; - slot = gfn_to_memslot(kvm, gfn); - BUG_ON(!slot); - page = gfn_to_page(slot, gfn); + page = gfn_to_page(kvm, gfn); + BUG_ON(!page); while (page_private(page)) { if (!(page_private(page) & 1)) @@ -437,9 +435,8 @@ static void kvm_mmu_free_page(struct kvm struct kvm_mmu_page *page_head = page_header(page_hpa); ASSERT(is_empty_shadow_page(page_hpa)); - list_del(&page_head->link); page_head->page_hpa = page_hpa; - list_add(&page_head->link, &vcpu->free_pages); + list_move(&page_head->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -457,11 +454,9 @@ static struct kvm_mmu_page *kvm_mmu_allo return NULL; page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); - list_del(&page->link); - list_add(&page->link, &vcpu->kvm->active_mmu_pages); + list_move(&page->link, &vcpu->kvm->active_mmu_pages); ASSERT(is_empty_shadow_page(page->page_hpa)); page->slot_bitmap = 0; - page->global = 1; page->multimapped = 0; page->parent_pte = parent_pte; --vcpu->kvm->n_free_mmu_pages; @@ -569,6 +564,7 @@ static struct kvm_mmu_page *kvm_mmu_get_ gva_t gaddr, unsigned level, int metaphysical, + unsigned hugepage_access, u64 *parent_pte) { union kvm_mmu_page_role role; @@ -582,6 +578,7 @@ static struct kvm_mmu_page *kvm_mmu_get_ role.glevels = vcpu->mmu.root_level; role.level = level; role.metaphysical = metaphysical; + role.hugepage_access = hugepage_access; if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; @@ -669,10 +666,8 @@ static void kvm_mmu_zap_page(struct kvm_ if (!page->root_count) { hlist_del(&page->hash_link); kvm_mmu_free_page(vcpu, page->page_hpa); - } else { - list_del(&page->link); - list_add(&page->link, &vcpu->kvm->active_mmu_pages); - } + } else + list_move(&page->link, &vcpu->kvm->active_mmu_pages); } static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) @@ -714,14 +709,12 @@ hpa_t safe_gpa_to_hpa(struct kvm_vcpu *v hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - struct kvm_memory_slot *slot; struct page *page; ASSERT((gpa & HPA_ERR_MASK) == 0); - slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!slot) + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!page) return gpa | HPA_ERR_MASK; - page = gfn_to_page(slot, gpa >> PAGE_SHIFT); return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) | (gpa & (PAGE_SIZE-1)); } @@ -735,6 +728,15 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, return gpa_to_hpa(vcpu, gpa); } +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return NULL; + return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT); +} + static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } @@ -772,7 +774,7 @@ static int nonpaging_map(struct kvm_vcpu >> PAGE_SHIFT; new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, v, level - 1, - 1, &table[index]); + 1, 0, &table[index]); if (!new_table) { pgprintk("nonpaging_map: ENOMEM\n"); return -ENOMEM; @@ -827,7 +829,7 @@ #ifdef CONFIG_X86_64 ASSERT(!VALID_PAGE(root)); page = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, NULL); + PT64_ROOT_LEVEL, 0, 0, NULL); root = page->page_hpa; ++page->root_count; vcpu->mmu.root_hpa = root; @@ -844,7 +846,7 @@ #endif root_gfn = 0; page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, !is_paging(vcpu), - NULL); + 0, NULL); root = page->page_hpa; ++page->root_count; vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; @@ -918,11 +920,6 @@ static void paging_new_cr3(struct kvm_vc kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } -static void mark_pagetable_nonglobal(void *shadow_pte) -{ - page_header(__pa(shadow_pte))->global = 0; -} - static inline void set_pte_common(struct kvm_vcpu *vcpu, u64 *shadow_pte, gpa_t gaddr, @@ -940,9 +937,6 @@ static inline void set_pte_common(struct *shadow_pte |= access_bits; - if (!(*shadow_pte & PT_GLOBAL_MASK)) - mark_pagetable_nonglobal(shadow_pte); - if (is_error_hpa(paddr)) { *shadow_pte |= gaddr; *shadow_pte |= PT_SHADOW_IO_MARK; @@ -1315,6 +1309,23 @@ void kvm_mmu_slot_remove_write_access(st } } +void kvm_mmu_zap_all(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + + while (!list_empty(&vcpu->kvm->active_mmu_pages)) { + struct kvm_mmu_page *page; + + page = container_of(vcpu->kvm->active_mmu_pages.next, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu, page); + } + + mmu_free_memory_caches(vcpu); + kvm_arch_ops->tlb_flush(vcpu); + init_kvm_mmu(vcpu); +} + #ifdef AUDIT static const char *audit_msg; @@ -1359,7 +1370,7 @@ static void audit_mappings_page(struct k static void audit_mappings(struct kvm_vcpu *vcpu) { - int i; + unsigned i; if (vcpu->mmu.root_level == 4) audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index f3bcee9..b94010d 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -148,8 +148,7 @@ #endif break; } - if (walker->level != 3 || is_long_mode(vcpu)) - walker->inherited_ar &= walker->table[index]; + walker->inherited_ar &= walker->table[index]; table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); kunmap_atomic(walker->table, KM_USER0); @@ -248,6 +247,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu u64 shadow_pte; int metaphysical; gfn_t table_gfn; + unsigned hugepage_access = 0; if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { if (level == PT_PAGE_TABLE_LEVEL) @@ -277,6 +277,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu if (level - 1 == PT_PAGE_TABLE_LEVEL && walker->level == PT_DIRECTORY_LEVEL) { metaphysical = 1; + hugepage_access = *guest_ent; + hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; + hugepage_access >>= PT_WRITABLE_SHIFT; table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; } else { @@ -284,7 +287,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu table_gfn = walker->table_gfn[level - 2]; } shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, shadow_ent); + metaphysical, hugepage_access, + shadow_ent); shadow_addr = shadow_page->page_hpa; shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 3d8ea7a..b7e1410 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -44,6 +44,10 @@ #define SEG_TYPE_BUSY_TSS16 3 #define KVM_EFER_LMA (1 << 10) #define KVM_EFER_LME (1 << 8) +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_DEATURE_SVML (1 << 2) + unsigned long iopm_base; unsigned long msrpm_base; @@ -68,6 +72,7 @@ struct svm_cpu_data { }; static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +static uint32_t svm_features; struct svm_init_data { int cpu; @@ -82,6 +87,11 @@ #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * #define MAX_INST_SIZE 15 +static inline u32 svm_has(u32 feat) +{ + return svm_features & feat; +} + static unsigned get_addr_size(struct kvm_vcpu *vcpu) { struct vmcb_save_area *sa = &vcpu->svm->vmcb->save; @@ -203,13 +213,6 @@ static void inject_ud(struct kvm_vcpu *v UD_VECTOR; } -static void inject_db(struct kvm_vcpu *vcpu) -{ - vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | - SVM_EVTINJ_TYPE_EXEPT | - DB_VECTOR; -} - static int is_page_fault(uint32_t info) { info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; @@ -309,6 +312,7 @@ #endif svm_data->asid_generation = 1; svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; svm_data->next_asid = svm_data->max_asid + 1; + svm_features = cpuid_edx(SVM_CPUID_FUNC); asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); gdt = (struct desc_struct *)gdt_descr.address; @@ -459,7 +463,6 @@ static void init_vmcb(struct vmcb *vmcb) { struct vmcb_control_area *control = &vmcb->control; struct vmcb_save_area *save = &vmcb->save; - u64 tsc; control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | @@ -511,13 +514,16 @@ static void init_vmcb(struct vmcb *vmcb) (1ULL << INTERCEPT_VMSAVE) | (1ULL << INTERCEPT_STGI) | (1ULL << INTERCEPT_CLGI) | - (1ULL << INTERCEPT_SKINIT); + (1ULL << INTERCEPT_SKINIT) | + (1ULL << INTERCEPT_MONITOR) | + (1ULL << INTERCEPT_MWAIT); control->iopm_base_pa = iopm_base; control->msrpm_base_pa = msrpm_base; - rdtscll(tsc); - control->tsc_offset = -tsc; + control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; + if (svm_has(SVM_FEATURE_LBRV)) + control->lbr_ctl = 1ULL; init_seg(&save->es); init_seg(&save->ss); @@ -576,12 +582,14 @@ static int svm_create_vcpu(struct kvm_vc vcpu->svm->vmcb = page_address(page); memset(vcpu->svm->vmcb, 0, PAGE_SIZE); vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; - vcpu->svm->cr0 = 0x00000010; vcpu->svm->asid_generation = 0; memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); init_vmcb(vcpu->svm->vmcb); fx_init(vcpu); + vcpu->apic_base = 0xfee00000 | + /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | + MSR_IA32_APICBASE_ENABLE; return 0; @@ -602,11 +610,26 @@ static void svm_free_vcpu(struct kvm_vcp static void svm_vcpu_load(struct kvm_vcpu *vcpu) { - get_cpu(); + int cpu; + + cpu = get_cpu(); + if (unlikely(cpu != vcpu->cpu)) { + u64 tsc_this, delta; + + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + rdtscll(tsc_this); + delta = vcpu->host_tsc - tsc_this; + vcpu->svm->vmcb->control.tsc_offset += delta; + vcpu->cpu = cpu; + } } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { + rdtscll(vcpu->host_tsc); put_cpu(); } @@ -733,9 +756,10 @@ #ifdef CONFIG_X86_64 } } #endif - vcpu->svm->cr0 = cr0; - vcpu->svm->vmcb->save.cr0 = cr0 | CR0_PG_MASK | CR0_WP_MASK; vcpu->cr0 = cr0; + cr0 |= CR0_PG_MASK | CR0_WP_MASK; + cr0 &= ~(CR0_CD_MASK | CR0_NW_MASK); + vcpu->svm->vmcb->save.cr0 = cr0; } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -981,7 +1005,7 @@ static int io_get_override(struct kvm_vc return 0; } -static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address) +static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, gva_t *address) { unsigned long addr_mask; unsigned long *reg; @@ -1025,38 +1049,38 @@ static unsigned long io_adress(struct kv static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug? - int _in = io_info & SVM_IOIO_TYPE_MASK; + int size, down, in, string, rep; + unsigned port; + unsigned long count; + gva_t address = 0; ++kvm_stat.io_exits; vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2; - kvm_run->exit_reason = KVM_EXIT_IO; - kvm_run->io.port = io_info >> 16; - kvm_run->io.direction = (_in) ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - kvm_run->io.size = ((io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT); - kvm_run->io.string = (io_info & SVM_IOIO_STR_MASK) != 0; - kvm_run->io.rep = (io_info & SVM_IOIO_REP_MASK) != 0; + in = (io_info & SVM_IOIO_TYPE_MASK) != 0; + port = io_info >> 16; + size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; + string = (io_info & SVM_IOIO_STR_MASK) != 0; + rep = (io_info & SVM_IOIO_REP_MASK) != 0; + count = 1; + down = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; - if (kvm_run->io.string) { + if (string) { unsigned addr_mask; - addr_mask = io_adress(vcpu, _in, &kvm_run->io.address); + addr_mask = io_adress(vcpu, in, &address); if (!addr_mask) { printk(KERN_DEBUG "%s: get io address failed\n", __FUNCTION__); return 1; } - if (kvm_run->io.rep) { - kvm_run->io.count - = vcpu->regs[VCPU_REGS_RCX] & addr_mask; - kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags - & X86_EFLAGS_DF) != 0; - } - } else - kvm_run->io.value = vcpu->svm->vmcb->save.rax; - return 0; + if (rep) + count = vcpu->regs[VCPU_REGS_RCX] & addr_mask; + } + return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down, + address, rep, port); } static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -1078,7 +1102,8 @@ static int halt_interception(struct kvm_ static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - vcpu->svm->vmcb->save.rip += 3; + vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 3; + skip_emulated_instruction(vcpu); return kvm_hypercall(vcpu, kvm_run); } @@ -1098,8 +1123,8 @@ static int task_switch_interception(stru static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; - kvm_run->exit_reason = KVM_EXIT_CPUID; - return 0; + kvm_emulate_cpuid(vcpu); + return 1; } static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -1288,6 +1313,8 @@ static int (*svm_exit_handlers[])(struct [SVM_EXIT_STGI] = invalid_op_interception, [SVM_EXIT_CLGI] = invalid_op_interception, [SVM_EXIT_SKINIT] = invalid_op_interception, + [SVM_EXIT_MONITOR] = invalid_op_interception, + [SVM_EXIT_MWAIT] = invalid_op_interception, }; @@ -1295,8 +1322,6 @@ static int handle_exit(struct kvm_vcpu * { u32 exit_code = vcpu->svm->vmcb->control.exit_code; - kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; - if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " @@ -1606,8 +1631,9 @@ #endif vcpu->svm->next_rip = 0; if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { - kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; - kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = vcpu->svm->vmcb->control.exit_code; post_kvm_run_save(vcpu, kvm_run); return 0; } @@ -1617,12 +1643,14 @@ #endif if (signal_pending(current)) { ++kvm_stat.signal_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { ++kvm_stat.request_irq_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } kvm_resched(vcpu); @@ -1711,7 +1739,6 @@ static struct kvm_arch_ops svm_arch_ops .get_cs_db_l_bits = svm_get_cs_db_l_bits, .decache_cr0_cr4_guest_bits = svm_decache_cr0_cr4_guest_bits, .set_cr0 = svm_set_cr0, - .set_cr0_no_modeswitch = svm_set_cr0, .set_cr3 = svm_set_cr3, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h index df731c3..5e93814 100644 --- a/drivers/kvm/svm.h +++ b/drivers/kvm/svm.h @@ -44,6 +44,9 @@ enum { INTERCEPT_RDTSCP, INTERCEPT_ICEBP, INTERCEPT_WBINVD, + INTERCEPT_MONITOR, + INTERCEPT_MWAIT, + INTERCEPT_MWAIT_COND, }; @@ -298,6 +301,9 @@ #define SVM_EXIT_SKINIT 0x086 #define SVM_EXIT_RDTSCP 0x087 #define SVM_EXIT_ICEBP 0x088 #define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_MONITOR 0x08a +#define SVM_EXIT_MWAIT 0x08b +#define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_ERR -1 diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index fbbf9d6..61a6116 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -712,6 +712,8 @@ static void enter_rmode(struct kvm_vcpu vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); @@ -786,22 +788,6 @@ #endif vcpu->cr0 = cr0; } -/* - * Used when restoring the VM to avoid corrupting segment registers - */ -static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) - enter_rmode(vcpu); - - vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0); - update_exception_bitmap(vcpu); - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, - (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); - vcpu->cr0 = cr0; -} - static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); @@ -878,7 +864,14 @@ static void vmx_set_segment(struct kvm_v vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (var->unusable) + if (vcpu->rmode.active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else if (var->unusable) ar = 1 << 16; else { ar = var->type & 15; @@ -933,9 +926,9 @@ static int init_rmode_tss(struct kvm* kv gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; char *page; - p1 = _gfn_to_page(kvm, fn++); - p2 = _gfn_to_page(kvm, fn++); - p3 = _gfn_to_page(kvm, fn); + p1 = gfn_to_page(kvm, fn++); + p2 = gfn_to_page(kvm, fn++); + p3 = gfn_to_page(kvm, fn); if (!p1 || !p2 || !p3) { kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); @@ -1138,7 +1131,6 @@ #endif vcpu->guest_msrs[j] = vcpu->host_msrs[j]; ++vcpu->nmsrs; } - printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs); nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, @@ -1394,7 +1386,7 @@ static int handle_triple_fault(struct kv return 0; } -static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) +static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count) { u64 inst; gva_t rip; @@ -1439,33 +1431,35 @@ static int get_io_count(struct kvm_vcpu done: countr_size *= 8; *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); + //printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]); return 1; } static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u64 exit_qualification; + int size, down, in, string, rep; + unsigned port; + unsigned long count; + gva_t address; ++kvm_stat.io_exits; exit_qualification = vmcs_read64(EXIT_QUALIFICATION); - kvm_run->exit_reason = KVM_EXIT_IO; - if (exit_qualification & 8) - kvm_run->io.direction = KVM_EXIT_IO_IN; - else - kvm_run->io.direction = KVM_EXIT_IO_OUT; - kvm_run->io.size = (exit_qualification & 7) + 1; - kvm_run->io.string = (exit_qualification & 16) != 0; - kvm_run->io.string_down - = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; - kvm_run->io.rep = (exit_qualification & 32) != 0; - kvm_run->io.port = exit_qualification >> 16; - if (kvm_run->io.string) { - if (!get_io_count(vcpu, &kvm_run->io.count)) + in = (exit_qualification & 8) != 0; + size = (exit_qualification & 7) + 1; + string = (exit_qualification & 16) != 0; + down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; + count = 1; + rep = (exit_qualification & 32) != 0; + port = exit_qualification >> 16; + address = 0; + if (string) { + if (rep && !get_io_count(vcpu, &count)) return 1; - kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); - } else - kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ - return 0; + address = vmcs_readl(GUEST_LINEAR_ADDRESS); + } + return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down, + address, rep, port); } static void @@ -1583,8 +1577,8 @@ static int handle_dr(struct kvm_vcpu *vc static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - kvm_run->exit_reason = KVM_EXIT_CPUID; - return 0; + kvm_emulate_cpuid(vcpu); + return 1; } static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -1658,7 +1652,7 @@ static int handle_halt(struct kvm_vcpu * static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3); + skip_emulated_instruction(vcpu); return kvm_hypercall(vcpu, kvm_run); } @@ -1920,10 +1914,10 @@ #endif asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - kvm_run->exit_type = 0; if (fail) { - kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; - kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); r = 0; } else { /* @@ -1933,19 +1927,20 @@ #endif profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); vcpu->launched = 1; - kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; r = kvm_handle_exit(kvm_run, vcpu); if (r > 0) { /* Give scheduler a change to reschedule. */ if (signal_pending(current)) { ++kvm_stat.signal_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { ++kvm_stat.request_irq_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } @@ -2064,7 +2059,6 @@ static struct kvm_arch_ops vmx_arch_ops .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits, .set_cr0 = vmx_set_cr0, - .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, #ifdef CONFIG_X86_64 diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 7513cdd..bcf872b 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -833,8 +833,9 @@ done_prefixes: dst.ptr = (unsigned long *)cr2; dst.bytes = (d & ByteOp) ? 1 : op_bytes; if (d & BitOp) { - dst.ptr += src.val / BITS_PER_LONG; - dst.bytes = sizeof(long); + unsigned long mask = ~(dst.bytes * 8 - 1); + + dst.ptr = (void *)dst.ptr + (src.val & mask) / 8; } if (!(d & Mov) && /* optimisation - avoid slow emulated read */ ((rc = ops->read_emulated((unsigned long)dst.ptr, diff --git a/include/linux/Kbuild b/include/linux/Kbuild index e81e301..b35b593 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -99,6 +99,7 @@ header-y += iso_fs.h header-y += ixjuser.h header-y += jffs2.h header-y += keyctl.h +header-y += kvm.h header-y += limits.h header-y += lock_dlm_plock.h header-y += magic.h diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 275354f..07bf353 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -11,7 +11,7 @@ #define __LINUX_KVM_H #include #include -#define KVM_API_VERSION 4 +#define KVM_API_VERSION 10 /* * Architectural interrupt line count, and the size of the bitmap needed @@ -33,37 +33,41 @@ struct kvm_memory_region { /* for kvm_memory_region::flags */ #define KVM_MEM_LOG_DIRTY_PAGES 1UL - -#define KVM_EXIT_TYPE_FAIL_ENTRY 1 -#define KVM_EXIT_TYPE_VM_EXIT 2 +struct kvm_memory_alias { + __u32 slot; /* this has a different namespace than memory slots */ + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 target_phys_addr; +}; enum kvm_exit_reason { KVM_EXIT_UNKNOWN = 0, KVM_EXIT_EXCEPTION = 1, KVM_EXIT_IO = 2, - KVM_EXIT_CPUID = 3, + KVM_EXIT_HYPERCALL = 3, KVM_EXIT_DEBUG = 4, KVM_EXIT_HLT = 5, KVM_EXIT_MMIO = 6, KVM_EXIT_IRQ_WINDOW_OPEN = 7, KVM_EXIT_SHUTDOWN = 8, + KVM_EXIT_FAIL_ENTRY = 9, + KVM_EXIT_INTR = 10, }; -/* for KVM_RUN */ +/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ - __u32 emulated; /* skip current instruction */ - __u32 mmio_completed; /* mmio request completed */ + __u32 io_completed; /* mmio/pio request completed */ __u8 request_interrupt_window; - __u8 padding1[7]; + __u8 padding1[3]; /* out */ - __u32 exit_type; __u32 exit_reason; __u32 instruction_length; __u8 ready_for_interrupt_injection; __u8 if_flag; - __u16 padding2; + __u8 padding2[6]; /* in (pre_kvm_run), out (post_kvm_run) */ __u64 cr8; @@ -72,29 +76,26 @@ struct kvm_run { union { /* KVM_EXIT_UNKNOWN */ struct { - __u32 hardware_exit_reason; + __u64 hardware_exit_reason; } hw; + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; /* KVM_EXIT_EXCEPTION */ struct { __u32 exception; __u32 error_code; } ex; /* KVM_EXIT_IO */ - struct { + struct kvm_io { #define KVM_EXIT_IO_IN 0 #define KVM_EXIT_IO_OUT 1 __u8 direction; __u8 size; /* bytes */ - __u8 string; - __u8 string_down; - __u8 rep; - __u8 pad; __u16 port; - __u64 count; - union { - __u64 address; - __u32 value; - }; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ } io; struct { } debug; @@ -105,6 +106,13 @@ #define KVM_EXIT_IO_OUT 1 __u32 len; __u8 is_write; } mmio; + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; }; }; @@ -118,6 +126,21 @@ struct kvm_regs { __u64 rip, rflags; }; +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + struct kvm_segment { __u64 base; __u32 limit; @@ -210,38 +233,74 @@ struct kvm_dirty_log { }; }; +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + #define KVMIO 0xAE /* * ioctls for /dev/kvm fds: */ -#define KVM_GET_API_VERSION _IO(KVMIO, 1) -#define KVM_CREATE_VM _IO(KVMIO, 2) /* returns a VM fd */ -#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 15, struct kvm_msr_list) +#define KVM_GET_API_VERSION _IO(KVMIO, 0x00) +#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ +#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list) +/* + * Check if a kvm extension is available. Argument is extension number, + * return is 1 (yes) or 0 (no, sorry). + */ +#define KVM_CHECK_EXTENSION _IO(KVMIO, 0x03) +/* + * Get size for mmap(vcpu_fd) + */ +#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ /* * ioctls for VM fds */ -#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 10, struct kvm_memory_region) +#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. */ -#define KVM_CREATE_VCPU _IOW(KVMIO, 11, int) -#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 12, struct kvm_dirty_log) +#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) +#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) +#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) /* * ioctls for vcpu fds */ -#define KVM_RUN _IOWR(KVMIO, 2, struct kvm_run) -#define KVM_GET_REGS _IOR(KVMIO, 3, struct kvm_regs) -#define KVM_SET_REGS _IOW(KVMIO, 4, struct kvm_regs) -#define KVM_GET_SREGS _IOR(KVMIO, 5, struct kvm_sregs) -#define KVM_SET_SREGS _IOW(KVMIO, 6, struct kvm_sregs) -#define KVM_TRANSLATE _IOWR(KVMIO, 7, struct kvm_translation) -#define KVM_INTERRUPT _IOW(KVMIO, 8, struct kvm_interrupt) -#define KVM_DEBUG_GUEST _IOW(KVMIO, 9, struct kvm_debug_guest) -#define KVM_GET_MSRS _IOWR(KVMIO, 13, struct kvm_msrs) -#define KVM_SET_MSRS _IOW(KVMIO, 14, struct kvm_msrs) +#define KVM_RUN _IO(KVMIO, 0x80) +#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) +#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs) +#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) +#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) +#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) +#define KVM_DEBUG_GUEST _IOW(KVMIO, 0x87, struct kvm_debug_guest) +#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) +#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) +#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) +#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask) +#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) +#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) #endif diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 326da7d..dff9ea3 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -29,6 +29,7 @@ #define MISC_DYNAMIC_MINOR 255 #define TUN_MINOR 200 #define HPET_MINOR 228 +#define KVM_MINOR 232 struct device;