diff --git a/.gitignore b/.gitignore index 5d56a3f..9dacde0 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ modules.builtin include/config include/linux/version.h include/generated +arch/*/include/generated # stgit generated dirs patches-* diff --git a/.mailmap b/.mailmap index 5a6dd59..353ad56 100644 --- a/.mailmap +++ b/.mailmap @@ -32,6 +32,7 @@ Brian Avery Brian King Christoph Hellwig Corey Minyard +Damian Hobson-Garcia David Brownell David Woodhouse Dmitry Eremin-Solenikov diff --git a/CREDITS b/CREDITS index dca6abc..a7ea8e3 100644 --- a/CREDITS +++ b/CREDITS @@ -328,7 +328,7 @@ S: Haifa, Israel N: Johannes Berg E: johannes@sipsolutions.net W: http://johannes.sipsolutions.net/ -P: 1024D/9AB78CA5 AD02 0176 4E29 C137 1DF6 08D2 FC44 CF86 9AB7 8CA5 +P: 4096R/7BF9099A C0EB C440 F6DA 091C 884D 8532 E0F3 73F3 7BF9 099A D: powerpc & 802.11 hacker N: Stephen R. van den Berg (AKA BuGless) @@ -2943,6 +2943,10 @@ S: Kasarmikatu 11 A4 S: 70110 Kuopio S: Finland +N: Tobias Ringström +E: tori@unhappy.mine.nu +D: Davicom DM9102(A)/DM9132/DM9801 fast ethernet driver + N: Luca Risolia E: luca.risolia@studio.unibo.it P: 1024D/FCE635A4 88E8 F32F 7244 68BA 3958 5D40 99DA 5D2A FCE6 35A4 @@ -3913,6 +3917,10 @@ S: Flandernstrasse 101 S: D-73732 Esslingen S: Germany +N: Roman Zippel +E: zippel@linux-m68k.org +D: AFFS and HFS filesystems, m68k maintainer, new kernel configuration in 2.5 + N: Leonard N. Zubkoff W: http://www.dandelion.com/Linux/ D: BusLogic SCSI driver diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index c17cd4b..1b777b9 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -328,8 +328,6 @@ sysrq.txt - info on the magic SysRq key. telephony/ - directory with info on telephony (e.g. voice over IP) support. -uml/ - - directory with information about User Mode Linux. unicode.txt - info on the Unicode character/font mapping used in Linux. unshare.txt diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/obsolete/o2cb deleted file mode 100644 index 9c49d8e..0000000 --- a/Documentation/ABI/obsolete/o2cb +++ /dev/null @@ -1,11 +0,0 @@ -What: /sys/o2cb symlink -Date: Dec 2005 -KernelVersion: 2.6.16 -Contact: ocfs2-devel@oss.oracle.com -Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will - be removed when new versions of ocfs2-tools which know to look - in /sys/fs/o2cb are sufficiently prevalent. Don't code new - software to look here, it should try /sys/fs/o2cb instead. - See Documentation/ABI/stable/o2cb for more information on usage. -Users: ocfs2-tools. It's sufficient to mail proposed changes to - ocfs2-devel@oss.oracle.com. diff --git a/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-koneplus b/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-koneplus new file mode 100644 index 0000000..c2a270b --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-koneplus @@ -0,0 +1,10 @@ +What: /sys/bus/usb/devices/-:./::./koneplus/roccatkoneplus/startup_profile +Date: October 2010 +Contact: Stefan Achatz +Description: The integer value of this attribute ranges from 0-4. + When read, this attribute returns the number of the actual + profile. This value is persistent, so its equivalent to the + profile that's active when the mouse is powered on next time. + When written, this file sets the number of the startup profile + and the mouse activates this profile immediately. + Please use actual_profile, it does the same thing. diff --git a/Documentation/ABI/removed/o2cb b/Documentation/ABI/removed/o2cb new file mode 100644 index 0000000..7f5daa4 --- /dev/null +++ b/Documentation/ABI/removed/o2cb @@ -0,0 +1,10 @@ +What: /sys/o2cb symlink +Date: May 2011 +KernelVersion: 2.6.40 +Contact: ocfs2-devel@oss.oracle.com +Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is + removed when new versions of ocfs2-tools which know to look + in /sys/fs/o2cb are sufficiently prevalent. Don't code new + software to look here, it should try /sys/fs/o2cb instead. +Users: ocfs2-tools. It's sufficient to mail proposed changes to + ocfs2-devel@oss.oracle.com. diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 4873c75..c1eb41c 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -142,3 +142,67 @@ Description: with the previous I/O request are enabled. When set to 2, all merge tries are disabled. The default value is 0 - which enables all types of merge tries. + +What: /sys/block//discard_alignment +Date: May 2011 +Contact: Martin K. Petersen +Description: + Devices that support discard functionality may + internally allocate space in units that are bigger than + the exported logical block size. The discard_alignment + parameter indicates how many bytes the beginning of the + device is offset from the internal allocation unit's + natural alignment. + +What: /sys/block///discard_alignment +Date: May 2011 +Contact: Martin K. Petersen +Description: + Devices that support discard functionality may + internally allocate space in units that are bigger than + the exported logical block size. The discard_alignment + parameter indicates how many bytes the beginning of the + partition is offset from the internal allocation unit's + natural alignment. + +What: /sys/block//queue/discard_granularity +Date: May 2011 +Contact: Martin K. Petersen +Description: + Devices that support discard functionality may + internally allocate space using units that are bigger + than the logical block size. The discard_granularity + parameter indicates the size of the internal allocation + unit in bytes if reported by the device. Otherwise the + discard_granularity will be set to match the device's + physical block size. A discard_granularity of 0 means + that the device does not support discard functionality. + +What: /sys/block//queue/discard_max_bytes +Date: May 2011 +Contact: Martin K. Petersen +Description: + Devices that support discard functionality may have + internal limits on the number of bytes that can be + trimmed or unmapped in a single operation. Some storage + protocols also have inherent limits on the number of + blocks that can be described in a single command. The + discard_max_bytes parameter is set by the device driver + to the maximum number of bytes that can be discarded in + a single operation. Discard requests issued to the + device must not exceed this limit. A discard_max_bytes + value of 0 means that the device does not support + discard functionality. + +What: /sys/block//queue/discard_zeroes_data +Date: May 2011 +Contact: Martin K. Petersen +Description: + Devices that support discard functionality may return + stale or random data when a previously discarded block + is read back. This can cause problems if the filesystem + expects discarded blocks to be explicitly cleared. If a + device reports that it deterministically returns zeroes + when a discarded area is read the discard_zeroes_data + parameter will be set to one. Otherwise it will be 0 and + the result of reading a discarded area is undefined. diff --git a/Documentation/ABI/testing/sysfs-bus-bcma b/Documentation/ABI/testing/sysfs-bus-bcma new file mode 100644 index 0000000..06b62ba --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-bcma @@ -0,0 +1,31 @@ +What: /sys/bus/bcma/devices/.../manuf +Date: May 2011 +KernelVersion: 2.6.40 +Contact: Rafał Miłecki +Description: + Each BCMA core has it's manufacturer id. See + include/linux/bcma/bcma.h for possible values. + +What: /sys/bus/bcma/devices/.../id +Date: May 2011 +KernelVersion: 2.6.40 +Contact: Rafał Miłecki +Description: + There are a few types of BCMA cores, they can be identified by + id field. + +What: /sys/bus/bcma/devices/.../rev +Date: May 2011 +KernelVersion: 2.6.40 +Contact: Rafał Miłecki +Description: + BCMA cores of the same type can still slightly differ depending + on their revision. Use it for detailed programming. + +What: /sys/bus/bcma/devices/.../class +Date: May 2011 +KernelVersion: 2.6.40 +Contact: Rafał Miłecki +Description: + Each BCMA core is identified by few fields, including class it + belongs to. See include/linux/bcma/bcma.h for possible values. diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 36bf454..349ecf2 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -74,6 +74,15 @@ Description: hot-remove the PCI device and any of its children. Depends on CONFIG_HOTPLUG. +What: /sys/bus/pci/devices/.../pci_bus/.../rescan +Date: May 2011 +Contact: Linux PCI developers +Description: + Writing a non-zero value to this attribute will + force a rescan of the bus and all child buses, + and re-discover devices removed earlier from this + part of the device tree. Depends on CONFIG_HOTPLUG. + What: /sys/bus/pci/devices/.../rescan Date: January 2009 Contact: Linux PCI developers diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 7564e88..e7be75b 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -183,21 +183,21 @@ Description: Discover and change clock speed of CPUs to learn how to control the knobs. -What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X -Date: August 2008 +What: /sys/devices/system/cpu/cpu*/cache/index3/cache_disable_{0,1} +Date: August 2008 KernelVersion: 2.6.27 -Contact: mark.langsdorf@amd.com -Description: These files exist in every cpu's cache index directories. - There are currently 2 cache_disable_# files in each - directory. Reading from these files on a supported - processor will return that cache disable index value - for that processor and node. Writing to one of these - files will cause the specificed cache index to be disabled. - - Currently, only AMD Family 10h Processors support cache index - disable, and only for their L3 caches. See the BIOS and - Kernel Developer's Guide at - http://support.amd.com/us/Embedded_TechDocs/31116-Public-GH-BKDG_3-28_5-28-09.pdf - for formatting information and other details on the - cache index disable. -Users: joachim.deguara@amd.com +Contact: discuss@x86-64.org +Description: Disable L3 cache indices + + These files exist in every CPU's cache/index3 directory. Each + cache_disable_{0,1} file corresponds to one disable slot which + can be used to disable a cache index. Reading from these files + on a processor with this functionality will return the currently + disabled index for that node. There is one L3 structure per + node, or per internal node on MCM machines. Writing a valid + index to one of these files will cause the specificed cache + index to be disabled. + + All AMD processors with L3 caches provide this functionality. + For details, see BKDGs at + http://developer.amd.com/documentation/guides/Pages/default.aspx diff --git a/Documentation/ABI/testing/sysfs-driver-hid-roccat-koneplus b/Documentation/ABI/testing/sysfs-driver-hid-roccat-koneplus index 326e054..c1b53b8 100644 --- a/Documentation/ABI/testing/sysfs-driver-hid-roccat-koneplus +++ b/Documentation/ABI/testing/sysfs-driver-hid-roccat-koneplus @@ -1,9 +1,12 @@ What: /sys/bus/usb/devices/-:./::./koneplus/roccatkoneplus/actual_profile Date: October 2010 Contact: Stefan Achatz -Description: When read, this file returns the number of the actual profile in - range 0-4. - This file is readonly. +Description: The integer value of this attribute ranges from 0-4. + When read, this attribute returns the number of the actual + profile. This value is persistent, so its equivalent to the + profile that's active when the mouse is powered on next time. + When written, this file sets the number of the startup profile + and the mouse activates this profile immediately. Users: http://roccat.sourceforge.net What: /sys/bus/usb/devices/-:./::./koneplus/roccatkoneplus/firmware_version @@ -89,16 +92,6 @@ Description: The mouse has a tracking- and a distance-control-unit. These This file is writeonly. Users: http://roccat.sourceforge.net -What: /sys/bus/usb/devices/-:./::./koneplus/roccatkoneplus/startup_profile -Date: October 2010 -Contact: Stefan Achatz -Description: The integer value of this attribute ranges from 0-4. - When read, this attribute returns the number of the profile - that's active when the mouse is powered on. - When written, this file sets the number of the startup profile - and the mouse activates this profile immediately. -Users: http://roccat.sourceforge.net - What: /sys/bus/usb/devices/-:./::./koneplus/roccatkoneplus/tcu Date: October 2010 Contact: Stefan Achatz diff --git a/Documentation/ABI/testing/sysfs-firmware-dmi b/Documentation/ABI/testing/sysfs-firmware-dmi index ba9da95..c78f9ab 100644 --- a/Documentation/ABI/testing/sysfs-firmware-dmi +++ b/Documentation/ABI/testing/sysfs-firmware-dmi @@ -14,14 +14,15 @@ Description: DMI is structured as a large table of entries, where each entry has a common header indicating the type and - length of the entry, as well as 'handle' that is - supposed to be unique amongst all entries. + length of the entry, as well as a firmware-provided + 'handle' that is supposed to be unique amongst all + entries. Some entries are required by the specification, but many others are optional. In general though, users should never expect to find a specific entry type on their system unless they know for certain what their firmware - is doing. Machine to machine will vary. + is doing. Machine to machine experiences will vary. Multiple entries of the same type are allowed. In order to handle these duplicate entry types, each entry is @@ -67,25 +68,24 @@ Description: and the two terminating nul characters. type : The type of the entry. This value is the same as found in the directory name. It indicates - how the rest of the entry should be - interpreted. + how the rest of the entry should be interpreted. instance: The instance ordinal of the entry for the given type. This value is the same as found in the parent directory name. - position: The position of the entry within the entirety - of the entirety. + position: The ordinal position (zero-based) of the entry + within the entirety of the DMI entry table. === Entry Specialization === Some entry types may have other information available in - sysfs. + sysfs. Not all types are specialized. --- Type 15 - System Event Log --- This entry allows the firmware to export a log of events the system has taken. This information is typically backed by nvram, but the implementation - details are abstracted by this table. This entries data + details are abstracted by this table. This entry's data is exported in the directory: /sys/firmware/dmi/entries/15-0/system_event_log diff --git a/Documentation/ABI/testing/sysfs-firmware-gsmi b/Documentation/ABI/testing/sysfs-firmware-gsmi new file mode 100644 index 0000000..0faa0aa --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-gsmi @@ -0,0 +1,58 @@ +What: /sys/firmware/gsmi +Date: March 2011 +Contact: Mike Waychison +Description: + Some servers used internally at Google have firmware + that provides callback functionality via explicit SMI + triggers. Some of the callbacks are similar to those + provided by the EFI runtime services page, but due to + historical reasons this different entry-point has been + used. + + The gsmi driver implements the kernel's abstraction for + these firmware callbacks. Currently, this functionality + is limited to handling the system event log and getting + access to EFI-style variables stored in nvram. + + Layout: + + /sys/firmware/gsmi/vars: + + This directory has the same layout (and + underlying implementation as /sys/firmware/efi/vars. + See Documentation/ABI/*/sysfs-firmware-efi-vars + for more information on how to interact with + this structure. + + /sys/firmware/gsmi/append_to_eventlog - write-only: + + This file takes a binary blob and passes it onto + the firmware to be timestamped and appended to + the system eventlog. The binary format is + interpreted by the firmware and may change from + platform to platform. The only kernel-enforced + requirement is that the blob be prefixed with a + 32bit host-endian type used as part of the + firmware call. + + /sys/firmware/gsmi/clear_config - write-only: + + Writing any value to this file will cause the + entire firmware configuration to be reset to + "factory defaults". Callers should assume that + a reboot is required for the configuration to be + cleared. + + /sys/firmware/gsmi/clear_eventlog - write-only: + + This file is used to clear out a portion/the + whole of the system event log. Values written + should be values between 1 and 100 inclusive (in + ASCII) representing the fraction of the log to + clear. Not all platforms support fractional + clearing though, and this writes to this file + will error out if the firmware doesn't like your + submitted fraction. + + Callers should assume that a reboot is needed + for this operation to complete. diff --git a/Documentation/ABI/testing/sysfs-firmware-log b/Documentation/ABI/testing/sysfs-firmware-log new file mode 100644 index 0000000..9b58e7c --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-log @@ -0,0 +1,7 @@ +What: /sys/firmware/log +Date: February 2011 +Contact: Mike Waychison +Description: + The /sys/firmware/log is a binary file that represents a + read-only copy of the firmware's log if one is + available. diff --git a/Documentation/ABI/testing/sysfs-kernel-fscaps b/Documentation/ABI/testing/sysfs-kernel-fscaps new file mode 100644 index 0000000..50a3033 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-fscaps @@ -0,0 +1,8 @@ +What: /sys/kernel/fscaps +Date: February 2011 +KernelVersion: 2.6.38 +Contact: Ludwig Nussel +Description + Shows whether file system capabilities are honored + when executing a binary + diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cleancache b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache new file mode 100644 index 0000000..662ae64 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache @@ -0,0 +1,11 @@ +What: /sys/kernel/mm/cleancache/ +Date: April 2011 +Contact: Dan Magenheimer +Description: + /sys/kernel/mm/cleancache/ contains a number of files which + record a count of various cleancache operations + (sum across all filesystems): + succ_gets + failed_gets + puts + flushes diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 194ca44..b464d12 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -158,3 +158,17 @@ Description: successful, will make the kernel abort a subsequent transition to a sleep state if any wakeup events are reported after the write has returned. + +What: /sys/power/reserved_size +Date: May 2011 +Contact: Rafael J. Wysocki +Description: + The /sys/power/reserved_size file allows user space to control + the amount of memory reserved for allocations made by device + drivers during the "device freeze" stage of hibernation. It can + be written a string representing a non-negative integer that + will be used as the amount of memory to reserve for allocations + made by device drivers' "freeze" callbacks, in bytes. + + Reading from this file will display the current value, which is + set to 1 MB by default. diff --git a/Documentation/ABI/testing/sysfs-ptp b/Documentation/ABI/testing/sysfs-ptp new file mode 100644 index 0000000..d40d2b5 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-ptp @@ -0,0 +1,98 @@ +What: /sys/class/ptp/ +Date: September 2010 +Contact: Richard Cochran +Description: + This directory contains files and directories + providing a standardized interface to the ancillary + features of PTP hardware clocks. + +What: /sys/class/ptp/ptpN/ +Date: September 2010 +Contact: Richard Cochran +Description: + This directory contains the attributes of the Nth PTP + hardware clock registered into the PTP class driver + subsystem. + +What: /sys/class/ptp/ptpN/clock_name +Date: September 2010 +Contact: Richard Cochran +Description: + This file contains the name of the PTP hardware clock + as a human readable string. + +What: /sys/class/ptp/ptpN/max_adjustment +Date: September 2010 +Contact: Richard Cochran +Description: + This file contains the PTP hardware clock's maximum + frequency adjustment value (a positive integer) in + parts per billion. + +What: /sys/class/ptp/ptpN/n_alarms +Date: September 2010 +Contact: Richard Cochran +Description: + This file contains the number of periodic or one shot + alarms offer by the PTP hardware clock. + +What: /sys/class/ptp/ptpN/n_external_timestamps +Date: September 2010 +Contact: Richard Cochran +Description: + This file contains the number of external timestamp + channels offered by the PTP hardware clock. + +What: /sys/class/ptp/ptpN/n_periodic_outputs +Date: September 2010 +Contact: Richard Cochran +Description: + This file contains the number of programmable periodic + output channels offered by the PTP hardware clock. + +What: /sys/class/ptp/ptpN/pps_avaiable +Date: September 2010 +Contact: Richard Cochran +Description: + This file indicates whether the PTP hardware clock + supports a Pulse Per Second to the host CPU. Reading + "1" means that the PPS is supported, while "0" means + not supported. + +What: /sys/class/ptp/ptpN/extts_enable +Date: September 2010 +Contact: Richard Cochran +Description: + This write-only file enables or disables external + timestamps. To enable external timestamps, write the + channel index followed by a "1" into the file. + To disable external timestamps, write the channel + index followed by a "0" into the file. + +What: /sys/class/ptp/ptpN/fifo +Date: September 2010 +Contact: Richard Cochran +Description: + This file provides timestamps on external events, in + the form of three integers: channel index, seconds, + and nanoseconds. + +What: /sys/class/ptp/ptpN/period +Date: September 2010 +Contact: Richard Cochran +Description: + This write-only file enables or disables periodic + outputs. To enable a periodic output, write five + integers into the file: channel index, start time + seconds, start time nanoseconds, period seconds, and + period nanoseconds. To disable a periodic output, set + all the seconds and nanoseconds values to zero. + +What: /sys/class/ptp/ptpN/pps_enable +Date: September 2010 +Contact: Richard Cochran +Description: + This write-only file enables or disables delivery of + PPS events to the Linux PPS subsystem. To enable PPS + events, write a "1" into the file. To disable events, + write a "0" into the file. diff --git a/Documentation/DocBook/.gitignore b/Documentation/DocBook/.gitignore index c6def35..679034c 100644 --- a/Documentation/DocBook/.gitignore +++ b/Documentation/DocBook/.gitignore @@ -8,3 +8,4 @@ *.dvi *.log *.out +media/ diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 8436b01..3cebfa0 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -73,7 +73,7 @@ installmandocs: mandocs ### #External programs used KERNELDOC = $(srctree)/scripts/kernel-doc -DOCPROC = $(objtree)/scripts/basic/docproc +DOCPROC = $(objtree)/scripts/docproc XMLTOFLAGS = -m $(srctree)/Documentation/DocBook/stylesheet.xsl XMLTOFLAGS += --skip-validation diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index 36f63d4..b638e50 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -96,10 +96,10 @@ X!Iinclude/linux/kobject.h Device drivers infrastructure + The Basic Device Driver-Model Structures +!Iinclude/linux/device.h + Device Drivers Base - !Edrivers/base/driver.c !Edrivers/base/core.c !Edrivers/base/class.c diff --git a/Documentation/DocBook/dvb/dvbapi.xml b/Documentation/DocBook/dvb/dvbapi.xml index ad8678d..9fad86c 100644 --- a/Documentation/DocBook/dvb/dvbapi.xml +++ b/Documentation/DocBook/dvb/dvbapi.xml @@ -35,6 +35,14 @@ + 2.0.4 + 2011-05-06 + mcc + + Add more information about DVB APIv5, better describing the frontend GET/SET props ioctl's. + + + 2.0.3 2010-07-03 mcc diff --git a/Documentation/DocBook/dvb/dvbproperty.xml b/Documentation/DocBook/dvb/dvbproperty.xml index 97f397e..52d5e3c 100644 --- a/Documentation/DocBook/dvb/dvbproperty.xml +++ b/Documentation/DocBook/dvb/dvbproperty.xml @@ -1,6 +1,327 @@ -
+
FE_GET_PROPERTY/FE_SET_PROPERTY + +/* Reserved fields should be set to 0 */ +struct dtv_property { + __u32 cmd; + union { + __u32 data; + struct { + __u8 data[32]; + __u32 len; + __u32 reserved1[3]; + void *reserved2; + } buffer; + } u; + int result; +} __attribute__ ((packed)); + +/* num of properties cannot exceed DTV_IOCTL_MAX_MSGS per ioctl */ +#define DTV_IOCTL_MAX_MSGS 64 + +struct dtv_properties { + __u32 num; + struct dtv_property *props; +}; + + +
+FE_GET_PROPERTY +DESCRIPTION + + +This ioctl call returns one or more frontend properties. This call only + requires read-only access to the device. + + +SYNOPSIS + + +int ioctl(int fd, int request = FE_GET_PROPERTY, + dtv_properties ⋆props); + + +PARAMETERS + + +int fd + +File descriptor returned by a previous call to open(). + + +int num + +Equals FE_GET_PROPERTY for this command. + + +struct dtv_property *props + +Points to the location where the front-end property commands are stored. + + +ERRORS + + EINVAL + Invalid parameter(s) received or number of parameters out of the range. + + ENOMEM + Out of memory. + + EFAULT + Failure while copying data from/to userspace. + + EOPNOTSUPP + Property type not supported. + +
+ +
+FE_SET_PROPERTY +DESCRIPTION + + +This ioctl call sets one or more frontend properties. This call only + requires read-only access to the device. + + +SYNOPSIS + + +int ioctl(int fd, int request = FE_SET_PROPERTY, + dtv_properties ⋆props); + + +PARAMETERS + + +int fd + +File descriptor returned by a previous call to open(). + + +int num + +Equals FE_SET_PROPERTY for this command. + + +struct dtv_property *props + +Points to the location where the front-end property commands are stored. + + +ERRORS + + + EINVAL + Invalid parameter(s) received or number of parameters out of the range. + + ENOMEM + Out of memory. + + EFAULT + Failure while copying data from/to userspace. + + EOPNOTSUPP + Property type not supported. + +
+ + +On FE_GET_PROPERTY/FE_SET_PROPERTY, +the actual action is determined by the dtv_property cmd/data pairs. With one single ioctl, is possible to +get/set up to 64 properties. The actual meaning of each property is described on the next sections. + + +The Available frontend property types are: + +#define DTV_UNDEFINED 0 +#define DTV_TUNE 1 +#define DTV_CLEAR 2 +#define DTV_FREQUENCY 3 +#define DTV_MODULATION 4 +#define DTV_BANDWIDTH_HZ 5 +#define DTV_INVERSION 6 +#define DTV_DISEQC_MASTER 7 +#define DTV_SYMBOL_RATE 8 +#define DTV_INNER_FEC 9 +#define DTV_VOLTAGE 10 +#define DTV_TONE 11 +#define DTV_PILOT 12 +#define DTV_ROLLOFF 13 +#define DTV_DISEQC_SLAVE_REPLY 14 +#define DTV_FE_CAPABILITY_COUNT 15 +#define DTV_FE_CAPABILITY 16 +#define DTV_DELIVERY_SYSTEM 17 +#define DTV_ISDBT_PARTIAL_RECEPTION 18 +#define DTV_ISDBT_SOUND_BROADCASTING 19 +#define DTV_ISDBT_SB_SUBCHANNEL_ID 20 +#define DTV_ISDBT_SB_SEGMENT_IDX 21 +#define DTV_ISDBT_SB_SEGMENT_COUNT 22 +#define DTV_ISDBT_LAYERA_FEC 23 +#define DTV_ISDBT_LAYERA_MODULATION 24 +#define DTV_ISDBT_LAYERA_SEGMENT_COUNT 25 +#define DTV_ISDBT_LAYERA_TIME_INTERLEAVING 26 +#define DTV_ISDBT_LAYERB_FEC 27 +#define DTV_ISDBT_LAYERB_MODULATION 28 +#define DTV_ISDBT_LAYERB_SEGMENT_COUNT 29 +#define DTV_ISDBT_LAYERB_TIME_INTERLEAVING 30 +#define DTV_ISDBT_LAYERC_FEC 31 +#define DTV_ISDBT_LAYERC_MODULATION 32 +#define DTV_ISDBT_LAYERC_SEGMENT_COUNT 33 +#define DTV_ISDBT_LAYERC_TIME_INTERLEAVING 34 +#define DTV_API_VERSION 35 +#define DTV_CODE_RATE_HP 36 +#define DTV_CODE_RATE_LP 37 +#define DTV_GUARD_INTERVAL 38 +#define DTV_TRANSMISSION_MODE 39 +#define DTV_HIERARCHY 40 +#define DTV_ISDBT_LAYER_ENABLED 41 +#define DTV_ISDBS_TS_ID 42 + + +
+ Parameters that are common to all Digital TV standards +
+ <constant>DTV_FREQUENCY</constant> + + Central frequency of the channel, in HZ. + + Notes: + 1)For ISDB-T, the channels are usually transmitted with an offset of 143kHz. + E.g. a valid frequncy could be 474143 kHz. The stepping is bound to the bandwidth of + the channel which is 6MHz. + + 2)As in ISDB-Tsb the channel consists of only one or three segments the + frequency step is 429kHz, 3*429 respectively. As for ISDB-T the + central frequency of the channel is expected. +
+ +
+ <constant>DTV_BANDWIDTH_HZ</constant> + + Bandwidth for the channel, in HZ. + + Possible values: + 1712000, + 5000000, + 6000000, + 7000000, + 8000000, + 10000000. + + + Notes: + + 1) For ISDB-T it should be always 6000000Hz (6MHz) + 2) For ISDB-Tsb it can vary depending on the number of connected segments + 3) Bandwidth doesn't apply for DVB-C transmissions, as the bandwidth + for DVB-C depends on the symbol rate + 4) Bandwidth in ISDB-T is fixed (6MHz) or can be easily derived from + other parameters (DTV_ISDBT_SB_SEGMENT_IDX, + DTV_ISDBT_SB_SEGMENT_COUNT). + 5) DVB-T supports 6, 7 and 8MHz. + 6) In addition, DVB-T2 supports 1.172, 5 and 10MHz. +
+ +
+ <constant>DTV_DELIVERY_SYSTEM</constant> + + Specifies the type of Delivery system + + Possible values: + +typedef enum fe_delivery_system { + SYS_UNDEFINED, + SYS_DVBC_ANNEX_AC, + SYS_DVBC_ANNEX_B, + SYS_DVBT, + SYS_DSS, + SYS_DVBS, + SYS_DVBS2, + SYS_DVBH, + SYS_ISDBT, + SYS_ISDBS, + SYS_ISDBC, + SYS_ATSC, + SYS_ATSCMH, + SYS_DMBTH, + SYS_CMMB, + SYS_DAB, + SYS_DVBT2, +} fe_delivery_system_t; + + +
+ +
+ <constant>DTV_TRANSMISSION_MODE</constant> + + Specifies the number of carriers used by the standard + + Possible values are: + +typedef enum fe_transmit_mode { + TRANSMISSION_MODE_2K, + TRANSMISSION_MODE_8K, + TRANSMISSION_MODE_AUTO, + TRANSMISSION_MODE_4K, + TRANSMISSION_MODE_1K, + TRANSMISSION_MODE_16K, + TRANSMISSION_MODE_32K, +} fe_transmit_mode_t; + + + Notes: + 1) ISDB-T supports three carrier/symbol-size: 8K, 4K, 2K. It is called + 'mode' in the standard: Mode 1 is 2K, mode 2 is 4K, mode 3 is 8K + + 2) If DTV_TRANSMISSION_MODE is set the TRANSMISSION_MODE_AUTO the + hardware will try to find the correct FFT-size (if capable) and will + use TMCC to fill in the missing parameters. + 3) DVB-T specifies 2K and 8K as valid sizes. + 4) DVB-T2 specifies 1K, 2K, 4K, 8K, 16K and 32K. +
+ +
+ <constant>DTV_GUARD_INTERVAL</constant> + + Possible values are: + +typedef enum fe_guard_interval { + GUARD_INTERVAL_1_32, + GUARD_INTERVAL_1_16, + GUARD_INTERVAL_1_8, + GUARD_INTERVAL_1_4, + GUARD_INTERVAL_AUTO, + GUARD_INTERVAL_1_128, + GUARD_INTERVAL_19_128, + GUARD_INTERVAL_19_256, +} fe_guard_interval_t; + + + Notes: + 1) If DTV_GUARD_INTERVAL is set the GUARD_INTERVAL_AUTO the hardware will + try to find the correct guard interval (if capable) and will use TMCC to fill + in the missing parameters. + 2) Intervals 1/128, 19/128 and 19/256 are used only for DVB-T2 at present +
+
+
ISDB-T frontend This section describes shortly what are the possible parameters in the Linux @@ -32,73 +353,6 @@ Parameters used by ISDB-T and ISDB-Tsb. -
- Parameters that are common with DVB-T and ATSC - -
- <constant>DTV_FREQUENCY</constant> - - Central frequency of the channel. - - For ISDB-T the channels are usually transmitted with an offset of 143kHz. E.g. a - valid frequncy could be 474143 kHz. The stepping is bound to the bandwidth of - the channel which is 6MHz. - - As in ISDB-Tsb the channel consists of only one or three segments the - frequency step is 429kHz, 3*429 respectively. As for ISDB-T the - central frequency of the channel is expected. -
- -
- <constant>DTV_BANDWIDTH_HZ</constant> (optional) - - Possible values: - - For ISDB-T it should be always 6000000Hz (6MHz) - For ISDB-Tsb it can vary depending on the number of connected segments - - Note: Hardware specific values might be given here, but standard - applications should not bother to set a value to this field as - standard demods are ignoring it anyway. - - Bandwidth in ISDB-T is fixed (6MHz) or can be easily derived from - other parameters (DTV_ISDBT_SB_SEGMENT_IDX, - DTV_ISDBT_SB_SEGMENT_COUNT). -
- -
- <constant>DTV_DELIVERY_SYSTEM</constant> - - Possible values: SYS_ISDBT -
- -
- <constant>DTV_TRANSMISSION_MODE</constant> - - ISDB-T supports three carrier/symbol-size: 8K, 4K, 2K. It is called - 'mode' in the standard: Mode 1 is 2K, mode 2 is 4K, mode 3 is 8K - - Possible values: TRANSMISSION_MODE_2K, TRANSMISSION_MODE_8K, - TRANSMISSION_MODE_AUTO, TRANSMISSION_MODE_4K - - If DTV_TRANSMISSION_MODE is set the TRANSMISSION_MODE_AUTO the - hardware will try to find the correct FFT-size (if capable) and will - use TMCC to fill in the missing parameters. - - TRANSMISSION_MODE_4K is added at the same time as the other new parameters. -
- -
- <constant>DTV_GUARD_INTERVAL</constant> - - Possible values: GUARD_INTERVAL_1_32, GUARD_INTERVAL_1_16, GUARD_INTERVAL_1_8, - GUARD_INTERVAL_1_4, GUARD_INTERVAL_AUTO - - If DTV_GUARD_INTERVAL is set the GUARD_INTERVAL_AUTO the hardware will - try to find the correct guard interval (if capable) and will use TMCC to fill - in the missing parameters. -
-
ISDB-T only parameters @@ -314,5 +568,20 @@
+
+ DVB-T2 parameters + + This section covers parameters that apply only to the DVB-T2 delivery method. DVB-T2 + support is currently in the early stages development so expect this section to grow + and become more detailed with time. + +
+ <constant>DTV_DVBT2_PLP_ID</constant> + + DVB-T2 supports Physical Layer Pipes (PLP) to allow transmission of + many data types via a single multiplex. The API will soon support this + at which point this section will be expanded. +
+
diff --git a/Documentation/DocBook/dvb/frontend.h.xml b/Documentation/DocBook/dvb/frontend.h.xml index d08e0d4..d792f78 100644 --- a/Documentation/DocBook/dvb/frontend.h.xml +++ b/Documentation/DocBook/dvb/frontend.h.xml @@ -176,14 +176,20 @@ typedef enum fe_transmit_mode { TRANSMISSION_MODE_2K, TRANSMISSION_MODE_8K, TRANSMISSION_MODE_AUTO, - TRANSMISSION_MODE_4K + TRANSMISSION_MODE_4K, + TRANSMISSION_MODE_1K, + TRANSMISSION_MODE_16K, + TRANSMISSION_MODE_32K, } fe_transmit_mode_t; typedef enum fe_bandwidth { BANDWIDTH_8_MHZ, BANDWIDTH_7_MHZ, BANDWIDTH_6_MHZ, - BANDWIDTH_AUTO + BANDWIDTH_AUTO, + BANDWIDTH_5_MHZ, + BANDWIDTH_10_MHZ, + BANDWIDTH_1_712_MHZ, } fe_bandwidth_t; @@ -192,7 +198,10 @@ typedef enum fe_guard_interval { GUARD_INTERVAL_1_16, GUARD_INTERVAL_1_8, GUARD_INTERVAL_1_4, - GUARD_INTERVAL_AUTO + GUARD_INTERVAL_AUTO, + GUARD_INTERVAL_1_128, + GUARD_INTERVAL_19_128, + GUARD_INTERVAL_19_256, } fe_guard_interval_t; @@ -306,7 +315,9 @@ struct dvb_frontend_event { #define DTV_ISDBS_TS_ID 42 -#define DTV_MAX_COMMAND DTV_ISDBS_TS_ID +#define DTV_DVBT2_PLP_ID 43 + +#define DTV_MAX_COMMAND DTV_DVBT2_PLP_ID typedef enum fe_pilot { PILOT_ON, @@ -338,6 +349,7 @@ typedef enum fe_delivery_system { SYS_DMBTH, SYS_CMMB, SYS_DAB, + SYS_DVBT2, } fe_delivery_system_t; struct dtv_cmds_h { diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl index fb10fd0..b342234 100644 --- a/Documentation/DocBook/genericirq.tmpl +++ b/Documentation/DocBook/genericirq.tmpl @@ -191,8 +191,8 @@ Whenever an interrupt triggers, the lowlevel arch code calls into the generic interrupt code by calling desc->handle_irq(). - This highlevel IRQ handling function only uses desc->chip primitives - referenced by the assigned chip descriptor structure. + This highlevel IRQ handling function only uses desc->irq_data.chip + primitives referenced by the assigned chip descriptor structure.
@@ -206,11 +206,11 @@ enable_irq() disable_irq_nosync() (SMP only) synchronize_irq() (SMP only) - set_irq_type() - set_irq_wake() - set_irq_data() - set_irq_chip() - set_irq_chip_data() + irq_set_irq_type() + irq_set_irq_wake() + irq_set_handler_data() + irq_set_chip() + irq_set_chip_data() See the autogenerated function documentation for details. @@ -225,6 +225,8 @@ handle_fasteoi_irq handle_simple_irq handle_percpu_irq + handle_edge_eoi_irq + handle_bad_irq The interrupt flow handlers (either predefined or architecture specific) are assigned to specific interrupts by the architecture @@ -241,13 +243,13 @@ default_enable(struct irq_data *data) { - desc->chip->irq_unmask(data); + desc->irq_data.chip->irq_unmask(data); } default_disable(struct irq_data *data) { if (!delay_disable(data)) - desc->chip->irq_mask(data); + desc->irq_data.chip->irq_mask(data); } default_ack(struct irq_data *data) @@ -284,9 +286,9 @@ noop(struct irq_data *data)) The following control flow is implemented (simplified excerpt): -desc->chip->irq_mask(); -handle_IRQ_event(desc->action); -desc->chip->irq_unmask(); +desc->irq_data.chip->irq_mask_ack(); +handle_irq_event(desc->action); +desc->irq_data.chip->irq_unmask(); @@ -300,8 +302,8 @@ desc->chip->irq_unmask(); The following control flow is implemented (simplified excerpt): -handle_IRQ_event(desc->action); -desc->chip->irq_eoi(); +handle_irq_event(desc->action); +desc->irq_data.chip->irq_eoi(); @@ -315,17 +317,17 @@ desc->chip->irq_eoi(); The following control flow is implemented (simplified excerpt): if (desc->status & running) { - desc->chip->irq_mask(); + desc->irq_data.chip->irq_mask_ack(); desc->status |= pending | masked; return; } -desc->chip->irq_ack(); +desc->irq_data.chip->irq_ack(); desc->status |= running; do { if (desc->status & masked) - desc->chip->irq_unmask(); + desc->irq_data.chip->irq_unmask(); desc->status &= ~pending; - handle_IRQ_event(desc->action); + handle_irq_event(desc->action); } while (status & pending); desc->status &= ~running; @@ -344,7 +346,7 @@ desc->status &= ~running; The following control flow is implemented (simplified excerpt): -handle_IRQ_event(desc->action); +handle_irq_event(desc->action); @@ -362,12 +364,29 @@ handle_IRQ_event(desc->action); The following control flow is implemented (simplified excerpt): -handle_IRQ_event(desc->action); -if (desc->chip->irq_eoi) - desc->chip->irq_eoi(); +if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(); +handle_irq_event(desc->action); +if (desc->irq_data.chip->irq_eoi) + desc->irq_data.chip->irq_eoi(); + + EOI Edge IRQ flow handler + + handle_edge_eoi_irq provides an abnomination of the edge + handler which is solely used to tame a badly wreckaged + irq controller on powerpc/cell. + + + + Bad IRQ flow handler + + handle_bad_irq is used for spurious interrupts which + have no real handler assigned.. + + Quirks and optimizations @@ -410,6 +429,7 @@ if (desc->chip->irq_eoi) irq_mask_ack() - Optional, recommended for performance irq_mask() irq_unmask() + irq_eoi() - Optional, required for eoi flow handlers irq_retrigger() - Optional irq_set_type() - Optional irq_set_wake() - Optional @@ -424,32 +444,24 @@ if (desc->chip->irq_eoi) __do_IRQ entry point - The original implementation __do_IRQ() is an alternative entry - point for all types of interrupts. + The original implementation __do_IRQ() was an alternative entry + point for all types of interrupts. It not longer exists. This handler turned out to be not suitable for all interrupt hardware and was therefore reimplemented with split - functionality for egde/level/simple/percpu interrupts. This is not + functionality for edge/level/simple/percpu interrupts. This is not only a functional optimization. It also shortens code paths for interrupts. - - To make use of the split implementation, replace the call to - __do_IRQ by a call to desc->handle_irq() and associate - the appropriate handler function to desc->handle_irq(). - In most cases the generic handler implementations should - be sufficient. - Locking on SMP The locking of chip registers is up to the architecture that - defines the chip primitives. There is a chip->lock field that can be used - for serialization, but the generic layer does not touch it. The per-irq - structure is protected via desc->lock, by the generic layer. + defines the chip primitives. The per-irq structure is + protected via desc->lock, by the generic layer. diff --git a/Documentation/DocBook/media-entities.tmpl b/Documentation/DocBook/media-entities.tmpl index fea63b4..c8abb23 100644 --- a/Documentation/DocBook/media-entities.tmpl +++ b/Documentation/DocBook/media-entities.tmpl @@ -270,6 +270,7 @@ + @@ -295,6 +296,7 @@ + diff --git a/Documentation/DocBook/v4l/pixfmt-m420.xml b/Documentation/DocBook/v4l/pixfmt-m420.xml new file mode 100644 index 0000000..ce4bc01 --- /dev/null +++ b/Documentation/DocBook/v4l/pixfmt-m420.xml @@ -0,0 +1,147 @@ + + + V4L2_PIX_FMT_M420 ('M420') + &manvol; + + + V4L2_PIX_FMT_M420 + Format with ½ horizontal and vertical chroma + resolution, also known as YUV 4:2:0. Hybrid plane line-interleaved + layout. + + + Description + + M420 is a YUV format with ½ horizontal and vertical chroma + subsampling (YUV 4:2:0). Pixels are organized as interleaved luma and + chroma planes. Two lines of luma data are followed by one line of chroma + data. + The luma plane has one byte per pixel. The chroma plane contains + interleaved CbCr pixels subsampled by ½ in the horizontal and + vertical directions. Each CbCr pair belongs to four pixels. For example, +Cb0/Cr0 belongs to +Y'00, Y'01, +Y'10, Y'11. + + All line lengths are identical: if the Y lines include pad bytes + so do the CbCr lines. + + + <constant>V4L2_PIX_FMT_M420</constant> 4 × 4 +pixel image + + + Byte Order. + Each cell is one byte. + + + + + + start + 0: + Y'00 + Y'01 + Y'02 + Y'03 + + + start + 4: + Y'10 + Y'11 + Y'12 + Y'13 + + + start + 8: + Cb00 + Cr00 + Cb01 + Cr01 + + + start + 16: + Y'20 + Y'21 + Y'22 + Y'23 + + + start + 20: + Y'30 + Y'31 + Y'32 + Y'33 + + + start + 24: + Cb10 + Cr10 + Cb11 + Cr11 + + + + + + + + + Color Sample Location. + + + + + + + 01 + 23 + + + 0 + YY + YY + + + + C + C + + + 1 + YY + YY + + + + + + 2 + YY + YY + + + + C + C + + + 3 + YY + YY + + + + + + + + + + + diff --git a/Documentation/DocBook/v4l/pixfmt-y10b.xml b/Documentation/DocBook/v4l/pixfmt-y10b.xml new file mode 100644 index 0000000..adb0ad8 --- /dev/null +++ b/Documentation/DocBook/v4l/pixfmt-y10b.xml @@ -0,0 +1,43 @@ + + + V4L2_PIX_FMT_Y10BPACK ('Y10B') + &manvol; + + + V4L2_PIX_FMT_Y10BPACK + Grey-scale image as a bit-packed array + + + Description + + This is a packed grey-scale image format with a depth of 10 bits per + pixel. Pixels are stored in a bit-packed array of 10bit bits per pixel, + with no padding between them and with the most significant bits coming + first from the left. + + + <constant>V4L2_PIX_FMT_Y10BPACK</constant> 4 pixel data stream taking 5 bytes + + + Bit-packed representation + pixels cross the byte boundary and have a ratio of 5 bytes for each 4 + pixels. + + + + + + Y'00[9:2] + Y'00[1:0]Y'01[9:4] + Y'01[3:0]Y'02[9:6] + Y'02[5:0]Y'03[9:8] + Y'03[7:0] + + + + + + + + + diff --git a/Documentation/DocBook/v4l/pixfmt.xml b/Documentation/DocBook/v4l/pixfmt.xml index 40af4be..dbfe3b0 100644 --- a/Documentation/DocBook/v4l/pixfmt.xml +++ b/Documentation/DocBook/v4l/pixfmt.xml @@ -697,6 +697,7 @@ information. &sub-grey; &sub-y10; &sub-y12; + &sub-y10b; &sub-y16; &sub-yuyv; &sub-uyvy; @@ -712,6 +713,7 @@ information. &sub-nv12m; &sub-nv12mt; &sub-nv16; + &sub-m420;
diff --git a/Documentation/DocBook/v4l/subdev-formats.xml b/Documentation/DocBook/v4l/subdev-formats.xml index d7ccd25..a26b10c 100644 --- a/Documentation/DocBook/v4l/subdev-formats.xml +++ b/Documentation/DocBook/v4l/subdev-formats.xml @@ -2522,5 +2522,51 @@
+ +
+ JPEG Compressed Formats + + Those data formats consist of an ordered sequence of 8-bit bytes + obtained from JPEG compression process. Additionally to the + _JPEG prefix the format code is made of + the following information. + + The number of bus samples per entropy encoded byte. + The bus width. + + + For instance, for a JPEG baseline process and an 8-bit bus width + the format will be named V4L2_MBUS_FMT_JPEG_1X8. + + + + The following table lists existing JPEG compressed formats. + + + JPEG Formats + + + + + + + Identifier + Code + Remarks + + + + + V4L2_MBUS_FMT_JPEG_1X8 + 0x4001 + Besides of its usage for the parallel bus this format is + recommended for transmission of JPEG data over MIPI CSI bus + using the User Defined 8-bit Data types. + + + + +
+
diff --git a/Documentation/DocBook/v4l/videodev2.h.xml b/Documentation/DocBook/v4l/videodev2.h.xml index 2b796a2..c50536a 100644 --- a/Documentation/DocBook/v4l/videodev2.h.xml +++ b/Documentation/DocBook/v4l/videodev2.h.xml @@ -311,6 +311,9 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_Y10 v4l2_fourcc('Y', '1', '0', ' ') /* 10 Greyscale */ #define V4L2_PIX_FMT_Y16 v4l2_fourcc('Y', '1', '6', ' ') /* 16 Greyscale */ +/* Grey bit-packed formats */ +#define V4L2_PIX_FMT_Y10BPACK v4l2_fourcc('Y', '1', '0', 'B') /* 10 Greyscale bit-packed */ + /* Palette formats */ #define V4L2_PIX_FMT_PAL8 v4l2_fourcc('P', 'A', 'L', '8') /* 8 8-bit palette */ @@ -333,6 +336,7 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_YUV420 v4l2_fourcc('Y', 'U', '1', '2') /* 12 YUV 4:2:0 */ #define V4L2_PIX_FMT_HI240 v4l2_fourcc('H', 'I', '2', '4') /* 8 8-bit color */ #define V4L2_PIX_FMT_HM12 v4l2_fourcc('H', 'M', '1', '2') /* 8 YUV 4:2:0 16x16 macroblocks */ +#define V4L2_PIX_FMT_M420 v4l2_fourcc('M', '4', '2', '0') /* 12 YUV 4:2:0 2 lines y, 1 line uv interleaved */ /* two planes -- one Y, one Cr + Cb interleaved */ #define V4L2_PIX_FMT_NV12 v4l2_fourcc('N', 'V', '1', '2') /* 12 Y/CbCr 4:2:0 */ diff --git a/Documentation/HOWTO b/Documentation/HOWTO index 365bda9..81bc1a9 100644 --- a/Documentation/HOWTO +++ b/Documentation/HOWTO @@ -209,7 +209,7 @@ tools. One such tool that is particularly recommended is the Linux Cross-Reference project, which is able to present source code in a self-referential, indexed webpage format. An excellent up-to-date repository of the kernel code may be found at: - http://users.sosdg.org/~qiyong/lxr/ + http://lxr.linux.no/+trees The development process diff --git a/Documentation/IRQ-affinity.txt b/Documentation/IRQ-affinity.txt index b4a615b..7890fae 100644 --- a/Documentation/IRQ-affinity.txt +++ b/Documentation/IRQ-affinity.txt @@ -4,10 +4,11 @@ ChangeLog: SMP IRQ affinity -/proc/irq/IRQ#/smp_affinity specifies which target CPUs are permitted -for a given IRQ source. It's a bitmask of allowed CPUs. It's not allowed -to turn off all CPUs, and if an IRQ controller does not support IRQ -affinity then the value will not change from the default 0xffffffff. +/proc/irq/IRQ#/smp_affinity and /proc/irq/IRQ#/smp_affinity_list specify +which target CPUs are permitted for a given IRQ source. It's a bitmask +(smp_affinity) or cpu list (smp_affinity_list) of allowed CPUs. It's not +allowed to turn off all CPUs, and if an IRQ controller does not support +IRQ affinity then the value will not change from the default of all cpus. /proc/irq/default_smp_affinity specifies default affinity mask that applies to all non-active IRQs. Once IRQ is allocated/activated its affinity bitmask @@ -54,3 +55,11 @@ round-trip min/avg/max = 0.1/0.5/585.4 ms This time around IRQ44 was delivered only to the last four processors. i.e counters for the CPU0-3 did not change. +Here is an example of limiting that same irq (44) to cpus 1024 to 1031: + +[root@moon 44]# echo 1024-1031 > smp_affinity +[root@moon 44]# cat smp_affinity +1024-1031 + +Note that to do this with a bitmask would require 32 bitmasks of zero +to follow the pertinent one. diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index 71b6f50..1d7a885 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -21,7 +21,7 @@ rcu.txt RTFP.txt - List of RCU papers (bibliography) going back to 1980. stallwarn.txt - - RCU CPU stall warnings (CONFIG_RCU_CPU_STALL_DETECTOR) + - RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress) torture.txt - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) trace.txt diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 862c08e..4e95920 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -1,22 +1,25 @@ Using RCU's CPU Stall Detector -The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables -RCU's CPU stall detector, which detects conditions that unduly delay -RCU grace periods. The stall detector's idea of what constitutes -"unduly delayed" is controlled by a set of C preprocessor macros: +The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall +detector, which detects conditions that unduly delay RCU grace periods. +This module parameter enables CPU stall detection by default, but +may be overridden via boot-time parameter or at runtime via sysfs. +The stall detector's idea of what constitutes "unduly delayed" is +controlled by a set of kernel configuration variables and cpp macros: -RCU_SECONDS_TILL_STALL_CHECK +CONFIG_RCU_CPU_STALL_TIMEOUT - This macro defines the period of time that RCU will wait from - the beginning of a grace period until it issues an RCU CPU - stall warning. This time period is normally ten seconds. + This kernel configuration parameter defines the period of time + that RCU will wait from the beginning of a grace period until it + issues an RCU CPU stall warning. This time period is normally + ten seconds. RCU_SECONDS_TILL_STALL_RECHECK This macro defines the period of time that RCU will wait after issuing a stall warning until it issues another stall warning - for the same stall. This time period is normally set to thirty - seconds. + for the same stall. This time period is normally set to three + times the check interval plus thirty seconds. RCU_STALL_RAT_DELAY diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 6a8c73f..c078ad4 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -10,34 +10,46 @@ for rcutree and next for rcutiny. CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats -These implementations of RCU provides five debugfs files under the -top-level directory RCU: rcu/rcudata (which displays fields in struct -rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of -rcu/rcudata), rcu/rcugp (which displays grace-period counters), -rcu/rcuhier (which displays the struct rcu_node hierarchy), and -rcu/rcu_pending (which displays counts of the reasons that the -rcu_pending() function decided that there was core RCU work to do). +These implementations of RCU provides several debugfs files under the +top-level directory "rcu": + +rcu/rcudata: + Displays fields in struct rcu_data. +rcu/rcudata.csv: + Comma-separated values spreadsheet version of rcudata. +rcu/rcugp: + Displays grace-period counters. +rcu/rcuhier: + Displays the struct rcu_node hierarchy. +rcu/rcu_pending: + Displays counts of the reasons rcu_pending() decided that RCU had + work to do. +rcu/rcutorture: + Displays rcutorture test progress. +rcu/rcuboost: + Displays RCU boosting statistics. Only present if + CONFIG_RCU_BOOST=y. The output of "cat rcu/rcudata" looks as follows: rcu_sched: - 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 - 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 - 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 - 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10 - 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10 - 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10 - 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10 - 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10 + 0 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=545/1/0 df=50 of=0 ri=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 + 1 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=967/1/0 df=58 of=0 ri=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 + 2 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=1081/1/0 df=175 of=0 ri=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 + 3 c=20942 g=20943 pq=1 pqc=20942 qp=1 dt=1846/0/0 df=404 of=0 ri=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 + 4 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=369/1/0 df=83 of=0 ri=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 + 5 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=381/1/0 df=64 of=0 ri=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 + 6 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=1037/1/0 df=183 of=0 ri=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 + 7 c=20897 g=20897 pq=1 pqc=20896 qp=0 dt=1572/0/0 df=382 of=0 ri=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 rcu_bh: - 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10 - 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10 - 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 - 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10 - 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 - 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 - 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 - 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 0 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=545/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 + 1 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=967/1/0 df=3 of=0 ri=1 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0 + 2 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1081/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0 + 3 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1846/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0 + 4 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=369/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0 + 5 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=381/1/0 df=4 of=0 ri=1 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0 + 6 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1037/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0 + 7 c=1474 g=1474 pq=1 pqc=1473 qp=0 dt=1572/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0 The first section lists the rcu_data structures for rcu_sched, the second for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an @@ -52,17 +64,18 @@ o The number at the beginning of each line is the CPU number. substantially larger than the number of actual CPUs. o "c" is the count of grace periods that this CPU believes have - completed. CPUs in dynticks idle mode may lag quite a ways - behind, for example, CPU 4 under "rcu_sched" above, which has - slept through the past 25 RCU grace periods. It is not unusual - to see CPUs lagging by thousands of grace periods. + completed. Offlined CPUs and CPUs in dynticks idle mode may + lag quite a ways behind, for example, CPU 6 under "rcu_sched" + above, which has been offline through not quite 40,000 RCU grace + periods. It is not unusual to see CPUs lagging by thousands of + grace periods. o "g" is the count of grace periods that this CPU believes have - started. Again, CPUs in dynticks idle mode may lag behind. - If the "c" and "g" values are equal, this CPU has already - reported a quiescent state for the last RCU grace period that - it is aware of, otherwise, the CPU believes that it owes RCU a - quiescent state. + started. Again, offlined CPUs and CPUs in dynticks idle mode + may lag behind. If the "c" and "g" values are equal, this CPU + has already reported a quiescent state for the last RCU grace + period that it is aware of, otherwise, the CPU believes that it + owes RCU a quiescent state. o "pq" indicates that this CPU has passed through a quiescent state for the current grace period. It is possible for "pq" to be @@ -81,7 +94,8 @@ o "pqc" indicates which grace period the last-observed quiescent the next grace period! o "qp" indicates that RCU still expects a quiescent state from - this CPU. + this CPU. Offlined CPUs and CPUs in dyntick idle mode might + well have qp=1, which is OK: RCU is still ignoring them. o "dt" is the current value of the dyntick counter that is incremented when entering or leaving dynticks idle state, either by the @@ -108,7 +122,7 @@ o "df" is the number of times that some other CPU has forced a o "of" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being - offline. In a perfect world, this might neve happen, but it + offline. In a perfect world, this might never happen, but it turns out that offlining and onlining a CPU can take several grace periods, and so there is likely to be an extended period of time when RCU believes that the CPU is online when it really is not. @@ -125,6 +139,62 @@ o "ql" is the number of RCU callbacks currently residing on of what state they are in (new, waiting for grace period to start, waiting for grace period to end, ready to invoke). +o "qs" gives an indication of the state of the callback queue + with four characters: + + "N" Indicates that there are callbacks queued that are not + ready to be handled by the next grace period, and thus + will be handled by the grace period following the next + one. + + "R" Indicates that there are callbacks queued that are + ready to be handled by the next grace period. + + "W" Indicates that there are callbacks queued that are + waiting on the current grace period. + + "D" Indicates that there are callbacks queued that have + already been handled by a prior grace period, and are + thus waiting to be invoked. Note that callbacks in + the process of being invoked are not counted here. + Callbacks in the process of being invoked are those + that have been removed from the rcu_data structures + queues by rcu_do_batch(), but which have not yet been + invoked. + + If there are no callbacks in a given one of the above states, + the corresponding character is replaced by ".". + +o "kt" is the per-CPU kernel-thread state. The digit preceding + the first slash is zero if there is no work pending and 1 + otherwise. The character between the first pair of slashes is + as follows: + + "S" The kernel thread is stopped, in other words, all + CPUs corresponding to this rcu_node structure are + offline. + + "R" The kernel thread is running. + + "W" The kernel thread is waiting because there is no work + for it to do. + + "O" The kernel thread is waiting because it has been + forced off of its designated CPU or because its + ->cpus_allowed mask permits it to run on other than + its designated CPU. + + "Y" The kernel thread is yielding to avoid hogging CPU. + + "?" Unknown value, indicates a bug. + + The number after the final slash is the CPU that the kthread + is actually running on. + +o "ktl" is the low-order 16 bits (in hexadecimal) of the count of + the number of times that this CPU's per-CPU kthread has gone + through its loop servicing invoke_rcu_cpu_kthread() requests. + o "b" is the batch limit for this CPU. If more than this number of RCU callbacks is ready to invoke, then the remainder will be deferred. @@ -174,14 +244,14 @@ o "gpnum" is the number of grace periods that have started. It is The output of "cat rcu/rcuhier" looks as follows, with very long lines: c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 -1/1 .>. 0:127 ^0 -3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 -3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 +1/1 ..>. 0:127 ^0 +3/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3 +3/3f ..>. 0:5 ^0 2/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3 rcu_bh: c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 -0/1 .>. 0:127 ^0 -0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 -0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 +0/1 ..>. 0:127 ^0 +0/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3 +0/3f ..>. 0:5 ^0 0/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3 This is once again split into "rcu_sched" and "rcu_bh" portions, and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional @@ -240,13 +310,20 @@ o Each element of the form "1/1 0:127 ^0" represents one struct current grace period. o The characters separated by the ">" indicate the state - of the blocked-tasks lists. A "T" preceding the ">" + of the blocked-tasks lists. A "G" preceding the ">" indicates that at least one task blocked in an RCU read-side critical section blocks the current grace - period, while a "." preceding the ">" indicates otherwise. - The character following the ">" indicates similarly for - the next grace period. A "T" should appear in this - field only for rcu-preempt. + period, while a "E" preceding the ">" indicates that + at least one task blocked in an RCU read-side critical + section blocks the current expedited grace period. + A "T" character following the ">" indicates that at + least one task is blocked within an RCU read-side + critical section, regardless of whether any current + grace period (expedited or normal) is inconvenienced. + A "." character appears if the corresponding condition + does not hold, so that "..>." indicates that no tasks + are blocked. In contrast, "GE>T" indicates maximal + inconvenience from blocked tasks. o The numbers separated by the ":" are the range of CPUs served by this struct rcu_node. This can be helpful @@ -328,6 +405,113 @@ o "nn" is the number of times that this CPU needed nothing. Alert is due to short-circuit evaluation in rcu_pending(). +The output of "cat rcu/rcutorture" looks as follows: + +rcutorture test sequence: 0 (test in progress) +rcutorture update version number: 615 + +The first line shows the number of rcutorture tests that have completed +since boot. If a test is currently running, the "(test in progress)" +string will appear as shown above. The second line shows the number of +update cycles that the current test has started, or zero if there is +no test in progress. + + +The output of "cat rcu/rcuboost" looks as follows: + +0:5 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f + balk: nt=0 egt=989 bt=0 nb=0 ny=0 nos=16 +6:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f + balk: nt=0 egt=225 bt=0 nb=0 ny=0 nos=6 + +This information is output only for rcu_preempt. Each two-line entry +corresponds to a leaf rcu_node strcuture. The fields are as follows: + +o "n:m" is the CPU-number range for the corresponding two-line + entry. In the sample output above, the first entry covers + CPUs zero through five and the second entry covers CPUs 6 + and 7. + +o "tasks=TNEB" gives the state of the various segments of the + rnp->blocked_tasks list: + + "T" This indicates that there are some tasks that blocked + while running on one of the corresponding CPUs while + in an RCU read-side critical section. + + "N" This indicates that some of the blocked tasks are preventing + the current normal (non-expedited) grace period from + completing. + + "E" This indicates that some of the blocked tasks are preventing + the current expedited grace period from completing. + + "B" This indicates that some of the blocked tasks are in + need of RCU priority boosting. + + Each character is replaced with "." if the corresponding + condition does not hold. + +o "kt" is the state of the RCU priority-boosting kernel + thread associated with the corresponding rcu_node structure. + The state can be one of the following: + + "S" The kernel thread is stopped, in other words, all + CPUs corresponding to this rcu_node structure are + offline. + + "R" The kernel thread is running. + + "W" The kernel thread is waiting because there is no work + for it to do. + + "Y" The kernel thread is yielding to avoid hogging CPU. + + "?" Unknown value, indicates a bug. + +o "ntb" is the number of tasks boosted. + +o "neb" is the number of tasks boosted in order to complete an + expedited grace period. + +o "nnb" is the number of tasks boosted in order to complete a + normal (non-expedited) grace period. When boosting a task + that was blocking both an expedited and a normal grace period, + it is counted against the expedited total above. + +o "j" is the low-order 16 bits of the jiffies counter in + hexadecimal. + +o "bt" is the low-order 16 bits of the value that the jiffies + counter will have when we next start boosting, assuming that + the current grace period does not end beforehand. This is + also in hexadecimal. + +o "balk: nt" counts the number of times we didn't boost (in + other words, we balked) even though it was time to boost because + there were no blocked tasks to boost. This situation occurs + when there is one blocked task on one rcu_node structure and + none on some other rcu_node structure. + +o "egt" counts the number of times we balked because although + there were blocked tasks, none of them were blocking the + current grace period, whether expedited or otherwise. + +o "bt" counts the number of times we balked because boosting + had already been initiated for the current grace period. + +o "nb" counts the number of times we balked because there + was at least one task blocking the current non-expedited grace + period that never had blocked. If it is already running, it + just won't help to boost its priority! + +o "ny" counts the number of times we balked because it was + not yet time to start boosting. + +o "nos" counts the number of times we balked for other + reasons, e.g., the grace period ended first. + + CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats These implementations of RCU provides a single debugfs file under the @@ -394,9 +578,9 @@ o "neb" is the number of expedited grace periods that have had o "nnb" is the number of normal grace periods that have had to resort to RCU priority boosting since boot. -o "j" is the low-order 12 bits of the jiffies counter in hexadecimal. +o "j" is the low-order 16 bits of the jiffies counter in hexadecimal. -o "bt" is the low-order 12 bits of the value that the jiffies counter +o "bt" is the low-order 16 bits of the value that the jiffies counter will have at the next time that boosting is scheduled to begin. o In the line beginning with "normal balk", the fields are as follows: diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index e439cd0..569f353 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -714,10 +714,11 @@ Jeff Garzik, "Linux kernel patch submission format". Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer". - - - - + + + + + NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people! diff --git a/Documentation/blockdev/cciss.txt b/Documentation/blockdev/cciss.txt index 89698e8..c00c6a5 100644 --- a/Documentation/blockdev/cciss.txt +++ b/Documentation/blockdev/cciss.txt @@ -169,3 +169,18 @@ is issued which positions the tape to a known position. Typically you must rewind the tape (by issuing "mt -f /dev/st0 rewind" for example) before i/o can proceed again to a tape drive which was reset. +There is a cciss_tape_cmds module parameter which can be used to make cciss +allocate more commands for use by tape drives. Ordinarily only a few commands +(6) are allocated for tape drives because tape drives are slow and +infrequently used and the primary purpose of Smart Array controllers is to +act as a RAID controller for disk drives, so the vast majority of commands +are allocated for disk devices. However, if you have more than a few tape +drives attached to a smart array, the default number of commands may not be +enought (for example, if you have 8 tape drives, you could only rewind 6 +at one time with the default number of commands.) The cciss_tape_cmds module +parameter allows more commands (up to 16 more) to be allocated for use by +tape drives. For example: + + insmod cciss.ko cciss_tape_cmds=16 + +Or, as a kernel boot parameter passed in via grub: cciss.cciss_tape_cmds=8 diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index 9164ae3..9b728dc 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -16,7 +16,7 @@ on all processors in the system. Don't let this scare you into thinking SMP cache/tlb flushing must be so inefficient, this is in fact an area where many optimizations are possible. For example, if it can be proven that a user address space has never executed -on a cpu (see vma->cpu_vm_mask), one need not perform a flush +on a cpu (see mm_cpumask()), one need not perform a flush for this address space on that cpu. First, the TLB flushing interfaces, since they are the simplest. The diff --git a/Documentation/devicetree/bindings/crypto/fsl-sec4.txt b/Documentation/devicetree/bindings/crypto/fsl-sec4.txt new file mode 100644 index 0000000..bf57ecd --- /dev/null +++ b/Documentation/devicetree/bindings/crypto/fsl-sec4.txt @@ -0,0 +1,397 @@ +===================================================================== +SEC 4 Device Tree Binding +Copyright (C) 2008-2011 Freescale Semiconductor Inc. + + CONTENTS + -Overview + -SEC 4 Node + -Job Ring Node + -Run Time Integrity Check (RTIC) Node + -Run Time Integrity Check (RTIC) Memory Node + -Secure Non-Volatile Storage (SNVS) Node + -Full Example + +NOTE: the SEC 4 is also known as Freescale's Cryptographic Accelerator +Accelerator and Assurance Module (CAAM). + +===================================================================== +Overview + +DESCRIPTION + +SEC 4 h/w can process requests from 2 types of sources. +1. DPAA Queue Interface (HW interface between Queue Manager & SEC 4). +2. Job Rings (HW interface between cores & SEC 4 registers). + +High Speed Data Path Configuration: + +HW interface between QM & SEC 4 and also BM & SEC 4, on DPAA-enabled parts +such as the P4080. The number of simultaneous dequeues the QI can make is +equal to the number of Descriptor Controller (DECO) engines in a particular +SEC version. E.g., the SEC 4.0 in the P4080 has 5 DECOs and can thus +dequeue from 5 subportals simultaneously. + +Job Ring Data Path Configuration: + +Each JR is located on a separate 4k page, they may (or may not) be made visible +in the memory partition devoted to a particular core. The P4080 has 4 JRs, so +up to 4 JRs can be configured; and all 4 JRs process requests in parallel. + +===================================================================== +SEC 4 Node + +Description + + Node defines the base address of the SEC 4 block. + This block specifies the address range of all global + configuration registers for the SEC 4 block. It + also receives interrupts from the Run Time Integrity Check + (RTIC) function within the SEC 4 block. + +PROPERTIES + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,sec-v4.0" + + - #address-cells + Usage: required + Value type: + Definition: A standard property. Defines the number of cells + for representing physical addresses in child nodes. + + - #size-cells + Usage: required + Value type: + Definition: A standard property. Defines the number of cells + for representing the size of physical addresses in + child nodes. + + - reg + Usage: required + Value type: + Definition: A standard property. Specifies the physical + address and length of the SEC4 configuration registers. + registers + + - ranges + Usage: required + Value type: + Definition: A standard property. Specifies the physical address + range of the SEC 4.0 register space (-SNVS not included). A + triplet that includes the child address, parent address, & + length. + + - interrupts + Usage: required + Value type: + Definition: Specifies the interrupts generated by this + device. The value of the interrupts property + consists of one interrupt specifier. The format + of the specifier is defined by the binding document + describing the node's interrupt parent. + + - interrupt-parent + Usage: (required if interrupt property is defined) + Value type: + Definition: A single value that points + to the interrupt parent to which the child domain + is being mapped. + + Note: All other standard properties (see the ePAPR) are allowed + but are optional. + + +EXAMPLE + crypto@300000 { + compatible = "fsl,sec-v4.0"; + #address-cells = <1>; + #size-cells = <1>; + reg = <0x300000 0x10000>; + ranges = <0 0x300000 0x10000>; + interrupt-parent = <&mpic>; + interrupts = <92 2>; + }; + +===================================================================== +Job Ring (JR) Node + + Child of the crypto node defines data processing interface to SEC 4 + across the peripheral bus for purposes of processing + cryptographic descriptors. The specified address + range can be made visible to one (or more) cores. + The interrupt defined for this node is controlled within + the address range of this node. + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,sec-v4.0-job-ring" + + - reg + Usage: required + Value type: + Definition: Specifies a two JR parameters: an offset from + the parent physical address and the length the JR registers. + + - fsl,liodn + Usage: optional-but-recommended + Value type: + Definition: + Specifies the LIODN to be used in conjunction with + the ppid-to-liodn table that specifies the PPID to LIODN mapping. + Needed if the PAMU is used. Value is a 12 bit value + where value is a LIODN ID for this JR. This property is + normally set by boot firmware. + + - interrupts + Usage: required + Value type: + Definition: Specifies the interrupts generated by this + device. The value of the interrupts property + consists of one interrupt specifier. The format + of the specifier is defined by the binding document + describing the node's interrupt parent. + + - interrupt-parent + Usage: (required if interrupt property is defined) + Value type: + Definition: A single value that points + to the interrupt parent to which the child domain + is being mapped. + +EXAMPLE + jr@1000 { + compatible = "fsl,sec-v4.0-job-ring"; + reg = <0x1000 0x1000>; + fsl,liodn = <0x081>; + interrupt-parent = <&mpic>; + interrupts = <88 2>; + }; + + +===================================================================== +Run Time Integrity Check (RTIC) Node + + Child node of the crypto node. Defines a register space that + contains up to 5 sets of addresses and their lengths (sizes) that + will be checked at run time. After an initial hash result is + calculated, these addresses are checked by HW to monitor any + change. If any memory is modified, a Security Violation is + triggered (see SNVS definition). + + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,sec-v4.0-rtic". + + - #address-cells + Usage: required + Value type: + Definition: A standard property. Defines the number of cells + for representing physical addresses in child nodes. Must + have a value of 1. + + - #size-cells + Usage: required + Value type: + Definition: A standard property. Defines the number of cells + for representing the size of physical addresses in + child nodes. Must have a value of 1. + + - reg + Usage: required + Value type: + Definition: A standard property. Specifies a two parameters: + an offset from the parent physical address and the length + the SEC4 registers. + + - ranges + Usage: required + Value type: + Definition: A standard property. Specifies the physical address + range of the SEC 4 register space (-SNVS not included). A + triplet that includes the child address, parent address, & + length. + +EXAMPLE + rtic@6000 { + compatible = "fsl,sec-v4.0-rtic"; + #address-cells = <1>; + #size-cells = <1>; + reg = <0x6000 0x100>; + ranges = <0x0 0x6100 0xe00>; + }; + +===================================================================== +Run Time Integrity Check (RTIC) Memory Node + A child node that defines individual RTIC memory regions that are used to + perform run-time integrity check of memory areas that should not modified. + The node defines a register that contains the memory address & + length (combined) and a second register that contains the hash result + in big endian format. + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,sec-v4.0-rtic-memory". + + - reg + Usage: required + Value type: + Definition: A standard property. Specifies two parameters: + an offset from the parent physical address and the length: + + 1. The location of the RTIC memory address & length registers. + 2. The location RTIC hash result. + + - fsl,rtic-region + Usage: optional-but-recommended + Value type: + Definition: + Specifies the HW address (36 bit address) for this region + followed by the length of the HW partition to be checked; + the address is represented as a 64 bit quantity followed + by a 32 bit length. + + - fsl,liodn + Usage: optional-but-recommended + Value type: + Definition: + Specifies the LIODN to be used in conjunction with + the ppid-to-liodn table that specifies the PPID to LIODN + mapping. Needed if the PAMU is used. Value is a 12 bit value + where value is a LIODN ID for this RTIC memory region. This + property is normally set by boot firmware. + +EXAMPLE + rtic-a@0 { + compatible = "fsl,sec-v4.0-rtic-memory"; + reg = <0x00 0x20 0x100 0x80>; + fsl,liodn = <0x03c>; + fsl,rtic-region = <0x12345678 0x12345678 0x12345678>; + }; + +===================================================================== +Secure Non-Volatile Storage (SNVS) Node + + Node defines address range and the associated + interrupt for the SNVS function. This function + monitors security state information & reports + security violations. + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,sec-v4.0-mon". + + - reg + Usage: required + Value type: + Definition: A standard property. Specifies the physical + address and length of the SEC4 configuration + registers. + + - interrupts + Usage: required + Value type: + Definition: Specifies the interrupts generated by this + device. The value of the interrupts property + consists of one interrupt specifier. The format + of the specifier is defined by the binding document + describing the node's interrupt parent. + + - interrupt-parent + Usage: (required if interrupt property is defined) + Value type: + Definition: A single value that points + to the interrupt parent to which the child domain + is being mapped. + +EXAMPLE + sec_mon@314000 { + compatible = "fsl,sec-v4.0-mon"; + reg = <0x314000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <93 2>; + }; + +===================================================================== +FULL EXAMPLE + + crypto: crypto@300000 { + compatible = "fsl,sec-v4.0"; + #address-cells = <1>; + #size-cells = <1>; + reg = <0x300000 0x10000>; + ranges = <0 0x300000 0x10000>; + interrupt-parent = <&mpic>; + interrupts = <92 2>; + + sec_jr0: jr@1000 { + compatible = "fsl,sec-v4.0-job-ring"; + reg = <0x1000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <88 2>; + }; + + sec_jr1: jr@2000 { + compatible = "fsl,sec-v4.0-job-ring"; + reg = <0x2000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <89 2>; + }; + + sec_jr2: jr@3000 { + compatible = "fsl,sec-v4.0-job-ring"; + reg = <0x3000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <90 2>; + }; + + sec_jr3: jr@4000 { + compatible = "fsl,sec-v4.0-job-ring"; + reg = <0x4000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <91 2>; + }; + + rtic@6000 { + compatible = "fsl,sec-v4.0-rtic"; + #address-cells = <1>; + #size-cells = <1>; + reg = <0x6000 0x100>; + ranges = <0x0 0x6100 0xe00>; + + rtic_a: rtic-a@0 { + compatible = "fsl,sec-v4.0-rtic-memory"; + reg = <0x00 0x20 0x100 0x80>; + }; + + rtic_b: rtic-b@20 { + compatible = "fsl,sec-v4.0-rtic-memory"; + reg = <0x20 0x20 0x200 0x80>; + }; + + rtic_c: rtic-c@40 { + compatible = "fsl,sec-v4.0-rtic-memory"; + reg = <0x40 0x20 0x300 0x80>; + }; + + rtic_d: rtic-d@60 { + compatible = "fsl,sec-v4.0-rtic-memory"; + reg = <0x60 0x20 0x500 0x80>; + }; + }; + }; + + sec_mon: sec_mon@314000 { + compatible = "fsl,sec-v4.0-mon"; + reg = <0x314000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <93 2>; + }; + +===================================================================== diff --git a/Documentation/devicetree/bindings/net/can/fsl-flexcan.txt b/Documentation/devicetree/bindings/net/can/fsl-flexcan.txt new file mode 100755 index 0000000..1a729f0 --- /dev/null +++ b/Documentation/devicetree/bindings/net/can/fsl-flexcan.txt @@ -0,0 +1,61 @@ +CAN Device Tree Bindings +------------------------ +2011 Freescale Semiconductor, Inc. + +fsl,flexcan-v1.0 nodes +----------------------- +In addition to the required compatible-, reg- and interrupt-properties, you can +also specify which clock source shall be used for the controller. + +CPI Clock- Can Protocol Interface Clock + This CLK_SRC bit of CTRL(control register) selects the clock source to + the CAN Protocol Interface(CPI) to be either the peripheral clock + (driven by the PLL) or the crystal oscillator clock. The selected clock + is the one fed to the prescaler to generate the Serial Clock (Sclock). + The PRESDIV field of CTRL(control register) controls a prescaler that + generates the Serial Clock (Sclock), whose period defines the + time quantum used to compose the CAN waveform. + +Can Engine Clock Source + There are two sources for CAN clock + - Platform Clock It represents the bus clock + - Oscillator Clock + + Peripheral Clock (PLL) + -------------- + | + --------- ------------- + | |CPI Clock | Prescaler | Sclock + | |---------------->| (1.. 256) |------------> + --------- ------------- + | | + -------------- ---------------------CLK_SRC + Oscillator Clock + +- fsl,flexcan-clock-source : CAN Engine Clock Source.This property selects + the peripheral clock. PLL clock is fed to the + prescaler to generate the Serial Clock (Sclock). + Valid values are "oscillator" and "platform" + "oscillator": CAN engine clock source is oscillator clock. + "platform" The CAN engine clock source is the bus clock + (platform clock). + +- fsl,flexcan-clock-divider : for the reference and system clock, an additional + clock divider can be specified. +- clock-frequency: frequency required to calculate the bitrate for FlexCAN. + +Note: + - v1.0 of flexcan-v1.0 represent the IP block version for P1010 SOC. + - P1010 does not have oscillator as the Clock Source.So the default + Clock Source is platform clock. +Examples: + + can0@1c000 { + compatible = "fsl,flexcan-v1.0"; + reg = <0x1c000 0x1000>; + interrupts = <48 0x2>; + interrupt-parent = <&mpic>; + fsl,flexcan-clock-source = "platform"; + fsl,flexcan-clock-divider = <2>; + clock-frequency = ; + }; diff --git a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt index edb7ae1..2c6be03 100644 --- a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt +++ b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt @@ -74,3 +74,57 @@ Example: interrupt-parent = <&mpic>; phy-handle = <&phy0> }; + +* Gianfar PTP clock nodes + +General Properties: + + - compatible Should be "fsl,etsec-ptp" + - reg Offset and length of the register set for the device + - interrupts There should be at least two interrupts. Some devices + have as many as four PTP related interrupts. + +Clock Properties: + + - fsl,tclk-period Timer reference clock period in nanoseconds. + - fsl,tmr-prsc Prescaler, divides the output clock. + - fsl,tmr-add Frequency compensation value. + - fsl,tmr-fiper1 Fixed interval period pulse generator. + - fsl,tmr-fiper2 Fixed interval period pulse generator. + - fsl,max-adj Maximum frequency adjustment in parts per billion. + + These properties set the operational parameters for the PTP + clock. You must choose these carefully for the clock to work right. + Here is how to figure good values: + + TimerOsc = system clock MHz + tclk_period = desired clock period nanoseconds + NominalFreq = 1000 / tclk_period MHz + FreqDivRatio = TimerOsc / NominalFreq (must be greater that 1.0) + tmr_add = ceil(2^32 / FreqDivRatio) + OutputClock = NominalFreq / tmr_prsc MHz + PulseWidth = 1 / OutputClock microseconds + FiperFreq1 = desired frequency in Hz + FiperDiv1 = 1000000 * OutputClock / FiperFreq1 + tmr_fiper1 = tmr_prsc * tclk_period * FiperDiv1 - tclk_period + max_adj = 1000000000 * (FreqDivRatio - 1.0) - 1 + + The calculation for tmr_fiper2 is the same as for tmr_fiper1. The + driver expects that tmr_fiper1 will be correctly set to produce a 1 + Pulse Per Second (PPS) signal, since this will be offered to the PPS + subsystem to synchronize the Linux clock. + +Example: + + ptp_clock@24E00 { + compatible = "fsl,etsec-ptp"; + reg = <0x24E00 0xB0>; + interrupts = <12 0x8 13 0x8>; + interrupt-parent = < &ipic >; + fsl,tclk-period = <10>; + fsl,tmr-prsc = <100>; + fsl,tmr-add = <0x999999A4>; + fsl,tmr-fiper1 = <0x3B9AC9F6>; + fsl,tmr-fiper2 = <0x00018696>; + fsl,max-adj = <659999998>; + }; diff --git a/Documentation/devicetree/bindings/powerpc/fsl/ifc.txt b/Documentation/devicetree/bindings/powerpc/fsl/ifc.txt new file mode 100644 index 0000000..939a26d --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/fsl/ifc.txt @@ -0,0 +1,76 @@ +Integrated Flash Controller + +Properties: +- name : Should be ifc +- compatible : should contain "fsl,ifc". The version of the integrated + flash controller can be found in the IFC_REV register at + offset zero. + +- #address-cells : Should be either two or three. The first cell is the + chipselect number, and the remaining cells are the + offset into the chipselect. +- #size-cells : Either one or two, depending on how large each chipselect + can be. +- reg : Offset and length of the register set for the device +- interrupts : IFC has two interrupts. The first one is the "common" + interrupt(CM_EVTER_STAT), and second is the NAND interrupt + (NAND_EVTER_STAT). + +- ranges : Each range corresponds to a single chipselect, and covers + the entire access window as configured. + +Child device nodes describe the devices connected to IFC such as NOR (e.g. +cfi-flash) and NAND (fsl,ifc-nand). There might be board specific devices +like FPGAs, CPLDs, etc. + +Example: + + ifc@ffe1e000 { + compatible = "fsl,ifc", "simple-bus"; + #address-cells = <2>; + #size-cells = <1>; + reg = <0x0 0xffe1e000 0 0x2000>; + interrupts = <16 2 19 2>; + + /* NOR, NAND Flashes and CPLD on board */ + ranges = <0x0 0x0 0x0 0xee000000 0x02000000 + 0x1 0x0 0x0 0xffa00000 0x00010000 + 0x3 0x0 0x0 0xffb00000 0x00020000>; + + flash@0,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "cfi-flash"; + reg = <0x0 0x0 0x2000000>; + bank-width = <2>; + device-width = <1>; + + partition@0 { + /* 32MB for user data */ + reg = <0x0 0x02000000>; + label = "NOR Data"; + }; + }; + + flash@1,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,ifc-nand"; + reg = <0x1 0x0 0x10000>; + + partition@0 { + /* This location must not be altered */ + /* 1MB for u-boot Bootloader Image */ + reg = <0x0 0x00100000>; + label = "NAND U-Boot Image"; + read-only; + }; + }; + + cpld@3,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,p1010rdb-cpld"; + reg = <0x3 0x0 0x000001f>; + }; + }; diff --git a/Documentation/devicetree/bindings/powerpc/fsl/mpic-timer.txt b/Documentation/devicetree/bindings/powerpc/fsl/mpic-timer.txt new file mode 100644 index 0000000..df41958 --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/fsl/mpic-timer.txt @@ -0,0 +1,38 @@ +* Freescale MPIC timers + +Required properties: +- compatible: "fsl,mpic-global-timer" + +- reg : Contains two regions. The first is the main timer register bank + (GTCCRxx, GTBCRxx, GTVPRxx, GTDRxx). The second is the timer control + register (TCRx) for the group. + +- fsl,available-ranges: use style section to define which + timer interrupts can be used. This property is optional; without this, + all timers within the group can be used. + +- interrupts: one interrupt per timer in the group, in order, starting + with timer zero. If timer-available-ranges is present, only the + interrupts that correspond to available timers shall be present. + +Example: + /* Note that this requires #interrupt-cells to be 4 */ + timer0: timer@41100 { + compatible = "fsl,mpic-global-timer"; + reg = <0x41100 0x100 0x41300 4>; + + /* Another AMP partition is using timers 0 and 1 */ + fsl,available-ranges = <2 2>; + + interrupts = <2 0 3 0 + 3 0 3 0>; + }; + + timer1: timer@42100 { + compatible = "fsl,mpic-global-timer"; + reg = <0x42100 0x100 0x42300 4>; + interrupts = <4 0 3 0 + 5 0 3 0 + 6 0 3 0 + 7 0 3 0>; + }; diff --git a/Documentation/devicetree/bindings/powerpc/fsl/mpic.txt b/Documentation/devicetree/bindings/powerpc/fsl/mpic.txt index 4f61458..2cf38bd 100644 --- a/Documentation/devicetree/bindings/powerpc/fsl/mpic.txt +++ b/Documentation/devicetree/bindings/powerpc/fsl/mpic.txt @@ -190,7 +190,7 @@ EXAMPLE 4 */ timer0: timer@41100 { compatible = "fsl,mpic-global-timer"; - reg = <0x41100 0x100>; + reg = <0x41100 0x100 0x41300 4>; interrupts = <0 0 3 0 1 0 3 0 2 0 3 0 diff --git a/Documentation/devicetree/bindings/powerpc/nintendo/wii.txt b/Documentation/devicetree/bindings/powerpc/nintendo/wii.txt index a7e155a..36afa32 100644 --- a/Documentation/devicetree/bindings/powerpc/nintendo/wii.txt +++ b/Documentation/devicetree/bindings/powerpc/nintendo/wii.txt @@ -127,7 +127,7 @@ Nintendo Wii device tree - reg : should contain the SDHCI registers location and length - interrupts : should contain the SDHCI interrupt -1.j) The Inter-Processsor Communication (IPC) node +1.j) The Inter-Processor Communication (IPC) node Represent the Inter-Processor Communication interface. This interface enables communications between the Broadway and the Starlet processors. diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 470d3db..dfa6fc6 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -1,6 +1,8 @@ *.a *.aux *.bin +*.bz2 +*.cis *.cpio *.csp *.dsp @@ -8,6 +10,8 @@ *.elf *.eps *.fw +*.gcno +*.gcov *.gen.S *.gif *.grep @@ -19,14 +23,20 @@ *.ko *.log *.lst +*.lzma +*.lzo +*.mo *.moc *.mod.c *.o *.o.* +*.order *.orig *.out +*.patch *.pdf *.png +*.pot *.ps *.rej *.s @@ -39,16 +49,22 @@ *.tex *.ver *.xml +*.xz *_MODULES *_vga16.c *~ +\#*# *.9 -*.9.gz .* +.*.d .mm 53c700_d.h CVS ChangeSet +GPATH +GRTAGS +GSYMS +GTAGS Image Kerntypes Module.markers @@ -57,15 +73,14 @@ PENDING SCCS System.map* TAGS +aconf +af_names.h aic7*reg.h* aic7*reg_print.c* aic7*seq.h* aicasm aicdb.h* -altivec1.c -altivec2.c -altivec4.c -altivec8.c +altivec*.c asm-offsets.h asm_offsets.h autoconf.h* @@ -80,6 +95,7 @@ btfixupprep build bvmlinux bzImage* +capability_names.h capflags.c classlist.h* comp*.log @@ -88,7 +104,8 @@ conf config config-* config_data.h* -config_data.gz* +config.mak +config.mak.autogen conmakehash consolemap_deftbl.c* cpustr.h @@ -96,7 +113,9 @@ crc32table.h* cscope.* defkeymap.c devlist.h* +dnotify_test docproc +dslm elf2ecoff elfconfig.h* evergreen_reg_safe.h @@ -105,6 +124,7 @@ flask.h fore200e_mkfirm fore200e_pca_fw.c* gconf +gconf.glade.h gen-devlist gen_crc32table gen_init_cpio @@ -112,11 +132,12 @@ generated genheaders genksyms *_gray256.c +hpet_example +hugepage-mmap +hugepage-shm ihex2fw ikconfig.h* inat-tables.c -initramfs_data.cpio -initramfs_data.cpio.gz initramfs_list int16.c int1.c @@ -133,15 +154,19 @@ kxgettext lkc_defs.h lex.c lex.*.c +linux logo_*.c logo_*_clut224.c logo_*_mono.c lxdialog +mach mach-types mach-types.h machtypes.h map +map_hugetlb maui_boot.h +media mconf miboot* mk_elfconfig @@ -150,23 +175,29 @@ mkbugboot mkcpustr mkdep mkprep +mkregtable mktables mktree modpost modules.builtin modules.order modversions.h* +nconf ncscope.* offset.h offsets.h oui.c* +page-types parse.c parse.h patches* pca200e.bin pca200e_ecd.bin2 -piggy.gz +perf.data +perf.data.old +perf-archive piggyback +piggy.gzip piggy.S pnmtologo ppc_defs.h* @@ -177,10 +208,9 @@ r200_reg_safe.h r300_reg_safe.h r420_reg_safe.h r600_reg_safe.h -raid6altivec*.c -raid6int*.c -raid6tables.c +recordmcount relocs +rlim_names.h rn50_reg_safe.h rs600_reg_safe.h rv515_reg_safe.h @@ -194,6 +224,7 @@ split-include syscalltab.h tables.c tags +test_get_len tftpboot.img timeconst.h times.h* @@ -210,10 +241,13 @@ vdso32.so.dbg vdso64.lds vdso64.so.dbg version.h* +vmImage vmlinux vmlinux-* vmlinux.aout +vmlinux.bin.all vmlinux.lds +vmlinuz voffset.h vsyscall.lds vsyscall_32.lds diff --git a/Documentation/driver-model/bus.txt b/Documentation/driver-model/bus.txt index 5001b75..6754b2d 100644 --- a/Documentation/driver-model/bus.txt +++ b/Documentation/driver-model/bus.txt @@ -3,24 +3,7 @@ Bus Types Definition ~~~~~~~~~~ - -struct bus_type { - char * name; - - struct subsystem subsys; - struct kset drivers; - struct kset devices; - - struct bus_attribute * bus_attrs; - struct device_attribute * dev_attrs; - struct driver_attribute * drv_attrs; - - int (*match)(struct device * dev, struct device_driver * drv); - int (*hotplug) (struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size); - int (*suspend)(struct device * dev, pm_message_t state); - int (*resume)(struct device * dev); -}; +See the kerneldoc for the struct bus_type. int bus_register(struct bus_type * bus); diff --git a/Documentation/driver-model/class.txt b/Documentation/driver-model/class.txt index 548505f..1fefc48 100644 --- a/Documentation/driver-model/class.txt +++ b/Documentation/driver-model/class.txt @@ -27,22 +27,7 @@ The device class structure looks like: typedef int (*devclass_add)(struct device *); typedef void (*devclass_remove)(struct device *); -struct device_class { - char * name; - rwlock_t lock; - u32 devnum; - struct list_head node; - - struct list_head drivers; - struct list_head intf_list; - - struct driver_dir_entry dir; - struct driver_dir_entry device_dir; - struct driver_dir_entry driver_dir; - - devclass_add add_device; - devclass_remove remove_device; -}; +See the kerneldoc for the struct class. A typical device class definition would look like: diff --git a/Documentation/driver-model/device.txt b/Documentation/driver-model/device.txt index a124f31..b2ff426 100644 --- a/Documentation/driver-model/device.txt +++ b/Documentation/driver-model/device.txt @@ -2,96 +2,7 @@ The Basic Device Structure ~~~~~~~~~~~~~~~~~~~~~~~~~~ -struct device { - struct list_head g_list; - struct list_head node; - struct list_head bus_list; - struct list_head driver_list; - struct list_head intf_list; - struct list_head children; - struct device * parent; - - char name[DEVICE_NAME_SIZE]; - char bus_id[BUS_ID_SIZE]; - - spinlock_t lock; - atomic_t refcount; - - struct bus_type * bus; - struct driver_dir_entry dir; - - u32 class_num; - - struct device_driver *driver; - void *driver_data; - void *platform_data; - - u32 current_state; - unsigned char *saved_state; - - void (*release)(struct device * dev); -}; - -Fields -~~~~~~ -g_list: Node in the global device list. - -node: Node in device's parent's children list. - -bus_list: Node in device's bus's devices list. - -driver_list: Node in device's driver's devices list. - -intf_list: List of intf_data. There is one structure allocated for - each interface that the device supports. - -children: List of child devices. - -parent: *** FIXME *** - -name: ASCII description of device. - Example: " 3Com Corporation 3c905 100BaseTX [Boomerang]" - -bus_id: ASCII representation of device's bus position. This - field should be a name unique across all devices on the - bus type the device belongs to. - - Example: PCI bus_ids are in the form of - :. - This name is unique across all PCI devices in the system. - -lock: Spinlock for the device. - -refcount: Reference count on the device. - -bus: Pointer to struct bus_type that device belongs to. - -dir: Device's sysfs directory. - -class_num: Class-enumerated value of the device. - -driver: Pointer to struct device_driver that controls the device. - -driver_data: Driver-specific data. - -platform_data: Platform data specific to the device. - - Example: for devices on custom boards, as typical of embedded - and SOC based hardware, Linux often uses platform_data to point - to board-specific structures describing devices and how they - are wired. That can include what ports are available, chip - variants, which GPIO pins act in what additional roles, and so - on. This shrinks the "Board Support Packages" (BSPs) and - minimizes board-specific #ifdefs in drivers. - -current_state: Current power state of the device. - -saved_state: Pointer to saved state of the device. This is usable by - the device driver controlling the device. - -release: Callback to free the device after all references have - gone away. This should be set by the allocator of the - device (i.e. the bus driver that discovered the device). +See the kerneldoc for the struct device. Programming Interface diff --git a/Documentation/driver-model/driver.txt b/Documentation/driver-model/driver.txt index d2cd6fb..4421135 100644 --- a/Documentation/driver-model/driver.txt +++ b/Documentation/driver-model/driver.txt @@ -1,23 +1,7 @@ Device Drivers -struct device_driver { - char * name; - struct bus_type * bus; - - struct completion unloaded; - struct kobject kobj; - list_t devices; - - struct module *owner; - - int (*probe) (struct device * dev); - int (*remove) (struct device * dev); - - int (*suspend) (struct device * dev, pm_message_t state); - int (*resume) (struct device * dev); -}; - +See the kerneldoc for the struct device_driver. Allocation diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 492e81d..ff31b1c 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -35,17 +35,6 @@ Who: Luis R. Rodriguez --------------------------- -What: AR9170USB -When: 2.6.40 - -Why: This driver is deprecated and the firmware is no longer - maintained. The replacement driver "carl9170" has been - around for a while, so the devices are still supported. - -Who: Christian Lamparter - ---------------------------- - What: IRQF_SAMPLE_RANDOM Check: IRQF_SAMPLE_RANDOM When: July 2009 @@ -226,7 +215,7 @@ Who: Zhang Rui What: CONFIG_ACPI_PROCFS_POWER When: 2.6.39 Why: sysfs I/F for ACPI power devices, including AC and Battery, - has been working in upstream kenrel since 2.6.24, Sep 2007. + has been working in upstream kernel since 2.6.24, Sep 2007. In 2.6.37, we make the sysfs I/F always built in and this option disabled by default. Remove this option and the ACPI power procfs interface in 2.6.39. @@ -273,16 +262,6 @@ Who: Michael Buesch --------------------------- -What: /sys/o2cb symlink -When: January 2010 -Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb - exists as a symlink for backwards compatibility for old versions of - ocfs2-tools. 2 years should be sufficient time to phase in new versions - which know to look in /sys/fs/o2cb. -Who: ocfs2-devel@oss.oracle.com - ---------------------------- - What: Ability for non root users to shm_get hugetlb pages based on mlock resource limits When: 2.6.31 @@ -405,16 +384,6 @@ Who: anybody or Florian Mickler ---------------------------- -What: capifs -When: February 2011 -Files: drivers/isdn/capi/capifs.* -Why: udev fully replaces this special file system that only contains CAPI - NCCI TTY device nodes. User space (pppdcapiplugin) works without - noticing the difference. -Who: Jan Kiszka - ----------------------------- - What: KVM paravirt mmu host support When: January 2011 Why: The paravirt mmu host support is slower than non-paravirt mmu, both @@ -460,14 +429,6 @@ Who: Thomas Gleixner ---------------------------- -What: The acpi_sleep=s4_nonvs command line option -When: 2.6.37 -Files: arch/x86/kernel/acpi/sleep.c -Why: superseded by acpi_sleep=nonvs -Who: Rafael J. Wysocki - ----------------------------- - What: PCI DMA unmap state API When: August 2012 Why: PCI DMA unmap state API (include/linux/pci-dma.h) was replaced @@ -580,3 +541,26 @@ Why: These legacy callbacks should no longer be used as i2c-core offers Who: Jean Delvare ---------------------------- + +What: Support for UVCIOC_CTRL_ADD in the uvcvideo driver +When: 2.6.42 +Why: The information passed to the driver by this ioctl is now queried + dynamically from the device. +Who: Laurent Pinchart + +---------------------------- + +What: Support for UVCIOC_CTRL_MAP_OLD in the uvcvideo driver +When: 2.6.42 +Why: Used only by applications compiled against older driver versions. + Superseded by UVCIOC_CTRL_MAP which supports V4L2 menu controls. +Who: Laurent Pinchart + +---------------------------- + +What: Support for UVCIOC_CTRL_GET and UVCIOC_CTRL_SET in the uvcvideo driver +When: 2.6.42 +Why: Superseded by the UVCIOC_CTRL_QUERY ioctl. +Who: Laurent Pinchart + +---------------------------- diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt index b22abba..13de64c 100644 --- a/Documentation/filesystems/9p.txt +++ b/Documentation/filesystems/9p.txt @@ -25,6 +25,8 @@ Other applications are described in the following papers: http://xcpu.org/papers/cellfs-talk.pdf * PROSE I/O: Using 9p to enable Application Partitions http://plan9.escet.urjc.es/iwp9/cready/PROSE_iwp9_2006.pdf + * VirtFS: A Virtualization Aware File System pass-through + http://goo.gl/3WPDg USAGE ===== @@ -130,31 +132,20 @@ OPTIONS RESOURCES ========= -Our current recommendation is to use Inferno (http://www.vitanuova.com/nferno/index.html) -as the 9p server. You can start a 9p server under Inferno by issuing the -following command: - ; styxlisten -A tcp!*!564 export '#U*' +Protocol specifications are maintained on github: +http://ericvh.github.com/9p-rfc/ -The -A specifies an unauthenticated export. The 564 is the port # (you may -have to choose a higher port number if running as a normal user). The '#U*' -specifies exporting the root of the Linux name space. You may specify a -subset of the namespace by extending the path: '#U*'/tmp would just export -/tmp. For more information, see the Inferno manual pages covering styxlisten -and export. +9p client and server implementations are listed on +http://9p.cat-v.org/implementations -A Linux version of the 9p server is now maintained under the npfs project -on sourceforge (http://sourceforge.net/projects/npfs). The currently -maintained version is the single-threaded version of the server (named spfs) -available from the same SVN repository. +A 9p2000.L server is being developed by LLNL and can be found +at http://code.google.com/p/diod/ There are user and developer mailing lists available through the v9fs project on sourceforge (http://sourceforge.net/projects/v9fs). -A stand-alone version of the module (which should build for any 2.6 kernel) -is available via (http://github.com/ericvh/9p-sac/tree/master) - -News and other information is maintained on SWiK (http://swik.net/v9fs) -and the Wiki (http://sf.net/apps/mediawiki/v9fs/index.php). +News and other information is maintained on a Wiki. +(http://sf.net/apps/mediawiki/v9fs/index.php). Bug reports may be issued through the kernel.org bugzilla (http://bugzilla.kernel.org) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index c79ec58..3ae9bc9 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -226,10 +226,6 @@ acl Enables POSIX Access Control Lists support. noacl This option disables POSIX Access Control List support. -reservation - -noreservation - bsddf (*) Make 'df' act like BSD. minixdf Make 'df' act like Minix. diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index 9ed920a..7618a28 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -46,9 +46,15 @@ errors=panic Panic and halt the machine if an error occurs. intr (*) Allow signals to interrupt cluster operations. nointr Do not allow signals to interrupt cluster operations. +noatime Do not update access time. +relatime(*) Update atime if the previous atime is older than + mtime or ctime +strictatime Always update atime, but the minimum update interval + is specified by atime_quantum. atime_quantum=60(*) OCFS2 will not update atime unless this number of seconds has passed since the last update. - Set to zero to always update atime. + Set to zero to always update atime. This option need + work with strictatime. data=ordered (*) All data are forced directly out to the main file system prior to its metadata being committed to the journal. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index b0b814d..f481780 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -574,6 +574,12 @@ The contents of each smp_affinity file is the same by default: > cat /proc/irq/0/smp_affinity ffffffff +There is an alternate interface, smp_affinity_list which allows specifying +a cpu range instead of a bitmask: + + > cat /proc/irq/0/smp_affinity_list + 1024-1031 + The default_smp_affinity mask applies to all non-active IRQs, which are the IRQs which have not yet been allocated/activated, and hence which lack a /proc/irq/[0-9]* directory. @@ -583,12 +589,13 @@ reports itself as being attached. This hardware locality information does not include information about any possible driver locality preference. prof_cpu_mask specifies which CPUs are to be profiled by the system wide -profiler. Default value is ffffffff (all cpus). +profiler. Default value is ffffffff (all cpus if there are only 32 of them). The way IRQs are routed is handled by the IO-APIC, and it's Round Robin between all the CPUs which are allowed to handle it. As usual the kernel has more info than you and does a better job than you, so the defaults are the -best choice for almost everyone. +best choice for almost everyone. [Note this applies only to those IO-APIC's +that support "Round Robin" interrupt distribution.] There are three more important subdirectories in /proc: net, scsi, and sys. The general rule is that the contents, or even the existence of these @@ -836,7 +843,6 @@ Provides counts of softirq handlers serviced since boot time, for each cpu. TASKLET: 0 0 0 290 SCHED: 27035 26983 26971 26746 HRTIMER: 0 0 0 0 - RCU: 1678 1769 2178 2250 1.3 IDE devices in /proc/ide diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt index d7b13b0..8e4fab6 100644 --- a/Documentation/filesystems/ubifs.txt +++ b/Documentation/filesystems/ubifs.txt @@ -115,28 +115,8 @@ ubi.mtd=0 root=ubi0:rootfs rootfstype=ubifs Module Parameters for Debugging =============================== -When UBIFS has been compiled with debugging enabled, there are 3 module +When UBIFS has been compiled with debugging enabled, there are 2 module parameters that are available to control aspects of testing and debugging. -The parameters are unsigned integers where each bit controls an option. -The parameters are: - -debug_msgs Selects which debug messages to display, as follows: - - Message Type Flag value - - General messages 1 - Journal messages 2 - Mount messages 4 - Commit messages 8 - LEB search messages 16 - Budgeting messages 32 - Garbage collection messages 64 - Tree Node Cache (TNC) messages 128 - LEB properties (lprops) messages 256 - Input/output messages 512 - Log messages 1024 - Scan messages 2048 - Recovery messages 4096 debug_chks Selects extra checks that UBIFS can do while running: @@ -154,11 +134,9 @@ debug_tsts Selects a mode of testing, as follows: Test mode Flag value - Force in-the-gaps method 2 Failure mode for recovery testing 4 -For example, set debug_msgs to 5 to display General messages and Mount -messages. +For example, set debug_chks to 3 to enable general and TNC checks. References diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 7bff3e4..3fc0c31 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -39,6 +39,12 @@ When mounting an XFS filesystem, the following options are accepted. drive level write caching to be enabled, for devices that support write barriers. + discard + Issue command to let the block device reclaim space freed by the + filesystem. This is useful for SSD devices, thinly provisioned + LUNs and virtual machine images, but may have a performance + impact. This option is incompatible with the nodelaylog option. + dmapi Enable the DMAPI (Data Management API) event callouts. Use with the "mtpt" option. diff --git a/Documentation/hid/hiddev.txt b/Documentation/hid/hiddev.txt new file mode 100644 index 0000000..6e8c9f1 --- /dev/null +++ b/Documentation/hid/hiddev.txt @@ -0,0 +1,205 @@ +Care and feeding of your Human Interface Devices + +INTRODUCTION + +In addition to the normal input type HID devices, USB also uses the +human interface device protocols for things that are not really human +interfaces, but have similar sorts of communication needs. The two big +examples for this are power devices (especially uninterruptable power +supplies) and monitor control on higher end monitors. + +To support these disparate requirements, the Linux USB system provides +HID events to two separate interfaces: +* the input subsystem, which converts HID events into normal input +device interfaces (such as keyboard, mouse and joystick) and a +normalised event interface - see Documentation/input/input.txt +* the hiddev interface, which provides fairly raw HID events + +The data flow for a HID event produced by a device is something like +the following : + + usb.c ---> hid-core.c ----> hid-input.c ----> [keyboard/mouse/joystick/event] + | + | + --> hiddev.c ----> POWER / MONITOR CONTROL + +In addition, other subsystems (apart from USB) can potentially feed +events into the input subsystem, but these have no effect on the hid +device interface. + +USING THE HID DEVICE INTERFACE + +The hiddev interface is a char interface using the normal USB major, +with the minor numbers starting at 96 and finishing at 111. Therefore, +you need the following commands: +mknod /dev/usb/hiddev0 c 180 96 +mknod /dev/usb/hiddev1 c 180 97 +mknod /dev/usb/hiddev2 c 180 98 +mknod /dev/usb/hiddev3 c 180 99 +mknod /dev/usb/hiddev4 c 180 100 +mknod /dev/usb/hiddev5 c 180 101 +mknod /dev/usb/hiddev6 c 180 102 +mknod /dev/usb/hiddev7 c 180 103 +mknod /dev/usb/hiddev8 c 180 104 +mknod /dev/usb/hiddev9 c 180 105 +mknod /dev/usb/hiddev10 c 180 106 +mknod /dev/usb/hiddev11 c 180 107 +mknod /dev/usb/hiddev12 c 180 108 +mknod /dev/usb/hiddev13 c 180 109 +mknod /dev/usb/hiddev14 c 180 110 +mknod /dev/usb/hiddev15 c 180 111 + +So you point your hiddev compliant user-space program at the correct +interface for your device, and it all just works. + +Assuming that you have a hiddev compliant user-space program, of +course. If you need to write one, read on. + + +THE HIDDEV API +This description should be read in conjunction with the HID +specification, freely available from http://www.usb.org, and +conveniently linked of http://www.linux-usb.org. + +The hiddev API uses a read() interface, and a set of ioctl() calls. + +HID devices exchange data with the host computer using data +bundles called "reports". Each report is divided into "fields", +each of which can have one or more "usages". In the hid-core, +each one of these usages has a single signed 32 bit value. + +read(): +This is the event interface. When the HID device's state changes, +it performs an interrupt transfer containing a report which contains +the changed value. The hid-core.c module parses the report, and +returns to hiddev.c the individual usages that have changed within +the report. In its basic mode, the hiddev will make these individual +usage changes available to the reader using a struct hiddev_event: + + struct hiddev_event { + unsigned hid; + signed int value; + }; + +containing the HID usage identifier for the status that changed, and +the value that it was changed to. Note that the structure is defined +within , along with some other useful #defines and +structures. The HID usage identifier is a composite of the HID usage +page shifted to the 16 high order bits ORed with the usage code. The +behavior of the read() function can be modified using the HIDIOCSFLAG +ioctl() described below. + + +ioctl(): +This is the control interface. There are a number of controls: + +HIDIOCGVERSION - int (read) +Gets the version code out of the hiddev driver. + +HIDIOCAPPLICATION - (none) +This ioctl call returns the HID application usage associated with the +hid device. The third argument to ioctl() specifies which application +index to get. This is useful when the device has more than one +application collection. If the index is invalid (greater or equal to +the number of application collections this device has) the ioctl +returns -1. You can find out beforehand how many application +collections the device has from the num_applications field from the +hiddev_devinfo structure. + +HIDIOCGCOLLECTIONINFO - struct hiddev_collection_info (read/write) +This returns a superset of the information above, providing not only +application collections, but all the collections the device has. It +also returns the level the collection lives in the hierarchy. +The user passes in a hiddev_collection_info struct with the index +field set to the index that should be returned. The ioctl fills in +the other fields. If the index is larger than the last collection +index, the ioctl returns -1 and sets errno to -EINVAL. + +HIDIOCGDEVINFO - struct hiddev_devinfo (read) +Gets a hiddev_devinfo structure which describes the device. + +HIDIOCGSTRING - struct hiddev_string_descriptor (read/write) +Gets a string descriptor from the device. The caller must fill in the +"index" field to indicate which descriptor should be returned. + +HIDIOCINITREPORT - (none) +Instructs the kernel to retrieve all input and feature report values +from the device. At this point, all the usage structures will contain +current values for the device, and will maintain it as the device +changes. Note that the use of this ioctl is unnecessary in general, +since later kernels automatically initialize the reports from the +device at attach time. + +HIDIOCGNAME - string (variable length) +Gets the device name + +HIDIOCGREPORT - struct hiddev_report_info (write) +Instructs the kernel to get a feature or input report from the device, +in order to selectively update the usage structures (in contrast to +INITREPORT). + +HIDIOCSREPORT - struct hiddev_report_info (write) +Instructs the kernel to send a report to the device. This report can +be filled in by the user through HIDIOCSUSAGE calls (below) to fill in +individual usage values in the report before sending the report in full +to the device. + +HIDIOCGREPORTINFO - struct hiddev_report_info (read/write) +Fills in a hiddev_report_info structure for the user. The report is +looked up by type (input, output or feature) and id, so these fields +must be filled in by the user. The ID can be absolute -- the actual +report id as reported by the device -- or relative -- +HID_REPORT_ID_FIRST for the first report, and (HID_REPORT_ID_NEXT | +report_id) for the next report after report_id. Without a-priori +information about report ids, the right way to use this ioctl is to +use the relative IDs above to enumerate the valid IDs. The ioctl +returns non-zero when there is no more next ID. The real report ID is +filled into the returned hiddev_report_info structure. + +HIDIOCGFIELDINFO - struct hiddev_field_info (read/write) +Returns the field information associated with a report in a +hiddev_field_info structure. The user must fill in report_id and +report_type in this structure, as above. The field_index should also +be filled in, which should be a number from 0 and maxfield-1, as +returned from a previous HIDIOCGREPORTINFO call. + +HIDIOCGUCODE - struct hiddev_usage_ref (read/write) +Returns the usage_code in a hiddev_usage_ref structure, given that +given its report type, report id, field index, and index within the +field have already been filled into the structure. + +HIDIOCGUSAGE - struct hiddev_usage_ref (read/write) +Returns the value of a usage in a hiddev_usage_ref structure. The +usage to be retrieved can be specified as above, or the user can +choose to fill in the report_type field and specify the report_id as +HID_REPORT_ID_UNKNOWN. In this case, the hiddev_usage_ref will be +filled in with the report and field information associated with this +usage if it is found. + +HIDIOCSUSAGE - struct hiddev_usage_ref (write) +Sets the value of a usage in an output report. The user fills in +the hiddev_usage_ref structure as above, but additionally fills in +the value field. + +HIDIOGCOLLECTIONINDEX - struct hiddev_usage_ref (write) +Returns the collection index associated with this usage. This +indicates where in the collection hierarchy this usage sits. + +HIDIOCGFLAG - int (read) +HIDIOCSFLAG - int (write) +These operations respectively inspect and replace the mode flags +that influence the read() call above. The flags are as follows: + + HIDDEV_FLAG_UREF - read() calls will now return + struct hiddev_usage_ref instead of struct hiddev_event. + This is a larger structure, but in situations where the + device has more than one usage in its reports with the + same usage code, this mode serves to resolve such + ambiguity. + + HIDDEV_FLAG_REPORT - This flag can only be used in conjunction + with HIDDEV_FLAG_UREF. With this flag set, when the device + sends a report, a struct hiddev_usage_ref will be returned + to read() filled in with the report_type and report_id, but + with field_index set to FIELD_INDEX_NONE. This serves as + additional notification when the device has sent a report. diff --git a/Documentation/hid/hidraw.txt b/Documentation/hid/hidraw.txt new file mode 100644 index 0000000..029e6cb --- /dev/null +++ b/Documentation/hid/hidraw.txt @@ -0,0 +1,119 @@ + HIDRAW - Raw Access to USB and Bluetooth Human Interface Devices + ================================================================== + +The hidraw driver provides a raw interface to USB and Bluetooth Human +Interface Devices (HIDs). It differs from hiddev in that reports sent and +received are not parsed by the HID parser, but are sent to and received from +the device unmodified. + +Hidraw should be used if the userspace application knows exactly how to +communicate with the hardware device, and is able to construct the HID +reports manually. This is often the case when making userspace drivers for +custom HID devices. + +Hidraw is also useful for communicating with non-conformant HID devices +which send and receive data in a way that is inconsistent with their report +descriptors. Because hiddev parses reports which are sent and received +through it, checking them against the device's report descriptor, such +communication with these non-conformant devices is impossible using hiddev. +Hidraw is the only alternative, short of writing a custom kernel driver, for +these non-conformant devices. + +A benefit of hidraw is that its use by userspace applications is independent +of the underlying hardware type. Currently, Hidraw is implemented for USB +and Bluetooth. In the future, as new hardware bus types are developed which +use the HID specification, hidraw will be expanded to add support for these +new bus types. + +Hidraw uses a dynamic major number, meaning that udev should be relied on to +create hidraw device nodes. Udev will typically create the device nodes +directly under /dev (eg: /dev/hidraw0). As this location is distribution- +and udev rule-dependent, applications should use libudev to locate hidraw +devices attached to the system. There is a tutorial on libudev with a +working example at: + http://www.signal11.us/oss/udev/ + +The HIDRAW API +--------------- + +read() +------- +read() will read a queued report received from the HID device. On USB +devices, the reports read using read() are the reports sent from the device +on the INTERRUPT IN endpoint. By default, read() will block until there is +a report available to be read. read() can be made non-blocking, by passing +the O_NONBLOCK flag to open(), or by setting the O_NONBLOCK flag using +fcntl(). + +On a device which uses numbered reports, the first byte of the returned data +will be the report number; the report data follows, beginning in the second +byte. For devices which do not use numbered reports, the report data +will begin at the first byte. + +write() +-------- +The write() function will write a report to the device. For USB devices, if +the device has an INTERRUPT OUT endpoint, the report will be sent on that +endpoint. If it does not, the report will be sent over the control endpoint, +using a SET_REPORT transfer. + +The first byte of the buffer passed to write() should be set to the report +number. If the device does not use numbered reports, the first byte should +be set to 0. The report data itself should begin at the second byte. + +ioctl() +-------- +Hidraw supports the following ioctls: + +HIDIOCGRDESCSIZE: Get Report Descriptor Size +This ioctl will get the size of the device's report descriptor. + +HIDIOCGRDESC: Get Report Descriptor +This ioctl returns the device's report descriptor using a +hidraw_report_descriptor struct. Make sure to set the size field of the +hidraw_report_descriptor struct to the size returned from HIDIOCGRDESCSIZE. + +HIDIOCGRAWINFO: Get Raw Info +This ioctl will return a hidraw_devinfo struct containing the bus type, the +vendor ID (VID), and product ID (PID) of the device. The bus type can be one +of: + BUS_USB + BUS_HIL + BUS_BLUETOOTH + BUS_VIRTUAL +which are defined in linux/input.h. + +HIDIOCGRAWNAME(len): Get Raw Name +This ioctl returns a string containing the vendor and product strings of +the device. The returned string is Unicode, UTF-8 encoded. + +HIDIOCGRAWPHYS(len): Get Physical Address +This ioctl returns a string representing the physical address of the device. +For USB devices, the string contains the physical path to the device (the +USB controller, hubs, ports, etc). For Bluetooth devices, the string +contains the hardware (MAC) address of the device. + +HIDIOCSFEATURE(len): Send a Feature Report +This ioctl will send a feature report to the device. Per the HID +specification, feature reports are always sent using the control endpoint. +Set the first byte of the supplied buffer to the report number. For devices +which do not use numbered reports, set the first byte to 0. The report data +begins in the second byte. Make sure to set len accordingly, to one more +than the length of the report (to account for the report number). + +HIDIOCGFEATURE(len): Get a Feature Report +This ioctl will request a feature report from the device using the control +endpoint. The first byte of the supplied buffer should be set to the report +number of the requested report. For devices which do not use numbered +reports, set the first byte to 0. The report will be returned starting at +the first byte of the buffer (ie: the report number is not returned). + +Example +--------- +In samples/, find hid-example.c, which shows examples of read(), write(), +and all the ioctls for hidraw. The code may be used by anyone for any +purpose, and can serve as a starting point for developing applications using +hidraw. + +Document by: + Alan Ott , Signal 11 Software diff --git a/Documentation/hwmon/adm1275 b/Documentation/hwmon/adm1275 new file mode 100644 index 0000000..6a3a647 --- /dev/null +++ b/Documentation/hwmon/adm1275 @@ -0,0 +1,60 @@ +Kernel driver adm1275 +===================== + +Supported chips: + * Analog Devices ADM1275 + Prefix: 'adm1275' + Addresses scanned: - + Datasheet: www.analog.com/static/imported-files/data_sheets/ADM1275.pdf + +Author: Guenter Roeck + + +Description +----------- + +This driver supports hardware montoring for Analog Devices ADM1275 Hot-Swap +Controller and Digital Power Monitor. + +The ADM1275 is a hot-swap controller that allows a circuit board to be removed +from or inserted into a live backplane. It also features current and voltage +readback via an integrated 12-bit analog-to-digital converter (ADC), accessed +using a PMBus. interface. + +The driver is a client driver to the core PMBus driver. Please see +Documentation/hwmon/pmbus for details on PMBus client drivers. + + +Usage Notes +----------- + +This driver does not auto-detect devices. You will have to instantiate the +devices explicitly. Please see Documentation/i2c/instantiating-devices for +details. + + +Platform data support +--------------------- + +The driver supports standard PMBus driver platform data. Please see +Documentation/hwmon/pmbus for details. + + +Sysfs entries +------------- + +The following attributes are supported. Limits are read-write; all other +attributes are read-only. + +in1_label "vin1" or "vout1" depending on chip variant and + configuration. +in1_input Measured voltage. From READ_VOUT register. +in1_min Minumum Voltage. From VOUT_UV_WARN_LIMIT register. +in1_max Maximum voltage. From VOUT_OV_WARN_LIMIT register. +in1_min_alarm Voltage low alarm. From VOLTAGE_UV_WARNING status. +in1_max_alarm Voltage high alarm. From VOLTAGE_OV_WARNING status. + +curr1_label "iout1" +curr1_input Measured current. From READ_IOUT register. +curr1_max Maximum current. From IOUT_OC_WARN_LIMIT register. +curr1_max_alarm Current high alarm. From IOUT_OC_WARN_LIMIT register. diff --git a/Documentation/hwmon/coretemp b/Documentation/hwmon/coretemp index 25568f8..f85e913 100644 --- a/Documentation/hwmon/coretemp +++ b/Documentation/hwmon/coretemp @@ -15,8 +15,13 @@ Author: Rudolf Marek Description ----------- +This driver permits reading the DTS (Digital Temperature Sensor) embedded +inside Intel CPUs. This driver can read both the per-core and per-package +temperature using the appropriate sensors. The per-package sensor is new; +as of now, it is present only in the SandyBridge platform. The driver will +show the temperature of all cores inside a package under a single device +directory inside hwmon. -This driver permits reading temperature sensor embedded inside Intel Core CPU. Temperature is measured in degrees Celsius and measurement resolution is 1 degree C. Valid temperatures are from 0 to TjMax degrees C, because the actual value of temperature register is in fact a delta from TjMax. @@ -27,13 +32,15 @@ mechanism will perform actions to forcibly cool down the processor. Alarm may be raised, if the temperature grows enough (more than TjMax) to trigger the Out-Of-Spec bit. Following table summarizes the exported sysfs files: -temp1_input - Core temperature (in millidegrees Celsius). -temp1_max - All cooling devices should be turned on (on Core2). -temp1_crit - Maximum junction temperature (in millidegrees Celsius). -temp1_crit_alarm - Set when Out-of-spec bit is set, never clears. +All Sysfs entries are named with their core_id (represented here by 'X'). +tempX_input - Core temperature (in millidegrees Celsius). +tempX_max - All cooling devices should be turned on (on Core2). +tempX_crit - Maximum junction temperature (in millidegrees Celsius). +tempX_crit_alarm - Set when Out-of-spec bit is set, never clears. Correct CPU operation is no longer guaranteed. -temp1_label - Contains string "Core X", where X is processor - number. +tempX_label - Contains string "Core X", where X is processor + number. For Package temp, this will be "Physical id Y", + where Y is the package number. The TjMax temperature is set to 85 degrees C if undocumented model specific register (UMSR) 0xee has bit 30 set. If not the TjMax is 100 degrees C as diff --git a/Documentation/hwmon/emc6w201 b/Documentation/hwmon/emc6w201 new file mode 100644 index 0000000..32f355a --- /dev/null +++ b/Documentation/hwmon/emc6w201 @@ -0,0 +1,42 @@ +Kernel driver emc6w201 +====================== + +Supported chips: + * SMSC EMC6W201 + Prefix: 'emc6w201' + Addresses scanned: I2C 0x2c, 0x2d, 0x2e + Datasheet: Not public + +Author: Jean Delvare + + +Description +----------- + +From the datasheet: + +"The EMC6W201 is an environmental monitoring device with automatic fan +control capability and enhanced system acoustics for noise suppression. +This ACPI compliant device provides hardware monitoring for up to six +voltages (including its own VCC) and five external thermal sensors, +measures the speed of up to five fans, and controls the speed of +multiple DC fans using three Pulse Width Modulator (PWM) outputs. Note +that it is possible to control more than three fans by connecting two +fans to one PWM output. The EMC6W201 will be available in a 36-pin +QFN package." + +The device is functionally close to the EMC6D100 series, but is +register-incompatible. + +The driver currently only supports the monitoring of the voltages, +temperatures and fan speeds. Limits can be changed. Alarms are not +supported, and neither is fan speed control. + + +Known Systems With EMC6W201 +--------------------------- + +The EMC6W201 is a rare device, only found on a few systems, made in +2005 and 2006. Known systems with this device: +* Dell Precision 670 workstation +* Gigabyte 2CEWH mainboard diff --git a/Documentation/hwmon/f71882fg b/Documentation/hwmon/f71882fg index df02245..84d2623 100644 --- a/Documentation/hwmon/f71882fg +++ b/Documentation/hwmon/f71882fg @@ -6,6 +6,10 @@ Supported chips: Prefix: 'f71808e' Addresses scanned: none, address read from Super I/O config space Datasheet: Not public + * Fintek F71808A + Prefix: 'f71808a' + Addresses scanned: none, address read from Super I/O config space + Datasheet: Not public * Fintek F71858FG Prefix: 'f71858fg' Addresses scanned: none, address read from Super I/O config space diff --git a/Documentation/hwmon/fam15h_power b/Documentation/hwmon/fam15h_power new file mode 100644 index 0000000..a92918e --- /dev/null +++ b/Documentation/hwmon/fam15h_power @@ -0,0 +1,37 @@ +Kernel driver fam15h_power +========================== + +Supported chips: +* AMD Family 15h Processors + + Prefix: 'fam15h_power' + Addresses scanned: PCI space + Datasheets: + BIOS and Kernel Developer's Guide (BKDG) For AMD Family 15h Processors + (not yet published) + +Author: Andreas Herrmann + +Description +----------- + +This driver permits reading of registers providing power information +of AMD Family 15h processors. + +For AMD Family 15h processors the following power values can be +calculated using different processor northbridge function registers: + +* BasePwrWatts: Specifies in watts the maximum amount of power + consumed by the processor for NB and logic external to the core. +* ProcessorPwrWatts: Specifies in watts the maximum amount of power + the processor can support. +* CurrPwrWatts: Specifies in watts the current amount of power being + consumed by the processor. + +This driver provides ProcessorPwrWatts and CurrPwrWatts: +* power1_crit (ProcessorPwrWatts) +* power1_input (CurrPwrWatts) + +On multi-node processors the calculated value is for the entire +package and not for a single node. Thus the driver creates sysfs +attributes only for internal node0 of a multi-node processor. diff --git a/Documentation/hwmon/k10temp b/Documentation/hwmon/k10temp index d2b56a4..0393c89 100644 --- a/Documentation/hwmon/k10temp +++ b/Documentation/hwmon/k10temp @@ -11,6 +11,7 @@ Supported chips: Socket S1G2: Athlon (X2), Sempron (X2), Turion X2 (Ultra) * AMD Family 12h processors: "Llano" * AMD Family 14h processors: "Brazos" (C/E/G-Series) +* AMD Family 15h processors: "Bulldozer" Prefix: 'k10temp' Addresses scanned: PCI space @@ -40,7 +41,7 @@ Description ----------- This driver permits reading of the internal temperature sensor of AMD -Family 10h/11h/12h/14h processors. +Family 10h/11h/12h/14h/15h processors. All these processors have a sensor, but on those for Socket F or AM2+, the sensor may return inconsistent values (erratum 319). The driver diff --git a/Documentation/hwmon/max16065 b/Documentation/hwmon/max16065 new file mode 100644 index 0000000..44b4f61 --- /dev/null +++ b/Documentation/hwmon/max16065 @@ -0,0 +1,98 @@ +Kernel driver max16065 +====================== + +Supported chips: + * Maxim MAX16065, MAX16066 + Prefixes: 'max16065', 'max16066' + Addresses scanned: - + Datasheet: + http://datasheets.maxim-ic.com/en/ds/MAX16065-MAX16066.pdf + * Maxim MAX16067 + Prefix: 'max16067' + Addresses scanned: - + Datasheet: + http://datasheets.maxim-ic.com/en/ds/MAX16067.pdf + * Maxim MAX16068 + Prefix: 'max16068' + Addresses scanned: - + Datasheet: + http://datasheets.maxim-ic.com/en/ds/MAX16068.pdf + * Maxim MAX16070/MAX16071 + Prefixes: 'max16070', 'max16071' + Addresses scanned: - + Datasheet: + http://datasheets.maxim-ic.com/en/ds/MAX16070-MAX16071.pdf + + +Author: Guenter Roeck + + +Description +----------- + +[From datasheets] The MAX16065/MAX16066 flash-configurable system managers +monitor and sequence multiple system voltages. The MAX16065/MAX16066 can also +accurately monitor (+/-2.5%) one current channel using a dedicated high-side +current-sense amplifier. The MAX16065 manages up to twelve system voltages +simultaneously, and the MAX16066 manages up to eight supply voltages. + +The MAX16067 flash-configurable system manager monitors and sequences multiple +system voltages. The MAX16067 manages up to six system voltages simultaneously. + +The MAX16068 flash-configurable system manager monitors and manages up to six +system voltages simultaneously. + +The MAX16070/MAX16071 flash-configurable system monitors supervise multiple +system voltages. The MAX16070/MAX16071 can also accurately monitor (+/-2.5%) +one current channel using a dedicated high-side current-sense amplifier. The +MAX16070 monitors up to twelve system voltages simultaneously, and the MAX16071 +monitors up to eight supply voltages. + +Each monitored channel has its own low and high critical limits. MAX16065, +MAX16066, MAX16070, and MAX16071 support an additional limit which is +configurable as either low or high secondary limit. MAX16065, MAX16066, +MAX16070, and MAX16071 also support supply current monitoring. + + +Usage Notes +----------- + +This driver does not probe for devices, since there is no register which +can be safely used to identify the chip. You will have to instantiate +the devices explicitly. Please see Documentation/i2c/instantiating-devices for +details. + + +Sysfs entries +------------- + +in[0-11]_input Input voltage measurements. + +in12_input Voltage on CSP (Current Sense Positive) pin. + Only if the chip supports current sensing and if + current sensing is enabled. + +in[0-11]_min Low warning limit. + Supported on MAX16065, MAX16066, MAX16070, and MAX16071 + only. + +in[0-11]_max High warning limit. + Supported on MAX16065, MAX16066, MAX16070, and MAX16071 + only. + + Either low or high warning limits are supported + (depending on chip configuration), but not both. + +in[0-11]_lcrit Low critical limit. + +in[0-11]_crit High critical limit. + +in[0-11]_alarm Input voltage alarm. + +curr1_input Current sense input; only if the chip supports current + sensing and if current sensing is enabled. + Displayed current assumes 0.001 Ohm current sense + resistor. + +curr1_alarm Overcurrent alarm; only if the chip supports current + sensing and if current sensing is enabled. diff --git a/Documentation/hwmon/max6642 b/Documentation/hwmon/max6642 new file mode 100644 index 0000000..afbd3e4 --- /dev/null +++ b/Documentation/hwmon/max6642 @@ -0,0 +1,21 @@ +Kernel driver max6642 +===================== + +Supported chips: + * Maxim MAX6642 + Prefix: 'max6642' + Addresses scanned: I2C 0x48-0x4f + Datasheet: Publicly available at the Maxim website + http://datasheets.maxim-ic.com/en/ds/MAX6642.pdf + +Authors: + Per Dalen + +Description +----------- + +The MAX6642 is a digital temperature sensor. It senses its own temperature as +well as the temperature on one external diode. + +All temperature values are given in degrees Celsius. Resolution +is 0.25 degree for the local temperature and for the remote temperature. diff --git a/Documentation/hwmon/max6650 b/Documentation/hwmon/max6650 index c565650..58d9644 100644 --- a/Documentation/hwmon/max6650 +++ b/Documentation/hwmon/max6650 @@ -2,9 +2,13 @@ Kernel driver max6650 ===================== Supported chips: - * Maxim 6650 / 6651 + * Maxim MAX6650 Prefix: 'max6650' - Addresses scanned: I2C 0x1b, 0x1f, 0x48, 0x4b + Addresses scanned: none + Datasheet: http://pdfserv.maxim-ic.com/en/ds/MAX6650-MAX6651.pdf + * Maxim MAX6651 + Prefix: 'max6651' + Addresses scanned: none Datasheet: http://pdfserv.maxim-ic.com/en/ds/MAX6650-MAX6651.pdf Authors: @@ -15,10 +19,10 @@ Authors: Description ----------- -This driver implements support for the Maxim 6650/6651 +This driver implements support for the Maxim MAX6650 and MAX6651. -The 2 devices are very similar, but the Maxim 6550 has a reduced feature -set, e.g. only one fan-input, instead of 4 for the 6651. +The 2 devices are very similar, but the MAX6550 has a reduced feature +set, e.g. only one fan-input, instead of 4 for the MAX6651. The driver is not able to distinguish between the 2 devices. @@ -36,6 +40,13 @@ fan1_div rw sets the speed range the inputs can handle. Legal values are 1, 2, 4, and 8. Use lower values for faster fans. +Usage notes +----------- + +This driver does not auto-detect devices. You will have to instantiate the +devices explicitly. Please see Documentation/i2c/instantiating-devices for +details. + Module parameters ----------------- diff --git a/Documentation/hwmon/pkgtemp b/Documentation/hwmon/pkgtemp deleted file mode 100644 index c8e1fb0..0000000 --- a/Documentation/hwmon/pkgtemp +++ /dev/null @@ -1,36 +0,0 @@ -Kernel driver pkgtemp -====================== - -Supported chips: - * Intel family - Prefix: 'pkgtemp' - CPUID: - Datasheet: Intel 64 and IA-32 Architectures Software Developer's Manual - Volume 3A: System Programming Guide - -Author: Fenghua Yu - -Description ------------ - -This driver permits reading package level temperature sensor embedded inside -Intel CPU package. The sensors can be in core, uncore, memory controller, or -other components in a package. The feature is first implemented in Intel Sandy -Bridge platform. - -Temperature is measured in degrees Celsius and measurement resolution is -1 degree C. Valid temperatures are from 0 to TjMax degrees C, because the actual -value of temperature register is in fact a delta from TjMax. - -Temperature known as TjMax is the maximum junction temperature of package. -We get this from MSR_IA32_TEMPERATURE_TARGET. If the MSR is not accessible, -we define TjMax as 100 degrees Celsius. At this temperature, protection -mechanism will perform actions to forcibly cool down the package. Alarm -may be raised, if the temperature grows enough (more than TjMax) to trigger -the Out-Of-Spec bit. Following table summarizes the exported sysfs files: - -temp1_input - Package temperature (in millidegrees Celsius). -temp1_max - All cooling devices should be turned on. -temp1_crit - Maximum junction temperature (in millidegrees Celsius). -temp1_crit_alarm - Set when Out-of-spec bit is set, never clears. - Correct CPU operation is no longer guaranteed. diff --git a/Documentation/hwmon/sht15 b/Documentation/hwmon/sht15 new file mode 100644 index 0000000..02850bd --- /dev/null +++ b/Documentation/hwmon/sht15 @@ -0,0 +1,74 @@ +Kernel driver sht15 +=================== + +Authors: + * Wouter Horre + * Jonathan Cameron + * Vivien Didelot + * Jerome Oufella + +Supported chips: + * Sensirion SHT10 + Prefix: 'sht10' + + * Sensirion SHT11 + Prefix: 'sht11' + + * Sensirion SHT15 + Prefix: 'sht15' + + * Sensirion SHT71 + Prefix: 'sht71' + + * Sensirion SHT75 + Prefix: 'sht75' + +Datasheet: Publicly available at the Sensirion website +http://www.sensirion.ch/en/pdf/product_information/Datasheet-humidity-sensor-SHT1x.pdf + +Description +----------- + +The SHT10, SHT11, SHT15, SHT71, and SHT75 are humidity and temperature +sensors. + +The devices communicate using two GPIO lines. + +Supported resolutions for the measurements are 14 bits for temperature and 12 +bits for humidity, or 12 bits for temperature and 8 bits for humidity. + +The humidity calibration coefficients are programmed into an OTP memory on the +chip. These coefficients are used to internally calibrate the signals from the +sensors. Disabling the reload of those coefficients allows saving 10ms for each +measurement and decrease power consumption, while loosing on precision. + +Some options may be set directly in the sht15_platform_data structure +or via sysfs attributes. + +Notes: + * The regulator supply name is set to "vcc". + * If a CRC validation fails, a soft reset command is sent, which resets + status register to its hardware default value, but the driver will try to + restore the previous device configuration. + +Platform data +------------- + +* checksum: + set it to true to enable CRC validation of the readings (default to false). +* no_otp_reload: + flag to indicate not to reload from OTP (default to false). +* low_resolution: + flag to indicate the temp/humidity resolution to use (default to false). + +Sysfs interface +--------------- + +* temp1_input: temperature input +* humidity1_input: humidity input +* heater_enable: write 1 in this attribute to enable the on-chip heater, + 0 to disable it. Be careful not to enable the heater + for too long. +* temp1_fault: if 1, this means that the voltage is low (below 2.47V) and + measurement may be invalid. +* humidity1_fault: same as temp1_fault. diff --git a/Documentation/hwmon/ucd9000 b/Documentation/hwmon/ucd9000 new file mode 100644 index 0000000..40ca6db --- /dev/null +++ b/Documentation/hwmon/ucd9000 @@ -0,0 +1,110 @@ +Kernel driver ucd9000 +===================== + +Supported chips: + * TI UCD90120, UCD90124, UCD9090, and UCD90910 + Prefixes: 'ucd90120', 'ucd90124', 'ucd9090', 'ucd90910' + Addresses scanned: - + Datasheets: + http://focus.ti.com/lit/ds/symlink/ucd90120.pdf + http://focus.ti.com/lit/ds/symlink/ucd90124.pdf + http://focus.ti.com/lit/ds/symlink/ucd9090.pdf + http://focus.ti.com/lit/ds/symlink/ucd90910.pdf + +Author: Guenter Roeck + + +Description +----------- + +From datasheets: + +The UCD90120 Power Supply Sequencer and System Health Monitor monitors and +sequences up to 12 independent voltage rails. The device integrates a 12-bit +ADC with a 2.5V internal reference for monitoring up to 13 power supply voltage, +current, or temperature inputs. + +The UCD90124 is a 12-rail PMBus/I2C addressable power-supply sequencer and +system-health monitor. The device integrates a 12-bit ADC for monitoring up to +13 power-supply voltage, current, or temperature inputs. Twenty-six GPIO pins +can be used for power supply enables, power-on reset signals, external +interrupts, cascading, or other system functions. Twelve of these pins offer PWM +functionality. Using these pins, the UCD90124 offers support for fan control, +margining, and general-purpose PWM functions. + +The UCD9090 is a 10-rail PMBus/I2C addressable power-supply sequencer and +monitor. The device integrates a 12-bit ADC for monitoring up to 10 power-supply +voltage inputs. Twenty-three GPIO pins can be used for power supply enables, +power-on reset signals, external interrupts, cascading, or other system +functions. Ten of these pins offer PWM functionality. Using these pins, the +UCD9090 offers support for margining, and general-purpose PWM functions. + +The UCD90910 is a ten-rail I2C / PMBus addressable power-supply sequencer and +system-health monitor. The device integrates a 12-bit ADC for monitoring up to +13 power-supply voltage, current, or temperature inputs. + +This driver is a client driver to the core PMBus driver. Please see +Documentation/hwmon/pmbus for details on PMBus client drivers. + + +Usage Notes +----------- + +This driver does not auto-detect devices. You will have to instantiate the +devices explicitly. Please see Documentation/i2c/instantiating-devices for +details. + + +Platform data support +--------------------- + +The driver supports standard PMBus driver platform data. Please see +Documentation/hwmon/pmbus for details. + + +Sysfs entries +------------- + +The following attributes are supported. Limits are read-write; all other +attributes are read-only. + +in[1-12]_label "vout[1-12]". +in[1-12]_input Measured voltage. From READ_VOUT register. +in[1-12]_min Minumum Voltage. From VOUT_UV_WARN_LIMIT register. +in[1-12]_max Maximum voltage. From VOUT_OV_WARN_LIMIT register. +in[1-12]_lcrit Critical minumum Voltage. VOUT_UV_FAULT_LIMIT register. +in[1-12]_crit Critical maximum voltage. From VOUT_OV_FAULT_LIMIT register. +in[1-12]_min_alarm Voltage low alarm. From VOLTAGE_UV_WARNING status. +in[1-12]_max_alarm Voltage high alarm. From VOLTAGE_OV_WARNING status. +in[1-12]_lcrit_alarm Voltage critical low alarm. From VOLTAGE_UV_FAULT status. +in[1-12]_crit_alarm Voltage critical high alarm. From VOLTAGE_OV_FAULT status. + +curr[1-12]_label "iout[1-12]". +curr[1-12]_input Measured current. From READ_IOUT register. +curr[1-12]_max Maximum current. From IOUT_OC_WARN_LIMIT register. +curr[1-12]_lcrit Critical minumum output current. From IOUT_UC_FAULT_LIMIT + register. +curr[1-12]_crit Critical maximum current. From IOUT_OC_FAULT_LIMIT register. +curr[1-12]_max_alarm Current high alarm. From IOUT_OC_WARNING status. +curr[1-12]_crit_alarm Current critical high alarm. From IOUT_OC_FAULT status. + + For each attribute index, either voltage or current is + reported, but not both. If voltage or current is + reported depends on the chip configuration. + +temp[1-2]_input Measured temperatures. From READ_TEMPERATURE_1 and + READ_TEMPERATURE_2 registers. +temp[1-2]_max Maximum temperature. From OT_WARN_LIMIT register. +temp[1-2]_crit Critical high temperature. From OT_FAULT_LIMIT register. +temp[1-2]_max_alarm Temperature high alarm. +temp[1-2]_crit_alarm Temperature critical high alarm. + +fan[1-4]_input Fan RPM. +fan[1-4]_alarm Fan alarm. +fan[1-4]_fault Fan fault. + + Fan attributes are only available on chips supporting + fan control (UCD90124, UCD90910). Attribute files are + created only for enabled fans. + Note that even though UCD90910 supports up to 10 fans, + only up to four fans are currently supported. diff --git a/Documentation/hwmon/ucd9200 b/Documentation/hwmon/ucd9200 new file mode 100644 index 0000000..3c58607 --- /dev/null +++ b/Documentation/hwmon/ucd9200 @@ -0,0 +1,112 @@ +Kernel driver ucd9200 +===================== + +Supported chips: + * TI UCD9220, UCD9222, UCD9224, UCD9240, UCD9244, UCD9246, and UCD9248 + Prefixes: 'ucd9220', 'ucd9222', 'ucd9224', 'ucd9240', 'ucd9244', 'ucd9246', + 'ucd9248' + Addresses scanned: - + Datasheets: + http://focus.ti.com/lit/ds/symlink/ucd9220.pdf + http://focus.ti.com/lit/ds/symlink/ucd9222.pdf + http://focus.ti.com/lit/ds/symlink/ucd9224.pdf + http://focus.ti.com/lit/ds/symlink/ucd9240.pdf + http://focus.ti.com/lit/ds/symlink/ucd9244.pdf + http://focus.ti.com/lit/ds/symlink/ucd9246.pdf + http://focus.ti.com/lit/ds/symlink/ucd9248.pdf + +Author: Guenter Roeck + + +Description +----------- + +[From datasheets] UCD9220, UCD9222, UCD9224, UCD9240, UCD9244, UCD9246, and +UCD9248 are multi-rail, multi-phase synchronous buck digital PWM controllers +designed for non-isolated DC/DC power applications. The devices integrate +dedicated circuitry for DC/DC loop management with flash memory and a serial +interface to support configuration, monitoring and management. + +This driver is a client driver to the core PMBus driver. Please see +Documentation/hwmon/pmbus for details on PMBus client drivers. + + +Usage Notes +----------- + +This driver does not auto-detect devices. You will have to instantiate the +devices explicitly. Please see Documentation/i2c/instantiating-devices for +details. + + +Platform data support +--------------------- + +The driver supports standard PMBus driver platform data. Please see +Documentation/hwmon/pmbus for details. + + +Sysfs entries +------------- + +The following attributes are supported. Limits are read-write; all other +attributes are read-only. + +in1_label "vin". +in1_input Measured voltage. From READ_VIN register. +in1_min Minumum Voltage. From VIN_UV_WARN_LIMIT register. +in1_max Maximum voltage. From VIN_OV_WARN_LIMIT register. +in1_lcrit Critical minumum Voltage. VIN_UV_FAULT_LIMIT register. +in1_crit Critical maximum voltage. From VIN_OV_FAULT_LIMIT register. +in1_min_alarm Voltage low alarm. From VIN_UV_WARNING status. +in1_max_alarm Voltage high alarm. From VIN_OV_WARNING status. +in1_lcrit_alarm Voltage critical low alarm. From VIN_UV_FAULT status. +in1_crit_alarm Voltage critical high alarm. From VIN_OV_FAULT status. + +in[2-5]_label "vout[1-4]". +in[2-5]_input Measured voltage. From READ_VOUT register. +in[2-5]_min Minumum Voltage. From VOUT_UV_WARN_LIMIT register. +in[2-5]_max Maximum voltage. From VOUT_OV_WARN_LIMIT register. +in[2-5]_lcrit Critical minumum Voltage. VOUT_UV_FAULT_LIMIT register. +in[2-5]_crit Critical maximum voltage. From VOUT_OV_FAULT_LIMIT register. +in[2-5]_min_alarm Voltage low alarm. From VOLTAGE_UV_WARNING status. +in[2-5]_max_alarm Voltage high alarm. From VOLTAGE_OV_WARNING status. +in[2-5]_lcrit_alarm Voltage critical low alarm. From VOLTAGE_UV_FAULT status. +in[2-5]_crit_alarm Voltage critical high alarm. From VOLTAGE_OV_FAULT status. + +curr1_label "iin". +curr1_input Measured current. From READ_IIN register. + +curr[2-5]_label "iout[1-4]". +curr[2-5]_input Measured current. From READ_IOUT register. +curr[2-5]_max Maximum current. From IOUT_OC_WARN_LIMIT register. +curr[2-5]_lcrit Critical minumum output current. From IOUT_UC_FAULT_LIMIT + register. +curr[2-5]_crit Critical maximum current. From IOUT_OC_FAULT_LIMIT register. +curr[2-5]_max_alarm Current high alarm. From IOUT_OC_WARNING status. +curr[2-5]_crit_alarm Current critical high alarm. From IOUT_OC_FAULT status. + +power1_input Measured input power. From READ_PIN register. +power1_label "pin" + +power[2-5]_input Measured output power. From READ_POUT register. +power[2-5]_label "pout[1-4]" + + The number of output voltage, current, and power + attribute sets is determined by the number of enabled + rails. See chip datasheets for details. + +temp[1-5]_input Measured temperatures. From READ_TEMPERATURE_1 and + READ_TEMPERATURE_2 registers. + temp1 is the chip internal temperature. temp[2-5] are + rail temperatures. temp[2-5] attributes are only + created for enabled rails. See chip datasheets for + details. +temp[1-5]_max Maximum temperature. From OT_WARN_LIMIT register. +temp[1-5]_crit Critical high temperature. From OT_FAULT_LIMIT register. +temp[1-5]_max_alarm Temperature high alarm. +temp[1-5]_crit_alarm Temperature critical high alarm. + +fan1_input Fan RPM. ucd9240 only. +fan1_alarm Fan alarm. ucd9240 only. +fan1_fault Fan fault. ucd9240 only. diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801 index 6df6976..2871fd5 100644 --- a/Documentation/i2c/busses/i2c-i801 +++ b/Documentation/i2c/busses/i2c-i801 @@ -19,6 +19,7 @@ Supported adapters: * Intel 6 Series (PCH) * Intel Patsburg (PCH) * Intel DH89xxCC (PCH) + * Intel Panther Point (PCH) Datasheets: Publicly available at the Intel website On Intel Patsburg and later chipsets, both the normal host SMBus controller diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients index 5ebf5af..5aa5337 100644 --- a/Documentation/i2c/writing-clients +++ b/Documentation/i2c/writing-clients @@ -38,7 +38,7 @@ static struct i2c_driver foo_driver = { .name = "foo", }, - .id_table = foo_ids, + .id_table = foo_idtable, .probe = foo_probe, .remove = foo_remove, /* if device autodetection is needed: */ diff --git a/Documentation/input/elantech.txt b/Documentation/input/elantech.txt index 56941ae..db798af 100644 --- a/Documentation/input/elantech.txt +++ b/Documentation/input/elantech.txt @@ -34,7 +34,8 @@ Contents Currently the Linux Elantech touchpad driver is aware of two different hardware versions unimaginatively called version 1 and version 2. Version 1 is found in "older" laptops and uses 4 bytes per packet. Version 2 seems to -be introduced with the EeePC and uses 6 bytes per packet. +be introduced with the EeePC and uses 6 bytes per packet, and provides +additional features such as position of two fingers, and width of the touch. The driver tries to support both hardware versions and should be compatible with the Xorg Synaptics touchpad driver and its graphical configuration @@ -94,18 +95,44 @@ Currently the Linux Elantech touchpad driver provides two extra knobs under can check these bits and reject any packet that appears corrupted. Using this knob you can bypass that check. - It is not known yet whether hardware version 2 provides the same parity - bits. Hence checking is disabled by default. Currently even turning it on - will do nothing. - + Hardware version 2 does not provide the same parity bits. Only some basic + data consistency checking can be done. For now checking is disabled by + default. Currently even turning it on will do nothing. ///////////////////////////////////////////////////////////////////////////// +3. Differentiating hardware versions + ================================= + +To detect the hardware version, read the version number as param[0].param[1].param[2] + + 4 bytes version: (after the arrow is the name given in the Dell-provided driver) + 02.00.22 => EF013 + 02.06.00 => EF019 +In the wild, there appear to be more versions, such as 00.01.64, 01.00.21, +02.00.00, 02.00.04, 02.00.06. + + 6 bytes: + 02.00.30 => EF113 + 02.08.00 => EF023 + 02.08.XX => EF123 + 02.0B.00 => EF215 + 04.01.XX => Scroll_EF051 + 04.02.XX => EF051 +In the wild, there appear to be more versions, such as 04.03.01, 04.04.11. There +appears to be almost no difference, except for EF113, which does not report +pressure/width and has different data consistency checks. + +Probably all the versions with param[0] <= 01 can be considered as +4 bytes/firmware 1. The versions < 02.08.00, with the exception of 02.00.30, as +4 bytes/firmware 2. Everything >= 02.08.00 can be considered as 6 bytes. + +///////////////////////////////////////////////////////////////////////////// -3. Hardware version 1 +4. Hardware version 1 ================== -3.1 Registers +4.1 Registers ~~~~~~~~~ By echoing a hexadecimal value to a register it contents can be altered. @@ -168,7 +195,7 @@ For example: smart edge activation area width? -3.2 Native relative mode 4 byte packet format +4.2 Native relative mode 4 byte packet format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ byte 0: @@ -226,9 +253,13 @@ byte 3: positive = down -3.3 Native absolute mode 4 byte packet format +4.3 Native absolute mode 4 byte packet format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +EF013 and EF019 have a special behaviour (due to a bug in the firmware?), and +when 1 finger is touching, the first 2 position reports must be discarded. +This counting is reset whenever a different number of fingers is reported. + byte 0: firmware version 1.x: @@ -279,11 +310,11 @@ byte 3: ///////////////////////////////////////////////////////////////////////////// -4. Hardware version 2 +5. Hardware version 2 ================== -4.1 Registers +5.1 Registers ~~~~~~~~~ By echoing a hexadecimal value to a register it contents can be altered. @@ -316,16 +347,41 @@ For example: 0x7f = never i.e. tap again to release) -4.2 Native absolute mode 6 byte packet format +5.2 Native absolute mode 6 byte packet format ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -4.2.1 One finger touch +5.2.1 Parity checking and packet re-synchronization +There is no parity checking, however some consistency checks can be performed. + +For instance for EF113: + SA1= packet[0]; + A1 = packet[1]; + B1 = packet[2]; + SB1= packet[3]; + C1 = packet[4]; + D1 = packet[5]; + if( (((SA1 & 0x3C) != 0x3C) && ((SA1 & 0xC0) != 0x80)) || // check Byte 1 + (((SA1 & 0x0C) != 0x0C) && ((SA1 & 0xC0) == 0x80)) || // check Byte 1 (one finger pressed) + (((SA1 & 0xC0) != 0x80) && (( A1 & 0xF0) != 0x00)) || // check Byte 2 + (((SB1 & 0x3E) != 0x38) && ((SA1 & 0xC0) != 0x80)) || // check Byte 4 + (((SB1 & 0x0E) != 0x08) && ((SA1 & 0xC0) == 0x80)) || // check Byte 4 (one finger pressed) + (((SA1 & 0xC0) != 0x80) && (( C1 & 0xF0) != 0x00)) ) // check Byte 5 + // error detected + +For all the other ones, there are just a few constant bits: + if( ((packet[0] & 0x0C) != 0x04) || + ((packet[3] & 0x0f) != 0x02) ) + // error detected + + +In case an error is detected, all the packets are shifted by one (and packet[0] is discarded). + +5.2.1 One/Three finger touch ~~~~~~~~~~~~~~~~ byte 0: bit 7 6 5 4 3 2 1 0 - n1 n0 . . . . R L + n1 n0 w3 w2 . . R L L, R = 1 when Left, Right mouse button pressed n1..n0 = numbers of fingers on touchpad @@ -333,24 +389,40 @@ byte 0: byte 1: bit 7 6 5 4 3 2 1 0 - . . . . . x10 x9 x8 + p7 p6 p5 p4 . x10 x9 x8 byte 2: bit 7 6 5 4 3 2 1 0 - x7 x6 x5 x4 x4 x2 x1 x0 + x7 x6 x5 x4 x3 x2 x1 x0 x10..x0 = absolute x value (horizontal) byte 3: bit 7 6 5 4 3 2 1 0 - . . . . . . . . + n4 vf w1 w0 . . . b2 + + n4 = set if more than 3 fingers (only in 3 fingers mode) + vf = a kind of flag ? (only on EF123, 0 when finger is over one + of the buttons, 1 otherwise) + w3..w0 = width of the finger touch (not EF113) + b2 (on EF113 only, 0 otherwise), b2.R.L indicates one button pressed: + 0 = none + 1 = Left + 2 = Right + 3 = Middle (Left and Right) + 4 = Forward + 5 = Back + 6 = Another one + 7 = Another one byte 4: bit 7 6 5 4 3 2 1 0 - . . . . . . y9 y8 + p3 p1 p2 p0 . . y9 y8 + + p7..p0 = pressure (not EF113) byte 5: @@ -363,6 +435,11 @@ byte 5: 4.2.2 Two finger touch ~~~~~~~~~~~~~~~~ +Note that the two pairs of coordinates are not exactly the coordinates of the +two fingers, but only the pair of the lower-left and upper-right coordinates. +So the actual fingers might be situated on the other diagonal of the square +defined by these two points. + byte 0: bit 7 6 5 4 3 2 1 0 @@ -376,14 +453,14 @@ byte 1: bit 7 6 5 4 3 2 1 0 ax7 ax6 ax5 ax4 ax3 ax2 ax1 ax0 - ax8..ax0 = first finger absolute x value + ax8..ax0 = lower-left finger absolute x value byte 2: bit 7 6 5 4 3 2 1 0 ay7 ay6 ay5 ay4 ay3 ay2 ay1 ay0 - ay8..ay0 = first finger absolute y value + ay8..ay0 = lower-left finger absolute y value byte 3: @@ -395,11 +472,11 @@ byte 4: bit 7 6 5 4 3 2 1 0 bx7 bx6 bx5 bx4 bx3 bx2 bx1 bx0 - bx8..bx0 = second finger absolute x value + bx8..bx0 = upper-right finger absolute x value byte 5: bit 7 6 5 4 3 2 1 0 by7 by8 by5 by4 by3 by2 by1 by0 - by8..by0 = second finger absolute y value + by8..by0 = upper-right finger absolute y value diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt index 943e8f6..92e68bc 100644 --- a/Documentation/input/rotary-encoder.txt +++ b/Documentation/input/rotary-encoder.txt @@ -9,6 +9,9 @@ peripherals with two wires. The outputs are phase-shifted by 90 degrees and by triggering on falling and rising edges, the turn direction can be determined. +Some encoders have both outputs low in stable states, whereas others also have +a stable state with both outputs high (half-period mode). + The phase diagram of these two outputs look like this: _____ _____ _____ @@ -26,6 +29,8 @@ The phase diagram of these two outputs look like this: |<-------->| one step + |<-->| + one step (half-period mode) For more information, please see http://en.wikipedia.org/wiki/Rotary_encoder @@ -34,6 +39,13 @@ For more information, please see 1. Events / state machine ------------------------- +In half-period mode, state a) and c) above are used to determine the +rotational direction based on the last stable state. Events are reported in +states b) and d) given that the new stable state is different from the last +(i.e. the rotation was not reversed half-way). + +Otherwise, the following apply: + a) Rising edge on channel A, channel B in low state This state is used to recognize a clockwise turn @@ -96,6 +108,7 @@ static struct rotary_encoder_platform_data my_rotary_encoder_info = { .gpio_b = GPIO_ROTARY_B, .inverted_a = 0, .inverted_b = 0, + .half_period = false, }; static struct platform_device rotary_encoder_device = { diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index a0a5d82..3a46e36 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -166,7 +166,6 @@ Code Seq#(hex) Include File Comments 'T' all arch/x86/include/asm/ioctls.h conflict! 'T' C0-DF linux/if_tun.h conflict! 'U' all sound/asound.h conflict! -'U' 00-0F drivers/media/video/uvc/uvcvideo.h conflict! 'U' 00-CF linux/uinput.h conflict! 'U' 00-EF linux/usbdevice_fs.h 'U' C0-CF drivers/bluetooth/hci_uart.h @@ -259,6 +258,7 @@ Code Seq#(hex) Include File Comments 't' 80-8F linux/isdn_ppp.h 't' 90 linux/toshiba.h 'u' 00-1F linux/smb_fs.h gone +'u' 20-3F linux/uvcvideo.h USB video class host driver 'v' 00-1F linux/ext2_fs.h conflict! 'v' 00-1F linux/fs.h conflict! 'v' 00-0F linux/sonypi.h conflict! @@ -304,6 +304,7 @@ Code Seq#(hex) Include File Comments 0xB0 all RATIO devices in development: 0xB1 00-1F PPPoX +0xB3 00 linux/mmc/ioctl.h 0xC0 00-0F linux/usb/iowarrior.h 0xCB 00-1F CBM serial IEC bus in development: diff --git a/Documentation/ja_JP/HOWTO b/Documentation/ja_JP/HOWTO index b63301a..050d37f 100644 --- a/Documentation/ja_JP/HOWTO +++ b/Documentation/ja_JP/HOWTO @@ -11,14 +11,14 @@ for non English (read: Japanese) speakers and is not intended as a fork. So if you have any comments or updates for this file, please try to update the original English file first. -Last Updated: 2008/10/24 +Last Updated: 2011/03/31 ================================== これは、 -linux-2.6.28/Documentation/HOWTO +linux-2.6.38/Documentation/HOWTO の和訳です。 翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ > -翻訳日: 2008/10/24 +翻訳日: 2011/3/28 翻訳者: Tsugikazu Shibata 校正者: 松倉さん 小林 雅典さん (Masanori Kobayasi) @@ -256,8 +256,8 @@ Linux カーネルの開発プロセスは現在幾つかの異なるメイン - メインの 2.6.x カーネルツリー - 2.6.x.y -stable カーネルツリー - 2.6.x -git カーネルパッチ - - 2.6.x -mm カーネルパッチ - サブシステム毎のカーネルツリーとパッチ + - 統合テストのための 2.6.x -next カーネルツリー 2.6.x カーネルツリー ----------------- @@ -268,9 +268,9 @@ Linux カーネルの開発プロセスは現在幾つかの異なるメイン - 新しいカーネルがリリースされた直後に、2週間の特別期間が設けられ、 この期間中に、メンテナ達は Linus に大きな差分を送ることができます。 - このような差分は通常 -mm カーネルに数週間含まれてきたパッチです。 + このような差分は通常 -next カーネルに数週間含まれてきたパッチです。 大きな変更は git(カーネルのソース管理ツール、詳細は - http://git.or.cz/ 参照) を使って送るのが好ましいやり方ですが、パッ + http://git-scm.com/ 参照) を使って送るのが好ましいやり方ですが、パッ チファイルの形式のまま送るのでも十分です。 - 2週間後、-rc1 カーネルがリリースされ、この後にはカーネル全体の安定 @@ -333,86 +333,44 @@ git リポジトリで管理されているLinus のカーネルツリーの毎 れは -rc カーネルと比べて、パッチが大丈夫かどうかも確認しないで自動的 に生成されるので、より実験的です。 -2.6.x -mm カーネルパッチ ------------------------- - -Andrew Morton によってリリースされる実験的なカーネルパッチ群です。 -Andrew は個別のサブシステムカーネルツリーとパッチを全て集めてきて -linux-kernel メーリングリストで収集された多数のパッチと同時に一つにま -とめます。 -このツリーは新機能とパッチが検証される場となります。ある期間の間パッチ -が -mm に入って価値を証明されたら、Andrew やサブシステムメンテナが、 -メインラインへ入れるように Linus にプッシュします。 - -メインカーネルツリーに含めるために Linus に送る前に、すべての新しいパッ -チが -mm ツリーでテストされることが強く推奨されています。マージウィン -ドウが開く前に -mm ツリーに現れなかったパッチはメインラインにマージさ -れることは困難になります。 - -これらのカーネルは安定して動作すべきシステムとして使うのには適切ではあ -りませんし、カーネルブランチの中でももっとも動作にリスクが高いものです。 - -もしあなたが、カーネル開発プロセスの支援をしたいと思っているのであれば、 -どうぞこれらのカーネルリリースをテストに使ってみて、そしてもし問題があ -れば、またもし全てが正しく動作したとしても、linux-kernel メーリングリ -ストにフィードバックを提供してください。 - -すべての他の実験的パッチに加えて、これらのカーネルは通常リリース時点で -メインラインの -git カーネルに含まれる全ての変更も含んでいます。 - --mm カーネルは決まったスケジュールではリリースされません、しかし通常幾 -つかの -mm カーネル (1 から 3 が普通)が各-rc カーネルの間にリリースさ -れます。 - サブシステム毎のカーネルツリーとパッチ ------------------------------------------- -カーネルの様々な領域で何が起きているかを見られるようにするため、多くの -カーネルサブシステム開発者は彼らの開発ツリーを公開しています。これらの -ツリーは説明したように -mm カーネルリリースに入れ込まれます。 - -以下はさまざまなカーネルツリーの中のいくつかのリスト- - - git ツリー- - - Kbuild の開発ツリー、Sam Ravnborg - git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git - - - ACPI の開発ツリー、 Len Brown - git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git - - - Block の開発ツリー、Jens Axboe - git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git - - - DRM の開発ツリー、Dave Airlie - git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git - - - ia64 の開発ツリー、Tony Luck - git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git - - - infiniband, Roland Dreier - git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git - - - libata, Jeff Garzik - git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git - - - ネットワークドライバ, Jeff Garzik - git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git - - - pcmcia, Dominik Brodowski - git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git - - - SCSI, James Bottomley - git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git - - - x86, Ingo Molnar - git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git - - quilt ツリー- - - USB, ドライバコアと I2C, Greg Kroah-Hartman - kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ +それぞれのカーネルサブシステムのメンテナ達は --- そして多くのカーネル +サブシステムの開発者達も --- 各自の最新の開発状況をソースリポジトリに +公開しています。そのため、自分とは異なる領域のカーネルで何が起きている +かを他の人が見られるようになっています。開発が早く進んでいる領域では、 +開発者は自身の投稿がどのサブシステムカーネルツリーを元にしているか質問 +されるので、その投稿とすでに進行中の他の作業との衝突が避けられます。 + +大部分のこれらのリポジトリは git ツリーです。しかしその他の SCM や +quilt シリーズとして公開されているパッチキューも使われています。これら +のサブシステムリポジトリのアドレスは MAINTAINERS ファイルにリストされ +ています。これらの多くは http://git.kernel.org/ で参照することができま +す。 - その他のカーネルツリーは http://git.kernel.org/ と MAINTAINERS ファ - イルに一覧表があります。 +提案されたパッチがこのようなサブシステムツリーにコミットされる前に、メー +リングリストで事前にレビューにかけられます(以下の対応するセクションを +参照)。いくつかのカーネルサブシステムでは、このレビューは patchwork +というツールによって追跡されます。Patchwork は web インターフェイスに +よってパッチ投稿の表示、パッチへのコメント付けや改訂などができ、そして +メンテナはパッチに対して、レビュー中、受付済み、拒否というようなマーク +をつけることができます。大部分のこれらの patchwork のサイトは +http://patchwork.kernel.org/ でリストされています。 + +統合テストのための 2.6.x -next カーネルツリー +--------------------------------------------- + +サブシステムツリーの更新内容がメインラインの 2.6.x ツリーにマージされ +る前に、それらは統合テストされる必要があります。この目的のため、実質的 +に全サブシステムツリーからほぼ毎日プルされてできる特別なテスト用のリ +ポジトリが存在します- + http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git + http://linux.f-seidel.de/linux-next/pmwiki/ + +このやり方によって、-next カーネルは次のマージ機会でどんなものがメイン +ラインカーネルにマージされるか、おおまかなの展望を提供します。-next +カーネルの実行テストを行う冒険好きなテスターは大いに歓迎されます バグレポート ------------- @@ -673,10 +631,9 @@ Linux カーネルコミュニティは、一度に大量のコードの塊を じところからスタートしたのですから。 Paolo Ciarrocchi に感謝、彼は彼の書いた "Development Process" -(http://linux.tar.bz/articles/2.6-development_process)セクショ -ンをこのテキストの原型にすることを許可してくれました。 -Rundy Dunlap と Gerrit Huizenga はメーリングリストでやるべきこととやっ -てはいけないことのリストを提供してくれました。 +(http://lwn.net/Articles/94386/) セクションをこのテキストの原型にする +ことを許可してくれました。Rundy Dunlap と Gerrit Huizenga はメーリング +リストでやるべきこととやってはいけないことのリストを提供してくれました。 以下の人々のレビュー、コメント、貢献に感謝。 Pat Mochel, Hanna Linder, Randy Dunlap, Kay Sievers, Vojtech Pavlik, Jan Kara, Josh Boyer, Kees Cook, Andrew Morton, Andi diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt index 7c2a89b..68e32bb 100644 --- a/Documentation/kbuild/kbuild.txt +++ b/Documentation/kbuild/kbuild.txt @@ -201,3 +201,16 @@ KBUILD_ENABLE_EXTRA_GCC_CHECKS -------------------------------------------------- If enabled over the make command line with "W=1", it turns on additional gcc -W... options for more extensive build-time checking. + +KBUILD_BUILD_TIMESTAMP +-------------------------------------------------- +Setting this to a date string overrides the timestamp used in the +UTS_VERSION definition (uname -v in the running kernel). The value has to +be a string that can be passed to date -d. The default value +is the output of the date command at one point during build. + +KBUILD_BUILD_USER, KBUILD_BUILD_HOST +-------------------------------------------------- +These two variables allow to override the user@host string displayed during +boot and in /proc/version. The default value is the output of the commands +whoami and host, respectively. diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index b507d61..44e2649 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt @@ -113,6 +113,13 @@ applicable everywhere (see syntax). That will limit the usefulness but on the other hand avoid the illegal configurations all over. +- limiting menu display: "visible if" + This attribute is only applicable to menu blocks, if the condition is + false, the menu block is not displayed to the user (the symbols + contained there can still be selected by other symbols, though). It is + similar to a conditional "prompt" attribude for individual menu + entries. Default value of "visible" is true. + - numerical ranges: "range" ["if" ] This allows to limit the range of possible input values for int and hex symbols. The user can only input a value which is larger than @@ -303,7 +310,8 @@ menu: "endmenu" This defines a menu block, see "Menu structure" above for more -information. The only possible options are dependencies. +information. The only possible options are dependencies and "visible" +attributes. if: @@ -381,3 +389,25 @@ config FOO limits FOO to module (=m) or disabled (=n). +Kconfig symbol existence +~~~~~~~~~~~~~~~~~~~~~~~~ +The following two methods produce the same kconfig symbol dependencies +but differ greatly in kconfig symbol existence (production) in the +generated config file. + +case 1: + +config FOO + tristate "about foo" + depends on BAR + +vs. case 2: + +if BAR +config FOO + tristate "about foo" +endif + +In case 1, the symbol FOO will always exist in the config file (given +no other dependencies). In case 2, the symbol FOO will only exist in +the config file if BAR is enabled. diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt index cca46b1..c313d71 100644 --- a/Documentation/kbuild/kconfig.txt +++ b/Documentation/kbuild/kconfig.txt @@ -48,11 +48,6 @@ KCONFIG_OVERWRITECONFIG If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not break symlinks when .config is a symlink to somewhere else. -KCONFIG_NOTIMESTAMP --------------------------------------------------- -If this environment variable exists and is non-null, the timestamp line -in generated .config files is omitted. - ______________________________________________________________________ Environment variables for '{allyes/allmod/allno/rand}config' diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index 5d145bb..47435e5 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -40,11 +40,13 @@ This document describes the Linux kernel Makefiles. --- 6.6 Commands useful for building a boot image --- 6.7 Custom kbuild commands --- 6.8 Preprocessing linker scripts + --- 6.9 Generic header files === 7 Kbuild syntax for exported headers --- 7.1 header-y --- 7.2 objhdr-y --- 7.3 destination-y + --- 7.4 generic-y === 8 Kbuild Variables === 9 Makefile language @@ -499,6 +501,18 @@ more details, with real examples. gcc >= 3.00. For gcc < 3.00, -malign-functions=4 is used. Note: cc-option-align uses KBUILD_CFLAGS for $(CC) options + cc-disable-warning + cc-disable-warning checks if gcc supports a given warning and returns + the commandline switch to disable it. This special function is needed, + because gcc 4.4 and later accept any unknown -Wno-* option and only + warn about it if there is another warning in the source file. + + Example: + KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) + + In the above example, -Wno-unused-but-set-variable will be added to + KBUILD_CFLAGS only if gcc really accepts it. + cc-version cc-version returns a numerical version of the $(CC) compiler version. The format is where both are two digits. So for example @@ -955,6 +969,11 @@ When kbuild executes, the following steps are followed (roughly): used when linking modules. This is often a linker script. From commandline LDFLAGS_MODULE shall be used (see kbuild.txt). + KBUILD_ARFLAGS Options for $(AR) when creating archives + + $(KBUILD_ARFLAGS) set by the top level Makefile to "D" (deterministic + mode) if this option is supported by $(AR). + --- 6.2 Add prerequisites to archprepare: The archprepare: rule is used to list prerequisites that need to be @@ -1209,6 +1228,14 @@ When kbuild executes, the following steps are followed (roughly): The kbuild infrastructure for *lds file are used in several architecture-specific files. +--- 6.9 Generic header files + + The directory include/asm-generic contains the header files + that may be shared between individual architectures. + The recommended approach how to use a generic header file is + to list the file in the Kbuild file. + See "7.4 generic-y" for further info on syntax etc. + === 7 Kbuild syntax for exported headers The kernel include a set of headers that is exported to userspace. @@ -1265,6 +1292,32 @@ See subsequent chapter for the syntax of the Kbuild file. In the example above all exported headers in the Kbuild file will be located in the directory "include/linux" when exported. + --- 7.4 generic-y + + If an architecture uses a verbatim copy of a header from + include/asm-generic then this is listed in the file + arch/$(ARCH)/include/asm/Kbuild like this: + + Example: + #arch/x86/include/asm/Kbuild + generic-y += termios.h + generic-y += rtc.h + + During the prepare phase of the build a wrapper include + file is generated in the directory: + + arch/$(ARCH)/include/generated/asm + + When a header is exported where the architecture uses + the generic header a similar wrapper is generated as part + of the set of exported headers in the directory: + + usr/include/asm + + The generated wrapper will in both cases look like the following: + + Example: termios.h + #include === 8 Kbuild Variables diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cc85a92..5438a2d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -245,7 +245,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. acpi_sleep= [HW,ACPI] Sleep options Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, - old_ordering, s4_nonvs, sci_force_enable } + old_ordering, nonvs, sci_force_enable } See Documentation/power/video.txt for information on s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep @@ -1664,6 +1664,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. noexec=on: enable non-executable mappings (default) noexec=off: disable non-executable mappings + nosmep [X86] + Disable SMEP (Supervisor Mode Execution Protection) + even if it is supported by processor. + noexec32 [X86-64] This affects only 32-bit executables. noexec32=on: enable non-executable mappings (default) @@ -1773,9 +1777,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nosoftlockup [KNL] Disable the soft-lockup detector. - noswapaccount [KNL] Disable accounting of swap in memory resource - controller. (See Documentation/cgroups/memory.txt) - nosync [HW,M68K] Disables sync negotiation for all devices. notsc [BUGS=X86-32] Disable Time Stamp Counter @@ -2581,6 +2582,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. bytes of sense data); c = FIX_CAPACITY (decrease the reported device capacity by one sector); + d = NO_READ_DISC_INFO (don't use + READ_DISC_INFO command); + e = NO_READ_CAPACITY_16 (don't use + READ_CAPACITY_16 command); h = CAPACITY_HEURISTICS (decrease the reported device capacity by one sector if the number is odd); diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt deleted file mode 100644 index 9bef4e4..0000000 --- a/Documentation/kvm/api.txt +++ /dev/null @@ -1,1451 +0,0 @@ -The Definitive KVM (Kernel-based Virtual Machine) API Documentation -=================================================================== - -1. General description - -The kvm API is a set of ioctls that are issued to control various aspects -of a virtual machine. The ioctls belong to three classes - - - System ioctls: These query and set global attributes which affect the - whole kvm subsystem. In addition a system ioctl is used to create - virtual machines - - - VM ioctls: These query and set attributes that affect an entire virtual - machine, for example memory layout. In addition a VM ioctl is used to - create virtual cpus (vcpus). - - Only run VM ioctls from the same process (address space) that was used - to create the VM. - - - vcpu ioctls: These query and set attributes that control the operation - of a single virtual cpu. - - Only run vcpu ioctls from the same thread that was used to create the - vcpu. - -2. File descriptors - -The kvm API is centered around file descriptors. An initial -open("/dev/kvm") obtains a handle to the kvm subsystem; this handle -can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this -handle will create a VM file descriptor which can be used to issue VM -ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu -and return a file descriptor pointing to it. Finally, ioctls on a vcpu -fd can be used to control the vcpu, including the important task of -actually running guest code. - -In general file descriptors can be migrated among processes by means -of fork() and the SCM_RIGHTS facility of unix domain socket. These -kinds of tricks are explicitly not supported by kvm. While they will -not cause harm to the host, their actual behavior is not guaranteed by -the API. The only supported use is one virtual machine per process, -and one vcpu per thread. - -3. Extensions - -As of Linux 2.6.22, the KVM ABI has been stabilized: no backward -incompatible change are allowed. However, there is an extension -facility that allows backward-compatible extensions to the API to be -queried and used. - -The extension mechanism is not based on on the Linux version number. -Instead, kvm defines extension identifiers and a facility to query -whether a particular extension identifier is available. If it is, a -set of ioctls is available for application use. - -4. API description - -This section describes ioctls that can be used to control kvm guests. -For each ioctl, the following information is provided along with a -description: - - Capability: which KVM extension provides this ioctl. Can be 'basic', - which means that is will be provided by any kernel that supports - API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which - means availability needs to be checked with KVM_CHECK_EXTENSION - (see section 4.4). - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Type: system, vm, or vcpu. - - Parameters: what parameters are accepted by the ioctl. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - -4.1 KVM_GET_API_VERSION - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: the constant KVM_API_VERSION (=12) - -This identifies the API version as the stable kvm API. It is not -expected that this number will change. However, Linux 2.6.20 and -2.6.21 report earlier versions; these are not documented and not -supported. Applications should refuse to run if KVM_GET_API_VERSION -returns a value other than 12. If this check passes, all ioctls -described as 'basic' will be available. - -4.2 KVM_CREATE_VM - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: a VM fd that can be used to control the new virtual machine. - -The new VM has no virtual cpus and no memory. An mmap() of a VM fd -will access the virtual machine's physical address space; offset zero -corresponds to guest physical address zero. Use of mmap() on a VM fd -is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is -available. - -4.3 KVM_GET_MSR_INDEX_LIST - -Capability: basic -Architectures: x86 -Type: system -Parameters: struct kvm_msr_list (in/out) -Returns: 0 on success; -1 on error -Errors: - E2BIG: the msr index list is to be to fit in the array specified by - the user. - -struct kvm_msr_list { - __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; -}; - -This ioctl returns the guest msrs that are supported. The list varies -by kvm version and host processor, but does not change otherwise. The -user fills in the size of the indices array in nmsrs, and in return -kvm adjusts nmsrs to reflect the actual number of msrs and fills in -the indices array with their numbers. - -Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are -not returned in the MSR list, as different vcpus can have a different number -of banks, as set via the KVM_X86_SETUP_MCE ioctl. - -4.4 KVM_CHECK_EXTENSION - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: extension identifier (KVM_CAP_*) -Returns: 0 if unsupported; 1 (or some other positive integer) if supported - -The API allows the application to query about extensions to the core -kvm API. Userspace passes an extension identifier (an integer) and -receives an integer that describes the extension availability. -Generally 0 means no and 1 means yes, but some extensions may report -additional information in the integer return value. - -4.5 KVM_GET_VCPU_MMAP_SIZE - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: size of vcpu mmap area, in bytes - -The KVM_RUN ioctl (cf.) communicates with userspace via a shared -memory region. This ioctl returns the size of that region. See the -KVM_RUN documentation for details. - -4.6 KVM_SET_MEMORY_REGION - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: struct kvm_memory_region (in) -Returns: 0 on success, -1 on error - -This ioctl is obsolete and has been removed. - -4.7 KVM_CREATE_VCPU - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: vcpu id (apic id on x86) -Returns: vcpu fd on success, -1 on error - -This API adds a vcpu to a virtual machine. The vcpu id is a small integer -in the range [0, max_vcpus). - -4.8 KVM_GET_DIRTY_LOG (vm ioctl) - -Capability: basic -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_dirty_log (in/out) -Returns: 0 on success, -1 on error - -/* for KVM_GET_DIRTY_LOG */ -struct kvm_dirty_log { - __u32 slot; - __u32 padding; - union { - void __user *dirty_bitmap; /* one bit per page */ - __u64 padding; - }; -}; - -Given a memory slot, return a bitmap containing any pages dirtied -since the last call to this ioctl. Bit 0 is the first page in the -memory slot. Ensure the entire structure is cleared to avoid padding -issues. - -4.9 KVM_SET_MEMORY_ALIAS - -Capability: basic -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_memory_alias (in) -Returns: 0 (success), -1 (error) - -This ioctl is obsolete and has been removed. - -4.10 KVM_RUN - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error -Errors: - EINTR: an unmasked signal is pending - -This ioctl is used to run a guest virtual cpu. While there are no -explicit parameters, there is an implicit parameter block that can be -obtained by mmap()ing the vcpu fd at offset 0, with the size given by -KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct -kvm_run' (see below). - -4.11 KVM_GET_REGS - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_regs (out) -Returns: 0 on success, -1 on error - -Reads the general purpose registers from the vcpu. - -/* x86 */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 rax, rbx, rcx, rdx; - __u64 rsi, rdi, rsp, rbp; - __u64 r8, r9, r10, r11; - __u64 r12, r13, r14, r15; - __u64 rip, rflags; -}; - -4.12 KVM_SET_REGS - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_regs (in) -Returns: 0 on success, -1 on error - -Writes the general purpose registers into the vcpu. - -See KVM_GET_REGS for the data structure. - -4.13 KVM_GET_SREGS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_sregs (out) -Returns: 0 on success, -1 on error - -Reads special registers from the vcpu. - -/* x86 */ -struct kvm_sregs { - struct kvm_segment cs, ds, es, fs, gs, ss; - struct kvm_segment tr, ldt; - struct kvm_dtable gdt, idt; - __u64 cr0, cr2, cr3, cr4, cr8; - __u64 efer; - __u64 apic_base; - __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; -}; - -interrupt_bitmap is a bitmap of pending external interrupts. At most -one bit may be set. This interrupt has been acknowledged by the APIC -but not yet injected into the cpu core. - -4.14 KVM_SET_SREGS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_sregs (in) -Returns: 0 on success, -1 on error - -Writes special registers into the vcpu. See KVM_GET_SREGS for the -data structures. - -4.15 KVM_TRANSLATE - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_translation (in/out) -Returns: 0 on success, -1 on error - -Translates a virtual address according to the vcpu's current address -translation mode. - -struct kvm_translation { - /* in */ - __u64 linear_address; - - /* out */ - __u64 physical_address; - __u8 valid; - __u8 writeable; - __u8 usermode; - __u8 pad[5]; -}; - -4.16 KVM_INTERRUPT - -Capability: basic -Architectures: x86, ppc -Type: vcpu ioctl -Parameters: struct kvm_interrupt (in) -Returns: 0 on success, -1 on error - -Queues a hardware interrupt vector to be injected. This is only -useful if in-kernel local APIC or equivalent is not used. - -/* for KVM_INTERRUPT */ -struct kvm_interrupt { - /* in */ - __u32 irq; -}; - -X86: - -Note 'irq' is an interrupt vector, not an interrupt pin or line. - -PPC: - -Queues an external interrupt to be injected. This ioctl is overleaded -with 3 different irq values: - -a) KVM_INTERRUPT_SET - - This injects an edge type external interrupt into the guest once it's ready - to receive interrupts. When injected, the interrupt is done. - -b) KVM_INTERRUPT_UNSET - - This unsets any pending interrupt. - - Only available with KVM_CAP_PPC_UNSET_IRQ. - -c) KVM_INTERRUPT_SET_LEVEL - - This injects a level type external interrupt into the guest context. The - interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET - is triggered. - - Only available with KVM_CAP_PPC_IRQ_LEVEL. - -Note that any value for 'irq' other than the ones stated above is invalid -and incurs unexpected behavior. - -4.17 KVM_DEBUG_GUEST - -Capability: basic -Architectures: none -Type: vcpu ioctl -Parameters: none) -Returns: -1 on error - -Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. - -4.18 KVM_GET_MSRS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_msrs (in/out) -Returns: 0 on success, -1 on error - -Reads model-specific registers from the vcpu. Supported msr indices can -be obtained using KVM_GET_MSR_INDEX_LIST. - -struct kvm_msrs { - __u32 nmsrs; /* number of msrs in entries */ - __u32 pad; - - struct kvm_msr_entry entries[0]; -}; - -struct kvm_msr_entry { - __u32 index; - __u32 reserved; - __u64 data; -}; - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array) and the 'index' member of each array entry. -kvm will fill in the 'data' member. - -4.19 KVM_SET_MSRS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_msrs (in) -Returns: 0 on success, -1 on error - -Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the -data structures. - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array), and the 'index' and 'data' members of each -array entry. - -4.20 KVM_SET_CPUID - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_cpuid (in) -Returns: 0 on success, -1 on error - -Defines the vcpu responses to the cpuid instruction. Applications -should use the KVM_SET_CPUID2 ioctl if available. - - -struct kvm_cpuid_entry { - __u32 function; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding; -}; - -/* for KVM_SET_CPUID */ -struct kvm_cpuid { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry entries[0]; -}; - -4.21 KVM_SET_SIGNAL_MASK - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_signal_mask (in) -Returns: 0 on success, -1 on error - -Defines which signals are blocked during execution of KVM_RUN. This -signal mask temporarily overrides the threads signal mask. Any -unblocked signal received (except SIGKILL and SIGSTOP, which retain -their traditional behaviour) will cause KVM_RUN to return with -EINTR. - -Note the signal will only be delivered if not blocked by the original -signal mask. - -/* for KVM_SET_SIGNAL_MASK */ -struct kvm_signal_mask { - __u32 len; - __u8 sigset[0]; -}; - -4.22 KVM_GET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (out) -Returns: 0 on success, -1 on error - -Reads the floating point state from the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - -4.23 KVM_SET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (in) -Returns: 0 on success, -1 on error - -Writes the floating point state to the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - -4.24 KVM_CREATE_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64 -Type: vm ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Creates an interrupt controller model in the kernel. On x86, creates a virtual -ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a -local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 -only go to the IOAPIC. On ia64, a IOSAPIC is created. - -4.25 KVM_IRQ_LINE - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64 -Type: vm ioctl -Parameters: struct kvm_irq_level -Returns: 0 on success, -1 on error - -Sets the level of a GSI input to the interrupt controller model in the kernel. -Requires that an interrupt controller model has been previously created with -KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level -to be set to 1 and then back to 0. - -struct kvm_irq_level { - union { - __u32 irq; /* GSI */ - __s32 status; /* not used for KVM_IRQ_LEVEL */ - }; - __u32 level; /* 0 or 1 */ -}; - -4.26 KVM_GET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64 -Type: vm ioctl -Parameters: struct kvm_irqchip (in/out) -Returns: 0 on success, -1 on error - -Reads the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP into a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - -4.27 KVM_SET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64 -Type: vm ioctl -Parameters: struct kvm_irqchip (in) -Returns: 0 on success, -1 on error - -Sets the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP from a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - -4.28 KVM_XEN_HVM_CONFIG - -Capability: KVM_CAP_XEN_HVM -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_xen_hvm_config (in) -Returns: 0 on success, -1 on error - -Sets the MSR that the Xen HVM guest uses to initialize its hypercall -page, and provides the starting address and size of the hypercall -blobs in userspace. When the guest writes the MSR, kvm copies one -page of a blob (32- or 64-bit, depending on the vcpu mode) to guest -memory. - -struct kvm_xen_hvm_config { - __u32 flags; - __u32 msr; - __u64 blob_addr_32; - __u64 blob_addr_64; - __u8 blob_size_32; - __u8 blob_size_64; - __u8 pad2[30]; -}; - -4.29 KVM_GET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (out) -Returns: 0 on success, -1 on error - -Gets the current timestamp of kvmclock as seen by the current guest. In -conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - -4.30 KVM_SET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (in) -Returns: 0 on success, -1 on error - -Sets the current timestamp of kvmclock to the value specified in its parameter. -In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - -4.31 KVM_GET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_vcpu_event (out) -Returns: 0 on success, -1 on error - -Gets currently pending exceptions, interrupts, and NMIs as well as related -states of the vcpu. - -struct kvm_vcpu_events { - struct { - __u8 injected; - __u8 nr; - __u8 has_error_code; - __u8 pad; - __u32 error_code; - } exception; - struct { - __u8 injected; - __u8 nr; - __u8 soft; - __u8 shadow; - } interrupt; - struct { - __u8 injected; - __u8 pending; - __u8 masked; - __u8 pad; - } nmi; - __u32 sipi_vector; - __u32 flags; -}; - -KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that -interrupt.shadow contains a valid state. Otherwise, this field is undefined. - -4.32 KVM_SET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_vcpu_event (in) -Returns: 0 on success, -1 on error - -Set pending exceptions, interrupts, and NMIs as well as related states of the -vcpu. - -See KVM_GET_VCPU_EVENTS for the data structure. - -Fields that may be modified asynchronously by running VCPUs can be excluded -from the update. These fields are nmi.pending and sipi_vector. Keep the -corresponding bits in the flags field cleared to suppress overwriting the -current in-kernel state. The bits are: - -KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel -KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector - -If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in -the flags field to signal that interrupt.shadow contains a valid state and -shall be written into the VCPU. - -4.33 KVM_GET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (out) -Returns: 0 on success, -1 on error - -Reads debug registers from the vcpu. - -struct kvm_debugregs { - __u64 db[4]; - __u64 dr6; - __u64 dr7; - __u64 flags; - __u64 reserved[9]; -}; - -4.34 KVM_SET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (in) -Returns: 0 on success, -1 on error - -Writes debug registers into the vcpu. - -See KVM_GET_DEBUGREGS for the data structure. The flags field is unused -yet and must be cleared on entry. - -4.35 KVM_SET_USER_MEMORY_REGION - -Capability: KVM_CAP_USER_MEM -Architectures: all -Type: vm ioctl -Parameters: struct kvm_userspace_memory_region (in) -Returns: 0 on success, -1 on error - -struct kvm_userspace_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ - __u64 userspace_addr; /* start of the userspace allocated memory */ -}; - -/* for kvm_memory_region::flags */ -#define KVM_MEM_LOG_DIRTY_PAGES 1UL - -This ioctl allows the user to create or modify a guest physical memory -slot. When changing an existing slot, it may be moved in the guest -physical memory space, or its flags may be modified. It may not be -resized. Slots may not overlap in guest physical address space. - -Memory for the region is taken starting at the address denoted by the -field userspace_addr, which must point at user addressable memory for -the entire memory slot size. Any object may back this memory, including -anonymous memory, ordinary files, and hugetlbfs. - -It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr -be identical. This allows large pages in the guest to be backed by large -pages in the host. - -The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which -instructs kvm to keep track of writes to memory within the slot. See -the KVM_GET_DIRTY_LOG ioctl. - -When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory -region are automatically reflected into the guest. For example, an mmap() -that affects the region will be made visible immediately. Another example -is madvise(MADV_DROP). - -It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. -The KVM_SET_MEMORY_REGION does not allow fine grained control over memory -allocation and is deprecated. - -4.36 KVM_SET_TSS_ADDR - -Capability: KVM_CAP_SET_TSS_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long tss_address (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a three-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - -4.37 KVM_ENABLE_CAP - -Capability: KVM_CAP_ENABLE_CAP -Architectures: ppc -Type: vcpu ioctl -Parameters: struct kvm_enable_cap (in) -Returns: 0 on success; -1 on error - -+Not all extensions are enabled by default. Using this ioctl the application -can enable an extension, making it available to the guest. - -On systems that do not support this ioctl, it always fails. On systems that -do support it, it only works for extensions that are supported for enablement. - -To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should -be used. - -struct kvm_enable_cap { - /* in */ - __u32 cap; - -The capability that is supposed to get enabled. - - __u32 flags; - -A bitfield indicating future enhancements. Has to be 0 for now. - - __u64 args[4]; - -Arguments for enabling a feature. If a feature needs initial values to -function properly, this is the place to put them. - - __u8 pad[64]; -}; - -4.38 KVM_GET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, ia64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (out) -Returns: 0 on success; -1 on error - -struct kvm_mp_state { - __u32 mp_state; -}; - -Returns the vcpu's current "multiprocessing state" (though also valid on -uniprocessor guests). - -Possible values are: - - - KVM_MP_STATE_RUNNABLE: the vcpu is currently running - - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) - which has not yet received an INIT signal - - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is - now ready for a SIPI - - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and - is waiting for an interrupt - - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector - accessible via KVM_GET_VCPU_EVENTS) - -This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel -irqchip, the multiprocessing state must be maintained by userspace. - -4.39 KVM_SET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, ia64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (in) -Returns: 0 on success; -1 on error - -Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for -arguments. - -This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel -irqchip, the multiprocessing state must be maintained by userspace. - -4.40 KVM_SET_IDENTITY_MAP_ADDR - -Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long identity (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a one-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - -4.41 KVM_SET_BOOT_CPU_ID - -Capability: KVM_CAP_SET_BOOT_CPU_ID -Architectures: x86, ia64 -Type: vm ioctl -Parameters: unsigned long vcpu_id -Returns: 0 on success, -1 on error - -Define which vcpu is the Bootstrap Processor (BSP). Values are the same -as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default -is vcpu 0. - -4.42 KVM_GET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (out) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy current vcpu's xsave struct to the userspace. - -4.43 KVM_SET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (in) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy userspace's xsave struct to the kernel. - -4.44 KVM_GET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (out) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would copy current vcpu's xcrs to the userspace. - -4.45 KVM_SET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (in) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would set vcpu's xcr to the value userspace specified. - -4.46 KVM_GET_SUPPORTED_CPUID - -Capability: KVM_CAP_EXT_CPUID -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry2 entries[0]; -}; - -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features which are supported by both the hardware -and kvm. Userspace can use the information returned by this ioctl to -construct cpuid information (for KVM_SET_CPUID2) that is consistent with -hardware, kernel, and userspace capabilities, and with user requirements (for -example, the user may wish to constrain cpuid to emulate older hardware, -or for feature consistency across a cluster). - -Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure -with the 'nent' field indicating the number of entries in the variable-size -array 'entries'. If the number of entries is too low to describe the cpu -capabilities, an error (E2BIG) is returned. If the number is too high, -the 'nent' field is adjusted and an error (ENOMEM) is returned. If the -number is just right, the 'nent' field is adjusted to the number of valid -entries in the 'entries' array, which is then filled. - -The entries returned are the host cpuid as returned by the cpuid instruction, -with unknown or unsupported features masked out. Some features (for example, -x2apic), may not be present in the host cpu, but are exposed by kvm if it can -emulate them efficiently. The fields in each entry are defined as follows: - - function: the eax value used to obtain the entry - index: the ecx value used to obtain the entry (for entries that are - affected by ecx) - flags: an OR of zero or more of the following: - KVM_CPUID_FLAG_SIGNIFCANT_INDEX: - if the index field is valid - KVM_CPUID_FLAG_STATEFUL_FUNC: - if cpuid for this function returns different values for successive - invocations; there will be several entries with the same function, - all with this flag set - KVM_CPUID_FLAG_STATE_READ_NEXT: - for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is - the first entry to be read by a cpu - eax, ebx, ecx, edx: the values returned by the cpuid instruction for - this function/index combination - -4.47 KVM_PPC_GET_PVINFO - -Capability: KVM_CAP_PPC_GET_PVINFO -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_pvinfo (out) -Returns: 0 on success, !0 on error - -struct kvm_ppc_pvinfo { - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - -This ioctl fetches PV specific information that need to be passed to the guest -using the device tree or other means from vm context. - -For now the only implemented piece of information distributed here is an array -of 4 instructions that make up a hypercall. - -If any additional field gets added to this structure later on, a bit for that -additional piece of information will be set in the flags bitmap. - -4.48 KVM_ASSIGN_PCI_DEVICE - -Capability: KVM_CAP_DEVICE_ASSIGNMENT -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Assigns a host PCI device to the VM. - -struct kvm_assigned_pci_dev { - __u32 assigned_dev_id; - __u32 busnr; - __u32 devfn; - __u32 flags; - __u32 segnr; - union { - __u32 reserved[11]; - }; -}; - -The PCI device is specified by the triple segnr, busnr, and devfn. -Identification in succeeding service requests is done via assigned_dev_id. The -following flags are specified: - -/* Depends on KVM_CAP_IOMMU */ -#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) - -4.49 KVM_DEASSIGN_PCI_DEVICE - -Capability: KVM_CAP_DEVICE_DEASSIGNMENT -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Ends PCI device assignment, releasing all associated resources. - -See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is -used in kvm_assigned_pci_dev to identify the device. - -4.50 KVM_ASSIGN_DEV_IRQ - -Capability: KVM_CAP_ASSIGN_DEV_IRQ -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_irq (in) -Returns: 0 on success, -1 on error - -Assigns an IRQ to a passed-through device. - -struct kvm_assigned_irq { - __u32 assigned_dev_id; - __u32 host_irq; - __u32 guest_irq; - __u32 flags; - union { - struct { - __u32 addr_lo; - __u32 addr_hi; - __u32 data; - } guest_msi; - __u32 reserved[12]; - }; -}; - -The following flags are defined: - -#define KVM_DEV_IRQ_HOST_INTX (1 << 0) -#define KVM_DEV_IRQ_HOST_MSI (1 << 1) -#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) - -#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) -#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) -#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) - -It is not valid to specify multiple types per host or guest IRQ. However, the -IRQ type of host and guest can differ or can even be null. - -4.51 KVM_DEASSIGN_DEV_IRQ - -Capability: KVM_CAP_ASSIGN_DEV_IRQ -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_irq (in) -Returns: 0 on success, -1 on error - -Ends an IRQ assignment to a passed-through device. - -See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified -by assigned_dev_id, flags must correspond to the IRQ type specified on -KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed. - -4.52 KVM_SET_GSI_ROUTING - -Capability: KVM_CAP_IRQ_ROUTING -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_irq_routing (in) -Returns: 0 on success, -1 on error - -Sets the GSI routing table entries, overwriting any previously set entries. - -struct kvm_irq_routing { - __u32 nr; - __u32 flags; - struct kvm_irq_routing_entry entries[0]; -}; - -No flags are specified so far, the corresponding field must be set to zero. - -struct kvm_irq_routing_entry { - __u32 gsi; - __u32 type; - __u32 flags; - __u32 pad; - union { - struct kvm_irq_routing_irqchip irqchip; - struct kvm_irq_routing_msi msi; - __u32 pad[8]; - } u; -}; - -/* gsi routing entry types */ -#define KVM_IRQ_ROUTING_IRQCHIP 1 -#define KVM_IRQ_ROUTING_MSI 2 - -No flags are specified so far, the corresponding field must be set to zero. - -struct kvm_irq_routing_irqchip { - __u32 irqchip; - __u32 pin; -}; - -struct kvm_irq_routing_msi { - __u32 address_lo; - __u32 address_hi; - __u32 data; - __u32 pad; -}; - -4.53 KVM_ASSIGN_SET_MSIX_NR - -Capability: KVM_CAP_DEVICE_MSIX -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_msix_nr (in) -Returns: 0 on success, -1 on error - -Set the number of MSI-X interrupts for an assigned device. This service can -only be called once in the lifetime of an assigned device. - -struct kvm_assigned_msix_nr { - __u32 assigned_dev_id; - __u16 entry_nr; - __u16 padding; -}; - -#define KVM_MAX_MSIX_PER_DEV 256 - -4.54 KVM_ASSIGN_SET_MSIX_ENTRY - -Capability: KVM_CAP_DEVICE_MSIX -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_msix_entry (in) -Returns: 0 on success, -1 on error - -Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting -the GSI vector to zero means disabling the interrupt. - -struct kvm_assigned_msix_entry { - __u32 assigned_dev_id; - __u32 gsi; - __u16 entry; /* The index of entry in the MSI-X table */ - __u16 padding[3]; -}; - -5. The kvm_run structure - -Application code obtains a pointer to the kvm_run structure by -mmap()ing a vcpu fd. From that point, application code can control -execution by changing fields in kvm_run prior to calling the KVM_RUN -ioctl, and obtain information about the reason KVM_RUN returned by -looking up structure members. - -struct kvm_run { - /* in */ - __u8 request_interrupt_window; - -Request that KVM_RUN return when it becomes possible to inject external -interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. - - __u8 padding1[7]; - - /* out */ - __u32 exit_reason; - -When KVM_RUN has returned successfully (return value 0), this informs -application code why KVM_RUN has returned. Allowable values for this -field are detailed below. - - __u8 ready_for_interrupt_injection; - -If request_interrupt_window has been specified, this field indicates -an interrupt can be injected now with KVM_INTERRUPT. - - __u8 if_flag; - -The value of the current interrupt flag. Only valid if in-kernel -local APIC is not used. - - __u8 padding2[2]; - - /* in (pre_kvm_run), out (post_kvm_run) */ - __u64 cr8; - -The value of the cr8 register. Only valid if in-kernel local APIC is -not used. Both input and output. - - __u64 apic_base; - -The value of the APIC BASE msr. Only valid if in-kernel local -APIC is not used. Both input and output. - - union { - /* KVM_EXIT_UNKNOWN */ - struct { - __u64 hardware_exit_reason; - } hw; - -If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown -reasons. Further architecture-specific information is available in -hardware_exit_reason. - - /* KVM_EXIT_FAIL_ENTRY */ - struct { - __u64 hardware_entry_failure_reason; - } fail_entry; - -If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due -to unknown reasons. Further architecture-specific information is -available in hardware_entry_failure_reason. - - /* KVM_EXIT_EXCEPTION */ - struct { - __u32 exception; - __u32 error_code; - } ex; - -Unused. - - /* KVM_EXIT_IO */ - struct { -#define KVM_EXIT_IO_IN 0 -#define KVM_EXIT_IO_OUT 1 - __u8 direction; - __u8 size; /* bytes */ - __u16 port; - __u32 count; - __u64 data_offset; /* relative to kvm_run start */ - } io; - -If exit_reason is KVM_EXIT_IO, then the vcpu has -executed a port I/O instruction which could not be satisfied by kvm. -data_offset describes where the data is located (KVM_EXIT_IO_OUT) or -where kvm expects application code to place the data for the next -KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. - - struct { - struct kvm_debug_exit_arch arch; - } debug; - -Unused. - - /* KVM_EXIT_MMIO */ - struct { - __u64 phys_addr; - __u8 data[8]; - __u32 len; - __u8 is_write; - } mmio; - -If exit_reason is KVM_EXIT_MMIO, then the vcpu has -executed a memory-mapped I/O instruction which could not be satisfied -by kvm. The 'data' member contains the written data if 'is_write' is -true, and should be filled by application code otherwise. - -NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding -operations are complete (and guest state is consistent) only after userspace -has re-entered the kernel with KVM_RUN. The kernel side will first finish -incomplete operations and then check for pending signals. Userspace -can re-enter the guest with an unmasked signal pending to complete -pending operations. - - /* KVM_EXIT_HYPERCALL */ - struct { - __u64 nr; - __u64 args[6]; - __u64 ret; - __u32 longmode; - __u32 pad; - } hypercall; - -Unused. This was once used for 'hypercall to userspace'. To implement -such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). -Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. - - /* KVM_EXIT_TPR_ACCESS */ - struct { - __u64 rip; - __u32 is_write; - __u32 pad; - } tpr_access; - -To be documented (KVM_TPR_ACCESS_REPORTING). - - /* KVM_EXIT_S390_SIEIC */ - struct { - __u8 icptcode; - __u64 mask; /* psw upper half */ - __u64 addr; /* psw lower half */ - __u16 ipa; - __u32 ipb; - } s390_sieic; - -s390 specific. - - /* KVM_EXIT_S390_RESET */ -#define KVM_S390_RESET_POR 1 -#define KVM_S390_RESET_CLEAR 2 -#define KVM_S390_RESET_SUBSYSTEM 4 -#define KVM_S390_RESET_CPU_INIT 8 -#define KVM_S390_RESET_IPL 16 - __u64 s390_reset_flags; - -s390 specific. - - /* KVM_EXIT_DCR */ - struct { - __u32 dcrn; - __u32 data; - __u8 is_write; - } dcr; - -powerpc specific. - - /* KVM_EXIT_OSI */ - struct { - __u64 gprs[32]; - } osi; - -MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch -hypercalls and exit with this exit struct that contains all the guest gprs. - -If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. -Userspace can now handle the hypercall and when it's done modify the gprs as -necessary. Upon guest entry all guest GPRs will then be replaced by the values -in this struct. - - /* Fix the size of the union. */ - char padding[256]; - }; -}; diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt deleted file mode 100644 index 8820685..0000000 --- a/Documentation/kvm/cpuid.txt +++ /dev/null @@ -1,45 +0,0 @@ -KVM CPUID bits -Glauber Costa , Red Hat Inc, 2010 -===================================================== - -A guest running on a kvm host, can check some of its features using -cpuid. This is not always guaranteed to work, since userspace can -mask-out some, or even all KVM-related cpuid features before launching -a guest. - -KVM cpuid functions are: - -function: KVM_CPUID_SIGNATURE (0x40000000) -returns : eax = 0, - ebx = 0x4b4d564b, - ecx = 0x564b4d56, - edx = 0x4d. -Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". -This function queries the presence of KVM cpuid leafs. - - -function: define KVM_CPUID_FEATURES (0x40000001) -returns : ebx, ecx, edx = 0 - eax = and OR'ed group of (1 << flag), where each flags is: - - -flag || value || meaning -============================================================================= -KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs - || || 0x11 and 0x12. ------------------------------------------------------------------------------- -KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays - || || on PIO operations. ------------------------------------------------------------------------------- -KVM_FEATURE_MMU_OP || 2 || deprecated. ------------------------------------------------------------------------------- -KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs - || || 0x4b564d00 and 0x4b564d01 ------------------------------------------------------------------------------- -KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by - || || writing to msr 0x4b564d02 ------------------------------------------------------------------------------- -KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side - || || per-cpu warps are expected in - || || kvmclock. ------------------------------------------------------------------------------- diff --git a/Documentation/kvm/locking.txt b/Documentation/kvm/locking.txt deleted file mode 100644 index 3b4cd3b..0000000 --- a/Documentation/kvm/locking.txt +++ /dev/null @@ -1,25 +0,0 @@ -KVM Lock Overview -================= - -1. Acquisition Orders ---------------------- - -(to be written) - -2. Reference ------------- - -Name: kvm_lock -Type: raw_spinlock -Arch: any -Protects: - vm_list - - hardware virtualization enable/disable -Comment: 'raw' because hardware enabling/disabling must be atomic /wrt - migration. - -Name: kvm_arch::tsc_write_lock -Type: raw_spinlock -Arch: x86 -Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} - - tsc offset in vmcb -Comment: 'raw' because updating the tsc offsets must not be preempted. diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt deleted file mode 100644 index f46aa58..0000000 --- a/Documentation/kvm/mmu.txt +++ /dev/null @@ -1,348 +0,0 @@ -The x86 kvm shadow mmu -====================== - -The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible -for presenting a standard x86 mmu to the guest, while translating guest -physical addresses to host physical addresses. - -The mmu code attempts to satisfy the following requirements: - -- correctness: the guest should not be able to determine that it is running - on an emulated mmu except for timing (we attempt to comply - with the specification, not emulate the characteristics of - a particular implementation such as tlb size) -- security: the guest must not be able to touch host memory not assigned - to it -- performance: minimize the performance penalty imposed by the mmu -- scaling: need to scale to large memory and large vcpu guests -- hardware: support the full range of x86 virtualization hardware -- integration: Linux memory management code must be in control of guest memory - so that swapping, page migration, page merging, transparent - hugepages, and similar features work without change -- dirty tracking: report writes to guest memory to enable live migration - and framebuffer-based displays -- footprint: keep the amount of pinned kernel memory low (most memory - should be shrinkable) -- reliability: avoid multipage or GFP_ATOMIC allocations - -Acronyms -======== - -pfn host page frame number -hpa host physical address -hva host virtual address -gfn guest frame number -gpa guest physical address -gva guest virtual address -ngpa nested guest physical address -ngva nested guest virtual address -pte page table entry (used also to refer generically to paging structure - entries) -gpte guest pte (referring to gfns) -spte shadow pte (referring to pfns) -tdp two dimensional paging (vendor neutral term for NPT and EPT) - -Virtual and real hardware supported -=================================== - -The mmu supports first-generation mmu hardware, which allows an atomic switch -of the current paging mode and cr3 during guest entry, as well as -two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware -it exposes is the traditional 2/3/4 level x86 mmu, with support for global -pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support -exposing NPT capable hardware on NPT capable hosts. - -Translation -=========== - -The primary job of the mmu is to program the processor's mmu to translate -addresses for the guest. Different translations are required at different -times: - -- when guest paging is disabled, we translate guest physical addresses to - host physical addresses (gpa->hpa) -- when guest paging is enabled, we translate guest virtual addresses, to - guest physical addresses, to host physical addresses (gva->gpa->hpa) -- when the guest launches a guest of its own, we translate nested guest - virtual addresses, to nested guest physical addresses, to guest physical - addresses, to host physical addresses (ngva->ngpa->gpa->hpa) - -The primary challenge is to encode between 1 and 3 translations into hardware -that support only 1 (traditional) and 2 (tdp) translations. When the -number of required translations matches the hardware, the mmu operates in -direct mode; otherwise it operates in shadow mode (see below). - -Memory -====== - -Guest memory (gpa) is part of the user address space of the process that is -using kvm. Userspace defines the translation between guest addresses and user -addresses (gpa->hva); note that two gpas may alias to the same hva, but not -vice versa. - -These hvas may be backed using any method available to the host: anonymous -memory, file backed memory, and device memory. Memory might be paged by the -host at any time. - -Events -====== - -The mmu is driven by events, some from the guest, some from the host. - -Guest generated events: -- writes to control registers (especially cr3) -- invlpg/invlpga instruction execution -- access to missing or protected translations - -Host generated events: -- changes in the gpa->hpa translation (either through gpa->hva changes or - through hva->hpa changes) -- memory pressure (the shrinker) - -Shadow pages -============ - -The principal data structure is the shadow page, 'struct kvm_mmu_page'. A -shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A -shadow page may contain a mix of leaf and nonleaf sptes. - -A nonleaf spte allows the hardware mmu to reach the leaf pages and -is not related to a translation directly. It points to other shadow pages. - -A leaf spte corresponds to either one or two translations encoded into -one paging structure entry. These are always the lowest level of the -translation stack, with optional higher level translations left to NPT/EPT. -Leaf ptes point at guest pages. - -The following table shows translations encoded by leaf ptes, with higher-level -translations in parentheses: - - Non-nested guests: - nonpaging: gpa->hpa - paging: gva->gpa->hpa - paging, tdp: (gva->)gpa->hpa - Nested guests: - non-tdp: ngva->gpa->hpa (*) - tdp: (ngva->)ngpa->gpa->hpa - -(*) the guest hypervisor will encode the ngva->gpa translation into its page - tables if npt is not present - -Shadow pages contain the following information: - role.level: - The level in the shadow paging hierarchy that this shadow page belongs to. - 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. - role.direct: - If set, leaf sptes reachable from this page are for a linear range. - Examples include real mode translation, large guest pages backed by small - host pages, and gpa->hpa translations when NPT or EPT is active. - The linear range starts at (gfn << PAGE_SHIFT) and its size is determined - by role.level (2MB for first level, 1GB for second level, 0.5TB for third - level, 256TB for fourth level) - If clear, this page corresponds to a guest page table denoted by the gfn - field. - role.quadrant: - When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit - sptes. That means a guest page table contains more ptes than the host, - so multiple shadow pages are needed to shadow one guest page. - For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the - first or second 512-gpte block in the guest page table. For second-level - page tables, each 32-bit gpte is converted to two 64-bit sptes - (since each first-level guest page is shadowed by two first-level - shadow pages) so role.quadrant takes values in the range 0..3. Each - quadrant maps 1GB virtual address space. - role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. - role.invalid: - The page is invalid and should not be used. It is a root page that is - currently pinned (by a cpu hardware register pointing to it); once it is - unpinned it will be destroyed. - role.cr4_pae: - Contains the value of cr4.pae for which the page is valid (e.g. whether - 32-bit or 64-bit gptes are in use). - role.nxe: - Contains the value of efer.nxe for which the page is valid. - role.cr0_wp: - Contains the value of cr0.wp for which the page is valid. - gfn: - Either the guest page table containing the translations shadowed by this - page, or the base page frame for linear translations. See role.direct. - spt: - A pageful of 64-bit sptes containing the translations for this page. - Accessed by both kvm and hardware. - The page pointed to by spt will have its page->private pointing back - at the shadow page structure. - sptes in spt point either at guest pages, or at lower-level shadow pages. - Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point - at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. - The spt array forms a DAG structure with the shadow page as a node, and - guest pages as leaves. - gfns: - An array of 512 guest frame numbers, one for each present pte. Used to - perform a reverse map from a pte to a gfn. When role.direct is set, any - element of this array can be calculated from the gfn field when used, in - this case, the array of gfns is not allocated. See role.direct and gfn. - slot_bitmap: - A bitmap containing one bit per memory slot. If the page contains a pte - mapping a page from memory slot n, then bit n of slot_bitmap will be set - (if a page is aliased among several slots, then it is not guaranteed that - all slots will be marked). - Used during dirty logging to avoid scanning a shadow page if none if its - pages need tracking. - root_count: - A counter keeping track of how many hardware registers (guest cr3 or - pdptrs) are now pointing at the page. While this counter is nonzero, the - page cannot be destroyed. See role.invalid. - multimapped: - Whether there exist multiple sptes pointing at this page. - parent_pte/parent_ptes: - If multimapped is zero, parent_pte points at the single spte that points at - this page's spt. Otherwise, parent_ptes points at a data structure - with a list of parent_ptes. - unsync: - If true, then the translations in this page may not match the guest's - translation. This is equivalent to the state of the tlb when a pte is - changed but before the tlb entry is flushed. Accordingly, unsync ptes - are synchronized when the guest executes invlpg or flushes its tlb by - other means. Valid for leaf pages. - unsync_children: - How many sptes in the page point at pages that are unsync (or have - unsynchronized children). - unsync_child_bitmap: - A bitmap indicating which sptes in spt point (directly or indirectly) at - pages that may be unsynchronized. Used to quickly locate all unsychronized - pages reachable from a given page. - -Reverse map -=========== - -The mmu maintains a reverse mapping whereby all ptes mapping a page can be -reached given its gfn. This is used, for example, when swapping out a page. - -Synchronized and unsynchronized pages -===================================== - -The guest uses two events to synchronize its tlb and page tables: tlb flushes -and page invalidations (invlpg). - -A tlb flush means that we need to synchronize all sptes reachable from the -guest's cr3. This is expensive, so we keep all guest page tables write -protected, and synchronize sptes to gptes when a gpte is written. - -A special case is when a guest page table is reachable from the current -guest cr3. In this case, the guest is obliged to issue an invlpg instruction -before using the translation. We take advantage of that by removing write -protection from the guest page, and allowing the guest to modify it freely. -We synchronize modified gptes when the guest invokes invlpg. This reduces -the amount of emulation we have to do when the guest modifies multiple gptes, -or when the a guest page is no longer used as a page table and is used for -random guest data. - -As a side effect we have to resynchronize all reachable unsynchronized shadow -pages on a tlb flush. - - -Reaction to events -================== - -- guest page fault (or npt page fault, or ept violation) - -This is the most complicated event. The cause of a page fault can be: - - - a true guest fault (the guest translation won't allow the access) (*) - - access to a missing translation - - access to a protected translation - - when logging dirty pages, memory is write protected - - synchronized shadow pages are write protected (*) - - access to untranslatable memory (mmio) - - (*) not applicable in direct mode - -Handling a page fault is performed as follows: - - - if needed, walk the guest page tables to determine the guest translation - (gva->gpa or ngpa->gpa) - - if permissions are insufficient, reflect the fault back to the guest - - determine the host page - - if this is an mmio request, there is no host page; call the emulator - to emulate the instruction instead - - walk the shadow page table to find the spte for the translation, - instantiating missing intermediate page tables as necessary - - try to unsynchronize the page - - if successful, we can let the guest continue and modify the gpte - - emulate the instruction - - if failed, unshadow the page and let the guest continue - - update any translations that were modified by the instruction - -invlpg handling: - - - walk the shadow page hierarchy and drop affected translations - - try to reinstantiate the indicated translation in the hope that the - guest will use it in the near future - -Guest control register updates: - -- mov to cr3 - - look up new shadow roots - - synchronize newly reachable shadow pages - -- mov to cr0/cr4/efer - - set up mmu context for new paging mode - - look up new shadow roots - - synchronize newly reachable shadow pages - -Host translation updates: - - - mmu notifier called with updated hva - - look up affected sptes through reverse map - - drop (or update) translations - -Emulating cr0.wp -================ - -If tdp is not enabled, the host must keep cr0.wp=1 so page write protection -works for the guest kernel, not guest guest userspace. When the guest -cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, -we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the -semantics require allowing any guest kernel access plus user read access). - -We handle this by mapping the permissions to two possible sptes, depending -on fault type: - -- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, - disallows user access) -- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel - write access) - -(user write faults generate a #PF) - -Large pages -=========== - -The mmu supports all combinations of large and small guest and host pages. -Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as -two separate 2M pages, on both guest and host, since the mmu always uses PAE -paging. - -To instantiate a large spte, four constraints must be satisfied: - -- the spte must point to a large host page -- the guest pte must be a large pte of at least equivalent size (if tdp is - enabled, there is no guest pte and this condition is satisified) -- if the spte will be writeable, the large page frame may not overlap any - write-protected pages -- the guest page must be wholly contained by a single memory slot - -To check the last two conditions, the mmu maintains a ->write_count set of -arrays for each memory slot and large page size. Every write protected page -causes its write_count to be incremented, thus preventing instantiation of -a large spte. The frames at the end of an unaligned memory slot have -artificically inflated ->write_counts so they can never be instantiated. - -Further reading -=============== - -- NPT presentation from KVM Forum 2008 - http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf - diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt deleted file mode 100644 index d079aed..0000000 --- a/Documentation/kvm/msr.txt +++ /dev/null @@ -1,187 +0,0 @@ -KVM-specific MSRs. -Glauber Costa , Red Hat Inc, 2010 -===================================================== - -KVM makes use of some custom MSRs to service some requests. - -Custom MSRs have a range reserved for them, that goes from -0x4b564d00 to 0x4b564dff. There are MSRs outside this area, -but they are deprecated and their use is discouraged. - -Custom MSR list --------- - -The current supported Custom MSR list is: - -MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 - - data: 4-byte alignment physical address of a memory area which must be - in guest RAM. This memory is expected to hold a copy of the following - structure: - - struct pvclock_wall_clock { - u32 version; - u32 sec; - u32 nsec; - } __attribute__((__packed__)); - - whose data will be filled in by the hypervisor. The hypervisor is only - guaranteed to update this data at the moment of MSR write. - Users that want to reliably query this information more than once have - to write more than once to this MSR. Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - sec: number of seconds for wallclock. - - nsec: number of nanoseconds for wallclock. - - Note that although MSRs are per-CPU entities, the effect of this - particular MSR is global. - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 - - data: 4-byte aligned physical address of a memory area which must be in - guest RAM, plus an enable bit in bit 0. This memory is expected to hold - a copy of the following structure: - - struct pvclock_vcpu_time_info { - u32 version; - u32 pad0; - u64 tsc_timestamp; - u64 system_time; - u32 tsc_to_system_mul; - s8 tsc_shift; - u8 flags; - u8 pad[2]; - } __attribute__((__packed__)); /* 32 bytes */ - - whose data will be filled in by the hypervisor periodically. Only one - write, or registration, is needed for each VCPU. The interval between - updates of this structure is arbitrary and implementation-dependent. - The hypervisor may update this structure at any time it sees fit until - anything with bit0 == 0 is written to it. - - Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - tsc_timestamp: the tsc value at the current VCPU at the time - of the update of this structure. Guests can subtract this value - from current tsc to derive a notion of elapsed time since the - structure update. - - system_time: a host notion of monotonic time, including sleep - time at the time this structure was last updated. Unit is - nanoseconds. - - tsc_to_system_mul: a function of the tsc frequency. One has - to multiply any tsc-related quantity by this value to get - a value in nanoseconds, besides dividing by 2^tsc_shift - - tsc_shift: cycle to nanosecond divider, as a power of two, to - allow for shift rights. One has to shift right any tsc-related - quantity by this value to get a value in nanoseconds, besides - multiplying by tsc_to_system_mul. - - With this information, guests can derive per-CPU time by - doing: - - time = (current_tsc - tsc_timestamp) - time = (time * tsc_to_system_mul) >> tsc_shift - time = time + system_time - - flags: bits in this field indicate extended capabilities - coordinated between the guest and the hypervisor. Availability - of specific flags has to be checked in 0x40000001 cpuid leaf. - Current flags are: - - flag bit | cpuid bit | meaning - ------------------------------------------------------------- - | | time measures taken across - 0 | 24 | multiple cpus are guaranteed to - | | be monotonic - ------------------------------------------------------------- - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - - -MSR_KVM_WALL_CLOCK: 0x11 - - data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME: 0x12 - - data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - - The suggested algorithm for detecting kvmclock presence is then: - - if (!kvm_para_available()) /* refer to cpuid.txt */ - return NON_PRESENT; - - flags = cpuid_eax(0x40000001); - if (flags & 3) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - return PRESENT; - } else if (flags & 0) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; - return PRESENT; - } else - return NON_PRESENT; - -MSR_KVM_ASYNC_PF_EN: 0x4b564d02 - data: Bits 63-6 hold 64-byte aligned physical address of a - 64 byte memory area which must be in guest RAM and must be - zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1 - when asynchronous page faults are enabled on the vcpu 0 when - disabled. Bit 2 is 1 if asynchronous page faults can be injected - when vcpu is in cpl == 0. - - First 4 byte of 64 byte memory location will be written to by - the hypervisor at the time of asynchronous page fault (APF) - injection to indicate type of asynchronous page fault. Value - of 1 means that the page referred to by the page fault is not - present. Value 2 means that the page is now available. Disabling - interrupt inhibits APFs. Guest must not enable interrupt - before the reason is read, or it may be overwritten by another - APF. Since APF uses the same exception vector as regular page - fault guest must reset the reason to 0 before it does - something that can generate normal page fault. If during page - fault APF reason is 0 it means that this is regular page - fault. - - During delivery of type 1 APF cr2 contains a token that will - be used to notify a guest when missing page becomes - available. When page becomes available type 2 APF is sent with - cr2 set to the token associated with the page. There is special - kind of token 0xffffffff which tells vcpu that it should wake - up all processes waiting for APFs and no individual type 2 APFs - will be sent. - - If APF is disabled while there are outstanding APFs, they will - not be delivered. - - Currently type 2 APF will be always delivered on the same vcpu as - type 1 was, but guest should not rely on that. diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt deleted file mode 100644 index 3ab969c..0000000 --- a/Documentation/kvm/ppc-pv.txt +++ /dev/null @@ -1,196 +0,0 @@ -The PPC KVM paravirtual interface -================================= - -The basic execution principle by which KVM on PowerPC works is to run all kernel -space code in PR=1 which is user space. This way we trap all privileged -instructions and can emulate them accordingly. - -Unfortunately that is also the downfall. There are quite some privileged -instructions that needlessly return us to the hypervisor even though they -could be handled differently. - -This is what the PPC PV interface helps with. It takes privileged instructions -and transforms them into unprivileged ones with some help from the hypervisor. -This cuts down virtualization costs by about 50% on some of my benchmarks. - -The code for that interface can be found in arch/powerpc/kernel/kvm* - -Querying for existence -====================== - -To find out if we're running on KVM or not, we leverage the device tree. When -Linux is running on KVM, a node /hypervisor exists. That node contains a -compatible property with the value "linux,kvm". - -Once you determined you're running under a PV capable KVM, you can now use -hypercalls as described below. - -KVM hypercalls -============== - -Inside the device tree's /hypervisor node there's a property called -'hypercall-instructions'. This property contains at most 4 opcodes that make -up the hypercall. To call a hypercall, just call these instructions. - -The parameters are as follows: - - Register IN OUT - - r0 - volatile - r3 1st parameter Return code - r4 2nd parameter 1st output value - r5 3rd parameter 2nd output value - r6 4th parameter 3rd output value - r7 5th parameter 4th output value - r8 6th parameter 5th output value - r9 7th parameter 6th output value - r10 8th parameter 7th output value - r11 hypercall number 8th output value - r12 - volatile - -Hypercall definitions are shared in generic code, so the same hypercall numbers -apply for x86 and powerpc alike with the exception that each KVM hypercall -also needs to be ORed with the KVM vendor code which is (42 << 16). - -Return codes can be as follows: - - Code Meaning - - 0 Success - 12 Hypercall not implemented - <0 Error - -The magic page -============== - -To enable communication between the hypervisor and guest there is a new shared -page that contains parts of supervisor visible register state. The guest can -map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. - -With this hypercall issued the guest always gets the magic page mapped at the -desired location in effective and physical address space. For now, we always -map the page to -4096. This way we can access it using absolute load and store -functions. The following instruction reads the first field of the magic page: - - ld rX, -4096(0) - -The interface is designed to be extensible should there be need later to add -additional registers to the magic page. If you add fields to the magic page, -also define a new hypercall feature to indicate that the host can give you more -registers. Only if the host supports the additional features, make use of them. - -The magic page has the following layout as described in -arch/powerpc/include/asm/kvm_para.h: - -struct kvm_vcpu_arch_shared { - __u64 scratch1; - __u64 scratch2; - __u64 scratch3; - __u64 critical; /* Guest may not get interrupts if == r1 */ - __u64 sprg0; - __u64 sprg1; - __u64 sprg2; - __u64 sprg3; - __u64 srr0; - __u64 srr1; - __u64 dar; - __u64 msr; - __u32 dsisr; - __u32 int_pending; /* Tells the guest if we have an interrupt */ -}; - -Additions to the page must only occur at the end. Struct fields are always 32 -or 64 bit aligned, depending on them being 32 or 64 bit wide respectively. - -Magic page features -=================== - -When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, -a second return value is passed to the guest. This second return value contains -a bitmap of available features inside the magic page. - -The following enhancements to the magic page are currently available: - - KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page - -For enhanced features in the magic page, please check for the existence of the -feature before using them! - -MSR bits -======== - -The MSR contains bits that require hypervisor intervention and bits that do -not require direct hypervisor intervention because they only get interpreted -when entering the guest or don't have any impact on the hypervisor's behavior. - -The following bits are safe to be set inside the guest: - - MSR_EE - MSR_RI - MSR_CR - MSR_ME - -If any other bit changes in the MSR, please still use mtmsr(d). - -Patched instructions -==================== - -The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions -respectively on 32 bit systems with an added offset of 4 to accommodate for big -endianness. - -The following is a list of mapping the Linux kernel performs when running as -guest. Implementing any of those mappings is optional, as the instruction traps -also act on the shared page. So calling privileged instructions still works as -before. - -From To -==== == - -mfmsr rX ld rX, magic_page->msr -mfsprg rX, 0 ld rX, magic_page->sprg0 -mfsprg rX, 1 ld rX, magic_page->sprg1 -mfsprg rX, 2 ld rX, magic_page->sprg2 -mfsprg rX, 3 ld rX, magic_page->sprg3 -mfsrr0 rX ld rX, magic_page->srr0 -mfsrr1 rX ld rX, magic_page->srr1 -mfdar rX ld rX, magic_page->dar -mfdsisr rX lwz rX, magic_page->dsisr - -mtmsr rX std rX, magic_page->msr -mtsprg 0, rX std rX, magic_page->sprg0 -mtsprg 1, rX std rX, magic_page->sprg1 -mtsprg 2, rX std rX, magic_page->sprg2 -mtsprg 3, rX std rX, magic_page->sprg3 -mtsrr0 rX std rX, magic_page->srr0 -mtsrr1 rX std rX, magic_page->srr1 -mtdar rX std rX, magic_page->dar -mtdsisr rX stw rX, magic_page->dsisr - -tlbsync nop - -mtmsrd rX, 0 b -mtmsr rX b - -mtmsrd rX, 1 b - -[Book3S only] -mtsrin rX, rY b - -[BookE only] -wrteei [0|1] b - - -Some instructions require more logic to determine what's going on than a load -or store instruction can deliver. To enable patching of those, we keep some -RAM around where we can live translate instructions to. What happens is the -following: - - 1) copy emulation code to memory - 2) patch that code to fit the emulated instruction - 3) patch that code to return to the original pc + 4 - 4) patch the original instruction to branch to the new code - -That way we can inject an arbitrary amount of code as replacement for a single -instruction. This allows us to check for pending interrupts when setting EE=1 -for example. diff --git a/Documentation/kvm/review-checklist.txt b/Documentation/kvm/review-checklist.txt deleted file mode 100644 index 730475a..0000000 --- a/Documentation/kvm/review-checklist.txt +++ /dev/null @@ -1,38 +0,0 @@ -Review checklist for kvm patches -================================ - -1. The patch must follow Documentation/CodingStyle and - Documentation/SubmittingPatches. - -2. Patches should be against kvm.git master branch. - -3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/kvm/api.txt - - the API must be discoverable using KVM_CHECK_EXTENSION - -4. New state must include support for save/restore. - -5. New features must default to off (userspace should explicitly request them). - Performance improvements can and should default to on. - -6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 - -7. Emulator changes should be accompanied by unit tests for qemu-kvm.git - kvm/test directory. - -8. Changes should be vendor neutral when possible. Changes to common code - are better than duplicating changes to vendor code. - -9. Similarly, prefer changes to arch independent code than to arch dependent - code. - -10. User/kernel interfaces and guest/host interfaces must be 64-bit clean - (all variables and sizes naturally aligned on 64-bit; use specific types - only - u64 rather than ulong). - -11. New guest visible features must either be documented in a hardware manual - or be accompanied by documentation. - -12. Features must be robust against reset and kexec - for example, shared - host/guest memory must be unshared to prevent the host from writing to - guest memory that the guest has not reserved for this purpose. diff --git a/Documentation/kvm/timekeeping.txt b/Documentation/kvm/timekeeping.txt deleted file mode 100644 index df89463..0000000 --- a/Documentation/kvm/timekeeping.txt +++ /dev/null @@ -1,612 +0,0 @@ - - Timekeeping Virtualization for X86-Based Architectures - - Zachary Amsden - Copyright (c) 2010, Red Hat. All rights reserved. - -1) Overview -2) Timing Devices -3) TSC Hardware -4) Virtualization Problems - -========================================================================= - -1) Overview - -One of the most complicated parts of the X86 platform, and specifically, -the virtualization of this platform is the plethora of timing devices available -and the complexity of emulating those devices. In addition, virtualization of -time introduces a new set of challenges because it introduces a multiplexed -division of time beyond the control of the guest CPU. - -First, we will describe the various timekeeping hardware available, then -present some of the problems which arise and solutions available, giving -specific recommendations for certain classes of KVM guests. - -The purpose of this document is to collect data and information relevant to -timekeeping which may be difficult to find elsewhere, specifically, -information relevant to KVM and hardware-based virtualization. - -========================================================================= - -2) Timing Devices - -First we discuss the basic hardware devices available. TSC and the related -KVM clock are special enough to warrant a full exposition and are described in -the following section. - -2.1) i8254 - PIT - -One of the first timer devices available is the programmable interrupt timer, -or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three -channels which can be programmed to deliver periodic or one-shot interrupts. -These three channels can be configured in different modes and have individual -counters. Channel 1 and 2 were not available for general use in the original -IBM PC, and historically were connected to control RAM refresh and the PC -speaker. Now the PIT is typically integrated as part of an emulated chipset -and a separate physical PIT is not used. - -The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done -using single or multiple byte access to the I/O ports. There are 6 modes -available, but not all modes are available to all timers, as only timer 2 -has a connected gate input, required for modes 1 and 5. The gate line is -controlled by port 61h, bit 0, as illustrated in the following diagram. - - -------------- ---------------- -| | | | -| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 -| Clock | | | | - -------------- | +->| GATE TIMER 0 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM - | | | (aka /dev/null) - | +->| GATE TIMER 1 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> Port 61h, bit 5 - | | | -Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ - ---------------- _| )--|LPF|---Speaker - / *---- \___/ -Port 61h, bit 1 -----------------------------------/ - -The timer modes are now described. - -Mode 0: Single Timeout. This is a one-shot software timeout that counts down - when the gate is high (always true for timers 0 and 1). When the count - reaches zero, the output goes high. - -Mode 1: Triggered One-shot. The output is initially set high. When the gate - line is set high, a countdown is initiated (which does not stop if the gate is - lowered), during which the output is set low. When the count reaches zero, - the output goes high. - -Mode 2: Rate Generator. The output is initially set high. When the countdown - reaches 1, the output goes low for one count and then returns high. The value - is reloaded and the countdown automatically resumes. If the gate line goes - low, the count is halted. If the output is low when the gate is lowered, the - output automatically goes high (this only affects timer 2). - -Mode 3: Square Wave. This generates a high / low square wave. The count - determines the length of the pulse, which alternates between high and low - when zero is reached. The count only proceeds when gate is high and is - automatically reloaded on reaching zero. The count is decremented twice at - each clock to generate a full high / low cycle at the full periodic rate. - If the count is even, the clock remains high for N/2 counts and low for N/2 - counts; if the clock is odd, the clock is high for (N+1)/2 counts and low - for (N-1)/2 counts. Only even values are latched by the counter, so odd - values are not observed when reading. This is the intended mode for timer 2, - which generates sine-like tones by low-pass filtering the square wave output. - -Mode 4: Software Strobe. After programming this mode and loading the counter, - the output remains high until the counter reaches zero. Then the output - goes low for 1 clock cycle and returns high. The counter is not reloaded. - Counting only occurs when gate is high. - -Mode 5: Hardware Strobe. After programming and loading the counter, the - output remains high. When the gate is raised, a countdown is initiated - (which does not stop if the gate is lowered). When the counter reaches zero, - the output goes low for 1 clock cycle and then returns high. The counter is - not reloaded. - -In addition to normal binary counting, the PIT supports BCD counting. The -command port, 0x43 is used to set the counter and mode for each of the three -timers. - -PIT commands, issued to port 0x43, using the following bit encoding: - -Bit 7-4: Command (See table below) -Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) -Bit 0 : Binary (0) / BCD (1) - -Command table: - -0000 - Latch Timer 0 count for port 0x40 - sample and hold the count to be read in port 0x40; - additional commands ignored until counter is read; - mode bits ignored. - -0001 - Set Timer 0 LSB mode for port 0x40 - set timer to read LSB only and force MSB to zero; - mode bits set timer mode - -0010 - Set Timer 0 MSB mode for port 0x40 - set timer to read MSB only and force LSB to zero; - mode bits set timer mode - -0011 - Set Timer 0 16-bit mode for port 0x40 - set timer to read / write LSB first, then MSB; - mode bits set timer mode - -0100 - Latch Timer 1 count for port 0x41 - as described above -0101 - Set Timer 1 LSB mode for port 0x41 - as described above -0110 - Set Timer 1 MSB mode for port 0x41 - as described above -0111 - Set Timer 1 16-bit mode for port 0x41 - as described above - -1000 - Latch Timer 2 count for port 0x42 - as described above -1001 - Set Timer 2 LSB mode for port 0x42 - as described above -1010 - Set Timer 2 MSB mode for port 0x42 - as described above -1011 - Set Timer 2 16-bit mode for port 0x42 as described above - -1101 - General counter latch - Latch combination of counters into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - Bit 0 = Unused - -1110 - Latch timer status - Latch combination of counter mode into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - - The output of ports 0x40-0x42 following this command will be: - - Bit 7 = Output pin - Bit 6 = Count loaded (0 if timer has expired) - Bit 5-4 = Read / Write mode - 01 = MSB only - 10 = LSB only - 11 = LSB / MSB (16-bit) - Bit 3-1 = Mode - Bit 0 = Binary (0) / BCD mode (1) - -2.2) RTC - -The second device which was available in the original PC was the MC146818 real -time clock. The original device is now obsolete, and usually emulated by the -system chipset, sometimes by an HPET and some frankenstein IRQ routing. - -The RTC is accessed through CMOS variables, which uses an index register to -control which bytes are read. Since there is only one index register, read -of the CMOS and read of the RTC require lock protection (in addition, it is -dangerous to allow userspace utilities such as hwclock to have direct RTC -access, as they could corrupt kernel reads and writes of CMOS memory). - -The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt -can function as a periodic timer, an additional once a day alarm, and can issue -interrupts after an update of the CMOS registers by the MC146818 is complete. -The type of interrupt is signalled in the RTC status registers. - -The RTC will update the current time fields by battery power even while the -system is off. The current time fields should not be read while an update is -in progress, as indicated in the status register. - -The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be -programmed to a 32kHz divider if the RTC is to count seconds. - -This is the RAM map originally used for the RTC/CMOS: - -Location Size Description ------------------------------------------- -00h byte Current second (BCD) -01h byte Seconds alarm (BCD) -02h byte Current minute (BCD) -03h byte Minutes alarm (BCD) -04h byte Current hour (BCD) -05h byte Hours alarm (BCD) -06h byte Current day of week (BCD) -07h byte Current day of month (BCD) -08h byte Current month (BCD) -09h byte Current year (BCD) -0Ah byte Register A - bit 7 = Update in progress - bit 6-4 = Divider for clock - 000 = 4.194 MHz - 001 = 1.049 MHz - 010 = 32 kHz - 10X = test modes - 110 = reset / disable - 111 = reset / disable - bit 3-0 = Rate selection for periodic interrupt - 000 = periodic timer disabled - 001 = 3.90625 uS - 010 = 7.8125 uS - 011 = .122070 mS - 100 = .244141 mS - ... - 1101 = 125 mS - 1110 = 250 mS - 1111 = 500 mS -0Bh byte Register B - bit 7 = Run (0) / Halt (1) - bit 6 = Periodic interrupt enable - bit 5 = Alarm interrupt enable - bit 4 = Update-ended interrupt enable - bit 3 = Square wave interrupt enable - bit 2 = BCD calendar (0) / Binary (1) - bit 1 = 12-hour mode (0) / 24-hour mode (1) - bit 0 = 0 (DST off) / 1 (DST enabled) -OCh byte Register C (read only) - bit 7 = interrupt request flag (IRQF) - bit 6 = periodic interrupt flag (PF) - bit 5 = alarm interrupt flag (AF) - bit 4 = update interrupt flag (UF) - bit 3-0 = reserved -ODh byte Register D (read only) - bit 7 = RTC has power - bit 6-0 = reserved -32h byte Current century BCD (*) - (*) location vendor specific and now determined from ACPI global tables - -2.3) APIC - -On Pentium and later processors, an on-board timer is available to each CPU -as part of the Advanced Programmable Interrupt Controller. The APIC is -accessed through memory-mapped registers and provides interrupt service to each -CPU, used for IPIs and local timer interrupts. - -Although in theory the APIC is a safe and stable source for local interrupts, -in practice, many bugs and glitches have occurred due to the special nature of -the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect -the use of the APIC and that workarounds may be required. In addition, some of -these workarounds pose unique constraints for virtualization - requiring either -extra overhead incurred from extra reads of memory-mapped I/O or additional -functionality that may be more computationally expensive to implement. - -Since the APIC is documented quite well in the Intel and AMD manuals, we will -avoid repetition of the detail here. It should be pointed out that the APIC -timer is programmed through the LVT (local vector timer) register, is capable -of one-shot or periodic operation, and is based on the bus clock divided down -by the programmable divider register. - -2.4) HPET - -HPET is quite complex, and was originally intended to replace the PIT / RTC -support of the X86 PC. It remains to be seen whether that will be the case, as -the de facto standard of PC hardware is to emulate these older devices. Some -systems designated as legacy free may support only the HPET as a hardware timer -device. - -The HPET spec is rather loose and vague, requiring at least 3 hardware timers, -but allowing implementation freedom to support many more. It also imposes no -fixed rate on the timer frequency, but does impose some extremal values on -frequency, error and slew. - -In general, the HPET is recommended as a high precision (compared to PIT /RTC) -time source which is independent of local variation (as there is only one HPET -in any given system). The HPET is also memory-mapped, and its presence is -indicated through ACPI tables by the BIOS. - -Detailed specification of the HPET is beyond the current scope of this -document, as it is also very well documented elsewhere. - -2.5) Offboard Timers - -Several cards, both proprietary (watchdog boards) and commonplace (e1000) have -timing chips built into the cards which may have registers which are accessible -to kernel or user drivers. To the author's knowledge, using these to generate -a clocksource for a Linux or other kernel has not yet been attempted and is in -general frowned upon as not playing by the agreed rules of the game. Such a -timer device would require additional support to be virtualized properly and is -not considered important at this time as no known operating system does this. - -========================================================================= - -3) TSC Hardware - -The TSC or time stamp counter is relatively simple in theory; it counts -instruction cycles issued by the processor, which can be used as a measure of -time. In practice, due to a number of problems, it is the most complicated -timekeeping device to use. - -The TSC is represented internally as a 64-bit MSR which can be read with the -RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware -limitations made it possible to write the TSC, but generally on old hardware it -was only possible to write the low 32-bits of the 64-bit counter, and the upper -32-bits of the counter were cleared. Now, however, on Intel processors family -0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction -has been lifted and all 64-bits are writable. On AMD systems, the ability to -write the TSC MSR is not an architectural guarantee. - -The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by -means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. - -Some vendors have implemented an additional instruction, RDTSCP, which returns -atomically not just the TSC, but an indicator which corresponds to the -processor number. This can be used to index into an array of TSC variables to -determine offset information in SMP systems where TSCs are not synchronized. -The presence of this instruction must be determined by consulting CPUID feature -bits. - -Both VMX and SVM provide extension fields in the virtualization hardware which -allows the guest visible TSC to be offset by a constant. Newer implementations -promise to allow the TSC to additionally be scaled, but this hardware is not -yet widely available. - -3.1) TSC synchronization - -The TSC is a CPU-local clock in most implementations. This means, on SMP -platforms, the TSCs of different CPUs may start at different times depending -on when the CPUs are powered on. Generally, CPUs on the same die will share -the same clock, however, this is not always the case. - -The BIOS may attempt to resynchronize the TSCs during the poweron process and -the operating system or other system software may attempt to do this as well. -Several hardware limitations make the problem worse - if it is not possible to -write the full 64-bits of the TSC, it may be impossible to match the TSC in -newly arriving CPUs to that of the rest of the system, resulting in -unsynchronized TSCs. This may be done by BIOS or system software, but in -practice, getting a perfectly synchronized TSC will not be possible unless all -values are read from the same clock, which generally only is possible on single -socket systems or those with special hardware support. - -3.2) TSC and CPU hotplug - -As touched on already, CPUs which arrive later than the boot time of the system -may not have a TSC value that is synchronized with the rest of the system. -Either system software, BIOS, or SMM code may actually try to establish the TSC -to a value matching the rest of the system, but a perfect match is usually not -a guarantee. This can have the effect of bringing a system from a state where -TSC is synchronized back to a state where TSC synchronization flaws, however -small, may be exposed to the OS and any virtualization environment. - -3.3) TSC and multi-socket / NUMA - -Multi-socket systems, especially large multi-socket systems are likely to have -individual clocksources rather than a single, universally distributed clock. -Since these clocks are driven by different crystals, they will not have -perfectly matched frequency, and temperature and electrical variations will -cause the CPU clocks, and thus the TSCs to drift over time. Depending on the -exact clock and bus design, the drift may or may not be fixed in absolute -error, and may accumulate over time. - -In addition, very large systems may deliberately slew the clocks of individual -cores. This technique, known as spread-spectrum clocking, reduces EMI at the -clock frequency and harmonics of it, which may be required to pass FCC -standards for telecommunications and computer equipment. - -It is recommended not to trust the TSCs to remain synchronized on NUMA or -multiple socket systems for these reasons. - -3.4) TSC and C-states - -C-states, or idling states of the processor, especially C1E and deeper sleep -states may be problematic for TSC as well. The TSC may stop advancing in such -a state, resulting in a TSC which is behind that of other CPUs when execution -is resumed. Such CPUs must be detected and flagged by the operating system -based on CPU and chipset identifications. - -The TSC in such a case may be corrected by catching it up to a known external -clocksource. - -3.5) TSC frequency change / P-states - -To make things slightly more interesting, some CPUs may change frequency. They -may or may not run the TSC at the same rate, and because the frequency change -may be staggered or slewed, at some points in time, the TSC rate may not be -known other than falling within a range of values. In this case, the TSC will -not be a stable time source, and must be calibrated against a known, stable, -external clock to be a usable source of time. - -Whether the TSC runs at a constant rate or scales with the P-state is model -dependent and must be determined by inspecting CPUID, chipset or vendor -specific MSR fields. - -In addition, some vendors have known bugs where the P-state is actually -compensated for properly during normal operation, but when the processor is -inactive, the P-state may be raised temporarily to service cache misses from -other processors. In such cases, the TSC on halted CPUs could advance faster -than that of non-halted processors. AMD Turion processors are known to have -this problem. - -3.6) TSC and STPCLK / T-states - -External signals given to the processor may also have the effect of stopping -the TSC. This is typically done for thermal emergency power control to prevent -an overheating condition, and typically, there is no way to detect that this -condition has happened. - -3.7) TSC virtualization - VMX - -VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET -field specified in the VMCS. Special instructions must be used to read and -write the VMCS field. - -3.8) TSC virtualization - SVM - -SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, SVM allows passing through the host TSC plus an additional offset -field specified in the SVM control block. - -3.9) TSC feature bits in Linux - -In summary, there is no way to guarantee the TSC remains in perfect -synchronization unless it is explicitly guaranteed by the architecture. Even -if so, the TSCs in multi-sockets or NUMA systems may still run independently -despite being locally consistent. - -The following feature bits are used by Linux to signal various TSC attributes, -but they can only be taken to be meaningful for UP or single node systems. - -X86_FEATURE_TSC : The TSC is available in hardware -X86_FEATURE_RDTSCP : The RDTSCP instruction is available -X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states -X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states -X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) - -4) Virtualization Problems - -Timekeeping is especially problematic for virtualization because a number of -challenges arise. The most obvious problem is that time is now shared between -the host and, potentially, a number of virtual machines. Thus the virtual -operating system does not run with 100% usage of the CPU, despite the fact that -it may very well make that assumption. It may expect it to remain true to very -exacting bounds when interrupt sources are disabled, but in reality only its -virtual interrupt sources are disabled, and the machine may still be preempted -at any time. This causes problems as the passage of real time, the injection -of machine interrupts and the associated clock sources are no longer completely -synchronized with real time. - -This same problem can occur on native harware to a degree, as SMM mode may -steal cycles from the naturally on X86 systems when SMM mode is used by the -BIOS, but not in such an extreme fashion. However, the fact that SMM mode may -cause similar problems to virtualization makes it a good justification for -solving many of these problems on bare metal. - -4.1) Interrupt clocking - -One of the most immediate problems that occurs with legacy operating systems -is that the system timekeeping routines are often designed to keep track of -time by counting periodic interrupts. These interrupts may come from the PIT -or the RTC, but the problem is the same: the host virtualization engine may not -be able to deliver the proper number of interrupts per second, and so guest -time may fall behind. This is especially problematic if a high interrupt rate -is selected, such as 1000 HZ, which is unfortunately the default for many Linux -guests. - -There are three approaches to solving this problem; first, it may be possible -to simply ignore it. Guests which have a separate time source for tracking -'wall clock' or 'real time' may not need any adjustment of their interrupts to -maintain proper time. If this is not sufficient, it may be necessary to inject -additional interrupts into the guest in order to increase the effective -interrupt rate. This approach leads to complications in extreme conditions, -where host load or guest lag is too much to compensate for, and thus another -solution to the problem has risen: the guest may need to become aware of lost -ticks and compensate for them internally. Although promising in theory, the -implementation of this policy in Linux has been extremely error prone, and a -number of buggy variants of lost tick compensation are distributed across -commonly used Linux systems. - -Windows uses periodic RTC clocking as a means of keeping time internally, and -thus requires interrupt slewing to keep proper time. It does use a low enough -rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in -practice. - -4.2) TSC sampling and serialization - -As the highest precision time source available, the cycle counter of the CPU -has aroused much interest from developers. As explained above, this timer has -many problems unique to its nature as a local, potentially unstable and -potentially unsynchronized source. One issue which is not unique to the TSC, -but is highlighted because of its very precise nature is sampling delay. By -definition, the counter, once read is already old. However, it is also -possible for the counter to be read ahead of the actual use of the result. -This is a consequence of the superscalar execution of the instruction stream, -which may execute instructions out of order. Such execution is called -non-serialized. Forcing serialized execution is necessary for precise -measurement with the TSC, and requires a serializing instruction, such as CPUID -or an MSR read. - -Since CPUID may actually be virtualized by a trap and emulate mechanism, this -serialization can pose a performance issue for hardware virtualization. An -accurate time stamp counter reading may therefore not always be available, and -it may be necessary for an implementation to guard against "backwards" reads of -the TSC as seen from other CPUs, even in an otherwise perfectly synchronized -system. - -4.3) Timespec aliasing - -Additionally, this lack of serialization from the TSC poses another challenge -when using results of the TSC when measured against another time source. As -the TSC is much higher precision, many possible values of the TSC may be read -while another clock is still expressing the same value. - -That is, you may read (T,T+10) while external clock C maintains the same value. -Due to non-serialized reads, you may actually end up with a range which -fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but -calibrated against an external value may have a range of valid values. -Re-calibrating this computation may actually cause time, as computed after the -calibration, to go backwards, compared with time computed before the -calibration. - -This problem is particularly pronounced with an internal time source in Linux, -the kernel time, which is expressed in the theoretically high resolution -timespec - but which advances in much larger granularity intervals, sometimes -at the rate of jiffies, and possibly in catchup modes, at a much larger step. - -This aliasing requires care in the computation and recalibration of kvmclock -and any other values derived from TSC computation (such as TSC virtualization -itself). - -4.4) Migration - -Migration of a virtual machine raises problems for timekeeping in two ways. -First, the migration itself may take time, during which interrupts cannot be -delivered, and after which, the guest time may need to be caught up. NTP may -be able to help to some degree here, as the clock correction required is -typically small enough to fall in the NTP-correctable window. - -An additional concern is that timers based off the TSC (or HPET, if the raw bus -clock is exposed) may now be running at different rates, requiring compensation -in some way in the hypervisor by virtualizing these timers. In addition, -migrating to a faster machine may preclude the use of a passthrough TSC, as a -faster clock cannot be made visible to a guest without the potential of time -advancing faster than usual. A slower clock is less of a problem, as it can -always be caught up to the original rate. KVM clock avoids these problems by -simply storing multipliers and offsets against the TSC for the guest to convert -back into nanosecond resolution values. - -4.5) Scheduling - -Since scheduling may be based on precise timing and firing of interrupts, the -scheduling algorithms of an operating system may be adversely affected by -virtualization. In theory, the effect is random and should be universally -distributed, but in contrived as well as real scenarios (guest device access, -causes of virtualization exits, possible context switch), this may not always -be the case. The effect of this has not been well studied. - -In an attempt to work around this, several implementations have provided a -paravirtualized scheduler clock, which reveals the true amount of CPU time for -which a virtual machine has been running. - -4.6) Watchdogs - -Watchdog timers, such as the lock detector in Linux may fire accidentally when -running under hardware virtualization due to timer interrupts being delayed or -misinterpretation of the passage of real time. Usually, these warnings are -spurious and can be ignored, but in some circumstances it may be necessary to -disable such detection. - -4.7) Delays and precision timing - -Precise timing and delays may not be possible in a virtualized system. This -can happen if the system is controlling physical hardware, or issues delays to -compensate for slower I/O to and from devices. The first issue is not solvable -in general for a virtualized system; hardware control software can't be -adequately virtualized without a full real-time operating system, which would -require an RT aware virtualization platform. - -The second issue may cause performance problems, but this is unlikely to be a -significant issue. In many cases these delays may be eliminated through -configuration or paravirtualization. - -4.8) Covert channels and leaks - -In addition to the above problems, time information will inevitably leak to the -guest about the host in anything but a perfect implementation of virtualized -time. This may allow the guest to infer the presence of a hypervisor (as in a -red-pill type detection), and it may allow information to leak between guests -by using CPU utilization itself as a signalling channel. Preventing such -problems would require completely isolated virtual time which may not track -real time any longer. This may be useful in certain security or QA contexts, -but in general isn't recommended for real-world deployment scenarios. diff --git a/Documentation/lguest/.gitignore b/Documentation/lguest/.gitignore deleted file mode 100644 index 115587f..0000000 --- a/Documentation/lguest/.gitignore +++ /dev/null @@ -1 +0,0 @@ -lguest diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile deleted file mode 100644 index bebac6b..0000000 --- a/Documentation/lguest/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# This creates the demonstration utility "lguest" which runs a Linux guest. -# Missing headers? Add "-I../../include -I../../arch/x86/include" -CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE - -all: lguest - -clean: - rm -f lguest diff --git a/Documentation/lguest/extract b/Documentation/lguest/extract deleted file mode 100644 index 7730bb6..0000000 --- a/Documentation/lguest/extract +++ /dev/null @@ -1,58 +0,0 @@ -#! /bin/sh - -set -e - -PREFIX=$1 -shift - -trap 'rm -r $TMPDIR' 0 -TMPDIR=`mktemp -d` - -exec 3>/dev/null -for f; do - while IFS=" -" read -r LINE; do - case "$LINE" in - *$PREFIX:[0-9]*:\**) - NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` - if [ -f $TMPDIR/$NUM ]; then - echo "$TMPDIR/$NUM already exits prior to $f" - exit 1 - fi - exec 3>>$TMPDIR/$NUM - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM - /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3 - ;; - *$PREFIX:[0-9]*) - NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` - if [ -f $TMPDIR/$NUM ]; then - echo "$TMPDIR/$NUM already exits prior to $f" - exit 1 - fi - exec 3>>$TMPDIR/$NUM - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM - /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3 - ;; - *:\**) - /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3 - echo >&3 - exec 3>/dev/null - ;; - *) - /bin/echo "$LINE" >&3 - ;; - esac - done < $f - echo >&3 - exec 3>/dev/null -done - -LASTFILE="" -for f in $TMPDIR/*; do - if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then - LASTFILE=$(cat $TMPDIR/.$(basename $f) ) - echo "[ $LASTFILE ]" - fi - cat $f -done - diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c deleted file mode 100644 index d9da7e1..0000000 --- a/Documentation/lguest/lguest.c +++ /dev/null @@ -1,2095 +0,0 @@ -/*P:100 - * This is the Launcher code, a simple program which lays out the "physical" - * memory for the new Guest by mapping the kernel image and the virtual - * devices, then opens /dev/lguest to tell the kernel about the Guest and - * control it. -:*/ -#define _LARGEFILE64_SOURCE -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include "../../include/linux/lguest_launcher.h" -/*L:110 - * We can ignore the 42 include files we need for this program, but I do want - * to draw attention to the use of kernel-style types. - * - * As Linus said, "C is a Spartan language, and so should your naming be." I - * like these abbreviations, so we define them here. Note that u64 is always - * unsigned long long, which works on all Linux systems: this means that we can - * use %llu in printf for any u64. - */ -typedef unsigned long long u64; -typedef uint32_t u32; -typedef uint16_t u16; -typedef uint8_t u8; -/*:*/ - -#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ -#define BRIDGE_PFX "bridge:" -#ifndef SIOCBRADDIF -#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ -#endif -/* We can have up to 256 pages for devices. */ -#define DEVICE_PAGES 256 -/* This will occupy 3 pages: it must be a power of 2. */ -#define VIRTQUEUE_NUM 256 - -/*L:120 - * verbose is both a global flag and a macro. The C preprocessor allows - * this, and although I wouldn't recommend it, it works quite nicely here. - */ -static bool verbose; -#define verbose(args...) \ - do { if (verbose) printf(args); } while(0) -/*:*/ - -/* The pointer to the start of guest memory. */ -static void *guest_base; -/* The maximum guest physical address allowed, and maximum possible. */ -static unsigned long guest_limit, guest_max; -/* The /dev/lguest file descriptor. */ -static int lguest_fd; - -/* a per-cpu variable indicating whose vcpu is currently running */ -static unsigned int __thread cpu_id; - -/* This is our list of devices. */ -struct device_list { - /* Counter to assign interrupt numbers. */ - unsigned int next_irq; - - /* Counter to print out convenient device numbers. */ - unsigned int device_num; - - /* The descriptor page for the devices. */ - u8 *descpage; - - /* A single linked list of devices. */ - struct device *dev; - /* And a pointer to the last device for easy append. */ - struct device *lastdev; -}; - -/* The list of Guest devices, based on command line arguments. */ -static struct device_list devices; - -/* The device structure describes a single device. */ -struct device { - /* The linked-list pointer. */ - struct device *next; - - /* The device's descriptor, as mapped into the Guest. */ - struct lguest_device_desc *desc; - - /* We can't trust desc values once Guest has booted: we use these. */ - unsigned int feature_len; - unsigned int num_vq; - - /* The name of this device, for --verbose. */ - const char *name; - - /* Any queues attached to this device */ - struct virtqueue *vq; - - /* Is it operational */ - bool running; - - /* Does Guest want an intrrupt on empty? */ - bool irq_on_empty; - - /* Device-specific data. */ - void *priv; -}; - -/* The virtqueue structure describes a queue attached to a device. */ -struct virtqueue { - struct virtqueue *next; - - /* Which device owns me. */ - struct device *dev; - - /* The configuration for this queue. */ - struct lguest_vqconfig config; - - /* The actual ring of buffers. */ - struct vring vring; - - /* Last available index we saw. */ - u16 last_avail_idx; - - /* How many are used since we sent last irq? */ - unsigned int pending_used; - - /* Eventfd where Guest notifications arrive. */ - int eventfd; - - /* Function for the thread which is servicing this virtqueue. */ - void (*service)(struct virtqueue *vq); - pid_t thread; -}; - -/* Remember the arguments to the program so we can "reboot" */ -static char **main_args; - -/* The original tty settings to restore on exit. */ -static struct termios orig_term; - -/* - * We have to be careful with barriers: our devices are all run in separate - * threads and so we need to make sure that changes visible to the Guest happen - * in precise order. - */ -#define wmb() __asm__ __volatile__("" : : : "memory") -#define mb() __asm__ __volatile__("" : : : "memory") - -/* - * Convert an iovec element to the given type. - * - * This is a fairly ugly trick: we need to know the size of the type and - * alignment requirement to check the pointer is kosher. It's also nice to - * have the name of the type in case we report failure. - * - * Typing those three things all the time is cumbersome and error prone, so we - * have a macro which sets them all up and passes to the real function. - */ -#define convert(iov, type) \ - ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) - -static void *_convert(struct iovec *iov, size_t size, size_t align, - const char *name) -{ - if (iov->iov_len != size) - errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); - if ((unsigned long)iov->iov_base % align != 0) - errx(1, "Bad alignment %p for %s", iov->iov_base, name); - return iov->iov_base; -} - -/* Wrapper for the last available index. Makes it easier to change. */ -#define lg_last_avail(vq) ((vq)->last_avail_idx) - -/* - * The virtio configuration space is defined to be little-endian. x86 is - * little-endian too, but it's nice to be explicit so we have these helpers. - */ -#define cpu_to_le16(v16) (v16) -#define cpu_to_le32(v32) (v32) -#define cpu_to_le64(v64) (v64) -#define le16_to_cpu(v16) (v16) -#define le32_to_cpu(v32) (v32) -#define le64_to_cpu(v64) (v64) - -/* Is this iovec empty? */ -static bool iov_empty(const struct iovec iov[], unsigned int num_iov) -{ - unsigned int i; - - for (i = 0; i < num_iov; i++) - if (iov[i].iov_len) - return false; - return true; -} - -/* Take len bytes from the front of this iovec. */ -static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) -{ - unsigned int i; - - for (i = 0; i < num_iov; i++) { - unsigned int used; - - used = iov[i].iov_len < len ? iov[i].iov_len : len; - iov[i].iov_base += used; - iov[i].iov_len -= used; - len -= used; - } - assert(len == 0); -} - -/* The device virtqueue descriptors are followed by feature bitmasks. */ -static u8 *get_feature_bits(struct device *dev) -{ - return (u8 *)(dev->desc + 1) - + dev->num_vq * sizeof(struct lguest_vqconfig); -} - -/*L:100 - * The Launcher code itself takes us out into userspace, that scary place where - * pointers run wild and free! Unfortunately, like most userspace programs, - * it's quite boring (which is why everyone likes to hack on the kernel!). - * Perhaps if you make up an Lguest Drinking Game at this point, it will get - * you through this section. Or, maybe not. - * - * The Launcher sets up a big chunk of memory to be the Guest's "physical" - * memory and stores it in "guest_base". In other words, Guest physical == - * Launcher virtual with an offset. - * - * This can be tough to get your head around, but usually it just means that we - * use these trivial conversion functions when the Guest gives us its - * "physical" addresses: - */ -static void *from_guest_phys(unsigned long addr) -{ - return guest_base + addr; -} - -static unsigned long to_guest_phys(const void *addr) -{ - return (addr - guest_base); -} - -/*L:130 - * Loading the Kernel. - * - * We start with couple of simple helper routines. open_or_die() avoids - * error-checking code cluttering the callers: - */ -static int open_or_die(const char *name, int flags) -{ - int fd = open(name, flags); - if (fd < 0) - err(1, "Failed to open %s", name); - return fd; -} - -/* map_zeroed_pages() takes a number of pages. */ -static void *map_zeroed_pages(unsigned int num) -{ - int fd = open_or_die("/dev/zero", O_RDONLY); - void *addr; - - /* - * We use a private mapping (ie. if we write to the page, it will be - * copied). We allocate an extra two pages PROT_NONE to act as guard - * pages against read/write attempts that exceed allocated space. - */ - addr = mmap(NULL, getpagesize() * (num+2), - PROT_NONE, MAP_PRIVATE, fd, 0); - - if (addr == MAP_FAILED) - err(1, "Mmapping %u pages of /dev/zero", num); - - if (mprotect(addr + getpagesize(), getpagesize() * num, - PROT_READ|PROT_WRITE) == -1) - err(1, "mprotect rw %u pages failed", num); - - /* - * One neat mmap feature is that you can close the fd, and it - * stays mapped. - */ - close(fd); - - /* Return address after PROT_NONE page */ - return addr + getpagesize(); -} - -/* Get some more pages for a device. */ -static void *get_pages(unsigned int num) -{ - void *addr = from_guest_phys(guest_limit); - - guest_limit += num * getpagesize(); - if (guest_limit > guest_max) - errx(1, "Not enough memory for devices"); - return addr; -} - -/* - * This routine is used to load the kernel or initrd. It tries mmap, but if - * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), - * it falls back to reading the memory in. - */ -static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) -{ - ssize_t r; - - /* - * We map writable even though for some segments are marked read-only. - * The kernel really wants to be writable: it patches its own - * instructions. - * - * MAP_PRIVATE means that the page won't be copied until a write is - * done to it. This allows us to share untouched memory between - * Guests. - */ - if (mmap(addr, len, PROT_READ|PROT_WRITE, - MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) - return; - - /* pread does a seek and a read in one shot: saves a few lines. */ - r = pread(fd, addr, len, offset); - if (r != len) - err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); -} - -/* - * This routine takes an open vmlinux image, which is in ELF, and maps it into - * the Guest memory. ELF = Embedded Linking Format, which is the format used - * by all modern binaries on Linux including the kernel. - * - * The ELF headers give *two* addresses: a physical address, and a virtual - * address. We use the physical address; the Guest will map itself to the - * virtual address. - * - * We return the starting address. - */ -static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) -{ - Elf32_Phdr phdr[ehdr->e_phnum]; - unsigned int i; - - /* - * Sanity checks on the main ELF header: an x86 executable with a - * reasonable number of correctly-sized program headers. - */ - if (ehdr->e_type != ET_EXEC - || ehdr->e_machine != EM_386 - || ehdr->e_phentsize != sizeof(Elf32_Phdr) - || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) - errx(1, "Malformed elf header"); - - /* - * An ELF executable contains an ELF header and a number of "program" - * headers which indicate which parts ("segments") of the program to - * load where. - */ - - /* We read in all the program headers at once: */ - if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) - err(1, "Seeking to program headers"); - if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) - err(1, "Reading program headers"); - - /* - * Try all the headers: there are usually only three. A read-only one, - * a read-write one, and a "note" section which we don't load. - */ - for (i = 0; i < ehdr->e_phnum; i++) { - /* If this isn't a loadable segment, we ignore it */ - if (phdr[i].p_type != PT_LOAD) - continue; - - verbose("Section %i: size %i addr %p\n", - i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); - - /* We map this section of the file at its physical address. */ - map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), - phdr[i].p_offset, phdr[i].p_filesz); - } - - /* The entry point is given in the ELF header. */ - return ehdr->e_entry; -} - -/*L:150 - * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed - * to jump into it and it will unpack itself. We used to have to perform some - * hairy magic because the unpacking code scared me. - * - * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote - * a small patch to jump over the tricky bits in the Guest, so now we just read - * the funky header so we know where in the file to load, and away we go! - */ -static unsigned long load_bzimage(int fd) -{ - struct boot_params boot; - int r; - /* Modern bzImages get loaded at 1M. */ - void *p = from_guest_phys(0x100000); - - /* - * Go back to the start of the file and read the header. It should be - * a Linux boot header (see Documentation/x86/i386/boot.txt) - */ - lseek(fd, 0, SEEK_SET); - read(fd, &boot, sizeof(boot)); - - /* Inside the setup_hdr, we expect the magic "HdrS" */ - if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) - errx(1, "This doesn't look like a bzImage to me"); - - /* Skip over the extra sectors of the header. */ - lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); - - /* Now read everything into memory. in nice big chunks. */ - while ((r = read(fd, p, 65536)) > 0) - p += r; - - /* Finally, code32_start tells us where to enter the kernel. */ - return boot.hdr.code32_start; -} - -/*L:140 - * Loading the kernel is easy when it's a "vmlinux", but most kernels - * come wrapped up in the self-decompressing "bzImage" format. With a little - * work, we can load those, too. - */ -static unsigned long load_kernel(int fd) -{ - Elf32_Ehdr hdr; - - /* Read in the first few bytes. */ - if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) - err(1, "Reading kernel"); - - /* If it's an ELF file, it starts with "\177ELF" */ - if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) - return map_elf(fd, &hdr); - - /* Otherwise we assume it's a bzImage, and try to load it. */ - return load_bzimage(fd); -} - -/* - * This is a trivial little helper to align pages. Andi Kleen hated it because - * it calls getpagesize() twice: "it's dumb code." - * - * Kernel guys get really het up about optimization, even when it's not - * necessary. I leave this code as a reaction against that. - */ -static inline unsigned long page_align(unsigned long addr) -{ - /* Add upwards and truncate downwards. */ - return ((addr + getpagesize()-1) & ~(getpagesize()-1)); -} - -/*L:180 - * An "initial ram disk" is a disk image loaded into memory along with the - * kernel which the kernel can use to boot from without needing any drivers. - * Most distributions now use this as standard: the initrd contains the code to - * load the appropriate driver modules for the current machine. - * - * Importantly, James Morris works for RedHat, and Fedora uses initrds for its - * kernels. He sent me this (and tells me when I break it). - */ -static unsigned long load_initrd(const char *name, unsigned long mem) -{ - int ifd; - struct stat st; - unsigned long len; - - ifd = open_or_die(name, O_RDONLY); - /* fstat() is needed to get the file size. */ - if (fstat(ifd, &st) < 0) - err(1, "fstat() on initrd '%s'", name); - - /* - * We map the initrd at the top of memory, but mmap wants it to be - * page-aligned, so we round the size up for that. - */ - len = page_align(st.st_size); - map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); - /* - * Once a file is mapped, you can close the file descriptor. It's a - * little odd, but quite useful. - */ - close(ifd); - verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); - - /* We return the initrd size. */ - return len; -} -/*:*/ - -/* - * Simple routine to roll all the commandline arguments together with spaces - * between them. - */ -static void concat(char *dst, char *args[]) -{ - unsigned int i, len = 0; - - for (i = 0; args[i]; i++) { - if (i) { - strcat(dst+len, " "); - len++; - } - strcpy(dst+len, args[i]); - len += strlen(args[i]); - } - /* In case it's empty. */ - dst[len] = '\0'; -} - -/*L:185 - * This is where we actually tell the kernel to initialize the Guest. We - * saw the arguments it expects when we looked at initialize() in lguest_user.c: - * the base of Guest "physical" memory, the top physical page to allow and the - * entry point for the Guest. - */ -static void tell_kernel(unsigned long start) -{ - unsigned long args[] = { LHREQ_INITIALIZE, - (unsigned long)guest_base, - guest_limit / getpagesize(), start }; - verbose("Guest: %p - %p (%#lx)\n", - guest_base, guest_base + guest_limit, guest_limit); - lguest_fd = open_or_die("/dev/lguest", O_RDWR); - if (write(lguest_fd, args, sizeof(args)) < 0) - err(1, "Writing to /dev/lguest"); -} -/*:*/ - -/*L:200 - * Device Handling. - * - * When the Guest gives us a buffer, it sends an array of addresses and sizes. - * We need to make sure it's not trying to reach into the Launcher itself, so - * we have a convenient routine which checks it and exits with an error message - * if something funny is going on: - */ -static void *_check_pointer(unsigned long addr, unsigned int size, - unsigned int line) -{ - /* - * Check if the requested address and size exceeds the allocated memory, - * or addr + size wraps around. - */ - if ((addr + size) > guest_limit || (addr + size) < addr) - errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); - /* - * We return a pointer for the caller's convenience, now we know it's - * safe to use. - */ - return from_guest_phys(addr); -} -/* A macro which transparently hands the line number to the real function. */ -#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) - -/* - * Each buffer in the virtqueues is actually a chain of descriptors. This - * function returns the next descriptor in the chain, or vq->vring.num if we're - * at the end. - */ -static unsigned next_desc(struct vring_desc *desc, - unsigned int i, unsigned int max) -{ - unsigned int next; - - /* If this descriptor says it doesn't chain, we're done. */ - if (!(desc[i].flags & VRING_DESC_F_NEXT)) - return max; - - /* Check they're not leading us off end of descriptors. */ - next = desc[i].next; - /* Make sure compiler knows to grab that: we don't want it changing! */ - wmb(); - - if (next >= max) - errx(1, "Desc next is %u", next); - - return next; -} - -/* - * This actually sends the interrupt for this virtqueue, if we've used a - * buffer. - */ -static void trigger_irq(struct virtqueue *vq) -{ - unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; - - /* Don't inform them if nothing used. */ - if (!vq->pending_used) - return; - vq->pending_used = 0; - - /* If they don't want an interrupt, don't send one... */ - if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { - /* ... unless they've asked us to force one on empty. */ - if (!vq->dev->irq_on_empty - || lg_last_avail(vq) != vq->vring.avail->idx) - return; - } - - /* Send the Guest an interrupt tell them we used something up. */ - if (write(lguest_fd, buf, sizeof(buf)) != 0) - err(1, "Triggering irq %i", vq->config.irq); -} - -/* - * This looks in the virtqueue for the first available buffer, and converts - * it to an iovec for convenient access. Since descriptors consist of some - * number of output then some number of input descriptors, it's actually two - * iovecs, but we pack them into one and note how many of each there were. - * - * This function waits if necessary, and returns the descriptor number found. - */ -static unsigned wait_for_vq_desc(struct virtqueue *vq, - struct iovec iov[], - unsigned int *out_num, unsigned int *in_num) -{ - unsigned int i, head, max; - struct vring_desc *desc; - u16 last_avail = lg_last_avail(vq); - - /* There's nothing available? */ - while (last_avail == vq->vring.avail->idx) { - u64 event; - - /* - * Since we're about to sleep, now is a good time to tell the - * Guest about what we've used up to now. - */ - trigger_irq(vq); - - /* OK, now we need to know about added descriptors. */ - vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - - /* - * They could have slipped one in as we were doing that: make - * sure it's written, then check again. - */ - mb(); - if (last_avail != vq->vring.avail->idx) { - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - break; - } - - /* Nothing new? Wait for eventfd to tell us they refilled. */ - if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) - errx(1, "Event read failed?"); - - /* We don't need to be notified again. */ - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - } - - /* Check it isn't doing very strange things with descriptor numbers. */ - if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) - errx(1, "Guest moved used index from %u to %u", - last_avail, vq->vring.avail->idx); - - /* - * Grab the next descriptor number they're advertising, and increment - * the index we've seen. - */ - head = vq->vring.avail->ring[last_avail % vq->vring.num]; - lg_last_avail(vq)++; - - /* If their number is silly, that's a fatal mistake. */ - if (head >= vq->vring.num) - errx(1, "Guest says index %u is available", head); - - /* When we start there are none of either input nor output. */ - *out_num = *in_num = 0; - - max = vq->vring.num; - desc = vq->vring.desc; - i = head; - - /* - * If this is an indirect entry, then this buffer contains a descriptor - * table which we handle as if it's any normal descriptor chain. - */ - if (desc[i].flags & VRING_DESC_F_INDIRECT) { - if (desc[i].len % sizeof(struct vring_desc)) - errx(1, "Invalid size for indirect buffer table"); - - max = desc[i].len / sizeof(struct vring_desc); - desc = check_pointer(desc[i].addr, desc[i].len); - i = 0; - } - - do { - /* Grab the first descriptor, and check it's OK. */ - iov[*out_num + *in_num].iov_len = desc[i].len; - iov[*out_num + *in_num].iov_base - = check_pointer(desc[i].addr, desc[i].len); - /* If this is an input descriptor, increment that count. */ - if (desc[i].flags & VRING_DESC_F_WRITE) - (*in_num)++; - else { - /* - * If it's an output descriptor, they're all supposed - * to come before any input descriptors. - */ - if (*in_num) - errx(1, "Descriptor has out after in"); - (*out_num)++; - } - - /* If we've got too many, that implies a descriptor loop. */ - if (*out_num + *in_num > max) - errx(1, "Looped descriptor"); - } while ((i = next_desc(desc, i, max)) != max); - - return head; -} - -/* - * After we've used one of their buffers, we tell the Guest about it. Sometime - * later we'll want to send them an interrupt using trigger_irq(); note that - * wait_for_vq_desc() does that for us if it has to wait. - */ -static void add_used(struct virtqueue *vq, unsigned int head, int len) -{ - struct vring_used_elem *used; - - /* - * The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. - */ - used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; - used->id = head; - used->len = len; - /* Make sure buffer is written before we update index. */ - wmb(); - vq->vring.used->idx++; - vq->pending_used++; -} - -/* And here's the combo meal deal. Supersize me! */ -static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) -{ - add_used(vq, head, len); - trigger_irq(vq); -} - -/* - * The Console - * - * We associate some data with the console for our exit hack. - */ -struct console_abort { - /* How many times have they hit ^C? */ - int count; - /* When did they start? */ - struct timeval start; -}; - -/* This is the routine which handles console input (ie. stdin). */ -static void console_input(struct virtqueue *vq) -{ - int len; - unsigned int head, in_num, out_num; - struct console_abort *abort = vq->dev->priv; - struct iovec iov[vq->vring.num]; - - /* Make sure there's a descriptor available. */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - if (out_num) - errx(1, "Output buffers in console in queue?"); - - /* Read into it. This is where we usually wait. */ - len = readv(STDIN_FILENO, iov, in_num); - if (len <= 0) { - /* Ran out of input? */ - warnx("Failed to get console input, ignoring console."); - /* - * For simplicity, dying threads kill the whole Launcher. So - * just nap here. - */ - for (;;) - pause(); - } - - /* Tell the Guest we used a buffer. */ - add_used_and_trigger(vq, head, len); - - /* - * Three ^C within one second? Exit. - * - * This is such a hack, but works surprisingly well. Each ^C has to - * be in a buffer by itself, so they can't be too fast. But we check - * that we get three within about a second, so they can't be too - * slow. - */ - if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { - abort->count = 0; - return; - } - - abort->count++; - if (abort->count == 1) - gettimeofday(&abort->start, NULL); - else if (abort->count == 3) { - struct timeval now; - gettimeofday(&now, NULL); - /* Kill all Launcher processes with SIGINT, like normal ^C */ - if (now.tv_sec <= abort->start.tv_sec+1) - kill(0, SIGINT); - abort->count = 0; - } -} - -/* This is the routine which handles console output (ie. stdout). */ -static void console_output(struct virtqueue *vq) -{ - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - - /* We usually wait in here, for the Guest to give us something. */ - head = wait_for_vq_desc(vq, iov, &out, &in); - if (in) - errx(1, "Input buffers in console output queue?"); - - /* writev can return a partial write, so we loop here. */ - while (!iov_empty(iov, out)) { - int len = writev(STDOUT_FILENO, iov, out); - if (len <= 0) - err(1, "Write to stdout gave %i", len); - iov_consume(iov, out, len); - } - - /* - * We're finished with that buffer: if we're going to sleep, - * wait_for_vq_desc() will prod the Guest with an interrupt. - */ - add_used(vq, head, 0); -} - -/* - * The Network - * - * Handling output for network is also simple: we get all the output buffers - * and write them to /dev/net/tun. - */ -struct net_info { - int tunfd; -}; - -static void net_output(struct virtqueue *vq) -{ - struct net_info *net_info = vq->dev->priv; - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - - /* We usually wait in here for the Guest to give us a packet. */ - head = wait_for_vq_desc(vq, iov, &out, &in); - if (in) - errx(1, "Input buffers in net output queue?"); - /* - * Send the whole thing through to /dev/net/tun. It expects the exact - * same format: what a coincidence! - */ - if (writev(net_info->tunfd, iov, out) < 0) - errx(1, "Write to tun failed?"); - - /* - * Done with that one; wait_for_vq_desc() will send the interrupt if - * all packets are processed. - */ - add_used(vq, head, 0); -} - -/* - * Handling network input is a bit trickier, because I've tried to optimize it. - * - * First we have a helper routine which tells is if from this file descriptor - * (ie. the /dev/net/tun device) will block: - */ -static bool will_block(int fd) -{ - fd_set fdset; - struct timeval zero = { 0, 0 }; - FD_ZERO(&fdset); - FD_SET(fd, &fdset); - return select(fd+1, &fdset, NULL, NULL, &zero) != 1; -} - -/* - * This handles packets coming in from the tun device to our Guest. Like all - * service routines, it gets called again as soon as it returns, so you don't - * see a while(1) loop here. - */ -static void net_input(struct virtqueue *vq) -{ - int len; - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - struct net_info *net_info = vq->dev->priv; - - /* - * Get a descriptor to write an incoming packet into. This will also - * send an interrupt if they're out of descriptors. - */ - head = wait_for_vq_desc(vq, iov, &out, &in); - if (out) - errx(1, "Output buffers in net input queue?"); - - /* - * If it looks like we'll block reading from the tun device, send them - * an interrupt. - */ - if (vq->pending_used && will_block(net_info->tunfd)) - trigger_irq(vq); - - /* - * Read in the packet. This is where we normally wait (when there's no - * incoming network traffic). - */ - len = readv(net_info->tunfd, iov, in); - if (len <= 0) - err(1, "Failed to read from tun."); - - /* - * Mark that packet buffer as used, but don't interrupt here. We want - * to wait until we've done as much work as we can. - */ - add_used(vq, head, len); -} -/*:*/ - -/* This is the helper to create threads: run the service routine in a loop. */ -static int do_thread(void *_vq) -{ - struct virtqueue *vq = _vq; - - for (;;) - vq->service(vq); - return 0; -} - -/* - * When a child dies, we kill our entire process group with SIGTERM. This - * also has the side effect that the shell restores the console for us! - */ -static void kill_launcher(int signal) -{ - kill(0, SIGTERM); -} - -static void reset_device(struct device *dev) -{ - struct virtqueue *vq; - - verbose("Resetting device %s\n", dev->name); - - /* Clear any features they've acked. */ - memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); - - /* We're going to be explicitly killing threads, so ignore them. */ - signal(SIGCHLD, SIG_IGN); - - /* Zero out the virtqueues, get rid of their threads */ - for (vq = dev->vq; vq; vq = vq->next) { - if (vq->thread != (pid_t)-1) { - kill(vq->thread, SIGTERM); - waitpid(vq->thread, NULL, 0); - vq->thread = (pid_t)-1; - } - memset(vq->vring.desc, 0, - vring_size(vq->config.num, LGUEST_VRING_ALIGN)); - lg_last_avail(vq) = 0; - } - dev->running = false; - - /* Now we care if threads die. */ - signal(SIGCHLD, (void *)kill_launcher); -} - -/*L:216 - * This actually creates the thread which services the virtqueue for a device. - */ -static void create_thread(struct virtqueue *vq) -{ - /* - * Create stack for thread. Since the stack grows upwards, we point - * the stack pointer to the end of this region. - */ - char *stack = malloc(32768); - unsigned long args[] = { LHREQ_EVENTFD, - vq->config.pfn*getpagesize(), 0 }; - - /* Create a zero-initialized eventfd. */ - vq->eventfd = eventfd(0, 0); - if (vq->eventfd < 0) - err(1, "Creating eventfd"); - args[2] = vq->eventfd; - - /* - * Attach an eventfd to this virtqueue: it will go off when the Guest - * does an LHCALL_NOTIFY for this vq. - */ - if (write(lguest_fd, &args, sizeof(args)) != 0) - err(1, "Attaching eventfd"); - - /* - * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so - * we get a signal if it dies. - */ - vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); - if (vq->thread == (pid_t)-1) - err(1, "Creating clone"); - - /* We close our local copy now the child has it. */ - close(vq->eventfd); -} - -static bool accepted_feature(struct device *dev, unsigned int bit) -{ - const u8 *features = get_feature_bits(dev) + dev->feature_len; - - if (dev->feature_len < bit / CHAR_BIT) - return false; - return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT)); -} - -static void start_device(struct device *dev) -{ - unsigned int i; - struct virtqueue *vq; - - verbose("Device %s OK: offered", dev->name); - for (i = 0; i < dev->feature_len; i++) - verbose(" %02x", get_feature_bits(dev)[i]); - verbose(", accepted"); - for (i = 0; i < dev->feature_len; i++) - verbose(" %02x", get_feature_bits(dev) - [dev->feature_len+i]); - - dev->irq_on_empty = accepted_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); - - for (vq = dev->vq; vq; vq = vq->next) { - if (vq->service) - create_thread(vq); - } - dev->running = true; -} - -static void cleanup_devices(void) -{ - struct device *dev; - - for (dev = devices.dev; dev; dev = dev->next) - reset_device(dev); - - /* If we saved off the original terminal settings, restore them now. */ - if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) - tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); -} - -/* When the Guest tells us they updated the status field, we handle it. */ -static void update_device_status(struct device *dev) -{ - /* A zero status is a reset, otherwise it's a set of flags. */ - if (dev->desc->status == 0) - reset_device(dev); - else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { - warnx("Device %s configuration FAILED", dev->name); - if (dev->running) - reset_device(dev); - } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - if (!dev->running) - start_device(dev); - } -} - -/*L:215 - * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In - * particular, it's used to notify us of device status changes during boot. - */ -static void handle_output(unsigned long addr) -{ - struct device *i; - - /* Check each device. */ - for (i = devices.dev; i; i = i->next) { - struct virtqueue *vq; - - /* - * Notifications to device descriptors mean they updated the - * device status. - */ - if (from_guest_phys(addr) == i->desc) { - update_device_status(i); - return; - } - - /* - * Devices *can* be used before status is set to DRIVER_OK. - * The original plan was that they would never do this: they - * would always finish setting up their status bits before - * actually touching the virtqueues. In practice, we allowed - * them to, and they do (eg. the disk probes for partition - * tables as part of initialization). - * - * If we see this, we start the device: once it's running, we - * expect the device to catch all the notifications. - */ - for (vq = i->vq; vq; vq = vq->next) { - if (addr != vq->config.pfn*getpagesize()) - continue; - if (i->running) - errx(1, "Notification on running %s", i->name); - /* This just calls create_thread() for each virtqueue */ - start_device(i); - return; - } - } - - /* - * Early console write is done using notify on a nul-terminated string - * in Guest memory. It's also great for hacking debugging messages - * into a Guest. - */ - if (addr >= guest_limit) - errx(1, "Bad NOTIFY %#lx", addr); - - write(STDOUT_FILENO, from_guest_phys(addr), - strnlen(from_guest_phys(addr), guest_limit - addr)); -} - -/*L:190 - * Device Setup - * - * All devices need a descriptor so the Guest knows it exists, and a "struct - * device" so the Launcher can keep track of it. We have common helper - * routines to allocate and manage them. - */ - -/* - * The layout of the device page is a "struct lguest_device_desc" followed by a - * number of virtqueue descriptors, then two sets of feature bits, then an - * array of configuration bytes. This routine returns the configuration - * pointer. - */ -static u8 *device_config(const struct device *dev) -{ - return (void *)(dev->desc + 1) - + dev->num_vq * sizeof(struct lguest_vqconfig) - + dev->feature_len * 2; -} - -/* - * This routine allocates a new "struct lguest_device_desc" from descriptor - * table page just above the Guest's normal memory. It returns a pointer to - * that descriptor. - */ -static struct lguest_device_desc *new_dev_desc(u16 type) -{ - struct lguest_device_desc d = { .type = type }; - void *p; - - /* Figure out where the next device config is, based on the last one. */ - if (devices.lastdev) - p = device_config(devices.lastdev) - + devices.lastdev->desc->config_len; - else - p = devices.descpage; - - /* We only have one page for all the descriptors. */ - if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) - errx(1, "Too many devices"); - - /* p might not be aligned, so we memcpy in. */ - return memcpy(p, &d, sizeof(d)); -} - -/* - * Each device descriptor is followed by the description of its virtqueues. We - * specify how many descriptors the virtqueue is to have. - */ -static void add_virtqueue(struct device *dev, unsigned int num_descs, - void (*service)(struct virtqueue *)) -{ - unsigned int pages; - struct virtqueue **i, *vq = malloc(sizeof(*vq)); - void *p; - - /* First we need some memory for this virtqueue. */ - pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1) - / getpagesize(); - p = get_pages(pages); - - /* Initialize the virtqueue */ - vq->next = NULL; - vq->last_avail_idx = 0; - vq->dev = dev; - - /* - * This is the routine the service thread will run, and its Process ID - * once it's running. - */ - vq->service = service; - vq->thread = (pid_t)-1; - - /* Initialize the configuration. */ - vq->config.num = num_descs; - vq->config.irq = devices.next_irq++; - vq->config.pfn = to_guest_phys(p) / getpagesize(); - - /* Initialize the vring. */ - vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); - - /* - * Append virtqueue to this device's descriptor. We use - * device_config() to get the end of the device's current virtqueues; - * we check that we haven't added any config or feature information - * yet, otherwise we'd be overwriting them. - */ - assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); - memcpy(device_config(dev), &vq->config, sizeof(vq->config)); - dev->num_vq++; - dev->desc->num_vq++; - - verbose("Virtqueue page %#lx\n", to_guest_phys(p)); - - /* - * Add to tail of list, so dev->vq is first vq, dev->vq->next is - * second. - */ - for (i = &dev->vq; *i; i = &(*i)->next); - *i = vq; -} - -/* - * The first half of the feature bitmask is for us to advertise features. The - * second half is for the Guest to accept features. - */ -static void add_feature(struct device *dev, unsigned bit) -{ - u8 *features = get_feature_bits(dev); - - /* We can't extend the feature bits once we've added config bytes */ - if (dev->desc->feature_len <= bit / CHAR_BIT) { - assert(dev->desc->config_len == 0); - dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; - } - - features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); -} - -/* - * This routine sets the configuration fields for an existing device's - * descriptor. It only works for the last device, but that's OK because that's - * how we use it. - */ -static void set_config(struct device *dev, unsigned len, const void *conf) -{ - /* Check we haven't overflowed our single page. */ - if (device_config(dev) + len > devices.descpage + getpagesize()) - errx(1, "Too many devices"); - - /* Copy in the config information, and store the length. */ - memcpy(device_config(dev), conf, len); - dev->desc->config_len = len; - - /* Size must fit in config_len field (8 bits)! */ - assert(dev->desc->config_len == len); -} - -/* - * This routine does all the creation and setup of a new device, including - * calling new_dev_desc() to allocate the descriptor and device memory. We - * don't actually start the service threads until later. - * - * See what I mean about userspace being boring? - */ -static struct device *new_device(const char *name, u16 type) -{ - struct device *dev = malloc(sizeof(*dev)); - - /* Now we populate the fields one at a time. */ - dev->desc = new_dev_desc(type); - dev->name = name; - dev->vq = NULL; - dev->feature_len = 0; - dev->num_vq = 0; - dev->running = false; - - /* - * Append to device list. Prepending to a single-linked list is - * easier, but the user expects the devices to be arranged on the bus - * in command-line order. The first network device on the command line - * is eth0, the first block device /dev/vda, etc. - */ - if (devices.lastdev) - devices.lastdev->next = dev; - else - devices.dev = dev; - devices.lastdev = dev; - - return dev; -} - -/* - * Our first setup routine is the console. It's a fairly simple device, but - * UNIX tty handling makes it uglier than it could be. - */ -static void setup_console(void) -{ - struct device *dev; - - /* If we can save the initial standard input settings... */ - if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { - struct termios term = orig_term; - /* - * Then we turn off echo, line buffering and ^C etc: We want a - * raw input stream to the Guest. - */ - term.c_lflag &= ~(ISIG|ICANON|ECHO); - tcsetattr(STDIN_FILENO, TCSANOW, &term); - } - - dev = new_device("console", VIRTIO_ID_CONSOLE); - - /* We store the console state in dev->priv, and initialize it. */ - dev->priv = malloc(sizeof(struct console_abort)); - ((struct console_abort *)dev->priv)->count = 0; - - /* - * The console needs two virtqueues: the input then the output. When - * they put something the input queue, we make sure we're listening to - * stdin. When they put something in the output queue, we write it to - * stdout. - */ - add_virtqueue(dev, VIRTQUEUE_NUM, console_input); - add_virtqueue(dev, VIRTQUEUE_NUM, console_output); - - verbose("device %u: console\n", ++devices.device_num); -} -/*:*/ - -/*M:010 - * Inter-guest networking is an interesting area. Simplest is to have a - * --sharenet= option which opens or creates a named pipe. This can be - * used to send packets to another guest in a 1:1 manner. - * - * More sopisticated is to use one of the tools developed for project like UML - * to do networking. - * - * Faster is to do virtio bonding in kernel. Doing this 1:1 would be - * completely generic ("here's my vring, attach to your vring") and would work - * for any traffic. Of course, namespace and permissions issues need to be - * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide - * multiple inter-guest channels behind one interface, although it would - * require some manner of hotplugging new virtio channels. - * - * Finally, we could implement a virtio network switch in the kernel. -:*/ - -static u32 str2ip(const char *ipaddr) -{ - unsigned int b[4]; - - if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) - errx(1, "Failed to parse IP address '%s'", ipaddr); - return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; -} - -static void str2mac(const char *macaddr, unsigned char mac[6]) -{ - unsigned int m[6]; - if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", - &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) - errx(1, "Failed to parse mac address '%s'", macaddr); - mac[0] = m[0]; - mac[1] = m[1]; - mac[2] = m[2]; - mac[3] = m[3]; - mac[4] = m[4]; - mac[5] = m[5]; -} - -/* - * This code is "adapted" from libbridge: it attaches the Host end of the - * network device to the bridge device specified by the command line. - * - * This is yet another James Morris contribution (I'm an IP-level guy, so I - * dislike bridging), and I just try not to break it. - */ -static void add_to_bridge(int fd, const char *if_name, const char *br_name) -{ - int ifidx; - struct ifreq ifr; - - if (!*br_name) - errx(1, "must specify bridge name"); - - ifidx = if_nametoindex(if_name); - if (!ifidx) - errx(1, "interface %s does not exist!", if_name); - - strncpy(ifr.ifr_name, br_name, IFNAMSIZ); - ifr.ifr_name[IFNAMSIZ-1] = '\0'; - ifr.ifr_ifindex = ifidx; - if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) - err(1, "can't add %s to bridge %s", if_name, br_name); -} - -/* - * This sets up the Host end of the network device with an IP address, brings - * it up so packets will flow, the copies the MAC address into the hwaddr - * pointer. - */ -static void configure_device(int fd, const char *tapif, u32 ipaddr) -{ - struct ifreq ifr; - struct sockaddr_in sin; - - memset(&ifr, 0, sizeof(ifr)); - strcpy(ifr.ifr_name, tapif); - - /* Don't read these incantations. Just cut & paste them like I did! */ - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(ipaddr); - memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); - if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) - err(1, "Setting %s interface address", tapif); - ifr.ifr_flags = IFF_UP; - if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) - err(1, "Bringing interface %s up", tapif); -} - -static int get_tun_device(char tapif[IFNAMSIZ]) -{ - struct ifreq ifr; - int netfd; - - /* Start with this zeroed. Messy but sure. */ - memset(&ifr, 0, sizeof(ifr)); - - /* - * We open the /dev/net/tun device and tell it we want a tap device. A - * tap device is like a tun device, only somehow different. To tell - * the truth, I completely blundered my way through this code, but it - * works now! - */ - netfd = open_or_die("/dev/net/tun", O_RDWR); - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; - strcpy(ifr.ifr_name, "tap%d"); - if (ioctl(netfd, TUNSETIFF, &ifr) != 0) - err(1, "configuring /dev/net/tun"); - - if (ioctl(netfd, TUNSETOFFLOAD, - TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) - err(1, "Could not set features for tun device"); - - /* - * We don't need checksums calculated for packets coming in this - * device: trust us! - */ - ioctl(netfd, TUNSETNOCSUM, 1); - - memcpy(tapif, ifr.ifr_name, IFNAMSIZ); - return netfd; -} - -/*L:195 - * Our network is a Host<->Guest network. This can either use bridging or - * routing, but the principle is the same: it uses the "tun" device to inject - * packets into the Host as if they came in from a normal network card. We - * just shunt packets between the Guest and the tun device. - */ -static void setup_tun_net(char *arg) -{ - struct device *dev; - struct net_info *net_info = malloc(sizeof(*net_info)); - int ipfd; - u32 ip = INADDR_ANY; - bool bridging = false; - char tapif[IFNAMSIZ], *p; - struct virtio_net_config conf; - - net_info->tunfd = get_tun_device(tapif); - - /* First we create a new network device. */ - dev = new_device("net", VIRTIO_ID_NET); - dev->priv = net_info; - - /* Network devices need a recv and a send queue, just like console. */ - add_virtqueue(dev, VIRTQUEUE_NUM, net_input); - add_virtqueue(dev, VIRTQUEUE_NUM, net_output); - - /* - * We need a socket to perform the magic network ioctls to bring up the - * tap interface, connect to the bridge etc. Any socket will do! - */ - ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); - if (ipfd < 0) - err(1, "opening IP socket"); - - /* If the command line was --tunnet=bridge: do bridging. */ - if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { - arg += strlen(BRIDGE_PFX); - bridging = true; - } - - /* A mac address may follow the bridge name or IP address */ - p = strchr(arg, ':'); - if (p) { - str2mac(p+1, conf.mac); - add_feature(dev, VIRTIO_NET_F_MAC); - *p = '\0'; - } - - /* arg is now either an IP address or a bridge name */ - if (bridging) - add_to_bridge(ipfd, tapif, arg); - else - ip = str2ip(arg); - - /* Set up the tun device. */ - configure_device(ipfd, tapif, ip); - - add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); - /* Expect Guest to handle everything except UFO */ - add_feature(dev, VIRTIO_NET_F_CSUM); - add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); - add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); - add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); - add_feature(dev, VIRTIO_NET_F_GUEST_ECN); - add_feature(dev, VIRTIO_NET_F_HOST_TSO4); - add_feature(dev, VIRTIO_NET_F_HOST_TSO6); - add_feature(dev, VIRTIO_NET_F_HOST_ECN); - /* We handle indirect ring entries */ - add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); - set_config(dev, sizeof(conf), &conf); - - /* We don't need the socket any more; setup is done. */ - close(ipfd); - - devices.device_num++; - - if (bridging) - verbose("device %u: tun %s attached to bridge: %s\n", - devices.device_num, tapif, arg); - else - verbose("device %u: tun %s: %s\n", - devices.device_num, tapif, arg); -} -/*:*/ - -/* This hangs off device->priv. */ -struct vblk_info { - /* The size of the file. */ - off64_t len; - - /* The file descriptor for the file. */ - int fd; - -}; - -/*L:210 - * The Disk - * - * The disk only has one virtqueue, so it only has one thread. It is really - * simple: the Guest asks for a block number and we read or write that position - * in the file. - * - * Before we serviced each virtqueue in a separate thread, that was unacceptably - * slow: the Guest waits until the read is finished before running anything - * else, even if it could have been doing useful work. - * - * We could have used async I/O, except it's reputed to suck so hard that - * characters actually go missing from your code when you try to use it. - */ -static void blk_request(struct virtqueue *vq) -{ - struct vblk_info *vblk = vq->dev->priv; - unsigned int head, out_num, in_num, wlen; - int ret; - u8 *in; - struct virtio_blk_outhdr *out; - struct iovec iov[vq->vring.num]; - off64_t off; - - /* - * Get the next request, where we normally wait. It triggers the - * interrupt to acknowledge previously serviced requests (if any). - */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - - /* - * Every block request should contain at least one output buffer - * (detailing the location on disk and the type of request) and one - * input buffer (to hold the result). - */ - if (out_num == 0 || in_num == 0) - errx(1, "Bad virtblk cmd %u out=%u in=%u", - head, out_num, in_num); - - out = convert(&iov[0], struct virtio_blk_outhdr); - in = convert(&iov[out_num+in_num-1], u8); - /* - * For historical reasons, block operations are expressed in 512 byte - * "sectors". - */ - off = out->sector * 512; - - /* - * In general the virtio block driver is allowed to try SCSI commands. - * It'd be nice if we supported eject, for example, but we don't. - */ - if (out->type & VIRTIO_BLK_T_SCSI_CMD) { - fprintf(stderr, "Scsi commands unsupported\n"); - *in = VIRTIO_BLK_S_UNSUPP; - wlen = sizeof(*in); - } else if (out->type & VIRTIO_BLK_T_OUT) { - /* - * Write - * - * Move to the right location in the block file. This can fail - * if they try to write past end. - */ - if (lseek64(vblk->fd, off, SEEK_SET) != off) - err(1, "Bad seek to sector %llu", out->sector); - - ret = writev(vblk->fd, iov+1, out_num-1); - verbose("WRITE to sector %llu: %i\n", out->sector, ret); - - /* - * Grr... Now we know how long the descriptor they sent was, we - * make sure they didn't try to write over the end of the block - * file (possibly extending it). - */ - if (ret > 0 && off + ret > vblk->len) { - /* Trim it back to the correct length */ - ftruncate64(vblk->fd, vblk->len); - /* Die, bad Guest, die. */ - errx(1, "Write past end %llu+%u", off, ret); - } - - wlen = sizeof(*in); - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); - } else if (out->type & VIRTIO_BLK_T_FLUSH) { - /* Flush */ - ret = fdatasync(vblk->fd); - verbose("FLUSH fdatasync: %i\n", ret); - wlen = sizeof(*in); - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); - } else { - /* - * Read - * - * Move to the right location in the block file. This can fail - * if they try to read past end. - */ - if (lseek64(vblk->fd, off, SEEK_SET) != off) - err(1, "Bad seek to sector %llu", out->sector); - - ret = readv(vblk->fd, iov+1, in_num-1); - verbose("READ from sector %llu: %i\n", out->sector, ret); - if (ret >= 0) { - wlen = sizeof(*in) + ret; - *in = VIRTIO_BLK_S_OK; - } else { - wlen = sizeof(*in); - *in = VIRTIO_BLK_S_IOERR; - } - } - - /* Finished that request. */ - add_used(vq, head, wlen); -} - -/*L:198 This actually sets up a virtual block device. */ -static void setup_block_file(const char *filename) -{ - struct device *dev; - struct vblk_info *vblk; - struct virtio_blk_config conf; - - /* Creat the device. */ - dev = new_device("block", VIRTIO_ID_BLOCK); - - /* The device has one virtqueue, where the Guest places requests. */ - add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); - - /* Allocate the room for our own bookkeeping */ - vblk = dev->priv = malloc(sizeof(*vblk)); - - /* First we open the file and store the length. */ - vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); - vblk->len = lseek64(vblk->fd, 0, SEEK_END); - - /* We support FLUSH. */ - add_feature(dev, VIRTIO_BLK_F_FLUSH); - - /* Tell Guest how many sectors this device has. */ - conf.capacity = cpu_to_le64(vblk->len / 512); - - /* - * Tell Guest not to put in too many descriptors at once: two are used - * for the in and out elements. - */ - add_feature(dev, VIRTIO_BLK_F_SEG_MAX); - conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); - - /* Don't try to put whole struct: we have 8 bit limit. */ - set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf); - - verbose("device %u: virtblock %llu sectors\n", - ++devices.device_num, le64_to_cpu(conf.capacity)); -} - -/*L:211 - * Our random number generator device reads from /dev/random into the Guest's - * input buffers. The usual case is that the Guest doesn't want random numbers - * and so has no buffers although /dev/random is still readable, whereas - * console is the reverse. - * - * The same logic applies, however. - */ -struct rng_info { - int rfd; -}; - -static void rng_input(struct virtqueue *vq) -{ - int len; - unsigned int head, in_num, out_num, totlen = 0; - struct rng_info *rng_info = vq->dev->priv; - struct iovec iov[vq->vring.num]; - - /* First we need a buffer from the Guests's virtqueue. */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - if (out_num) - errx(1, "Output buffers in rng?"); - - /* - * Just like the console write, we loop to cover the whole iovec. - * In this case, short reads actually happen quite a bit. - */ - while (!iov_empty(iov, in_num)) { - len = readv(rng_info->rfd, iov, in_num); - if (len <= 0) - err(1, "Read from /dev/random gave %i", len); - iov_consume(iov, in_num, len); - totlen += len; - } - - /* Tell the Guest about the new input. */ - add_used(vq, head, totlen); -} - -/*L:199 - * This creates a "hardware" random number device for the Guest. - */ -static void setup_rng(void) -{ - struct device *dev; - struct rng_info *rng_info = malloc(sizeof(*rng_info)); - - /* Our device's privat info simply contains the /dev/random fd. */ - rng_info->rfd = open_or_die("/dev/random", O_RDONLY); - - /* Create the new device. */ - dev = new_device("rng", VIRTIO_ID_RNG); - dev->priv = rng_info; - - /* The device has one virtqueue, where the Guest places inbufs. */ - add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); - - verbose("device %u: rng\n", devices.device_num++); -} -/* That's the end of device setup. */ - -/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ -static void __attribute__((noreturn)) restart_guest(void) -{ - unsigned int i; - - /* - * Since we don't track all open fds, we simply close everything beyond - * stderr. - */ - for (i = 3; i < FD_SETSIZE; i++) - close(i); - - /* Reset all the devices (kills all threads). */ - cleanup_devices(); - - execv(main_args[0], main_args); - err(1, "Could not exec %s", main_args[0]); -} - -/*L:220 - * Finally we reach the core of the Launcher which runs the Guest, serves - * its input and output, and finally, lays it to rest. - */ -static void __attribute__((noreturn)) run_guest(void) -{ - for (;;) { - unsigned long notify_addr; - int readval; - - /* We read from the /dev/lguest device to run the Guest. */ - readval = pread(lguest_fd, ¬ify_addr, - sizeof(notify_addr), cpu_id); - - /* One unsigned long means the Guest did HCALL_NOTIFY */ - if (readval == sizeof(notify_addr)) { - verbose("Notify on address %#lx\n", notify_addr); - handle_output(notify_addr); - /* ENOENT means the Guest died. Reading tells us why. */ - } else if (errno == ENOENT) { - char reason[1024] = { 0 }; - pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); - errx(1, "%s", reason); - /* ERESTART means that we need to reboot the guest */ - } else if (errno == ERESTART) { - restart_guest(); - /* Anything else means a bug or incompatible change. */ - } else - err(1, "Running guest failed"); - } -} -/*L:240 - * This is the end of the Launcher. The good news: we are over halfway - * through! The bad news: the most fiendish part of the code still lies ahead - * of us. - * - * Are you ready? Take a deep breath and join me in the core of the Host, in - * "make Host". -:*/ - -static struct option opts[] = { - { "verbose", 0, NULL, 'v' }, - { "tunnet", 1, NULL, 't' }, - { "block", 1, NULL, 'b' }, - { "rng", 0, NULL, 'r' }, - { "initrd", 1, NULL, 'i' }, - { "username", 1, NULL, 'u' }, - { "chroot", 1, NULL, 'c' }, - { NULL }, -}; -static void usage(void) -{ - errx(1, "Usage: lguest [--verbose] " - "[--tunnet=(:|bridge::)\n" - "|--block=|--initrd=]...\n" - " vmlinux [args...]"); -} - -/*L:105 The main routine is where the real work begins: */ -int main(int argc, char *argv[]) -{ - /* Memory, code startpoint and size of the (optional) initrd. */ - unsigned long mem = 0, start, initrd_size = 0; - /* Two temporaries. */ - int i, c; - /* The boot information for the Guest. */ - struct boot_params *boot; - /* If they specify an initrd file to load. */ - const char *initrd_name = NULL; - - /* Password structure for initgroups/setres[gu]id */ - struct passwd *user_details = NULL; - - /* Directory to chroot to */ - char *chroot_path = NULL; - - /* Save the args: we "reboot" by execing ourselves again. */ - main_args = argv; - - /* - * First we initialize the device list. We keep a pointer to the last - * device, and the next interrupt number to use for devices (1: - * remember that 0 is used by the timer). - */ - devices.lastdev = NULL; - devices.next_irq = 1; - - /* We're CPU 0. In fact, that's the only CPU possible right now. */ - cpu_id = 0; - - /* - * We need to know how much memory so we can set up the device - * descriptor and memory pages for the devices as we parse the command - * line. So we quickly look through the arguments to find the amount - * of memory now. - */ - for (i = 1; i < argc; i++) { - if (argv[i][0] != '-') { - mem = atoi(argv[i]) * 1024 * 1024; - /* - * We start by mapping anonymous pages over all of - * guest-physical memory range. This fills it with 0, - * and ensures that the Guest won't be killed when it - * tries to access it. - */ - guest_base = map_zeroed_pages(mem / getpagesize() - + DEVICE_PAGES); - guest_limit = mem; - guest_max = mem + DEVICE_PAGES*getpagesize(); - devices.descpage = get_pages(1); - break; - } - } - - /* The options are fairly straight-forward */ - while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { - switch (c) { - case 'v': - verbose = true; - break; - case 't': - setup_tun_net(optarg); - break; - case 'b': - setup_block_file(optarg); - break; - case 'r': - setup_rng(); - break; - case 'i': - initrd_name = optarg; - break; - case 'u': - user_details = getpwnam(optarg); - if (!user_details) - err(1, "getpwnam failed, incorrect username?"); - break; - case 'c': - chroot_path = optarg; - break; - default: - warnx("Unknown argument %s", argv[optind]); - usage(); - } - } - /* - * After the other arguments we expect memory and kernel image name, - * followed by command line arguments for the kernel. - */ - if (optind + 2 > argc) - usage(); - - verbose("Guest base is at %p\n", guest_base); - - /* We always have a console device */ - setup_console(); - - /* Now we load the kernel */ - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); - - /* Boot information is stashed at physical address 0 */ - boot = from_guest_phys(0); - - /* Map the initrd image if requested (at top of physical memory) */ - if (initrd_name) { - initrd_size = load_initrd(initrd_name, mem); - /* - * These are the location in the Linux boot header where the - * start and size of the initrd are expected to be found. - */ - boot->hdr.ramdisk_image = mem - initrd_size; - boot->hdr.ramdisk_size = initrd_size; - /* The bootloader type 0xFF means "unknown"; that's OK. */ - boot->hdr.type_of_loader = 0xFF; - } - - /* - * The Linux boot header contains an "E820" memory map: ours is a - * simple, single region. - */ - boot->e820_entries = 1; - boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); - /* - * The boot header contains a command line pointer: we put the command - * line after the boot header. - */ - boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); - /* We use a simple helper to copy the arguments separated by spaces. */ - concat((char *)(boot + 1), argv+optind+2); - - /* Boot protocol version: 2.07 supports the fields for lguest. */ - boot->hdr.version = 0x207; - - /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ - boot->hdr.hardware_subarch = 1; - - /* Tell the entry path not to try to reload segment registers. */ - boot->hdr.loadflags |= KEEP_SEGMENTS; - - /* - * We tell the kernel to initialize the Guest: this returns the open - * /dev/lguest file descriptor. - */ - tell_kernel(start); - - /* Ensure that we terminate if a device-servicing child dies. */ - signal(SIGCHLD, kill_launcher); - - /* If we exit via err(), this kills all the threads, restores tty. */ - atexit(cleanup_devices); - - /* If requested, chroot to a directory */ - if (chroot_path) { - if (chroot(chroot_path) != 0) - err(1, "chroot(\"%s\") failed", chroot_path); - - if (chdir("/") != 0) - err(1, "chdir(\"/\") failed"); - - verbose("chroot done\n"); - } - - /* If requested, drop privileges */ - if (user_details) { - uid_t u; - gid_t g; - - u = user_details->pw_uid; - g = user_details->pw_gid; - - if (initgroups(user_details->pw_name, g) != 0) - err(1, "initgroups failed"); - - if (setresgid(g, g, g) != 0) - err(1, "setresgid failed"); - - if (setresuid(u, u, u) != 0) - err(1, "setresuid failed"); - - verbose("Dropping privileges completed\n"); - } - - /* Finally, run the Guest. This doesn't return. */ - run_guest(); -} -/*:*/ - -/*M:999 - * Mastery is done: you now know everything I do. - * - * But surely you have seen code, features and bugs in your wanderings which - * you now yearn to attack? That is the real game, and I look forward to you - * patching and forking lguest into the Your-Name-Here-visor. - * - * Farewell, and good coding! - * Rusty Russell. - */ diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt deleted file mode 100644 index dad9997..0000000 --- a/Documentation/lguest/lguest.txt +++ /dev/null @@ -1,128 +0,0 @@ - __ - (___()'`; Rusty's Remarkably Unreliable Guide to Lguest - /, /` - or, A Young Coder's Illustrated Hypervisor - \\"--\\ http://lguest.ozlabs.org - -Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel, -for Linux developers and users to experiment with virtualization with the -minimum of complexity. Nonetheless, it should have sufficient features to -make it useful for specific tasks, and, of course, you are encouraged to fork -and enhance it (see drivers/lguest/README). - -Features: - -- Kernel module which runs in a normal kernel. -- Simple I/O model for communication. -- Simple program to create new guests. -- Logo contains cute puppies: http://lguest.ozlabs.org - -Developer features: - -- Fun to hack on. -- No ABI: being tied to a specific kernel anyway, you can change anything. -- Many opportunities for improvement or feature implementation. - -Running Lguest: - -- The easiest way to run lguest is to use same kernel as guest and host. - You can configure them differently, but usually it's easiest not to. - - You will need to configure your kernel with the following options: - - "General setup": - "Prompt for development and/or incomplete code/drivers" = Y - (CONFIG_EXPERIMENTAL=y) - - "Processor type and features": - "Paravirtualized guest support" = Y - "Lguest guest support" = Y - "High Memory Support" = off/4GB - "Alignment value to which kernel should be aligned" = 0x100000 - (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and - CONFIG_PHYSICAL_ALIGN=0x100000) - - "Device Drivers": - "Block devices" - "Virtio block driver (EXPERIMENTAL)" = M/Y - "Network device support" - "Universal TUN/TAP device driver support" = M/Y - "Virtio network driver (EXPERIMENTAL)" = M/Y - (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m) - - "Virtualization" - "Linux hypervisor example code" = M/Y - (CONFIG_LGUEST=m) - -- A tool called "lguest" is available in this directory: type "make" - to build it. If you didn't build your kernel in-tree, use "make - O=". - -- Create or find a root disk image. There are several useful ones - around, such as the xm-test tiny root image at - http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img - - For more serious work, I usually use a distribution ISO image and - install it under qemu, then make multiple copies: - - dd if=/dev/zero of=rootfile bs=1M count=2048 - qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d - - Make sure that you install a getty on /dev/hvc0 if you want to log in on the - console! - -- "modprobe lg" if you built it as a module. - -- Run an lguest as root: - - Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda - - Explanation: - 64: the amount of memory to use, in MB. - - vmlinux: the kernel image found in the top of your build directory. You - can also use a standard bzImage. - - --tunnet=192.168.19.1: configures a "tap" device for networking with this - IP address. - - --block=rootfile: a file or block device which becomes /dev/vda - inside the guest. - - root=/dev/vda: this (and anything else on the command line) are - kernel boot parameters. - -- Configuring networking. I usually have the host masquerade, using - "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 > - /proc/sys/net/ipv4/ip_forward". In this example, I would configure - eth0 inside the guest at 192.168.19.2. - - Another method is to bridge the tap device to an external interface - using --tunnet=bridge:, and perhaps run dhcp on the guest - to obtain an IP address. The bridge needs to be configured first: - this option simply adds the tap interface to it. - - A simple example on my system: - - ifconfig eth0 0.0.0.0 - brctl addbr lg0 - ifconfig lg0 up - brctl addif lg0 eth0 - dhclient lg0 - - Then use --tunnet=bridge:lg0 when launching the guest. - - See: - - http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge - - for general information on how to get bridging to work. - -- Random number generation. Using the --rng option will provide a - /dev/hwrng in the guest that will read from the host's /dev/random. - Use this option in conjunction with rng-tools (see ../hw_random.txt) - to provide entropy to the guest kernel's /dev/random. - -There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest - -Good luck! -Rusty Russell rusty@rustcorp.com.au. diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 65f4c79..9c0a80d 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -136,7 +136,7 @@ View the top contending locks: dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24 &inode->i_mutex: 161 286 18446744073709 62882.54 1244614.55 3653 20598 18446744073709 62318.60 1693822.74 &zone->lru_lock: 94 94 0.53 7.33 92.10 4366 32690 0.29 59.81 16350.06 - &inode->i_data.i_mmap_lock: 79 79 0.40 3.77 53.03 11779 87755 0.28 116.93 29898.44 + &inode->i_data.i_mmap_mutex: 79 79 0.40 3.77 53.03 11779 87755 0.28 116.93 29898.44 &q->__queue_lock: 48 50 0.52 31.62 86.31 774 13131 0.17 113.08 12277.52 &rq->rq_lock_key: 43 47 0.74 68.50 170.63 3706 33929 0.22 107.99 17460.62 &rq->rq_lock_key#2: 39 46 0.75 6.68 49.03 2979 32292 0.17 125.17 17137.63 diff --git a/Documentation/mmc/00-INDEX b/Documentation/mmc/00-INDEX index fca586f..93dd7a7 100644 --- a/Documentation/mmc/00-INDEX +++ b/Documentation/mmc/00-INDEX @@ -2,3 +2,5 @@ - this file mmc-dev-attrs.txt - info on SD and MMC device attributes +mmc-dev-parts.txt + - info on SD and MMC device partitions diff --git a/Documentation/mmc/mmc-dev-attrs.txt b/Documentation/mmc/mmc-dev-attrs.txt index ff2bd68..8898a95 100644 --- a/Documentation/mmc/mmc-dev-attrs.txt +++ b/Documentation/mmc/mmc-dev-attrs.txt @@ -1,3 +1,13 @@ +SD and MMC Block Device Attributes +================================== + +These attributes are defined for the block devices associated with the +SD or MMC device. + +The following attributes are read/write. + + force_ro Enforce read-only access even if write protect switch is off. + SD and MMC Device Attributes ============================ diff --git a/Documentation/mmc/mmc-dev-parts.txt b/Documentation/mmc/mmc-dev-parts.txt new file mode 100644 index 0000000..2db28b8 --- /dev/null +++ b/Documentation/mmc/mmc-dev-parts.txt @@ -0,0 +1,27 @@ +SD and MMC Device Partitions +============================ + +Device partitions are additional logical block devices present on the +SD/MMC device. + +As of this writing, MMC boot partitions as supported and exposed as +/dev/mmcblkXboot0 and /dev/mmcblkXboot1, where X is the index of the +parent /dev/mmcblkX. + +MMC Boot Partitions +=================== + +Read and write access is provided to the two MMC boot partitions. Due to +the sensitive nature of the boot partition contents, which often store +a bootloader or bootloader configuration tables crucial to booting the +platform, write access is disabled by default to reduce the chance of +accidental bricking. + +To enable write access to /dev/mmcblkXbootY, disable the forced read-only +access with: + +echo 0 > /sys/block/mmcblkXbootY/force_ro + +To re-enable read-only access: + +echo 1 > /sys/block/mmcblkXbootY/force_ro diff --git a/Documentation/networking/batman-adv.txt b/Documentation/networking/batman-adv.txt index ee496eb..88d4afb 100644 --- a/Documentation/networking/batman-adv.txt +++ b/Documentation/networking/batman-adv.txt @@ -1,4 +1,4 @@ -[state: 27-01-2011] +[state: 17-04-2011] BATMAN-ADV ---------- @@ -19,6 +19,7 @@ duce the overhead to a minimum. It does not depend on any (other) network driver, and can be used on wifi as well as ethernet lan, vpn, etc ... (anything with ethernet-style layer 2). + CONFIGURATION ------------- @@ -160,13 +161,13 @@ face. Each entry can/has to have the following values: -> "TQ mac value" - src mac's link quality towards mac address of a neighbor originator's interface which is being used for routing --> "HNA mac" - HNA announced by source mac +-> "TT mac" - TT announced by source mac -> "PRIMARY" - this is a primary interface -> "SEC mac" - secondary mac address of source (requires preceding PRIMARY) The TQ value has a range from 4 to 255 with 255 being the best. -The HNA entries are showing which hosts are connected to the mesh +The TT entries are showing which hosts are connected to the mesh via bat0 or being bridged into the mesh network. The PRIMARY/SEC values are only applied on primary interfaces @@ -199,7 +200,7 @@ abled during run time. Following log_levels are defined: 0 - All debug output disabled 1 - Enable messages related to routing / flooding / broadcasting -2 - Enable route or hna added / changed / deleted +2 - Enable route or tt entry added / changed / deleted 3 - Enable all messages The debug output can be changed at runtime using the file @@ -207,7 +208,7 @@ The debug output can be changed at runtime using the file # echo 2 > /sys/class/net/bat0/mesh/log_level -will enable debug messages for when routes or HNAs change. +will enable debug messages for when routes or TTs change. BATCTL diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index e27202b..675612f 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -1,7 +1,7 @@ Linux Ethernet Bonding Driver HOWTO - Latest update: 23 September 2009 + Latest update: 27 April 2011 Initial release : Thomas Davis Corrections, HA extensions : 2000/10/03-15 : @@ -585,25 +585,23 @@ mode chosen. num_grat_arp - - Specifies the number of gratuitous ARPs to be issued after a - failover event. One gratuitous ARP is issued immediately after - the failover, subsequent ARPs are sent at a rate of one per link - monitor interval (arp_interval or miimon, whichever is active). - - The valid range is 0 - 255; the default value is 1. This option - affects only the active-backup mode. This option was added for - bonding version 3.3.0. - num_unsol_na - Specifies the number of unsolicited IPv6 Neighbor Advertisements - to be issued after a failover event. One unsolicited NA is issued - immediately after the failover. + Specify the number of peer notifications (gratuitous ARPs and + unsolicited IPv6 Neighbor Advertisements) to be issued after a + failover event. As soon as the link is up on the new slave + (possibly immediately) a peer notification is sent on the + bonding device and each VLAN sub-device. This is repeated at + each link monitor interval (arp_interval or miimon, whichever + is active) if the number is greater than 1. - The valid range is 0 - 255; the default value is 1. This option - affects only the active-backup mode. This option was added for - bonding version 3.4.0. + The valid range is 0 - 255; the default value is 1. These options + affect only the active-backup mode. These options were added for + bonding versions 3.3.0 and 3.4.0 respectively. + + From Linux 2.6.40 and bonding version 3.7.1, these notifications + are generated by the ipv4 and ipv6 code and the numbers of + repetitions cannot be set independently. primary @@ -772,8 +770,17 @@ resend_igmp a failover event. One membership report is issued immediately after the failover, subsequent packets are sent in each 200ms interval. - The valid range is 0 - 255; the default value is 1. This option - was added for bonding version 3.7.0. + The valid range is 0 - 255; the default value is 1. A value of 0 + prevents the IGMP membership report from being issued in response + to the failover event. + + This option is useful for bonding modes balance-rr (0), active-backup + (1), balance-tlb (5) and balance-alb (6), in which a failover can + switch the IGMP traffic from one slave to another. Therefore a fresh + IGMP report must be issued to cause the switch to forward the incoming + IGMP traffic over the newly selected slave. + + This option was added for bonding version 3.7.0. 3. Configuring Bonding Devices ============================== diff --git a/Documentation/networking/igb.txt b/Documentation/networking/igb.txt index 98953c0..9a2a037 100644 --- a/Documentation/networking/igb.txt +++ b/Documentation/networking/igb.txt @@ -93,6 +93,19 @@ Additional Configurations REQUIREMENTS: MSI-X support is required for Multiqueue. If MSI-X is not found, the system will fallback to MSI or to Legacy interrupts. + MAC and VLAN anti-spoofing feature + ---------------------------------- + When a malicious driver attempts to send a spoofed packet, it is dropped by + the hardware and not transmitted. An interrupt is sent to the PF driver + notifying it of the spoof attempt. + + When a spoofed packet is detected the PF driver will send the following + message to the system log (displayed by the "dmesg" command): + + Spoof event(s) detected on VF(n) + + Where n=the VF that attempted to do the spoofing. + Support ======= diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 1971bcf..8888083 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt @@ -279,11 +279,15 @@ When the system goes into the standby or memory sleep state, the phases are: time.) Unlike the other suspend-related phases, during the prepare phase the device tree is traversed top-down. - The prepare phase uses only a bus callback. After the callback method - returns, no new children may be registered below the device. The method - may also prepare the device or driver in some way for the upcoming - system power transition, but it should not put the device into a - low-power state. + In addition to that, if device drivers need to allocate additional + memory to be able to hadle device suspend correctly, that should be + done in the prepare phase. + + After the prepare callback method returns, no new children may be + registered below the device. The method may also prepare the device or + driver in some way for the upcoming system power transition (for + example, by allocating additional memory required for this purpose), but + it should not put the device into a low-power state. 2. The suspend methods should quiesce the device to stop it from performing I/O. They also may save the device registers and put it into the diff --git a/Documentation/power/notifiers.txt b/Documentation/power/notifiers.txt index cf98070..c2a4a34 100644 --- a/Documentation/power/notifiers.txt +++ b/Documentation/power/notifiers.txt @@ -1,46 +1,41 @@ Suspend notifiers - (C) 2007 Rafael J. Wysocki , GPL - -There are some operations that device drivers may want to carry out in their -.suspend() routines, but shouldn't, because they can cause the hibernation or -suspend to fail. For example, a driver may want to allocate a substantial amount -of memory (like 50 MB) in .suspend(), but that shouldn't be done after the -swsusp's memory shrinker has run. - -Also, there may be some operations, that subsystems want to carry out before a -hibernation/suspend or after a restore/resume, requiring the system to be fully -functional, so the drivers' .suspend() and .resume() routines are not suitable -for this purpose. For example, device drivers may want to upload firmware to -their devices after a restore from a hibernation image, but they cannot do it by -calling request_firmware() from their .resume() routines (user land processes -are frozen at this point). The solution may be to load the firmware into -memory before processes are frozen and upload it from there in the .resume() -routine. Of course, a hibernation notifier may be used for this purpose. - -The subsystems that have such needs can register suspend notifiers that will be -called upon the following events by the suspend core: + (C) 2007-2011 Rafael J. Wysocki , GPL + +There are some operations that subsystems or drivers may want to carry out +before hibernation/suspend or after restore/resume, but they require the system +to be fully functional, so the drivers' and subsystems' .suspend() and .resume() +or even .prepare() and .complete() callbacks are not suitable for this purpose. +For example, device drivers may want to upload firmware to their devices after +resume/restore, but they cannot do it by calling request_firmware() from their +.resume() or .complete() routines (user land processes are frozen at these +points). The solution may be to load the firmware into memory before processes +are frozen and upload it from there in the .resume() routine. +A suspend/hibernation notifier may be used for this purpose. + +The subsystems or drivers having such needs can register suspend notifiers that +will be called upon the following events by the PM core: PM_HIBERNATION_PREPARE The system is going to hibernate or suspend, tasks will be frozen immediately. PM_POST_HIBERNATION The system memory state has been restored from a - hibernation image or an error occurred during the - hibernation. Device drivers' .resume() callbacks have + hibernation image or an error occurred during + hibernation. Device drivers' restore callbacks have been executed and tasks have been thawed. PM_RESTORE_PREPARE The system is going to restore a hibernation image. - If all goes well the restored kernel will issue a + If all goes well, the restored kernel will issue a PM_POST_HIBERNATION notification. -PM_POST_RESTORE An error occurred during the hibernation restore. - Device drivers' .resume() callbacks have been executed +PM_POST_RESTORE An error occurred during restore from hibernation. + Device drivers' restore callbacks have been executed and tasks have been thawed. -PM_SUSPEND_PREPARE The system is preparing for a suspend. +PM_SUSPEND_PREPARE The system is preparing for suspend. PM_POST_SUSPEND The system has just resumed or an error occurred during - the suspend. Device drivers' .resume() callbacks have - been executed and tasks have been thawed. + suspend. Device drivers' resume callbacks have been + executed and tasks have been thawed. It is generally assumed that whatever the notifiers do for PM_HIBERNATION_PREPARE, should be undone for PM_POST_HIBERNATION. Analogously, diff --git a/Documentation/pti/pti_intel_mid.txt b/Documentation/pti/pti_intel_mid.txt new file mode 100644 index 0000000..e7a5b6d --- /dev/null +++ b/Documentation/pti/pti_intel_mid.txt @@ -0,0 +1,99 @@ +The Intel MID PTI project is HW implemented in Intel Atom +system-on-a-chip designs based on the Parallel Trace +Interface for MIPI P1149.7 cJTAG standard. The kernel solution +for this platform involves the following files: + +./include/linux/pti.h +./drivers/.../n_tracesink.h +./drivers/.../n_tracerouter.c +./drivers/.../n_tracesink.c +./drivers/.../pti.c + +pti.c is the driver that enables various debugging features +popular on platforms from certain mobile manufacturers. +n_tracerouter.c and n_tracesink.c allow extra system information to +be collected and routed to the pti driver, such as trace +debugging data from a modem. Although n_tracerouter +and n_tracesink are a part of the complete PTI solution, +these two line disciplines can work separately from +pti.c and route any data stream from one /dev/tty node +to another /dev/tty node via kernel-space. This provides +a stable, reliable connection that will not break unless +the user-space application shuts down (plus avoids +kernel->user->kernel context switch overheads of routing +data). + +An example debugging usage for this driver system: + *Hook /dev/ttyPTI0 to syslogd. Opening this port will also start + a console device to further capture debugging messages to PTI. + *Hook /dev/ttyPTI1 to modem debugging data to write to PTI HW. + This is where n_tracerouter and n_tracesink are used. + *Hook /dev/pti to a user-level debugging application for writing + to PTI HW. + *Use mipi_* Kernel Driver API in other device drivers for + debugging to PTI by first requesting a PTI write address via + mipi_request_masterchannel(1). + +Below is example pseudo-code on how a 'privileged' application +can hook up n_tracerouter and n_tracesink to any tty on +a system. 'Privileged' means the application has enough +privileges to successfully manipulate the ldisc drivers +but is not just blindly executing as 'root'. Keep in mind +the use of ioctl(,TIOCSETD,) is not specific to the n_tracerouter +and n_tracesink line discpline drivers but is a generic +operation for a program to use a line discpline driver +on a tty port other than the default n_tty. + +/////////// To hook up n_tracerouter and n_tracesink ///////// + +// Note that n_tracerouter depends on n_tracesink. +#include +#define ONE_TTY "/dev/ttyOne" +#define TWO_TTY "/dev/ttyTwo" + +// needed global to hand onto ldisc connection +static int g_fd_source = -1; +static int g_fd_sink = -1; + +// these two vars used to grab LDISC values from loaded ldisc drivers +// in OS. Look at /proc/tty/ldiscs to get the right numbers from +// the ldiscs loaded in the system. +int source_ldisc_num, sink_ldisc_num = -1; +int retval; + +g_fd_source = open(ONE_TTY, O_RDWR); // must be R/W +g_fd_sink = open(TWO_TTY, O_RDWR); // must be R/W + +if (g_fd_source <= 0) || (g_fd_sink <= 0) { + // doubt you'll want to use these exact error lines of code + printf("Error on open(). errno: %d\n",errno); + return errno; +} + +retval = ioctl(g_fd_sink, TIOCSETD, &sink_ldisc_num); +if (retval < 0) { + printf("Error on ioctl(). errno: %d\n", errno); + return errno; +} + +retval = ioctl(g_fd_source, TIOCSETD, &source_ldisc_num); +if (retval < 0) { + printf("Error on ioctl(). errno: %d\n", errno); + return errno; +} + +/////////// To disconnect n_tracerouter and n_tracesink //////// + +// First make sure data through the ldiscs has stopped. + +// Second, disconnect ldiscs. This provides a +// little cleaner shutdown on tty stack. +sink_ldisc_num = 0; +source_ldisc_num = 0; +ioctl(g_fd_uart, TIOCSETD, &sink_ldisc_num); +ioctl(g_fd_gadget, TIOCSETD, &source_ldisc_num); + +// Three, program closes connection, and cleanup: +close(g_fd_uart); +close(g_fd_gadget); +g_fd_uart = g_fd_gadget = NULL; diff --git a/Documentation/ptp/ptp.txt b/Documentation/ptp/ptp.txt new file mode 100644 index 0000000..ae8fef8 --- /dev/null +++ b/Documentation/ptp/ptp.txt @@ -0,0 +1,89 @@ + +* PTP hardware clock infrastructure for Linux + + This patch set introduces support for IEEE 1588 PTP clocks in + Linux. Together with the SO_TIMESTAMPING socket options, this + presents a standardized method for developing PTP user space + programs, synchronizing Linux with external clocks, and using the + ancillary features of PTP hardware clocks. + + A new class driver exports a kernel interface for specific clock + drivers and a user space interface. The infrastructure supports a + complete set of PTP hardware clock functionality. + + + Basic clock operations + - Set time + - Get time + - Shift the clock by a given offset atomically + - Adjust clock frequency + + + Ancillary clock features + - One short or periodic alarms, with signal delivery to user program + - Time stamp external events + - Period output signals configurable from user space + - Synchronization of the Linux system time via the PPS subsystem + +** PTP hardware clock kernel API + + A PTP clock driver registers itself with the class driver. The + class driver handles all of the dealings with user space. The + author of a clock driver need only implement the details of + programming the clock hardware. The clock driver notifies the class + driver of asynchronous events (alarms and external time stamps) via + a simple message passing interface. + + The class driver supports multiple PTP clock drivers. In normal use + cases, only one PTP clock is needed. However, for testing and + development, it can be useful to have more than one clock in a + single system, in order to allow performance comparisons. + +** PTP hardware clock user space API + + The class driver also creates a character device for each + registered clock. User space can use an open file descriptor from + the character device as a POSIX clock id and may call + clock_gettime, clock_settime, and clock_adjtime. These calls + implement the basic clock operations. + + User space programs may control the clock using standardized + ioctls. A program may query, enable, configure, and disable the + ancillary clock features. User space can receive time stamped + events via blocking read() and poll(). One shot and periodic + signals may be configured via the POSIX timer_settime() system + call. + +** Writing clock drivers + + Clock drivers include include/linux/ptp_clock_kernel.h and register + themselves by presenting a 'struct ptp_clock_info' to the + registration method. Clock drivers must implement all of the + functions in the interface. If a clock does not offer a particular + ancillary feature, then the driver should just return -EOPNOTSUPP + from those functions. + + Drivers must ensure that all of the methods in interface are + reentrant. Since most hardware implementations treat the time value + as a 64 bit integer accessed as two 32 bit registers, drivers + should use spin_lock_irqsave/spin_unlock_irqrestore to protect + against concurrent access. This locking cannot be accomplished in + class driver, since the lock may also be needed by the clock + driver's interrupt service routine. + +** Supported hardware + + + Freescale eTSEC gianfar + - 2 Time stamp external triggers, programmable polarity (opt. interrupt) + - 2 Alarm registers (optional interrupt) + - 3 Periodic signals (optional interrupt) + + + National DP83640 + - 6 GPIOs programmable as inputs or outputs + - 6 GPIOs with dedicated functions (LED/JTAG/clock) can also be + used as general inputs or outputs + - GPIO inputs can time stamp external triggers + - GPIO outputs can produce periodic signals + - 1 interrupt pin + + + Intel IXP465 + - Auxiliary Slave/Master Mode Snapshot (optional interrupt) + - Target Time (optional interrupt) diff --git a/Documentation/ptp/testptp.c b/Documentation/ptp/testptp.c new file mode 100644 index 0000000..f59ded0 --- /dev/null +++ b/Documentation/ptp/testptp.c @@ -0,0 +1,381 @@ +/* + * PTP 1588 clock support - User space test program + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define DEVICE "/dev/ptp0" + +#ifndef ADJ_SETOFFSET +#define ADJ_SETOFFSET 0x0100 +#endif + +#ifndef CLOCK_INVALID +#define CLOCK_INVALID -1 +#endif + +/* When glibc offers the syscall, this will go away. */ +#include +static int clock_adjtime(clockid_t id, struct timex *tx) +{ + return syscall(__NR_clock_adjtime, id, tx); +} + +static clockid_t get_clockid(int fd) +{ +#define CLOCKFD 3 +#define FD_TO_CLOCKID(fd) ((~(clockid_t) (fd) << 3) | CLOCKFD) + + return FD_TO_CLOCKID(fd); +} + +static void handle_alarm(int s) +{ + printf("received signal %d\n", s); +} + +static int install_handler(int signum, void (*handler)(int)) +{ + struct sigaction action; + sigset_t mask; + + /* Unblock the signal. */ + sigemptyset(&mask); + sigaddset(&mask, signum); + sigprocmask(SIG_UNBLOCK, &mask, NULL); + + /* Install the signal handler. */ + action.sa_handler = handler; + action.sa_flags = 0; + sigemptyset(&action.sa_mask); + sigaction(signum, &action, NULL); + + return 0; +} + +static long ppb_to_scaled_ppm(int ppb) +{ + /* + * The 'freq' field in the 'struct timex' is in parts per + * million, but with a 16 bit binary fractional field. + * Instead of calculating either one of + * + * scaled_ppm = (ppb / 1000) << 16 [1] + * scaled_ppm = (ppb << 16) / 1000 [2] + * + * we simply use double precision math, in order to avoid the + * truncation in [1] and the possible overflow in [2]. + */ + return (long) (ppb * 65.536); +} + +static void usage(char *progname) +{ + fprintf(stderr, + "usage: %s [options]\n" + " -a val request a one-shot alarm after 'val' seconds\n" + " -A val request a periodic alarm every 'val' seconds\n" + " -c query the ptp clock's capabilities\n" + " -d name device to open\n" + " -e val read 'val' external time stamp events\n" + " -f val adjust the ptp clock frequency by 'val' ppb\n" + " -g get the ptp clock time\n" + " -h prints this message\n" + " -p val enable output with a period of 'val' nanoseconds\n" + " -P val enable or disable (val=1|0) the system clock PPS\n" + " -s set the ptp clock time from the system time\n" + " -S set the system time from the ptp clock time\n" + " -t val shift the ptp clock time by 'val' seconds\n", + progname); +} + +int main(int argc, char *argv[]) +{ + struct ptp_clock_caps caps; + struct ptp_extts_event event; + struct ptp_extts_request extts_request; + struct ptp_perout_request perout_request; + struct timespec ts; + struct timex tx; + + static timer_t timerid; + struct itimerspec timeout; + struct sigevent sigevent; + + char *progname; + int c, cnt, fd; + + char *device = DEVICE; + clockid_t clkid; + int adjfreq = 0x7fffffff; + int adjtime = 0; + int capabilities = 0; + int extts = 0; + int gettime = 0; + int oneshot = 0; + int periodic = 0; + int perout = -1; + int pps = -1; + int settime = 0; + + progname = strrchr(argv[0], '/'); + progname = progname ? 1+progname : argv[0]; + while (EOF != (c = getopt(argc, argv, "a:A:cd:e:f:ghp:P:sSt:v"))) { + switch (c) { + case 'a': + oneshot = atoi(optarg); + break; + case 'A': + periodic = atoi(optarg); + break; + case 'c': + capabilities = 1; + break; + case 'd': + device = optarg; + break; + case 'e': + extts = atoi(optarg); + break; + case 'f': + adjfreq = atoi(optarg); + break; + case 'g': + gettime = 1; + break; + case 'p': + perout = atoi(optarg); + break; + case 'P': + pps = atoi(optarg); + break; + case 's': + settime = 1; + break; + case 'S': + settime = 2; + break; + case 't': + adjtime = atoi(optarg); + break; + case 'h': + usage(progname); + return 0; + case '?': + default: + usage(progname); + return -1; + } + } + + fd = open(device, O_RDWR); + if (fd < 0) { + fprintf(stderr, "opening %s: %s\n", device, strerror(errno)); + return -1; + } + + clkid = get_clockid(fd); + if (CLOCK_INVALID == clkid) { + fprintf(stderr, "failed to read clock id\n"); + return -1; + } + + if (capabilities) { + if (ioctl(fd, PTP_CLOCK_GETCAPS, &caps)) { + perror("PTP_CLOCK_GETCAPS"); + } else { + printf("capabilities:\n" + " %d maximum frequency adjustment (ppb)\n" + " %d programmable alarms\n" + " %d external time stamp channels\n" + " %d programmable periodic signals\n" + " %d pulse per second\n", + caps.max_adj, + caps.n_alarm, + caps.n_ext_ts, + caps.n_per_out, + caps.pps); + } + } + + if (0x7fffffff != adjfreq) { + memset(&tx, 0, sizeof(tx)); + tx.modes = ADJ_FREQUENCY; + tx.freq = ppb_to_scaled_ppm(adjfreq); + if (clock_adjtime(clkid, &tx)) { + perror("clock_adjtime"); + } else { + puts("frequency adjustment okay"); + } + } + + if (adjtime) { + memset(&tx, 0, sizeof(tx)); + tx.modes = ADJ_SETOFFSET; + tx.time.tv_sec = adjtime; + tx.time.tv_usec = 0; + if (clock_adjtime(clkid, &tx) < 0) { + perror("clock_adjtime"); + } else { + puts("time shift okay"); + } + } + + if (gettime) { + if (clock_gettime(clkid, &ts)) { + perror("clock_gettime"); + } else { + printf("clock time: %ld.%09ld or %s", + ts.tv_sec, ts.tv_nsec, ctime(&ts.tv_sec)); + } + } + + if (settime == 1) { + clock_gettime(CLOCK_REALTIME, &ts); + if (clock_settime(clkid, &ts)) { + perror("clock_settime"); + } else { + puts("set time okay"); + } + } + + if (settime == 2) { + clock_gettime(clkid, &ts); + if (clock_settime(CLOCK_REALTIME, &ts)) { + perror("clock_settime"); + } else { + puts("set time okay"); + } + } + + if (extts) { + memset(&extts_request, 0, sizeof(extts_request)); + extts_request.index = 0; + extts_request.flags = PTP_ENABLE_FEATURE; + if (ioctl(fd, PTP_EXTTS_REQUEST, &extts_request)) { + perror("PTP_EXTTS_REQUEST"); + extts = 0; + } else { + puts("external time stamp request okay"); + } + for (; extts; extts--) { + cnt = read(fd, &event, sizeof(event)); + if (cnt != sizeof(event)) { + perror("read"); + break; + } + printf("event index %u at %lld.%09u\n", event.index, + event.t.sec, event.t.nsec); + fflush(stdout); + } + /* Disable the feature again. */ + extts_request.flags = 0; + if (ioctl(fd, PTP_EXTTS_REQUEST, &extts_request)) { + perror("PTP_EXTTS_REQUEST"); + } + } + + if (oneshot) { + install_handler(SIGALRM, handle_alarm); + /* Create a timer. */ + sigevent.sigev_notify = SIGEV_SIGNAL; + sigevent.sigev_signo = SIGALRM; + if (timer_create(clkid, &sigevent, &timerid)) { + perror("timer_create"); + return -1; + } + /* Start the timer. */ + memset(&timeout, 0, sizeof(timeout)); + timeout.it_value.tv_sec = oneshot; + if (timer_settime(timerid, 0, &timeout, NULL)) { + perror("timer_settime"); + return -1; + } + pause(); + timer_delete(timerid); + } + + if (periodic) { + install_handler(SIGALRM, handle_alarm); + /* Create a timer. */ + sigevent.sigev_notify = SIGEV_SIGNAL; + sigevent.sigev_signo = SIGALRM; + if (timer_create(clkid, &sigevent, &timerid)) { + perror("timer_create"); + return -1; + } + /* Start the timer. */ + memset(&timeout, 0, sizeof(timeout)); + timeout.it_interval.tv_sec = periodic; + timeout.it_value.tv_sec = periodic; + if (timer_settime(timerid, 0, &timeout, NULL)) { + perror("timer_settime"); + return -1; + } + while (1) { + pause(); + } + timer_delete(timerid); + } + + if (perout >= 0) { + if (clock_gettime(clkid, &ts)) { + perror("clock_gettime"); + return -1; + } + memset(&perout_request, 0, sizeof(perout_request)); + perout_request.index = 0; + perout_request.start.sec = ts.tv_sec + 2; + perout_request.start.nsec = 0; + perout_request.period.sec = 0; + perout_request.period.nsec = perout; + if (ioctl(fd, PTP_PEROUT_REQUEST, &perout_request)) { + perror("PTP_PEROUT_REQUEST"); + } else { + puts("periodic output request okay"); + } + } + + if (pps != -1) { + int enable = pps ? 1 : 0; + if (ioctl(fd, PTP_ENABLE_PPS, enable)) { + perror("PTP_ENABLE_PPS"); + } else { + puts("pps for system time request okay"); + } + } + + close(fd); + return 0; +} diff --git a/Documentation/ptp/testptp.mk b/Documentation/ptp/testptp.mk new file mode 100644 index 0000000..4ef2d97 --- /dev/null +++ b/Documentation/ptp/testptp.mk @@ -0,0 +1,33 @@ +# PTP 1588 clock support - User space test program +# +# Copyright (C) 2010 OMICRON electronics GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +CC = $(CROSS_COMPILE)gcc +INC = -I$(KBUILD_OUTPUT)/usr/include +CFLAGS = -Wall $(INC) +LDLIBS = -lrt +PROGS = testptp + +all: $(PROGS) + +testptp: testptp.o + +clean: + rm -f testptp.o + +distclean: clean + rm -f $(PROGS) diff --git a/Documentation/scsi/LICENSE.qla2xxx b/Documentation/scsi/LICENSE.qla2xxx index 9e15b4f..19e7cd4 100644 --- a/Documentation/scsi/LICENSE.qla2xxx +++ b/Documentation/scsi/LICENSE.qla2xxx @@ -1,11 +1,11 @@ -Copyright (c) 2003-2005 QLogic Corporation -QLogic Linux Fibre Channel HBA Driver +Copyright (c) 2003-2011 QLogic Corporation +QLogic Linux/ESX Fibre Channel HBA Driver -This program includes a device driver for Linux 2.6 that may be +This program includes a device driver for Linux 2.6/ESX that may be distributed with QLogic hardware specific firmware binary file. You may modify and redistribute the device driver code under the -GNU General Public License as published by the Free Software -Foundation (version 2 or a later version). +GNU General Public License (a copy of which is attached hereto as +Exhibit A) published by the Free Software Foundation (version 2). You may redistribute the hardware specific firmware binary file under the following terms: @@ -43,3 +43,285 @@ OTHERWISE IN ANY INTELLECTUAL PROPERTY RIGHTS (PATENT, COPYRIGHT, TRADE SECRET, MASK WORK, OR OTHER PROPRIETARY RIGHT) EMBODIED IN ANY OTHER QLOGIC HARDWARE OR SOFTWARE EITHER SOLELY OR IN COMBINATION WITH THIS PROGRAM. + + +EXHIBIT A + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index 9822afb..8975701 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt @@ -1230,6 +1230,13 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. This module supports multiple cards. The driver requires the firmware loader support on kernel. + Module snd-lola + --------------- + + Module for Digigram Lola PCI-e boards + + This module supports multiple cards. + Module snd-lx6464es ------------------- diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt index 0caf77e..d70c93b 100644 --- a/Documentation/sound/alsa/HD-Audio-Models.txt +++ b/Documentation/sound/alsa/HD-Audio-Models.txt @@ -94,7 +94,7 @@ ALC662/663/272 3stack-dig 3-stack (2-channel) with SPDIF 3stack-6ch 3-stack (6-channel) 3stack-6ch-dig 3-stack (6-channel) with SPDIF - 6stack-dig 6-stack with SPDIF + 5stack-dig 5-stack with SPDIF lenovo-101e Lenovo laptop eeepc-p701 ASUS Eeepc P701 eeepc-ep20 ASUS Eeepc EP20 diff --git a/Documentation/stable_api_nonsense.txt b/Documentation/stable_api_nonsense.txt index 847b342..db3be89 100644 --- a/Documentation/stable_api_nonsense.txt +++ b/Documentation/stable_api_nonsense.txt @@ -122,7 +122,7 @@ operating system to suffer. In both of these instances, all developers agreed that these were important changes that needed to be made, and they were made, with -relatively little pain. If Linux had to ensure that it preserve a +relatively little pain. If Linux had to ensure that it will preserve a stable source interface, a new interface would have been created, and the older, broken one would have had to be maintained over time, leading to extra work for the USB developers. Since all Linux USB developers do diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 4af0614..88fd7f5 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -231,13 +231,6 @@ its creation). This directory contains configuration options for the epoll(7) interface. -max_user_instances ------------------- - -This is the maximum number of epoll file descriptors that a single user can -have open at a given time. The default value is 128, and should be enough -for normal users. - max_user_watches ---------------- diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index cbd05ff..3201a70 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -32,6 +32,17 @@ Table : Subdirectories in /proc/sys/net 1. /proc/sys/net/core - Network core options ------------------------------------------------------- +bpf_jit_enable +-------------- + +This enables Berkeley Packet Filter Just in Time compiler. +Currently supported on x86_64 architecture, bpf_jit provides a framework +to speed packet filtering, the one used by tcpdump/libpcap for example. +Values : + 0 - disable the JIT (default value) + 1 - enable the JIT + 2 - enable the JIT and ask the compiler to emit traces on kernel log. + rmem_default ------------ diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 30289fa..96f0ee8 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -481,10 +481,10 @@ the DMA zone. Type(A) is called as "Node" order. Type (B) is "Zone" order. "Node order" orders the zonelists by node, then by zone within each node. -Specify "[Nn]ode" for zone order +Specify "[Nn]ode" for node order "Zone Order" orders the zonelists by zone type, then by node within each -zone. Specify "[Zz]one"for zode order. +zone. Specify "[Zz]one" for zone order. Specify "[Dd]efault" to request automatic configuration. Autoconfiguration will select "node" order in following case. diff --git a/Documentation/timers/timers-howto.txt b/Documentation/timers/timers-howto.txt index c9ef29d..038f8c7 100644 --- a/Documentation/timers/timers-howto.txt +++ b/Documentation/timers/timers-howto.txt @@ -24,7 +24,7 @@ ATOMIC CONTEXT: ndelay(unsigned long nsecs) udelay(unsigned long usecs) - mdelay(unsgined long msecs) + mdelay(unsigned long msecs) udelay is the generally preferred API; ndelay-level precision may not actually exist on many non-PC devices. diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 6d27ab8..c83bd6b 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -120,7 +120,6 @@ format: field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1;signed:0; field:int common_pid; offset:4; size:4; signed:1; - field:int common_lock_depth; offset:8; size:4; signed:1; field:unsigned long __probe_ip; offset:12; size:4; signed:0; field:int __probe_nargs; offset:16; size:4; signed:1; diff --git a/Documentation/uml/UserModeLinux-HOWTO.txt b/Documentation/uml/UserModeLinux-HOWTO.txt deleted file mode 100644 index 9b7e190..0000000 --- a/Documentation/uml/UserModeLinux-HOWTO.txt +++ /dev/null @@ -1,4579 +0,0 @@ - User Mode Linux HOWTO - User Mode Linux Core Team - Mon Nov 18 14:16:16 EST 2002 - - This document describes the use and abuse of Jeff Dike's User Mode - Linux: a port of the Linux kernel as a normal Intel Linux process. - ______________________________________________________________________ - - Table of Contents - - 1. Introduction - - 1.1 How is User Mode Linux Different? - 1.2 Why Would I Want User Mode Linux? - - 2. Compiling the kernel and modules - - 2.1 Compiling the kernel - 2.2 Compiling and installing kernel modules - 2.3 Compiling and installing uml_utilities - - 3. Running UML and logging in - - 3.1 Running UML - 3.2 Logging in - 3.3 Examples - - 4. UML on 2G/2G hosts - - 4.1 Introduction - 4.2 The problem - 4.3 The solution - - 5. Setting up serial lines and consoles - - 5.1 Specifying the device - 5.2 Specifying the channel - 5.3 Examples - - 6. Setting up the network - - 6.1 General setup - 6.2 Userspace daemons - 6.3 Specifying ethernet addresses - 6.4 UML interface setup - 6.5 Multicast - 6.6 TUN/TAP with the uml_net helper - 6.7 TUN/TAP with a preconfigured tap device - 6.8 Ethertap - 6.9 The switch daemon - 6.10 Slip - 6.11 Slirp - 6.12 pcap - 6.13 Setting up the host yourself - - 7. Sharing Filesystems between Virtual Machines - - 7.1 A warning - 7.2 Using layered block devices - 7.3 Note! - 7.4 Another warning - 7.5 uml_moo : Merging a COW file with its backing file - - 8. Creating filesystems - - 8.1 Create the filesystem file - 8.2 Assign the file to a UML device - 8.3 Creating and mounting the filesystem - - 9. Host file access - - 9.1 Using hostfs - 9.2 hostfs as the root filesystem - 9.3 Building hostfs - - 10. The Management Console - 10.1 version - 10.2 halt and reboot - 10.3 config - 10.4 remove - 10.5 sysrq - 10.6 help - 10.7 cad - 10.8 stop - 10.9 go - - 11. Kernel debugging - - 11.1 Starting the kernel under gdb - 11.2 Examining sleeping processes - 11.3 Running ddd on UML - 11.4 Debugging modules - 11.5 Attaching gdb to the kernel - 11.6 Using alternate debuggers - - 12. Kernel debugging examples - - 12.1 The case of the hung fsck - 12.2 Episode 2: The case of the hung fsck - - 13. What to do when UML doesn't work - - 13.1 Strange compilation errors when you build from source - 13.2 (obsolete) - 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem - 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' - 13.5 UML doesn't work when /tmp is an NFS filesystem - 13.6 UML hangs on boot when compiled with gprof support - 13.7 syslogd dies with a SIGTERM on startup - 13.8 TUN/TAP networking doesn't work on a 2.4 host - 13.9 You can network to the host but not to other machines on the net - 13.10 I have no root and I want to scream - 13.11 UML build conflict between ptrace.h and ucontext.h - 13.12 The UML BogoMips is exactly half the host's BogoMips - 13.13 When you run UML, it immediately segfaults - 13.14 xterms appear, then immediately disappear - 13.15 Any other panic, hang, or strange behavior - - 14. Diagnosing Problems - - 14.1 Case 1 : Normal kernel panics - 14.2 Case 2 : Tracing thread panics - 14.3 Case 3 : Tracing thread panics caused by other threads - 14.4 Case 4 : Hangs - - 15. Thanks - - 15.1 Code and Documentation - 15.2 Flushing out bugs - 15.3 Buglets and clean-ups - 15.4 Case Studies - 15.5 Other contributions - - - ______________________________________________________________________ - - 11.. IInnttrroodduuccttiioonn - - Welcome to User Mode Linux. It's going to be fun. - - - - 11..11.. HHooww iiss UUsseerr MMooddee LLiinnuuxx DDiiffffeerreenntt?? - - Normally, the Linux Kernel talks straight to your hardware (video - card, keyboard, hard drives, etc), and any programs which run ask the - kernel to operate the hardware, like so: - - - - +-----------+-----------+----+ - | Process 1 | Process 2 | ...| - +-----------+-----------+----+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - The User Mode Linux Kernel is different; instead of talking to the - hardware, it talks to a `real' Linux kernel (called the `host kernel' - from now on), like any other program. Programs can then run inside - User-Mode Linux as if they were running under a normal kernel, like - so: - - - - +----------------+ - | Process 2 | ...| - +-----------+----------------+ - | Process 1 | User-Mode Linux| - +----------------------------+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - - 11..22.. WWhhyy WWoouulldd II WWaanntt UUsseerr MMooddee LLiinnuuxx?? - - - 1. If User Mode Linux crashes, your host kernel is still fine. - - 2. You can run a usermode kernel as a non-root user. - - 3. You can debug the User Mode Linux like any normal process. - - 4. You can run gprof (profiling) and gcov (coverage testing). - - 5. You can play with your kernel without breaking things. - - 6. You can use it as a sandbox for testing new apps. - - 7. You can try new development kernels safely. - - 8. You can run different distributions simultaneously. - - 9. It's extremely fun. - - - - - - 22.. CCoommppiilliinngg tthhee kkeerrnneell aanndd mmoodduulleess - - - - - 22..11.. CCoommppiilliinngg tthhee kkeerrnneell - - - Compiling the user mode kernel is just like compiling any other - kernel. Let's go through the steps, using 2.4.0-prerelease (current - as of this writing) as an example: - - - 1. Download the latest UML patch from - - the download page - . - - - 3. Make a directory and unpack the kernel into it. - - - - host% - mkdir ~/uml - - - - - - - host% - cd ~/uml - - - - - - - host% - tar -xzvf linux-2.4.0-prerelease.tar.bz2 - - - - - - - 4. Apply the patch using - - - - host% - cd ~/uml/linux - - - - host% - bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1 - - - - - - - 5. Run your favorite config; `make xconfig ARCH=um' is the most - convenient. `make config ARCH=um' and 'make menuconfig ARCH=um' - will work as well. The defaults will give you a useful kernel. If - you want to change something, go ahead, it probably won't hurt - anything. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries - will not run. They will immediately segfault. See ``UML on 2G/2G - hosts'' for the scoop on running UML on your system. - - - - 6. Finish with `make linux ARCH=um': the result is a file called - `linux' in the top directory of your source tree. - - Make sure that you don't build this kernel in /usr/src/linux. On some - distributions, /usr/include/asm is a link into this pool. The user- - mode build changes the other end of that link, and things that include - stop compiling. - - The sources are also available from cvs at the project's cvs page, - which has directions on getting the sources. You can also browse the - CVS pool from there. - - If you get the CVS sources, you will have to check them out into an - empty directory. You will then have to copy each file into the - corresponding directory in the appropriate kernel pool. - - If you don't have the latest kernel pool, you can get the - corresponding user-mode sources with - - - host% cvs co -r v_2_3_x linux - - - - - where 'x' is the version in your pool. Note that you will not get the - bug fixes and enhancements that have gone into subsequent releases. - - - 22..22.. CCoommppiilliinngg aanndd iinnssttaalllliinngg kkeerrnneell mmoodduulleess - - UML modules are built in the same way as the native kernel (with the - exception of the 'ARCH=um' that you always need for UML): - - - host% make modules ARCH=um - - - - - Any modules that you want to load into this kernel need to be built in - the user-mode pool. Modules from the native kernel won't work. - - You can install them by using ftp or something to copy them into the - virtual machine and dropping them into /lib/modules/`uname -r`. - - You can also get the kernel build process to install them as follows: - - 1. with the kernel not booted, mount the root filesystem in the top - level of the kernel pool: - - - host% mount root_fs mnt -o loop - - - - - - - 2. run - - - host% - make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um - - - - - - - 3. unmount the filesystem - - - host% umount mnt - - - - - - - 4. boot the kernel on it - - - When the system is booted, you can use insmod as usual to get the - modules into the kernel. A number of things have been loaded into UML - as modules, especially filesystems and network protocols and filters, - so most symbols which need to be exported probably already are. - However, if you do find symbols that need exporting, let us - know, and - they'll be "taken care of". - - - - 22..33.. CCoommppiilliinngg aanndd iinnssttaalllliinngg uummll__uuttiilliittiieess - - Many features of the UML kernel require a user-space helper program, - so a uml_utilities package is distributed separately from the kernel - patch which provides these helpers. Included within this is: - - +o port-helper - Used by consoles which connect to xterms or ports - - +o tunctl - Configuration tool to create and delete tap devices - - +o uml_net - Setuid binary for automatic tap device configuration - - +o uml_switch - User-space virtual switch required for daemon - transport - - The uml_utilities tree is compiled with: - - - host# - make && make install - - - - - Note that UML kernel patches may require a specific version of the - uml_utilities distribution. If you don't keep up with the mailing - lists, ensure that you have the latest release of uml_utilities if you - are experiencing problems with your UML kernel, particularly when - dealing with consoles or command-line switches to the helper programs - - - - - - - - - 33.. RRuunnnniinngg UUMMLL aanndd llooggggiinngg iinn - - - - 33..11.. RRuunnnniinngg UUMMLL - - It runs on 2.2.15 or later, and all 2.4 kernels. - - - Booting UML is straightforward. Simply run 'linux': it will try to - mount the file `root_fs' in the current directory. You do not need to - run it as root. If your root filesystem is not named `root_fs', then - you need to put a `ubd0=root_fs_whatever' switch on the linux command - line. - - - You will need a filesystem to boot UML from. There are a number - available for download from here . There are also several tools - which can be - used to generate UML-compatible filesystem images from media. - The kernel will boot up and present you with a login prompt. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries will - not run. They will immediately segfault. See ``UML on 2G/2G hosts'' - for the scoop on running UML on your system. - - - - 33..22.. LLooggggiinngg iinn - - - - The prepackaged filesystems have a root account with password 'root' - and a user account with password 'user'. The login banner will - generally tell you how to log in. So, you log in and you will find - yourself inside a little virtual machine. Our filesystems have a - variety of commands and utilities installed (and it is fairly easy to - add more), so you will have a lot of tools with which to poke around - the system. - - There are a couple of other ways to log in: - - +o On a virtual console - - - - Each virtual console that is configured (i.e. the device exists in - /dev and /etc/inittab runs a getty on it) will come up in its own - xterm. If you get tired of the xterms, read ``Setting up serial - lines and consoles'' to see how to attach the consoles to - something else, like host ptys. - - - - +o Over the serial line - - - In the boot output, find a line that looks like: - - - - serial line 0 assigned pty /dev/ptyp1 - - - - - Attach your favorite terminal program to the corresponding tty. I.e. - for minicom, the command would be - - - host% minicom -o -p /dev/ttyp1 - - - - - - - +o Over the net - - - If the network is running, then you can telnet to the virtual - machine and log in to it. See ``Setting up the network'' to learn - about setting up a virtual network. - - When you're done using it, run halt, and the kernel will bring itself - down and the process will exit. - - - 33..33.. EExxaammpplleess - - Here are some examples of UML in action: - - +o A login session - - +o A virtual network - - - - - - - - 44.. UUMMLL oonn 22GG//22GG hhoossttss - - - - - 44..11.. IInnttrroodduuccttiioonn - - - Most Linux machines are configured so that the kernel occupies the - upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and - processes use the lower 3G (0x00000000 - 0xbfffffff). However, some - machine are configured with a 2G/2G split, with the kernel occupying - the upper 2G (0x80000000 - 0xffffffff) and processes using the lower - 2G (0x00000000 - 0x7fffffff). - - - - - 44..22.. TThhee pprroobblleemm - - - The prebuilt UML binaries on this site will not run on 2G/2G hosts - because UML occupies the upper .5G of the 3G process address space - (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right - in the middle of the kernel address space, so UML won't even load - it - will immediately segfault. - - - - - 44..33.. TThhee ssoolluuttiioonn - - - The fix for this is to rebuild UML from source after enabling - CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to - load itself in the top .5G of that smaller process address space, - where it will run fine. See ``Compiling the kernel and modules'' if - you need help building UML from source. - - - - - - - - - - - 55.. SSeettttiinngg uupp sseerriiaall lliinneess aanndd ccoonnssoolleess - - - It is possible to attach UML serial lines and consoles to many types - of host I/O channels by specifying them on the command line. - - - You can attach them to host ptys, ttys, file descriptors, and ports. - This allows you to do things like - - +o have a UML console appear on an unused host console, - - +o hook two virtual machines together by having one attach to a pty - and having the other attach to the corresponding tty - - +o make a virtual machine accessible from the net by attaching a - console to a port on the host. - - - The general format of the command line option is device=channel. - - - - 55..11.. SSppeecciiffyyiinngg tthhee ddeevviiccee - - Devices are specified with "con" or "ssl" (console or serial line, - respectively), optionally with a device number if you are talking - about a specific device. - - - Using just "con" or "ssl" describes all of the consoles or serial - lines. If you want to talk about console #3 or serial line #10, they - would be "con3" and "ssl10", respectively. - - - A specific device name will override a less general "con=" or "ssl=". - So, for example, you can assign a pty to each of the serial lines - except for the first two like this: - - - ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 - - - - - The specificity of the device name is all that matters; order on the - command line is irrelevant. - - - - 55..22.. SSppeecciiffyyiinngg tthhee cchhaannnneell - - There are a number of different types of channels to attach a UML - device to, each with a different way of specifying exactly what to - attach to. - - +o pseudo-terminals - device=pty pts terminals - device=pts - - - This will cause UML to allocate a free host pseudo-terminal for the - device. The terminal that it got will be announced in the boot - log. You access it by attaching a terminal program to the - corresponding tty: - - +o screen /dev/pts/n - - +o screen /dev/ttyxx - - +o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts - devices - - +o kermit - start it up, 'open' the device, then 'connect' - - - - - - +o terminals - device=tty:tty device file - - - This will make UML attach the device to the specified tty (i.e - - - con1=tty:/dev/tty3 - - - - - will attach UML's console 1 to the host's /dev/tty3). If the tty that - you specify is the slave end of a tty/pty pair, something else must - have already opened the corresponding pty in order for this to work. - - - - - - +o xterms - device=xterm - - - UML will run an xterm and the device will be attached to it. - - - - - - +o Port - device=port:port number - - - This will attach the UML devices to the specified host port. - Attaching console 1 to the host's port 9000 would be done like - this: - - - con1=port:9000 - - - - - Attaching all the serial lines to that port would be done similarly: - - - ssl=port:9000 - - - - - You access these devices by telnetting to that port. Each active tel- - net session gets a different device. If there are more telnets to a - port than UML devices attached to it, then the extra telnet sessions - will block until an existing telnet detaches, or until another device - becomes active (i.e. by being activated in /etc/inittab). - - This channel has the advantage that you can both attach multiple UML - devices to it and know how to access them without reading the UML boot - log. It is also unique in allowing access to a UML from remote - machines without requiring that the UML be networked. This could be - useful in allowing public access to UMLs because they would be - accessible from the net, but wouldn't need any kind of network - filtering or access control because they would have no network access. - - - If you attach the main console to a portal, then the UML boot will - appear to hang. In reality, it's waiting for a telnet to connect, at - which point the boot will proceed. - - - - - - +o already-existing file descriptors - device=file descriptor - - - If you set up a file descriptor on the UML command line, you can - attach a UML device to it. This is most commonly used to put the - main console back on stdin and stdout after assigning all the other - consoles to something else: - - - con0=fd:0,fd:1 con=pts - - - - - - - - - +o Nothing - device=null - - - This allows the device to be opened, in contrast to 'none', but - reads will block, and writes will succeed and the data will be - thrown out. - - - - - - +o None - device=none - - - This causes the device to disappear. - - - - You can also specify different input and output channels for a device - by putting a comma between them: - - - ssl3=tty:/dev/tty2,xterm - - - - - will cause serial line 3 to accept input on the host's /dev/tty3 and - display output on an xterm. That's a silly example - the most common - use of this syntax is to reattach the main console to stdin and stdout - as shown above. - - - If you decide to move the main console away from stdin/stdout, the - initial boot output will appear in the terminal that you're running - UML in. However, once the console driver has been officially - initialized, then the boot output will start appearing wherever you - specified that console 0 should be. That device will receive all - subsequent output. - - - - 55..33.. EExxaammpplleess - - There are a number of interesting things you can do with this - capability. - - - First, this is how you get rid of those bleeding console xterms by - attaching them to host ptys: - - - con=pty con0=fd:0,fd:1 - - - - - This will make a UML console take over an unused host virtual console, - so that when you switch to it, you will see the UML login prompt - rather than the host login prompt: - - - con1=tty:/dev/tty6 - - - - - You can attach two virtual machines together with what amounts to a - serial line as follows: - - Run one UML with a serial line attached to a pty - - - - ssl1=pty - - - - - Look at the boot log to see what pty it got (this example will assume - that it got /dev/ptyp1). - - Boot the other UML with a serial line attached to the corresponding - tty - - - - ssl1=tty:/dev/ttyp1 - - - - - Log in, make sure that it has no getty on that serial line, attach a - terminal program like minicom to it, and you should see the login - prompt of the other virtual machine. - - - 66.. SSeettttiinngg uupp tthhee nneettwwoorrkk - - - - This page describes how to set up the various transports and to - provide a UML instance with network access to the host, other machines - on the local net, and the rest of the net. - - - As of 2.4.5, UML networking has been completely redone to make it much - easier to set up, fix bugs, and add new features. - - - There is a new helper, uml_net, which does the host setup that - requires root privileges. - - - There are currently five transport types available for a UML virtual - machine to exchange packets with other hosts: - - +o ethertap - - +o TUN/TAP - - +o Multicast - - +o a switch daemon - - +o slip - - +o slirp - - +o pcap - - The TUN/TAP, ethertap, slip, and slirp transports allow a UML - instance to exchange packets with the host. They may be directed - to the host or the host may just act as a router to provide access - to other physical or virtual machines. - - - The pcap transport is a synthetic read-only interface, using the - libpcap binary to collect packets from interfaces on the host and - filter them. This is useful for building preconfigured traffic - monitors or sniffers. - - - The daemon and multicast transports provide a completely virtual - network to other virtual machines. This network is completely - disconnected from the physical network unless one of the virtual - machines on it is acting as a gateway. - - - With so many host transports, which one should you use? Here's when - you should use each one: - - +o ethertap - if you want access to the host networking and it is - running 2.2 - - +o TUN/TAP - if you want access to the host networking and it is - running 2.4. Also, the TUN/TAP transport is able to use a - preconfigured device, allowing it to avoid using the setuid uml_net - helper, which is a security advantage. - - +o Multicast - if you want a purely virtual network and you don't want - to set up anything but the UML - - +o a switch daemon - if you want a purely virtual network and you - don't mind running the daemon in order to get somewhat better - performance - - +o slip - there is no particular reason to run the slip backend unless - ethertap and TUN/TAP are just not available for some reason - - +o slirp - if you don't have root access on the host to setup - networking, or if you don't want to allocate an IP to your UML - - +o pcap - not much use for actual network connectivity, but great for - monitoring traffic on the host - - Ethertap is available on 2.4 and works fine. TUN/TAP is preferred - to it because it has better performance and ethertap is officially - considered obsolete in 2.4. Also, the root helper only needs to - run occasionally for TUN/TAP, rather than handling every packet, as - it does with ethertap. This is a slight security advantage since - it provides fewer opportunities for a nasty UML user to somehow - exploit the helper's root privileges. - - - 66..11.. GGeenneerraall sseettuupp - - First, you must have the virtual network enabled in your UML. If are - running a prebuilt kernel from this site, everything is already - enabled. If you build the kernel yourself, under the "Network device - support" menu, enable "Network device support", and then the three - transports. - - - The next step is to provide a network device to the virtual machine. - This is done by describing it on the kernel command line. - - The general format is - - - eth = , - - - - - For example, a virtual ethernet device may be attached to a host - ethertap device as follows: - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - This sets up eth0 inside the virtual machine to attach itself to the - host /dev/tap0, assigns it an ethernet address, and assigns the host - tap0 interface an IP address. - - - - Note that the IP address you assign to the host end of the tap device - must be different than the IP you assign to the eth device inside UML. - If you are short on IPs and don't want to consume two per UML, then - you can reuse the host's eth IP address for the host ends of the tap - devices. Internally, the UMLs must still get unique IPs for their eth - devices. You can also give the UMLs non-routable IPs (192.168.x.x or - 10.x.x.x) and have the host masquerade them. This will let outgoing - connections work, but incoming connections won't without more work, - such as port forwarding from the host. - Also note that when you configure the host side of an interface, it is - only acting as a gateway. It will respond to pings sent to it - locally, but is not useful to do that since it's a host interface. - You are not talking to the UML when you ping that interface and get a - response. - - - You can also add devices to a UML and remove them at runtime. See the - ``The Management Console'' page for details. - - - The sections below describe this in more detail. - - - Once you've decided how you're going to set up the devices, you boot - UML, log in, configure the UML side of the devices, and set up routes - to the outside world. At that point, you will be able to talk to any - other machines, physical or virtual, on the net. - - - If ifconfig inside UML fails and the network refuses to come up, run - tell you what went wrong. - - - - 66..22.. UUsseerrssppaaccee ddaaeemmoonnss - - You will likely need the setuid helper, or the switch daemon, or both. - They are both installed with the RPM and deb, so if you've installed - either, you can skip the rest of this section. - - - If not, then you need to check them out of CVS, build them, and - install them. The helper is uml_net, in CVS /tools/uml_net, and the - daemon is uml_switch, in CVS /tools/uml_router. They are both built - with a plain 'make'. Both need to be installed in a directory that's - in your path - /usr/bin is recommend. On top of that, uml_net needs - to be setuid root. - - - - 66..33.. SSppeecciiffyyiinngg eetthheerrnneett aaddddrreesssseess - - Below, you will see that the TUN/TAP, ethertap, and daemon interfaces - allow you to specify hardware addresses for the virtual ethernet - devices. This is generally not necessary. If you don't have a - specific reason to do it, you probably shouldn't. If one is not - specified on the command line, the driver will assign one based on the - device IP address. It will provide the address fe:fd:nn:nn:nn:nn - where nn.nn.nn.nn is the device IP address. This is nearly always - sufficient to guarantee a unique hardware address for the device. A - couple of exceptions are: - - +o Another set of virtual ethernet devices are on the same network and - they are assigned hardware addresses using a different scheme which - may conflict with the UML IP address-based scheme - - +o You aren't going to use the device for IP networking, so you don't - assign the device an IP address - - If you let the driver provide the hardware address, you should make - sure that the device IP address is known before the interface is - brought up. So, inside UML, this will guarantee that: - - - - UML# - ifconfig eth0 192.168.0.250 up - - - - - If you decide to assign the hardware address yourself, make sure that - the first byte of the address is even. Addresses with an odd first - byte are broadcast addresses, which you don't want assigned to a - device. - - - - 66..44.. UUMMLL iinntteerrffaaccee sseettuupp - - Once the network devices have been described on the command line, you - should boot UML and log in. - - - The first thing to do is bring the interface up: - - - UML# ifconfig ethn ip-address up - - - - - You should be able to ping the host at this point. - - - To reach the rest of the world, you should set a default route to the - host: - - - UML# route add default gw host ip - - - - - Again, with host ip of 192.168.0.4: - - - UML# route add default gw 192.168.0.4 - - - - - This page used to recommend setting a network route to your local net. - This is wrong, because it will cause UML to try to figure out hardware - addresses of the local machines by arping on the interface to the - host. Since that interface is basically a single strand of ethernet - with two nodes on it (UML and the host) and arp requests don't cross - networks, they will fail to elicit any responses. So, what you want - is for UML to just blindly throw all packets at the host and let it - figure out what to do with them, which is what leaving out the network - route and adding the default route does. - - - Note: If you can't communicate with other hosts on your physical - ethernet, it's probably because of a network route that's - automatically set up. If you run 'route -n' and see a route that - looks like this: - - - - - Destination Gateway Genmask Flags Metric Ref Use Iface - 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 - - - - - with a mask that's not 255.255.255.255, then replace it with a route - to your host: - - - UML# - route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 - - - - - - - UML# - route add -host 192.168.0.4 dev eth0 - - - - - This, plus the default route to the host, will allow UML to exchange - packets with any machine on your ethernet. - - - - 66..55.. MMuullttiiccaasstt - - The simplest way to set up a virtual network between multiple UMLs is - to use the mcast transport. This was written by Harald Welte and is - present in UML version 2.4.5-5um and later. Your system must have - multicast enabled in the kernel and there must be a multicast-capable - network device on the host. Normally, this is eth0, but if there is - no ethernet card on the host, then you will likely get strange error - messages when you bring the device up inside UML. - - - To use it, run two UMLs with - - - eth0=mcast - - - - - on their command lines. Log in, configure the ethernet device in each - machine with different IP addresses: - - - UML1# ifconfig eth0 192.168.0.254 - - - - - - - UML2# ifconfig eth0 192.168.0.253 - - - - - and they should be able to talk to each other. - - The full set of command line options for this transport are - - - - ethn=mcast,ethernet address,multicast - address,multicast port,ttl - - - - - Harald's original README is here and explains these in detail, as well as - some other issues. - - - - 66..66.. TTUUNN//TTAAPP wwiitthh tthhee uummll__nneett hheellppeerr - - TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the - host. The TUN/TAP backend has been in UML since 2.4.9-3um. - - - The easiest way to get up and running is to let the setuid uml_net - helper do the host setup for you. This involves insmod-ing the tun.o - module if necessary, configuring the device, and setting up IP - forwarding, routing, and proxy arp. If you are new to UML networking, - do this first. If you're concerned about the security implications of - the setuid helper, use it to get up and running, then read the next - section to see how to have UML use a preconfigured tap device, which - avoids the use of uml_net. - - - If you specify an IP address for the host side of the device, the - uml_net helper will do all necessary setup on the host - the only - requirement is that TUN/TAP be available, either built in to the host - kernel or as the tun.o module. - - The format of the command line switch to attach a device to a TUN/TAP - device is - - - eth =tuntap,,, - - - - - For example, this argument will attach the UML's eth0 to the next - available tap device and assign an ethernet address to it based on its - IP address - - - eth0=tuntap,,,192.168.0.254 - - - - - - - Note that the IP address that must be used for the eth device inside - UML is fixed by the routing and proxy arp that is set up on the - TUN/TAP device on the host. You can use a different one, but it won't - work because reply packets won't reach the UML. This is a feature. - It prevents a nasty UML user from doing things like setting the UML IP - to the same as the network's nameserver or mail server. - - - There are a couple potential problems with running the TUN/TAP - transport on a 2.4 host kernel - - +o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host - kernel or use the ethertap transport. - - +o With an upgraded kernel, TUN/TAP may fail with - - - File descriptor in bad state - - - - - This is due to a header mismatch between the upgraded kernel and the - kernel that was originally installed on the machine. The fix is to - make sure that /usr/src/linux points to the headers for the running - kernel. - - These were pointed out by Tim Robinson in - name="this uml- - user post"> . - - - - 66..77.. TTUUNN//TTAAPP wwiitthh aa pprreeccoonnffiigguurreedd ttaapp ddeevviiccee - - If you prefer not to have UML use uml_net (which is somewhat - insecure), with UML 2.4.17-11, you can set up a TUN/TAP device - beforehand. The setup needs to be done as root, but once that's done, - there is no need for root assistance. Setting up the device is done - as follows: - - +o Create the device with tunctl (available from the UML utilities - tarball) - - - - - host# tunctl -u uid - - - - - where uid is the user id or username that UML will be run as. This - will tell you what device was created. - - +o Configure the device IP (change IP addresses and device name to - suit) - - - - - host# ifconfig tap0 192.168.0.254 up - - - - - - +o Set up routing and arping if desired - this is my recipe, there are - other ways of doing the same thing - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' - - host# - route add -host 192.168.0.253 dev tap0 - - - - - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' - - - - - - - host# - arp -Ds 192.168.0.253 eth0 pub - - - - - Note that this must be done every time the host boots - this configu- - ration is not stored across host reboots. So, it's probably a good - idea to stick it in an rc file. An even better idea would be a little - utility which reads the information from a config file and sets up - devices at boot time. - - +o Rather than using up two IPs and ARPing for one of them, you can - also provide direct access to your LAN by the UML by using a - bridge. - - - host# - brctl addbr br0 - - - - - - - host# - ifconfig eth0 0.0.0.0 promisc up - - - - - - - host# - ifconfig tap0 0.0.0.0 promisc up - - - - - - - host# - ifconfig br0 192.168.0.1 netmask 255.255.255.0 up - - - - - - - - host# - brctl stp br0 off - - - - - - - host# - brctl setfd br0 1 - - - - - - - host# - brctl sethello br0 1 - - - - - - - host# - brctl addif br0 eth0 - - - - - - - host# - brctl addif br0 tap0 - - - - - Note that 'br0' should be setup using ifconfig with the existing IP - address of eth0, as eth0 no longer has its own IP. - - +o - - - Also, the /dev/net/tun device must be writable by the user running - UML in order for the UML to use the device that's been configured - for it. The simplest thing to do is - - - host# chmod 666 /dev/net/tun - - - - - Making it world-writable looks bad, but it seems not to be - exploitable as a security hole. However, it does allow anyone to cre- - ate useless tap devices (useless because they can't configure them), - which is a DOS attack. A somewhat more secure alternative would to be - to create a group containing all the users who have preconfigured tap - devices and chgrp /dev/net/tun to that group with mode 664 or 660. - - - +o Once the device is set up, run UML with 'eth0=tuntap,device name' - (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the - mconsole config command). - - +o Bring the eth device up in UML and you're in business. - - If you don't want that tap device any more, you can make it non- - persistent with - - - host# tunctl -d tap device - - - - - Finally, tunctl has a -b (for brief mode) switch which causes it to - output only the name of the tap device it created. This makes it - suitable for capture by a script: - - - host# TAP=`tunctl -u 1000 -b` - - - - - - - 66..88.. EEtthheerrttaapp - - Ethertap is the general mechanism on 2.2 for userspace processes to - exchange packets with the kernel. - - - - To use this transport, you need to describe the virtual network device - on the UML command line. The general format for this is - - - eth =ethertap, , , - - - - - So, the previous example - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - attaches the UML eth0 device to the host /dev/tap0, assigns it the - ethernet address fe:fd:0:0:0:1, and assigns the IP address - 192.168.0.254 to the tap device. - - - - The tap device is mandatory, but the others are optional. If the - ethernet address is omitted, one will be assigned to it. - - - The presence of the tap IP address will cause the helper to run and do - whatever host setup is needed to allow the virtual machine to - communicate with the outside world. If you're not sure you know what - you're doing, this is the way to go. - - - If it is absent, then you must configure the tap device and whatever - arping and routing you will need on the host. However, even in this - case, the uml_net helper still needs to be in your path and it must be - setuid root if you're not running UML as root. This is because the - tap device doesn't support SIGIO, which UML needs in order to use - something as a source of input. So, the helper is used as a - convenient asynchronous IO thread. - - If you're using the uml_net helper, you can ignore the following host - setup - uml_net will do it for you. You just need to make sure you - have ethertap available, either built in to the host kernel or - available as a module. - - - If you want to set things up yourself, you need to make sure that the - appropriate /dev entry exists. If it doesn't, become root and create - it as follows: - - - mknod /dev/tap c 36 + 16 - - - - - For example, this is how to create /dev/tap0: - - - mknod /dev/tap0 c 36 0 + 16 - - - - - You also need to make sure that the host kernel has ethertap support. - If ethertap is enabled as a module, you apparently need to insmod - ethertap once for each ethertap device you want to enable. So, - - - host# - insmod ethertap - - - - - will give you the tap0 interface. To get the tap1 interface, you need - to run - - - host# - insmod ethertap unit=1 -o ethertap1 - - - - - - - - 66..99.. TThhee sswwiittcchh ddaaeemmoonn - - NNoottee: This is the daemon formerly known as uml_router, but which was - renamed so the network weenies of the world would stop growling at me. - - - The switch daemon, uml_switch, provides a mechanism for creating a - totally virtual network. By default, it provides no connection to the - host network (but see -tap, below). - - - The first thing you need to do is run the daemon. Running it with no - arguments will make it listen on a default pair of unix domain - sockets. - - - If you want it to listen on a different pair of sockets, use - - - -unix control socket data socket - - - - - - If you want it to act as a hub rather than a switch, use - - - -hub - - - - - - If you want the switch to be connected to host networking (allowing - the umls to get access to the outside world through the host), use - - - -tap tap0 - - - - - - Note that the tap device must be preconfigured (see "TUN/TAP with a - preconfigured tap device", above). If you're using a different tap - device than tap0, specify that instead of tap0. - - - uml_switch can be backgrounded as follows - - - host% - uml_switch [ options ] < /dev/null > /dev/null - - - - - The reason it doesn't background by default is that it listens to - stdin for EOF. When it sees that, it exits. - - - The general format of the kernel command line switch is - - - - ethn=daemon,ethernet address,socket - type,control socket,data socket - - - - - You can leave off everything except the 'daemon'. You only need to - specify the ethernet address if the one that will be assigned to it - isn't acceptable for some reason. The rest of the arguments describe - how to communicate with the daemon. You should only specify them if - you told the daemon to use different sockets than the default. So, if - you ran the daemon with no arguments, running the UML on the same - machine with - eth0=daemon - - - - - will cause the eth0 driver to attach itself to the daemon correctly. - - - - 66..1100.. SSlliipp - - Slip is another, less general, mechanism for a process to communicate - with the host networking. In contrast to the ethertap interface, - which exchanges ethernet frames with the host and can be used to - transport any higher-level protocol, it can only be used to transport - IP. - - - The general format of the command line switch is - - - - ethn=slip,slip IP - - - - - The slip IP argument is the IP address that will be assigned to the - host end of the slip device. If it is specified, the helper will run - and will set up the host so that the virtual machine can reach it and - the rest of the network. - - - There are some oddities with this interface that you should be aware - of. You should only specify one slip device on a given virtual - machine, and its name inside UML will be 'umn', not 'eth0' or whatever - you specified on the command line. These problems will be fixed at - some point. - - - - 66..1111.. SSlliirrpp - - slirp uses an external program, usually /usr/bin/slirp, to provide IP - only networking connectivity through the host. This is similar to IP - masquerading with a firewall, although the translation is performed in - user-space, rather than by the kernel. As slirp does not set up any - interfaces on the host, or changes routing, slirp does not require - root access or setuid binaries on the host. - - - The general format of the command line switch for slirp is: - - - - ethn=slirp,ethernet address,slirp path - - - - - The ethernet address is optional, as UML will set up the interface - with an ethernet address based upon the initial IP address of the - interface. The slirp path is generally /usr/bin/slirp, although it - will depend on distribution. - - - The slirp program can have a number of options passed to the command - line and we can't add them to the UML command line, as they will be - parsed incorrectly. Instead, a wrapper shell script can be written or - the options inserted into the /.slirprc file. More information on - all of the slirp options can be found in its man pages. - - - The eth0 interface on UML should be set up with the IP 10.2.0.15, - although you can use anything as long as it is not used by a network - you will be connecting to. The default route on UML should be set to - use - - - UML# - route add default dev eth0 - - - - - slirp provides a number of useful IP addresses which can be used by - UML, such as 10.0.2.3 which is an alias for the DNS server specified - in /etc/resolv.conf on the host or the IP given in the 'dns' option - for slirp. - - - Even with a baudrate setting higher than 115200, the slirp connection - is limited to 115200. If you need it to go faster, the slirp binary - needs to be compiled with FULL_BOLT defined in config.h. - - - - 66..1122.. ppccaapp - - The pcap transport is attached to a UML ethernet device on the command - line or with uml_mconsole with the following syntax: - - - - ethn=pcap,host interface,filter - expression,option1,option2 - - - - - The expression and options are optional. - - - The interface is whatever network device on the host you want to - sniff. The expression is a pcap filter expression, which is also what - tcpdump uses, so if you know how to specify tcpdump filters, you will - use the same expressions here. The options are up to two of - 'promisc', control whether pcap puts the host interface into - promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap - expression optimizer is used. - - - Example: - - - - eth0=pcap,eth0,tcp - - eth1=pcap,eth0,!tcp - - - - will cause the UML eth0 to emit all tcp packets on the host eth0 and - the UML eth1 to emit all non-tcp packets on the host eth0. - - - - 66..1133.. SSeettttiinngg uupp tthhee hhoosstt yyoouurrsseellff - - If you don't specify an address for the host side of the ethertap or - slip device, UML won't do any setup on the host. So this is what is - needed to get things working (the examples use a host-side IP of - 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your - own network): - - +o The device needs to be configured with its IP address. Tap devices - are also configured with an mtu of 1484. Slip devices are - configured with a point-to-point address pointing at the UML ip - address. - - - host# ifconfig tap0 arp mtu 1484 192.168.0.251 up - - - - - - - host# - ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up - - - - - - +o If a tap device is being set up, a route is set to the UML IP. - - - UML# route add -host 192.168.0.250 gw 192.168.0.251 - - - - - - +o To allow other hosts on your network to see the virtual machine, - proxy arp is set up for it. - - - host# arp -Ds 192.168.0.250 eth0 pub - - - - - - +o Finally, the host is set up to route packets. - - - host# echo 1 > /proc/sys/net/ipv4/ip_forward - - - - - - - - - - - 77.. SShhaarriinngg FFiilleessyysstteemmss bbeettwweeeenn VViirrttuuaall MMaacchhiinneess - - - - - 77..11.. AA wwaarrnniinngg - - Don't attempt to share filesystems simply by booting two UMLs from the - same file. That's the same thing as booting two physical machines - from a shared disk. It will result in filesystem corruption. - - - - 77..22.. UUssiinngg llaayyeerreedd bblloocckk ddeevviicceess - - The way to share a filesystem between two virtual machines is to use - the copy-on-write (COW) layering capability of the ubd block driver. - As of 2.4.6-2um, the driver supports layering a read-write private - device over a read-only shared device. A machine's writes are stored - in the private device, while reads come from either device - the - private one if the requested block is valid in it, the shared one if - not. Using this scheme, the majority of data which is unchanged is - shared between an arbitrary number of virtual machines, each of which - has a much smaller file containing the changes that it has made. With - a large number of UMLs booting from a large root filesystem, this - leads to a huge disk space saving. It will also help performance, - since the host will be able to cache the shared data using a much - smaller amount of memory, so UML disk requests will be served from the - host's memory rather than its disks. - - - - - To add a copy-on-write layer to an existing block device file, simply - add the name of the COW file to the appropriate ubd switch: - - - ubd0=root_fs_cow,root_fs_debian_22 - - - - - where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is - the existing shared filesystem. The COW file need not exist. If it - doesn't, the driver will create and initialize it. Once the COW file - has been initialized, it can be used on its own on the command line: - - - ubd0=root_fs_cow - - - - - The name of the backing file is stored in the COW file header, so it - would be redundant to continue specifying it on the command line. - - - - 77..33.. NNoottee!! - - When checking the size of the COW file in order to see the gobs of - space that you're saving, make sure you use 'ls -ls' to see the actual - disk consumption rather than the length of the file. The COW file is - sparse, so the length will be very different from the disk usage. - Here is a 'ls -l' of a COW file and backing file from one boot and - shutdown: - host% ls -l cow.debian debian2.2 - -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Doesn't look like much saved space, does it? Well, here's 'ls -ls': - - - host% ls -ls cow.debian debian2.2 - 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Now, you can see that the COW file has less than a meg of disk, rather - than 492 meg. - - - - 77..44.. AAnnootthheerr wwaarrnniinngg - - Once a filesystem is being used as a readonly backing file for a COW - file, do not boot directly from it or modify it in any way. Doing so - will invalidate any COW files that are using it. The mtime and size - of the backing file are stored in the COW file header at its creation, - and they must continue to match. If they don't, the driver will - refuse to use the COW file. - - - - - If you attempt to evade this restriction by changing either the - backing file or the COW header by hand, you will get a corrupted - filesystem. - - - - - Among other things, this means that upgrading the distribution in a - backing file and expecting that all of the COW files using it will see - the upgrade will not work. - - - - - 77..55.. uummll__mmoooo :: MMeerrggiinngg aa CCOOWW ffiillee wwiitthh iittss bbaacckkiinngg ffiillee - - Depending on how you use UML and COW devices, it may be advisable to - merge the changes in the COW file into the backing file every once in - a while. - - - - - The utility that does this is uml_moo. Its usage is - - - host% uml_moo COW file new backing file - - - - - There's no need to specify the backing file since that information is - already in the COW file header. If you're paranoid, boot the new - merged file, and if you're happy with it, move it over the old backing - file. - - - - - uml_moo creates a new backing file by default as a safety measure. It - also has a destructive merge option which will merge the COW file - directly into its current backing file. This is really only usable - when the backing file only has one COW file associated with it. If - there are multiple COWs associated with a backing file, a -d merge of - one of them will invalidate all of the others. However, it is - convenient if you're short of disk space, and it should also be - noticeably faster than a non-destructive merge. - - - - - uml_moo is installed with the UML deb and RPM. If you didn't install - UML from one of those packages, you can also get it from the UML - utilities tar file in tools/moo. - - - - - - - - - 88.. CCrreeaattiinngg ffiilleessyysstteemmss - - - You may want to create and mount new UML filesystems, either because - your root filesystem isn't large enough or because you want to use a - filesystem other than ext2. - - - This was written on the occasion of reiserfs being included in the - 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will - talk about reiserfs. This information is generic, and the examples - should be easy to translate to the filesystem of your choice. - - - 88..11.. CCrreeaattee tthhee ffiilleessyysstteemm ffiillee - - dd is your friend. All you need to do is tell dd to create an empty - file of the appropriate size. I usually make it sparse to save time - and to avoid allocating disk space until it's actually used. For - example, the following command will create a sparse 100 meg file full - of zeroes. - - - host% - dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M - - - - - - - 88..22.. AAssssiiggnn tthhee ffiillee ttoo aa UUMMLL ddeevviiccee - - Add an argument like the following to the UML command line: - - ubd4=new_filesystem - - - - - making sure that you use an unassigned ubd device number. - - - - 88..33.. CCrreeaattiinngg aanndd mmoouunnttiinngg tthhee ffiilleessyysstteemm - - Make sure that the filesystem is available, either by being built into - the kernel, or available as a module, then boot up UML and log in. If - the root filesystem doesn't have the filesystem utilities (mkfs, fsck, - etc), then get them into UML by way of the net or hostfs. - - - Make the new filesystem on the device assigned to the new file: - - - host# mkreiserfs /dev/ubd/4 - - - <----------- MKREISERFSv2 -----------> - - ReiserFS version 3.6.25 - Block size 4096 bytes - Block count 25856 - Used blocks 8212 - Journal - 8192 blocks (18-8209), journal header is in block 8210 - Bitmaps: 17 - Root block 8211 - Hash function "r5" - ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y - journal size 8192 (from 18) - Initializing journal - 0%....20%....40%....60%....80%....100% - Syncing..done. - - - - - Now, mount it: - - - UML# - mount /dev/ubd/4 /mnt - - - - - and you're in business. - - - - - - - - - - 99.. HHoosstt ffiillee aacccceessss - - - If you want to access files on the host machine from inside UML, you - can treat it as a separate machine and either nfs mount directories - from the host or copy files into the virtual machine with scp or rcp. - However, since UML is running on the host, it can access those - files just like any other process and make them available inside the - virtual machine without needing to use the network. - - - This is now possible with the hostfs virtual filesystem. With it, you - can mount a host directory into the UML filesystem and access the - files contained in it just as you would on the host. - - - 99..11.. UUssiinngg hhoossttffss - - To begin with, make sure that hostfs is available inside the virtual - machine with - - - UML# cat /proc/filesystems - - - - . hostfs should be listed. If it's not, either rebuild the kernel - with hostfs configured into it or make sure that hostfs is built as a - module and available inside the virtual machine, and insmod it. - - - Now all you need to do is run mount: - - - UML# mount none /mnt/host -t hostfs - - - - - will mount the host's / on the virtual machine's /mnt/host. - - - If you don't want to mount the host root directory, then you can - specify a subdirectory to mount with the -o switch to mount: - - - UML# mount none /mnt/home -t hostfs -o /home - - - - - will mount the hosts's /home on the virtual machine's /mnt/home. - - - - 99..22.. hhoossttffss aass tthhee rroooott ffiilleessyysstteemm - - It's possible to boot from a directory hierarchy on the host using - hostfs rather than using the standard filesystem in a file. - - To start, you need that hierarchy. The easiest way is to loop mount - an existing root_fs file: - - - host# mount root_fs uml_root_dir -o loop - - - - - You need to change the filesystem type of / in etc/fstab to be - 'hostfs', so that line looks like this: - - /dev/ubd/0 / hostfs defaults 1 1 - - - - - Then you need to chown to yourself all the files in that directory - that are owned by root. This worked for me: - - - host# find . -uid 0 -exec chown jdike {} \; - - - - - Next, make sure that your UML kernel has hostfs compiled in, not as a - module. Then run UML with the boot device pointing at that directory: - - - ubd0=/path/to/uml/root/directory - - - - - UML should then boot as it does normally. - - - 99..33.. BBuuiillddiinngg hhoossttffss - - If you need to build hostfs because it's not in your kernel, you have - two choices: - - - - +o Compiling hostfs into the kernel: - - - Reconfigure the kernel and set the 'Host filesystem' option under - - - +o Compiling hostfs as a module: - - - Reconfigure the kernel and set the 'Host filesystem' option under - be in arch/um/fs/hostfs/hostfs.o. Install that in - /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and - - - UML# insmod hostfs - - - - - - - - - - - - - 1100.. TThhee MMaannaaggeemmeenntt CCoonnssoollee - - - - The UML management console is a low-level interface to the kernel, - somewhat like the i386 SysRq interface. Since there is a full-blown - operating system under UML, there is much greater flexibility possible - than with the SysRq mechanism. - - - There are a number of things you can do with the mconsole interface: - - +o get the kernel version - - +o add and remove devices - - +o halt or reboot the machine - - +o Send SysRq commands - - +o Pause and resume the UML - - - You need the mconsole client (uml_mconsole) which is present in CVS - (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in - 2.4.6. - - - You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. - When you boot UML, you'll see a line like: - - - mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole - - - - - If you specify a unique machine id one the UML command line, i.e. - - - umid=debian - - - - - you'll see this - - - mconsole initialized on /home/jdike/.uml/debian/mconsole - - - - - That file is the socket that uml_mconsole will use to communicate with - UML. Run it with either the umid or the full path as its argument: - - - host% uml_mconsole debian - - - - - or - - - host% uml_mconsole /home/jdike/.uml/debian/mconsole - - - - - You'll get a prompt, at which you can run one of these commands: - - +o version - - +o halt - - +o reboot - - +o config - - +o remove - - +o sysrq - - +o help - - +o cad - - +o stop - - +o go - - - 1100..11.. vveerrssiioonn - - This takes no arguments. It prints the UML version. - - - (mconsole) version - OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 - - - - - There are a couple actual uses for this. It's a simple no-op which - can be used to check that a UML is running. It's also a way of - sending an interrupt to the UML. This is sometimes useful on SMP - hosts, where there's a bug which causes signals to UML to be lost, - often causing it to appear to hang. Sending such a UML the mconsole - version command is a good way to 'wake it up' before networking has - been enabled, as it does not do anything to the function of the UML. - - - - 1100..22.. hhaalltt aanndd rreebboooott - - These take no arguments. They shut the machine down immediately, with - no syncing of disks and no clean shutdown of userspace. So, they are - pretty close to crashing the machine. - - - (mconsole) halt - OK - - - - - - - 1100..33.. ccoonnffiigg - - "config" adds a new device to the virtual machine. Currently the ubd - and network drivers support this. It takes one argument, which is the - device to add, with the same syntax as the kernel command line. - - - - - (mconsole) - config ubd3=/home/jdike/incoming/roots/root_fs_debian22 - - OK - (mconsole) config eth1=mcast - OK - - - - - - - 1100..44.. rreemmoovvee - - "remove" deletes a device from the system. Its argument is just the - name of the device to be removed. The device must be idle in whatever - sense the driver considers necessary. In the case of the ubd driver, - the removed block device must not be mounted, swapped on, or otherwise - open, and in the case of the network driver, the device must be down. - - - (mconsole) remove ubd3 - OK - (mconsole) remove eth1 - OK - - - - - - - 1100..55.. ssyyssrrqq - - This takes one argument, which is a single letter. It calls the - generic kernel's SysRq driver, which does whatever is called for by - that argument. See the SysRq documentation in Documentation/sysrq.txt - in your favorite kernel tree to see what letters are valid and what - they do. - - - - 1100..66.. hheellpp - - "help" returns a string listing the valid commands and what each one - does. - - - - 1100..77.. ccaadd - - This invokes the Ctl-Alt-Del action on init. What exactly this ends - up doing is up to /etc/inittab. Normally, it reboots the machine. - With UML, this is usually not desired, so if a halt would be better, - then find the section of inittab that looks like this - - - # What to do when CTRL-ALT-DEL is pressed. - ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now - - - - - and change the command to halt. - - - - 1100..88.. ssttoopp - - This puts the UML in a loop reading mconsole requests until a 'go' - mconsole command is received. This is very useful for making backups - of UML filesystems, as the UML can be stopped, then synced via 'sysrq - s', so that everything is written to the filesystem. You can then copy - the filesystem and then send the UML 'go' via mconsole. - - - Note that a UML running with more than one CPU will have problems - after you send the 'stop' command, as only one CPU will be held in a - mconsole loop and all others will continue as normal. This is a bug, - and will be fixed. - - - - 1100..99.. ggoo - - This resumes a UML after being paused by a 'stop' command. Note that - when the UML has resumed, TCP connections may have timed out and if - the UML is paused for a long period of time, crond might go a little - crazy, running all the jobs it didn't do earlier. - - - - - - - - - 1111.. KKeerrnneell ddeebbuuggggiinngg - - - NNoottee:: The interface that makes debugging, as described here, possible - is present in 2.4.0-test6 kernels and later. - - - Since the user-mode kernel runs as a normal Linux process, it is - possible to debug it with gdb almost like any other process. It is - slightly different because the kernel's threads are already being - ptraced for system call interception, so gdb can't ptrace them. - However, a mechanism has been added to work around that problem. - - - In order to debug the kernel, you need build it from source. See - ``Compiling the kernel and modules'' for information on doing that. - Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during - the config. These will compile the kernel with -g, and enable the - ptrace proxy so that gdb works with UML, respectively. - - - - - 1111..11.. SSttaarrttiinngg tthhee kkeerrnneell uunnddeerr ggddbb - - You can have the kernel running under the control of gdb from the - beginning by putting 'debug' on the command line. You will get an - xterm with gdb running inside it. The kernel will send some commands - to gdb which will leave it stopped at the beginning of start_kernel. - At this point, you can get things going with 'next', 'step', or - 'cont'. - - - There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an - interrupt handler. - 1111..22.. EExxaammiinniinngg sslleeeeppiinngg pprroocceesssseess - - Not every bug is evident in the currently running process. Sometimes, - processes hang in the kernel when they shouldn't because they've - deadlocked on a semaphore or something similar. In this case, when - you ^C gdb and get a backtrace, you will see the idle thread, which - isn't very relevant. - - - What you want is the stack of whatever process is sleeping when it - shouldn't be. You need to figure out which process that is, which is - generally fairly easy. Then you need to get its host process id, - which you can do either by looking at ps on the host or at - task.thread.extern_pid in gdb. - - - Now what you do is this: - - +o detach from the current thread - - - (UML gdb) det - - - - - - +o attach to the thread you are interested in - - - (UML gdb) att - - - - - - +o look at its stack and anything else of interest - - - (UML gdb) bt - - - - - Note that you can't do anything at this point that requires that a - process execute, e.g. calling a function - - +o when you're done looking at that process, reattach to the current - thread and continue it - - - (UML gdb) - att 1 - - - - - - - (UML gdb) - c - - - - - Here, specifying any pid which is not the process id of a UML thread - will cause gdb to reattach to the current thread. I commonly use 1, - but any other invalid pid would work. - - - - 1111..33.. RRuunnnniinngg dddddd oonn UUMMLL - - ddd works on UML, but requires a special kludge. The process goes - like this: - - +o Start ddd - - - host% ddd linux - - - - - - +o With ps, get the pid of the gdb that ddd started. You can ask the - gdb to tell you, but for some reason that confuses things and - causes a hang. - - +o run UML with 'debug=parent gdb-pid=' added to the command line - - it will just sit there after you hit return - - +o type 'att 1' to the ddd gdb and you will see something like - - - 0xa013dc51 in __kill () - - - (gdb) - - - - - - +o At this point, type 'c', UML will boot up, and you can use ddd just - as you do on any other process. - - - - 1111..44.. DDeebbuuggggiinngg mmoodduulleess - - gdb has support for debugging code which is dynamically loaded into - the process. This support is what is needed to debug kernel modules - under UML. - - - Using that support is somewhat complicated. You have to tell gdb what - object file you just loaded into UML and where in memory it is. Then, - it can read the symbol table, and figure out where all the symbols are - from the load address that you provided. It gets more interesting - when you load the module again (i.e. after an rmmod). You have to - tell gdb to forget about all its symbols, including the main UML ones - for some reason, then load then all back in again. - - - There's an easy way and a hard way to do this. The easy way is to use - the umlgdb expect script written by Chandan Kudige. It basically - automates the process for you. - - - First, you must tell it where your modules are. There is a list in - the script that looks like this: - set MODULE_PATHS { - "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" - "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" - "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" - } - - - - - You change that to list the names and paths of the modules that you - are going to debug. Then you run it from the toplevel directory of - your UML pool and it basically tells you what to do: - - - - - ******** GDB pid is 21903 ******** - Start UML as: ./linux debug gdb-pid=21903 - - - - GNU gdb 5.0rh-5 Red Hat Linux 7.1 - Copyright 2001 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) b sys_init_module - Breakpoint 1 at 0xa0011923: file module.c, line 349. - (gdb) att 1 - - - - - After you run UML and it sits there doing nothing, you hit return at - the 'att 1' and continue it: - - - Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 - 0xa00f4221 in __kill () - (UML gdb) c - Continuing. - - - - - At this point, you debug normally. When you insmod something, the - expect magic will kick in and you'll see something like: - - - - - - - - - - - - - - - - - - *** Module hostfs loaded *** - Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 349 char *name, *n_name, *name_tmp = NULL; - (UML gdb) finish - Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 - 411 else res = EXECUTE_SYSCALL(syscall, regs); - Value returned is $1 = 0 - (UML gdb) - p/x (int)module_list + module_list->size_of_struct - - $2 = 0xa9021054 - (UML gdb) symbol-file ./linux - Load new symbol table from "./linux"? (y or n) y - Reading symbols from ./linux... - done. - (UML gdb) - add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 - - add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at - .text_addr = 0xa9021054 - (y or n) y - - Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... - done. - (UML gdb) p *module_list - $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", - size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, - nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, - init = 0xa90221f0 , cleanup = 0xa902222c , - ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, - persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, - kallsyms_end = 0x0, - archdata_start = 0x1b855
, - archdata_end = 0xe5890000
, - kernel_data = 0xf689c35d
} - >> Finished loading symbols for hostfs ... - - - - - That's the easy way. It's highly recommended. The hard way is - described below in case you're interested in what's going on. - - - Boot the kernel under the debugger and load the module with insmod or - modprobe. With gdb, do: - - - (UML gdb) p module_list - - - - - This is a list of modules that have been loaded into the kernel, with - the most recently loaded module first. Normally, the module you want - is at module_list. If it's not, walk down the next links, looking at - the name fields until find the module you want to debug. Take the - address of that structure, and add module.size_of_struct (which in - 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition - for you :-): - - - - (UML gdb) - printf "%#x\n", (int)module_list module_list->size_of_struct - - - - - The offset from the module start occasionally changes (before 2.4.0, - it was module.size_of_struct + 4), so it's a good idea to check the - init and cleanup addresses once in a while, as describe below. Now - do: - - - (UML gdb) - add-symbol-file /path/to/module/on/host that_address - - - - - Tell gdb you really want to do it, and you're in business. - - - If there's any doubt that you got the offset right, like breakpoints - appear not to work, or they're appearing in the wrong place, you can - check it by looking at the module structure. The init and cleanup - fields should look like: - - - init = 0x588066b0 , cleanup = 0x588066c0 - - - - - with no offsets on the symbol names. If the names are right, but they - are offset, then the offset tells you how much you need to add to the - address you gave to add-symbol-file. - - - When you want to load in a new version of the module, you need to get - gdb to forget about the old one. The only way I've found to do that - is to tell gdb to forget about all symbols that it knows about: - - - (UML gdb) symbol-file - - - - - Then reload the symbols from the kernel binary: - - - (UML gdb) symbol-file /path/to/kernel - - - - - and repeat the process above. You'll also need to re-enable break- - points. They were disabled when you dumped all the symbols because - gdb couldn't figure out where they should go. - - - - 1111..55.. AAttttaacchhiinngg ggddbb ttoo tthhee kkeerrnneell - - If you don't have the kernel running under gdb, you can attach gdb to - it later by sending the tracing thread a SIGUSR1. The first line of - the console output identifies its pid: - tracing thread pid = 20093 - - - - - When you send it the signal: - - - host% kill -USR1 20093 - - - - - you will get an xterm with gdb running in it. - - - If you have the mconsole compiled into UML, then the mconsole client - can be used to start gdb: - - - (mconsole) (mconsole) config gdb=xterm - - - - - will fire up an xterm with gdb running in it. - - - - 1111..66.. UUssiinngg aalltteerrnnaattee ddeebbuuggggeerrss - - UML has support for attaching to an already running debugger rather - than starting gdb itself. This is present in CVS as of 17 Apr 2001. - I sent it to Alan for inclusion in the ac tree, and it will be in my - 2.4.4 release. - - - This is useful when gdb is a subprocess of some UI, such as emacs or - ddd. It can also be used to run debuggers other than gdb on UML. - Below is an example of using strace as an alternate debugger. - - - To do this, you need to get the pid of the debugger and pass it in - with the - - - If you are using gdb under some UI, then tell it to 'att 1', and - you'll find yourself attached to UML. - - - If you are using something other than gdb as your debugger, then - you'll need to get it to do the equivalent of 'att 1' if it doesn't do - it automatically. - - - An example of an alternate debugger is strace. You can strace the - actual kernel as follows: - - +o Run the following in a shell - - - host% - sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' - - - - +o Run UML with 'debug' and 'gdb-pid=' with the pid printed out - by the previous command - - +o Hit return in the shell, and UML will start running, and strace - output will start accumulating in the output file. - - Note that this is different from running - - - host% strace ./linux - - - - - That will strace only the main UML thread, the tracing thread, which - doesn't do any of the actual kernel work. It just oversees the vir- - tual machine. In contrast, using strace as described above will show - you the low-level activity of the virtual machine. - - - - - - 1122.. KKeerrnneell ddeebbuuggggiinngg eexxaammpplleess - - 1122..11.. TThhee ccaassee ooff tthhee hhuunngg ffsscckk - - When booting up the kernel, fsck failed, and dropped me into a shell - to fix things up. I ran fsck -y, which hung: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 was not cleanly unmounted, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Inode 19780, i_blocks is 1548, should be 540. Fix? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - - - - - The standard drill in this sort of situation is to fire up gdb on the - signal thread, which, in this case, was pid 1935. In another window, - I run gdb and attach pid 1935. - - - - - ~/linux/2.3.26/um 1016: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - - (gdb) att 1935 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 - 0x100756d9 in __wait4 () - - - - - - - Let's see what's currently running: - - - - (gdb) p current_task.pid - $1 = 0 - - - - - - It's the idle thread, which means that fsck went to sleep for some - reason and never woke up. - - - Let's guess that the last process in the process list is fsck: - - - - (gdb) p current_task.prev_task.comm - $13 = "fsck.ext2\000\000\000\000\000\000" - - - - - - It is, so let's see what it thinks it's up to: - - - - (gdb) p current_task.prev_task.thread - $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, - kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { - 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, - request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { - regs = {1350467584, 2952789424, 0 }, sigstack = 0, - pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, - arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { - op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} - - - - - - The interesting things here are the fact that its .thread.syscall.id - is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or - the defines in include/asm-um/arch/unistd.h), and that it never - returned. Also, its .request.op is OP_SWITCH (see - arch/um/include/user_util.h). These mean that it went into a write, - and, for some reason, called schedule(). - - - The fact that it never returned from write means that its stack should - be fairly interesting. Its pid is 1980 (.thread.extern_pid). That - process is being ptraced by the signal thread, so it must be detached - before gdb can attach it: - - - - - - - - - - - (gdb) call detach(1980) - - Program received signal SIGSEGV, Segmentation fault. - - The program being debugged stopped while in a function called from GDB. - When the function (detach) is done executing, GDB will silently - stop (instead of continuing to evaluate the expression containing - the function call). - (gdb) call detach(1980) - $15 = 0 - - - - - - The first detach segfaults for some reason, and the second one - succeeds. - - - Now I detach from the signal thread, attach to the fsck thread, and - look at its stack: - - - (gdb) det - Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 - (gdb) att 1980 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 - 0x10070451 in __kill () - (gdb) bt - #0 0x10070451 in __kill () - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - #4 0x10001d12 in schedule () at sched.c:777 - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #9 - #10 0x10155404 in errno () - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #14 - #15 0xc0fd in ?? () - #16 0x10016647 in sys_write (fd=3, - buf=0x80b8800
, count=1024) - at read_write.c:159 - #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #19 - #20 0x400dc8b0 in ?? () - - - - - - The interesting things here are : - - +o There are two segfaults on this stack (frames 9 and 14) - - +o The first faulting address (frame 11) is 0x50000800 - - (gdb) p (void *)1342179328 - $16 = (void *) 0x50000800 - - - - - - The initial faulting address is interesting because it is on the idle - thread's stack. I had been seeing the idle thread segfault for no - apparent reason, and the cause looked like stack corruption. In hopes - of catching the culprit in the act, I had turned off all protections - to that stack while the idle thread wasn't running. This apparently - tripped that trap. - - - However, the more immediate problem is that second segfault and I'm - going to concentrate on that. First, I want to see where the fault - happened, so I have to go look at the sigcontent struct in frame 8: - - - - (gdb) up - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - 30 kill(pid, SIGUSR1); - (gdb) - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - 156 usr1_pid(getpid()); - (gdb) - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - 161 _switch_to(prev, next); - (gdb) - #4 0x10001d12 in schedule () at sched.c:777 - 777 switch_to(prev, next, prev); - (gdb) - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - 71 schedule(); - (gdb) - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - 157 } - (gdb) - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - 182 segv_handler(sc); - (gdb) p *sc - Cannot access memory at address 0x0. - - - - - That's not very useful, so I'll try a more manual method: - - - (gdb) p *((struct sigcontext *) (&sig + 1)) - $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, - esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, - trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, - esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1280} - - - - The ip is in handle_mm_fault: - - - (gdb) p (void *)268480945 - $20 = (void *) 0x1000b1b1 - (gdb) i sym $20 - handle_mm_fault + 57 in section .text - - - - - - Specifically, it's in pte_alloc: - - - (gdb) i line *$20 - Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b1 - and ends at 0x1000b1b7 . - - - - - - To find where in handle_mm_fault this is, I'll jump forward in the - code until I see an address in that procedure: - - - - (gdb) i line *0x1000b1c0 - Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b7 - and ends at 0x1000b1c3 . - (gdb) i line *0x1000b1d0 - Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1d0 - and ends at 0x1000b1da . - (gdb) i line *0x1000b1e0 - Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1da - and ends at 0x1000b1e1 . - (gdb) i line *0x1000b1f0 - Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1f0 - and ends at 0x1000b200 . - (gdb) i line *0x1000b200 - Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b200 - and ends at 0x1000b208 . - (gdb) i line *0x1000b210 - Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b210 - and ends at 0x1000b219 . - (gdb) i line *0x1000b220 - Line 1168 of "memory.c" starts at address 0x1000b21e - and ends at 0x1000b222 . - - - - - - Something is apparently wrong with the page tables or vma_structs, so - lets go back to frame 11 and have a look at them: - - - - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - 50 handle_mm_fault(current, vma, address, is_write); - (gdb) call pgd_offset_proc(vma->vm_mm, address) - $22 = (pgd_t *) 0x80a548c - - - - - - That's pretty bogus. Page tables aren't supposed to be in process - text or data areas. Let's see what's in the vma: - - - (gdb) p *vma - $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, - vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, - vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, - vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, - vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, - vm_private_data = 0x62} - (gdb) p *vma.vm_mm - $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, - pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, - map_count = 134909076, mmap_sem = {count = {counter = 135073792}, - sleepers = -1342177872, wait = {lock = , - task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, - __magic = -1342177670, __creator = -1342177300}, __magic = 98}, - page_table_lock = {}, context = 138, start_code = 0, end_code = 0, - start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, - arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, - total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, - swap_address = 0, segments = 0x0} - - - - - - This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx - addresses, this is looking like a stack was plonked down on top of - these structures. Maybe it's a stack overflow from the next page: - - - - (gdb) p vma - $25 = (struct vm_area_struct *) 0x507d2434 - - - - - - That's towards the lower quarter of the page, so that would have to - have been pretty heavy stack overflow: - - - - - - - - - - - - - - - (gdb) x/100x $25 - 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c - 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 - 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a - 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 - 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 - 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 - 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 - - - - - - It's not stack overflow. The only "stack-like" piece of this data is - the vma_struct itself. - - - At this point, I don't see any avenues to pursue, so I just have to - admit that I have no idea what's going on. What I will do, though, is - stick a trap on the segfault handler which will stop if it sees any - writes to the idle thread's stack. That was the thing that happened - first, and it may be that if I can catch it immediately, what's going - on will be somewhat clearer. - - - 1122..22.. EEppiissooddee 22:: TThhee ccaassee ooff tthhee hhuunngg ffsscckk - - After setting a trap in the SEGV handler for accesses to the signal - thread's stack, I reran the kernel. - - - fsck hung again, this time by hitting the trap: - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 contains a file system with errors, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - Untested (4127) [100fe44c]: trap_kern.c line 31 - - - - - - I need to get the signal thread to detach from pid 4127 so that I can - attach to it with gdb. This is done by sending it a SIGUSR1, which is - caught by the signal thread, which detaches the process: - - - kill -USR1 4127 - - - - - - Now I can run gdb on it: - - - - - - - - - - - - - - ~/linux/2.3.26/um 1034: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) att 4127 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 - 0x10075891 in __libc_nanosleep () - - - - - - The backtrace shows that it was in a write and that the fault address - (address in frame 3) is 0x50000800, which is right in the middle of - the signal thread's stack page: - - - (gdb) bt - #0 0x10075891 in __libc_nanosleep () - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - #2 0x1006ce9a in stop () at user_util.c:191 - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 - #6 - #7 0xc0fd in ?? () - #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) - at read_write.c:159 - #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #11 - #12 0x400dc8b0 in ?? () - #13 - #14 0x400dc8b0 in ?? () - #15 0x80545fd in ?? () - #16 0x804daae in ?? () - #17 0x8054334 in ?? () - #18 0x804d23e in ?? () - #19 0x8049632 in ?? () - #20 0x80491d2 in ?? () - #21 0x80596b5 in ?? () - (gdb) p (void *)1342179328 - $3 = (void *) 0x50000800 - - - - - - Going up the stack to the segv_handler frame and looking at where in - the code the access happened shows that it happened near line 110 of - block_dev.c: - - - - - - - - - - (gdb) up - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. - (gdb) - #2 0x1006ce9a in stop () at user_util.c:191 - 191 while(1) sleep(1000000); - (gdb) - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - 31 KERN_UNTESTED(); - (gdb) - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) p *sc - $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, - esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, - err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, - esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1342179328} - (gdb) p (void *)268550834 - $2 = (void *) 0x1001c2b2 - (gdb) i sym $2 - block_write + 1090 in section .text - (gdb) i line *$2 - Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" - starts at address 0x1001c2a1 - and ends at 0x1001c2bf . - (gdb) i line *0x1001c2c0 - Line 110 of "block_dev.c" starts at address 0x1001c2bf - and ends at 0x1001c2e3 . - - - - - - Looking at the source shows that the fault happened during a call to - copy_to_user to copy the data into the kernel: - - - 107 count -= chars; - 108 copy_from_user(p,buf,chars); - 109 p += chars; - 110 buf += chars; - - - - - - p is the pointer which must contain 0x50000800, since buf contains - 0x80b8800 (frame 8 above). It is defined as: - - - p = offset + bh->b_data; - - - - - - I need to figure out what bh is, and it just so happens that bh is - passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a - few lines later, so I do a little disassembly: - - - - - (gdb) disas 0x1001c2bf 0x1001c2e0 - Dump of assembler code from 0x1001c2bf to 0x1001c2d0: - 0x1001c2bf : addl %eax,0xc(%ebp) - 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx - 0x1001c2c8 : btsl $0x0,0x18(%edx) - 0x1001c2cd : btsl $0x1,0x18(%edx) - 0x1001c2d2 : sbbl %ecx,%ecx - 0x1001c2d4 : testl %ecx,%ecx - 0x1001c2d6 : jne 0x1001c2e3 - 0x1001c2d8 : pushl $0x0 - 0x1001c2da : pushl %edx - 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> - End of assembler dump. - - - - - - At that point, bh is in %edx (address 0x1001c2da), which is calculated - at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, - taking %ebp from the sigcontext_struct above: - - - (gdb) p (void *)1342631484 - $5 = (void *) 0x5006ee3c - (gdb) p 0x5006ee3c+0xfffffdd4 - $6 = 1342630928 - (gdb) p (void *)$6 - $7 = (void *) 0x5006ec10 - (gdb) p *((void **)$7) - $8 = (void *) 0x50100200 - - - - - - Now, I look at the structure to see what's in it, and particularly, - what its b_data field contains: - - - (gdb) p *((struct buffer_head *)0x50100200) - $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, - b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, - b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, - b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, - b_data = 0x50000800 "", b_page = 0x50004000, - b_end_io = 0x10017f60 , b_dev_id = 0x0, - b_rsector = 98810, b_wait = {lock = , - task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, - __creator = 0}, b_kiobuf = 0x0} - - - - - - The b_data field is indeed 0x50000800, so the question becomes how - that happened. The rest of the structure looks fine, so this probably - is not a case of data corruption. It happened on purpose somehow. - - - The b_page field is a pointer to the page_struct representing the - 0x50000000 page. Looking at it shows the kernel's idea of the state - of that page: - - - - (gdb) p *$13.b_page - $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, - index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { - next = 0x50008460, prev = 0x50019350}, wait = { - lock = , task_list = {next = 0x50004024, - prev = 0x50004024}, __magic = 1342193708, __creator = 0}, - pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, - zone = 0x100c5160} - - - - - - Some sanity-checking: the virtual field shows the "virtual" address of - this page, which in this kernel is the same as its "physical" address, - and the page_struct itself should be mem_map[0], since it represents - the first page of memory: - - - - (gdb) p (void *)1342177280 - $18 = (void *) 0x50000000 - (gdb) p mem_map - $19 = (mem_map_t *) 0x50004000 - - - - - - These check out fine. - - - Now to check out the page_struct itself. In particular, the flags - field shows whether the page is considered free or not: - - - (gdb) p (void *)132 - $21 = (void *) 0x84 - - - - - - The "reserved" bit is the high bit, which is definitely not set, so - the kernel considers the signal stack page to be free and available to - be used. - - - At this point, I jump to conclusions and start looking at my early - boot code, because that's where that page is supposed to be reserved. - - - In my setup_arch procedure, I have the following code which looks just - fine: - - - - bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); - free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); - - - - - - Two stack pages have already been allocated, and low_physmem points to - the third page, which is the beginning of free memory. - The init_bootmem call declares the entire memory to the boot memory - manager, which marks it all reserved. The free_bootmem call frees up - all of it, except for the first two pages. This looks correct to me. - - - So, I decide to see init_bootmem run and make sure that it is marking - those first two pages as reserved. I never get that far. - - - Stepping into init_bootmem, and looking at bootmem_map before looking - at what it contains shows the following: - - - - (gdb) p bootmem_map - $3 = (void *) 0x50000000 - - - - - - Aha! The light dawns. That first page is doing double duty as a - stack and as the boot memory map. The last thing that the boot memory - manager does is to free the pages used by its memory map, so this page - is getting freed even its marked as reserved. - - - The fix was to initialize the boot memory manager before allocating - those two stack pages, and then allocate them through the boot memory - manager. After doing this, and fixing a couple of subsequent buglets, - the stack corruption problem disappeared. - - - - - - 1133.. WWhhaatt ttoo ddoo wwhheenn UUMMLL ddooeessnn''tt wwoorrkk - - - - - 1133..11.. SSttrraannggee ccoommppiillaattiioonn eerrrroorrss wwhheenn yyoouu bbuuiilldd ffrroomm ssoouurrccee - - As of test11, it is necessary to have "ARCH=um" in the environment or - on the make command line for all steps in building UML, including - clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, - and linux. If you forget for any of them, the i386 build seems to - contaminate the UML build. If this happens, start from scratch with - - - host% - make mrproper ARCH=um - - - - - and repeat the build process with ARCH=um on all the steps. - - - See ``Compiling the kernel and modules'' for more details. - - - Another cause of strange compilation errors is building UML in - /usr/src/linux. If you do this, the first thing you need to do is - clean up the mess you made. The /usr/src/linux/asm link will now - point to /usr/src/linux/asm-um. Make it point back to - /usr/src/linux/asm-i386. Then, move your UML pool someplace else and - build it there. Also see below, where a more specific set of symptoms - is described. - - - - 1133..33.. AA vvaarriieettyy ooff ppaanniiccss aanndd hhaannggss wwiitthh //ttmmpp oonn aa rreeiisseerrffss ffiilleessyyss-- - tteemm - - I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. - Panics preceded by - - - Detaching pid nnnn - - - - are diagnostic of this problem. This is a reiserfs bug which causes a - thread to occasionally read stale data from a mmapped page shared with - another thread. The fix is to upgrade the filesystem or to have /tmp - be an ext2 filesystem. - - - - 1133..44.. TThhee ccoommppiillee ffaaiillss wwiitthh eerrrroorrss aabboouutt ccoonnfflliiccttiinngg ttyyppeess ffoorr - ''ooppeenn'',, ''dduupp'',, aanndd ''wwaaiittppiidd'' - - This happens when you build in /usr/src/linux. The UML build makes - the include/asm link point to include/asm-um. /usr/include/asm points - to /usr/src/linux/include/asm, so when that link gets moved, files - which need to include the asm-i386 versions of headers get the - incompatible asm-um versions. The fix is to move the include/asm link - back to include/asm-i386 and to do UML builds someplace else. - - - - 1133..55.. UUMMLL ddooeessnn''tt wwoorrkk wwhheenn //ttmmpp iiss aann NNFFSS ffiilleessyysstteemm - - This seems to be a similar situation with the ReiserFS problem above. - Some versions of NFS seems not to handle mmap correctly, which UML - depends on. The workaround is have /tmp be a non-NFS directory. - - - 1133..66.. UUMMLL hhaannggss oonn bboooott wwhheenn ccoommppiilleedd wwiitthh ggpprrooff ssuuppppoorrtt - - If you build UML with gprof support and, early in the boot, it does - this - - - kernel BUG at page_alloc.c:100! - - - - - you have a buggy gcc. You can work around the problem by removing - UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up - another bug, but that one is fairly hard to reproduce. - - - - 1133..77.. ssyyssllooggdd ddiieess wwiitthh aa SSIIGGTTEERRMM oonn ssttaarrttuupp - - The exact boot error depends on the distribution that you're booting, - but Debian produces this: - - - /etc/rc2.d/S10sysklogd: line 49: 93 Terminated - start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD - - - - - This is a syslogd bug. There's a race between a parent process - installing a signal handler and its child sending the signal. See - this uml-devel post for the details. - - - - 1133..88.. TTUUNN//TTAAPP nneettwwoorrkkiinngg ddooeessnn''tt wwoorrkk oonn aa 22..44 hhoosstt - - There are a couple of problems which were - name="pointed - out"> by Tim Robinson - - +o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. - The fix is to upgrade to something more recent and then read the - next item. - - +o If you see - - - File descriptor in bad state - - - - when you bring up the device inside UML, you have a header mismatch - between the original kernel and the upgraded one. Make /usr/src/linux - point at the new headers. This will only be a problem if you build - uml_net yourself. - - - - 1133..99.. YYoouu ccaann nneettwwoorrkk ttoo tthhee hhoosstt bbuutt nnoott ttoo ootthheerr mmaacchhiinneess oonn tthhee - nneett - - If you can connect to the host, and the host can connect to UML, but - you cannot connect to any other machines, then you may need to enable - IP Masquerading on the host. Usually this is only experienced when - using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML - networking, rather than the public address space that your host is - connected to. UML does not enable IP Masquerading, so you will need - to create a static rule to enable it: - - - host% - iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE - - - - - Replace eth0 with the interface that you use to talk to the rest of - the world. - - - Documentation on IP Masquerading, and SNAT, can be found at - www.netfilter.org . - - - If you can reach the local net, but not the outside Internet, then - that is usually a routing problem. The UML needs a default route: - - - UML# - route add default gw gateway IP - - - - - The gateway IP can be any machine on the local net that knows how to - reach the outside world. Usually, this is the host or the local net- - work's gateway. - - - Occasionally, we hear from someone who can reach some machines, but - not others on the same net, or who can reach some ports on other - machines, but not others. These are usually caused by strange - firewalling somewhere between the UML and the other box. You track - this down by running tcpdump on every interface the packets travel - over and see where they disappear. When you find a machine that takes - the packets in, but does not send them onward, that's the culprit. - - - - 1133..1100.. II hhaavvee nnoo rroooott aanndd II wwaanntt ttoo ssccrreeaamm - - Thanks to Birgit Wahlich for telling me about this strange one. It - turns out that there's a limit of six environment variables on the - kernel command line. When that limit is reached or exceeded, argument - processing stops, which means that the 'root=' argument that UML - usually adds is not seen. So, the filesystem has no idea what the - root device is, so it panics. - - - The fix is to put less stuff on the command line. Glomming all your - setup variables into one is probably the best way to go. - - - - 1133..1111.. UUMMLL bbuuiilldd ccoonnfflliicctt bbeettwweeeenn ppttrraaccee..hh aanndd uuccoonntteexxtt..hh - - On some older systems, /usr/include/asm/ptrace.h and - /usr/include/sys/ucontext.h define the same names. So, when they're - included together, the defines from one completely mess up the parsing - of the other, producing errors like: - /usr/include/sys/ucontext.h:47: parse error before - `10' - - - - - plus a pile of warnings. - - - This is a libc botch, which has since been fixed, and I don't see any - way around it besides upgrading. - - - - 1133..1122.. TThhee UUMMLL BBooggooMMiippss iiss eexxaaccttllyy hhaallff tthhee hhoosstt''ss BBooggooMMiippss - - On i386 kernels, there are two ways of running the loop that is used - to calculate the BogoMips rating, using the TSC if it's there or using - a one-instruction loop. The TSC produces twice the BogoMips as the - loop. UML uses the loop, since it has nothing resembling a TSC, and - will get almost exactly the same BogoMips as a host using the loop. - However, on a host with a TSC, its BogoMips will be double the loop - BogoMips, and therefore double the UML BogoMips. - - - - 1133..1133.. WWhheenn yyoouu rruunn UUMMLL,, iitt iimmmmeeddiiaatteellyy sseeggffaauullttss - - If the host is configured with the 2G/2G address space split, that's - why. See ``UML on 2G/2G hosts'' for the details on getting UML to - run on your host. - - - - 1133..1144.. xxtteerrmmss aappppeeaarr,, tthheenn iimmmmeeddiiaatteellyy ddiissaappppeeaarr - - If you're running an up to date kernel with an old release of - uml_utilities, the port-helper program will not work properly, so - xterms will exit straight after they appear. The solution is to - upgrade to the latest release of uml_utilities. Usually this problem - occurs when you have installed a packaged release of UML then compiled - your own development kernel without upgrading the uml_utilities from - the source distribution. - - - - 1133..1155.. AAnnyy ootthheerr ppaanniicc,, hhaanngg,, oorr ssttrraannggee bbeehhaavviioorr - - If you're seeing truly strange behavior, such as hangs or panics that - happen in random places, or you try running the debugger to see what's - happening and it acts strangely, then it could be a problem in the - host kernel. If you're not running a stock Linus or -ac kernel, then - try that. An early version of the preemption patch and a 2.4.10 SuSE - kernel have caused very strange problems in UML. - - - Otherwise, let me know about it. Send a message to one of the UML - mailing lists - either the developer list - user-mode-linux-devel at - lists dot sourceforge dot net (subscription info) or the user list - - user-mode-linux-user at lists dot sourceforge do net (subscription - info), whichever you prefer. Don't assume that everyone knows about - it and that a fix is imminent. - - - If you want to be super-helpful, read ``Diagnosing Problems'' and - follow the instructions contained therein. - 1144.. DDiiaaggnnoossiinngg PPrroobblleemmss - - - If you get UML to crash, hang, or otherwise misbehave, you should - report this on one of the project mailing lists, either the developer - list - user-mode-linux-devel at lists dot sourceforge dot net - (subscription info) or the user list - user-mode-linux-user at lists - dot sourceforge dot net (subscription info). When you do, it is - likely that I will want more information. So, it would be helpful to - read the stuff below, do whatever is applicable in your case, and - report the results to the list. - - - For any diagnosis, you're going to need to build a debugging kernel. - The binaries from this site aren't debuggable. If you haven't done - this before, read about ``Compiling the kernel and modules'' and - ``Kernel debugging'' UML first. - - - 1144..11.. CCaassee 11 :: NNoorrmmaall kkeerrnneell ppaanniiccss - - The most common case is for a normal thread to panic. To debug this, - you will need to run it under the debugger (add 'debug' to the command - line). An xterm will start up with gdb running inside it. Continue - it when it stops in start_kernel and make it crash. Now ^C gdb and - - - If the panic was a "Kernel mode fault", then there will be a segv - frame on the stack and I'm going to want some more information. The - stack might look something like this: - - - (UML gdb) backtrace - #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) - at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 - #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 - #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 - #3 0x1009bf38 in __restore () - at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 - #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) - at trap_kern.c:66 - #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 - #6 0x1009bf38 in __restore () - - - - - I'm going to want to see the symbol and line information for the value - of ip in the segv frame. In this case, you would do the following: - - - (UML gdb) i sym 268849158 - - - - - and - - - (UML gdb) i line *268849158 - - - - - The reason for this is the __restore frame right above the segv_han- - dler frame is hiding the frame that actually segfaulted. So, I have - to get that information from the faulting ip. - - - 1144..22.. CCaassee 22 :: TTrraacciinngg tthhrreeaadd ppaanniiccss - - The less common and more painful case is when the tracing thread - panics. In this case, the kernel debugger will be useless because it - needs a healthy tracing thread in order to work. The first thing to - do is get a backtrace from the tracing thread. This is done by - figuring out what its pid is, firing up gdb, and attaching it to that - pid. You can figure out the tracing thread pid by looking at the - first line of the console output, which will look like this: - - - tracing thread pid = 15851 - - - - - or by running ps on the host and finding the line that looks like - this: - - - jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] - - - - - If the panic was 'segfault in signals', then follow the instructions - above for collecting information about the location of the seg fault. - - - If the tracing thread flaked out all by itself, then send that - backtrace in and wait for our crack debugging team to fix the problem. - - - 1144..33.. CCaassee 33 :: TTrraacciinngg tthhrreeaadd ppaanniiccss ccaauusseedd bbyy ootthheerr tthhrreeaaddss - - However, there are cases where the misbehavior of another thread - caused the problem. The most common panic of this type is: - - - wait_for_stop failed to wait for to stop with - - - - - In this case, you'll need to get a backtrace from the process men- - tioned in the panic, which is complicated by the fact that the kernel - debugger is defunct and without some fancy footwork, another gdb can't - attach to it. So, this is how the fancy footwork goes: - - In a shell: - - - host% kill -STOP pid - - - - - Run gdb on the tracing thread as described in case 2 and do: - - - (host gdb) call detach(pid) - - - If you get a segfault, do it again. It always works the second time. - - Detach from the tracing thread and attach to that other thread: - - - (host gdb) detach - - - - - - - (host gdb) attach pid - - - - - If gdb hangs when attaching to that process, go back to a shell and - do: - - - host% - kill -CONT pid - - - - - And then get the backtrace: - - - (host gdb) backtrace - - - - - - 1144..44.. CCaassee 44 :: HHaannggss - - Hangs seem to be fairly rare, but they sometimes happen. When a hang - happens, we need a backtrace from the offending process. Run the - kernel debugger as described in case 1 and get a backtrace. If the - current process is not the idle thread, then send in the backtrace. - You can tell that it's the idle thread if the stack looks like this: - - - #0 0x100b1401 in __libc_nanosleep () - #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 - #2 0x100a546f in do_idle () at process_kern.c:445 - #3 0x100a5508 in cpu_idle () at process_kern.c:471 - #4 0x100ec18f in start_kernel () at init/main.c:592 - #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 - #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 - - - - - If this is the case, then some other process is at fault, and went to - sleep when it shouldn't have. Run ps on the host and figure out which - process should not have gone to sleep and stayed asleep. Then attach - to it with gdb and get a backtrace as described in case 3. - - - - - - - 1155.. TThhaannkkss - - - A number of people have helped this project in various ways, and this - page gives recognition where recognition is due. - - - If you're listed here and you would prefer a real link on your name, - or no link at all, instead of the despammed email address pseudo-link, - let me know. - - - If you're not listed here and you think maybe you should be, please - let me know that as well. I try to get everyone, but sometimes my - bookkeeping lapses and I forget about contributions. - - - 1155..11.. CCooddee aanndd DDooccuummeennttaattiioonn - - Rusty Russell - - - +o wrote the HOWTO - - +o prodded me into making this project official and putting it on - SourceForge - - +o came up with the way cool UML logo - - +o redid the config process - - - Peter Moulder - Fixed my config and build - processes, and added some useful code to the block driver - - - Bill Stearns - - - +o HOWTO updates - - +o lots of bug reports - - +o lots of testing - - +o dedicated a box (uml.ists.dartmouth.edu) to support UML development - - +o wrote the mkrootfs script, which allows bootable filesystems of - RPM-based distributions to be cranked out - - +o cranked out a large number of filesystems with said script - - - Jim Leu - Wrote the virtual ethernet driver - and associated usermode tools - - Lars Brinkhoff - Contributed the ptrace - proxy from his own project to allow easier - kernel debugging - - - Andrea Arcangeli - Redid some of the early boot - code so that it would work on machines with Large File Support - - - Chris Emerson - Did - the first UML port to Linux/ppc - - - Harald Welte - Wrote the multicast - transport for the network driver - - - Jorgen Cederlof - Added special file support to hostfs - - - Greg Lonnon - Changed the ubd driver - to allow it to layer a COW file on a shared read-only filesystem and - wrote the iomem emulation support - - - Henrik Nordstrom - Provided a variety - of patches, fixes, and clues - - - Lennert Buytenhek - Contributed various patches, a rewrite of the - network driver, the first implementation of the mconsole driver, and - did the bulk of the work needed to get SMP working again. - - - Yon Uriarte - Fixed the TUN/TAP network backend while I slept. - - - Adam Heath - Made a bunch of nice cleanups to the initialization code, - plus various other small patches. - - - Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and - is doing a real nice job of it. He also noticed and fixed a number of - actually and potentially exploitable security holes in uml_net. Plus - the occasional patch. I like patches. - - - James McMechan - James seems to have taken over maintenance of the ubd - driver and is doing a nice job of it. - - - Chandan Kudige - wrote the umlgdb script which automates the reloading - of module symbols. - - - Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, - enabling UML processes to access audio devices on the host. He also - submitted patches for the slip transport and lots of other things. - - - David Coulson - - - +o Set up the usermodelinux.org site, - which is a great way of keeping the UML user community on top of - UML goings-on. - - +o Site documentation and updates - - +o Nifty little UML management daemon UMLd - - - +o Lots of testing and bug reports - - - - - 1155..22.. FFlluusshhiinngg oouutt bbuuggss - - - - +o Yuri Pudgorodsky - - +o Gerald Britton - - +o Ian Wehrman - - +o Gord Lamb - - +o Eugene Koontz - - +o John H. Hartman - - +o Anders Karlsson - - +o Daniel Phillips - - +o John Fremlin - - +o Rainer Burgstaller - - +o James Stevenson - - +o Matt Clay - - +o Cliff Jefferies - - +o Geoff Hoff - - +o Lennert Buytenhek - - +o Al Viro - - +o Frank Klingenhoefer - - +o Livio Baldini Soares - - +o Jon Burgess - - +o Petru Paler - - +o Paul - - +o Chris Reahard - - +o Sverker Nilsson - - +o Gong Su - - +o johan verrept - - +o Bjorn Eriksson - - +o Lorenzo Allegrucci - - +o Muli Ben-Yehuda - - +o David Mansfield - - +o Howard Goff - - +o Mike Anderson - - +o John Byrne - - +o Sapan J. Batia - - +o Iris Huang - - +o Jan Hudec - - +o Voluspa - - - - - 1155..33.. BBuugglleettss aanndd cclleeaann--uuppss - - - - +o Dave Zarzycki - - +o Adam Lazur - - +o Boria Feigin - - +o Brian J. Murrell - - +o JS - - +o Roman Zippel - - +o Wil Cooley - - +o Ayelet Shemesh - - +o Will Dyson - - +o Sverker Nilsson - - +o dvorak - - +o v.naga srinivas - - +o Shlomi Fish - - +o Roger Binns - - +o johan verrept - - +o MrChuoi - - +o Peter Cleve - - +o Vincent Guffens - - +o Nathan Scott - - +o Patrick Caulfield - - +o jbearce - - +o Catalin Marinas - - +o Shane Spencer - - +o Zou Min - - - +o Ryan Boder - - +o Lorenzo Colitti - - +o Gwendal Grignou - - +o Andre' Breiler - - +o Tsutomu Yasuda - - - - 1155..44.. CCaassee SSttuuddiieess - - - +o Jon Wright - - +o William McEwan - - +o Michael Richardson - - - - 1155..55.. OOtthheerr ccoonnttrriibbuuttiioonnss - - - Bill Carr made the Red Hat mkrootfs script - work with RH 6.2. - - Michael Jennings sent in some material which - is now gracing the top of the index page of this site. - - SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com - . The bandwidth there made it possible to - produce most of the filesystems available on the project download - page. - - Laurent Bonnaud took the old grotty - Debian filesystem that I've been distributing and updated it to 2.2. - It is now available by itself here. - - Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make - releases even when Sourceforge is broken. - - Rodrigo de Castro looked at my broken pte code and told me what was - wrong with it, letting me fix a long-standing (several weeks) and - serious set of bugs. - - Chris Reahard built a specialized root filesystem for running a DNS - server jailed inside UML. It's available from the download - page in the Jail - Filesystems section. - - - - - - - - - - - - diff --git a/Documentation/usb/callbacks.txt b/Documentation/usb/callbacks.txt index bfb36b3..9e85846 100644 --- a/Documentation/usb/callbacks.txt +++ b/Documentation/usb/callbacks.txt @@ -95,9 +95,11 @@ pre_reset int (*pre_reset)(struct usb_interface *intf); -Another driver or user space is triggering a reset on the device which -contains the interface passed as an argument. Cease IO and save any -device state you need to restore. +A driver or user space is triggering a reset on the device which +contains the interface passed as an argument. Cease IO, wait for all +outstanding URBs to complete, and save any device state you need to +restore. No more URBs may be submitted until the post_reset method +is called. If you need to allocate memory here, use GFP_NOIO or GFP_ATOMIC, if you are in atomic context. diff --git a/Documentation/usb/hiddev.txt b/Documentation/usb/hiddev.txt deleted file mode 100644 index 6e8c9f1..0000000 --- a/Documentation/usb/hiddev.txt +++ /dev/null @@ -1,205 +0,0 @@ -Care and feeding of your Human Interface Devices - -INTRODUCTION - -In addition to the normal input type HID devices, USB also uses the -human interface device protocols for things that are not really human -interfaces, but have similar sorts of communication needs. The two big -examples for this are power devices (especially uninterruptable power -supplies) and monitor control on higher end monitors. - -To support these disparate requirements, the Linux USB system provides -HID events to two separate interfaces: -* the input subsystem, which converts HID events into normal input -device interfaces (such as keyboard, mouse and joystick) and a -normalised event interface - see Documentation/input/input.txt -* the hiddev interface, which provides fairly raw HID events - -The data flow for a HID event produced by a device is something like -the following : - - usb.c ---> hid-core.c ----> hid-input.c ----> [keyboard/mouse/joystick/event] - | - | - --> hiddev.c ----> POWER / MONITOR CONTROL - -In addition, other subsystems (apart from USB) can potentially feed -events into the input subsystem, but these have no effect on the hid -device interface. - -USING THE HID DEVICE INTERFACE - -The hiddev interface is a char interface using the normal USB major, -with the minor numbers starting at 96 and finishing at 111. Therefore, -you need the following commands: -mknod /dev/usb/hiddev0 c 180 96 -mknod /dev/usb/hiddev1 c 180 97 -mknod /dev/usb/hiddev2 c 180 98 -mknod /dev/usb/hiddev3 c 180 99 -mknod /dev/usb/hiddev4 c 180 100 -mknod /dev/usb/hiddev5 c 180 101 -mknod /dev/usb/hiddev6 c 180 102 -mknod /dev/usb/hiddev7 c 180 103 -mknod /dev/usb/hiddev8 c 180 104 -mknod /dev/usb/hiddev9 c 180 105 -mknod /dev/usb/hiddev10 c 180 106 -mknod /dev/usb/hiddev11 c 180 107 -mknod /dev/usb/hiddev12 c 180 108 -mknod /dev/usb/hiddev13 c 180 109 -mknod /dev/usb/hiddev14 c 180 110 -mknod /dev/usb/hiddev15 c 180 111 - -So you point your hiddev compliant user-space program at the correct -interface for your device, and it all just works. - -Assuming that you have a hiddev compliant user-space program, of -course. If you need to write one, read on. - - -THE HIDDEV API -This description should be read in conjunction with the HID -specification, freely available from http://www.usb.org, and -conveniently linked of http://www.linux-usb.org. - -The hiddev API uses a read() interface, and a set of ioctl() calls. - -HID devices exchange data with the host computer using data -bundles called "reports". Each report is divided into "fields", -each of which can have one or more "usages". In the hid-core, -each one of these usages has a single signed 32 bit value. - -read(): -This is the event interface. When the HID device's state changes, -it performs an interrupt transfer containing a report which contains -the changed value. The hid-core.c module parses the report, and -returns to hiddev.c the individual usages that have changed within -the report. In its basic mode, the hiddev will make these individual -usage changes available to the reader using a struct hiddev_event: - - struct hiddev_event { - unsigned hid; - signed int value; - }; - -containing the HID usage identifier for the status that changed, and -the value that it was changed to. Note that the structure is defined -within , along with some other useful #defines and -structures. The HID usage identifier is a composite of the HID usage -page shifted to the 16 high order bits ORed with the usage code. The -behavior of the read() function can be modified using the HIDIOCSFLAG -ioctl() described below. - - -ioctl(): -This is the control interface. There are a number of controls: - -HIDIOCGVERSION - int (read) -Gets the version code out of the hiddev driver. - -HIDIOCAPPLICATION - (none) -This ioctl call returns the HID application usage associated with the -hid device. The third argument to ioctl() specifies which application -index to get. This is useful when the device has more than one -application collection. If the index is invalid (greater or equal to -the number of application collections this device has) the ioctl -returns -1. You can find out beforehand how many application -collections the device has from the num_applications field from the -hiddev_devinfo structure. - -HIDIOCGCOLLECTIONINFO - struct hiddev_collection_info (read/write) -This returns a superset of the information above, providing not only -application collections, but all the collections the device has. It -also returns the level the collection lives in the hierarchy. -The user passes in a hiddev_collection_info struct with the index -field set to the index that should be returned. The ioctl fills in -the other fields. If the index is larger than the last collection -index, the ioctl returns -1 and sets errno to -EINVAL. - -HIDIOCGDEVINFO - struct hiddev_devinfo (read) -Gets a hiddev_devinfo structure which describes the device. - -HIDIOCGSTRING - struct hiddev_string_descriptor (read/write) -Gets a string descriptor from the device. The caller must fill in the -"index" field to indicate which descriptor should be returned. - -HIDIOCINITREPORT - (none) -Instructs the kernel to retrieve all input and feature report values -from the device. At this point, all the usage structures will contain -current values for the device, and will maintain it as the device -changes. Note that the use of this ioctl is unnecessary in general, -since later kernels automatically initialize the reports from the -device at attach time. - -HIDIOCGNAME - string (variable length) -Gets the device name - -HIDIOCGREPORT - struct hiddev_report_info (write) -Instructs the kernel to get a feature or input report from the device, -in order to selectively update the usage structures (in contrast to -INITREPORT). - -HIDIOCSREPORT - struct hiddev_report_info (write) -Instructs the kernel to send a report to the device. This report can -be filled in by the user through HIDIOCSUSAGE calls (below) to fill in -individual usage values in the report before sending the report in full -to the device. - -HIDIOCGREPORTINFO - struct hiddev_report_info (read/write) -Fills in a hiddev_report_info structure for the user. The report is -looked up by type (input, output or feature) and id, so these fields -must be filled in by the user. The ID can be absolute -- the actual -report id as reported by the device -- or relative -- -HID_REPORT_ID_FIRST for the first report, and (HID_REPORT_ID_NEXT | -report_id) for the next report after report_id. Without a-priori -information about report ids, the right way to use this ioctl is to -use the relative IDs above to enumerate the valid IDs. The ioctl -returns non-zero when there is no more next ID. The real report ID is -filled into the returned hiddev_report_info structure. - -HIDIOCGFIELDINFO - struct hiddev_field_info (read/write) -Returns the field information associated with a report in a -hiddev_field_info structure. The user must fill in report_id and -report_type in this structure, as above. The field_index should also -be filled in, which should be a number from 0 and maxfield-1, as -returned from a previous HIDIOCGREPORTINFO call. - -HIDIOCGUCODE - struct hiddev_usage_ref (read/write) -Returns the usage_code in a hiddev_usage_ref structure, given that -given its report type, report id, field index, and index within the -field have already been filled into the structure. - -HIDIOCGUSAGE - struct hiddev_usage_ref (read/write) -Returns the value of a usage in a hiddev_usage_ref structure. The -usage to be retrieved can be specified as above, or the user can -choose to fill in the report_type field and specify the report_id as -HID_REPORT_ID_UNKNOWN. In this case, the hiddev_usage_ref will be -filled in with the report and field information associated with this -usage if it is found. - -HIDIOCSUSAGE - struct hiddev_usage_ref (write) -Sets the value of a usage in an output report. The user fills in -the hiddev_usage_ref structure as above, but additionally fills in -the value field. - -HIDIOGCOLLECTIONINDEX - struct hiddev_usage_ref (write) -Returns the collection index associated with this usage. This -indicates where in the collection hierarchy this usage sits. - -HIDIOCGFLAG - int (read) -HIDIOCSFLAG - int (write) -These operations respectively inspect and replace the mode flags -that influence the read() call above. The flags are as follows: - - HIDDEV_FLAG_UREF - read() calls will now return - struct hiddev_usage_ref instead of struct hiddev_event. - This is a larger structure, but in situations where the - device has more than one usage in its reports with the - same usage code, this mode serves to resolve such - ambiguity. - - HIDDEV_FLAG_REPORT - This flag can only be used in conjunction - with HIDDEV_FLAG_UREF. With this flag set, when the device - sends a report, a struct hiddev_usage_ref will be returned - to read() filled in with the report_type and report_id, but - with field_index set to FIELD_INDEX_NONE. This serves as - additional notification when the device has sent a report. diff --git a/Documentation/usb/linux-cdc-acm.inf b/Documentation/usb/linux-cdc-acm.inf index 612e722..37a02ce 100644 --- a/Documentation/usb/linux-cdc-acm.inf +++ b/Documentation/usb/linux-cdc-acm.inf @@ -90,10 +90,10 @@ ServiceBinary=%12%\USBSER.sys [SourceDisksFiles] [SourceDisksNames] [DeviceList] -%DESCRIPTION%=DriverInstall, USB\VID_0525&PID_A4A7, USB\VID_0525&PID_A4AB&MI_02 +%DESCRIPTION%=DriverInstall, USB\VID_0525&PID_A4A7, USB\VID_1D6B&PID_0104&MI_02 [DeviceList.NTamd64] -%DESCRIPTION%=DriverInstall, USB\VID_0525&PID_A4A7, USB\VID_0525&PID_A4AB&MI_02 +%DESCRIPTION%=DriverInstall, USB\VID_0525&PID_A4A7, USB\VID_1D6B&PID_0104&MI_02 ;------------------------------------------------------------------------------ diff --git a/Documentation/usb/linux.inf b/Documentation/usb/linux.inf index 4dee958..4ffa715b0 100644 --- a/Documentation/usb/linux.inf +++ b/Documentation/usb/linux.inf @@ -18,15 +18,15 @@ DriverVer = 06/21/2006,6.0.6000.16384 ; Decoration for x86 architecture [LinuxDevices.NTx86] -%LinuxDevice% = RNDIS.NT.5.1, USB\VID_0525&PID_a4a2, USB\VID_0525&PID_a4ab&MI_00 +%LinuxDevice% = RNDIS.NT.5.1, USB\VID_0525&PID_a4a2, USB\VID_1d6b&PID_0104&MI_00 ; Decoration for x64 architecture [LinuxDevices.NTamd64] -%LinuxDevice% = RNDIS.NT.5.1, USB\VID_0525&PID_a4a2, USB\VID_0525&PID_a4ab&MI_00 +%LinuxDevice% = RNDIS.NT.5.1, USB\VID_0525&PID_a4a2, USB\VID_1d6b&PID_0104&MI_00 ; Decoration for ia64 architecture [LinuxDevices.NTia64] -%LinuxDevice% = RNDIS.NT.5.1, USB\VID_0525&PID_a4a2, USB\VID_0525&PID_a4ab&MI_00 +%LinuxDevice% = RNDIS.NT.5.1, USB\VID_0525&PID_a4a2, USB\VID_1d6b&PID_0104&MI_00 ;@@@ This is the common setting for setup [ControlFlags] diff --git a/Documentation/vgaarbiter.txt b/Documentation/vgaarbiter.txt index 43a9b06..b7d401e 100644 --- a/Documentation/vgaarbiter.txt +++ b/Documentation/vgaarbiter.txt @@ -14,11 +14,10 @@ the legacy VGA arbitration task (besides other bus management tasks) when more than one legacy device co-exists on the same machine. But the problem happens when these devices are trying to be accessed by different userspace clients (e.g. two server in parallel). Their address assignments conflict. Moreover, -ideally, being an userspace application, it is not the role of the the X -server to control bus resources. Therefore an arbitration scheme outside of -the X server is needed to control the sharing of these resources. This -document introduces the operation of the VGA arbiter implemented for Linux -kernel. +ideally, being a userspace application, it is not the role of the X server to +control bus resources. Therefore an arbitration scheme outside of the X server +is needed to control the sharing of these resources. This document introduces +the operation of the VGA arbiter implemented for the Linux kernel. ---------------------------------------------------------------------------- @@ -39,7 +38,7 @@ I.1 vgaarb The vgaarb is a module of the Linux Kernel. When it is initially loaded, it scans all PCI devices and adds the VGA ones inside the arbitration. The arbiter then enables/disables the decoding on different devices of the VGA -legacy instructions. Device which do not want/need to use the arbiter may +legacy instructions. Devices which do not want/need to use the arbiter may explicitly tell it by calling vga_set_legacy_decoding(). The kernel exports a char device interface (/dev/vga_arbiter) to the clients, @@ -95,8 +94,8 @@ In the case of devices hot-{un,}plugged, there is a hook - pci_notify() - to notify them being added/removed in the system and automatically added/removed in the arbiter. -There's also a in-kernel API of the arbiter in the case of DRM, vgacon and -others which may use the arbiter. +There is also an in-kernel API of the arbiter in case DRM, vgacon, or other +drivers want to use it. I.2 libpciaccess @@ -117,9 +116,8 @@ Besides it, in pci_system were added: struct pci_device *vga_default_dev; -The vga_count is usually need to keep informed how many cards are being -arbitrated, so for instance if there's only one then it can totally escape the -scheme. +The vga_count is used to track how many cards are being arbitrated, so for +instance, if there is only one card, then it can completely escape arbitration. These functions below acquire VGA resources for the given card and mark those diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx index 31b4857..9aae449 100644 --- a/Documentation/video4linux/CARDLIST.em28xx +++ b/Documentation/video4linux/CARDLIST.em28xx @@ -54,7 +54,7 @@ 53 -> Pinnacle Hybrid Pro (em2881) 54 -> Kworld VS-DVB-T 323UR (em2882) [eb1a:e323] 55 -> Terratec Cinnergy Hybrid T USB XS (em2882) (em2882) [0ccd:005e,0ccd:0042] - 56 -> Pinnacle Hybrid Pro (2) (em2882) [2304:0226] + 56 -> Pinnacle Hybrid Pro (330e) (em2882) [2304:0226] 57 -> Kworld PlusTV HD Hybrid 330 (em2883) [eb1a:a316] 58 -> Compro VideoMate ForYou/Stereo (em2820/em2840) [185b:2041] 60 -> Hauppauge WinTV HVR 850 (em2883) [2040:651f] diff --git a/Documentation/video4linux/Zoran b/Documentation/video4linux/Zoran index c40e3ba..9ed629d 100644 --- a/Documentation/video4linux/Zoran +++ b/Documentation/video4linux/Zoran @@ -130,7 +130,6 @@ Card number: 4 Note: No module for the mse3000 is available yet Note: No module for the vpx3224 is available yet -Note: use encoder=X or decoder=X for non-default i2c chips =========================== diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt index 5c542e6..5bfa9a7 100644 --- a/Documentation/video4linux/gspca.txt +++ b/Documentation/video4linux/gspca.txt @@ -275,6 +275,7 @@ pac7302 093a:2629 Genious iSlim 300 pac7302 093a:262a Webcam 300k pac7302 093a:262c Philips SPC 230 NC jeilinj 0979:0280 Sakar 57379 +jeilinj 0979:0280 Sportscam DV15 zc3xx 0ac8:0302 Z-star Vimicro zc0302 vc032x 0ac8:0321 Vimicro generic vc0321 vc032x 0ac8:0323 Vimicro Vc0323 diff --git a/Documentation/video4linux/uvcvideo.txt b/Documentation/video4linux/uvcvideo.txt new file mode 100644 index 0000000..848d620 --- /dev/null +++ b/Documentation/video4linux/uvcvideo.txt @@ -0,0 +1,239 @@ +Linux USB Video Class (UVC) driver +================================== + +This file documents some driver-specific aspects of the UVC driver, such as +driver-specific ioctls and implementation notes. + +Questions and remarks can be sent to the Linux UVC development mailing list at +linux-uvc-devel@lists.berlios.de. + + +Extension Unit (XU) support +--------------------------- + +1. Introduction + +The UVC specification allows for vendor-specific extensions through extension +units (XUs). The Linux UVC driver supports extension unit controls (XU controls) +through two separate mechanisms: + + - through mappings of XU controls to V4L2 controls + - through a driver-specific ioctl interface + +The first one allows generic V4L2 applications to use XU controls by mapping +certain XU controls onto V4L2 controls, which then show up during ordinary +control enumeration. + +The second mechanism requires uvcvideo-specific knowledge for the application to +access XU controls but exposes the entire UVC XU concept to user space for +maximum flexibility. + +Both mechanisms complement each other and are described in more detail below. + + +2. Control mappings + +The UVC driver provides an API for user space applications to define so-called +control mappings at runtime. These allow for individual XU controls or byte +ranges thereof to be mapped to new V4L2 controls. Such controls appear and +function exactly like normal V4L2 controls (i.e. the stock controls, such as +brightness, contrast, etc.). However, reading or writing of such a V4L2 controls +triggers a read or write of the associated XU control. + +The ioctl used to create these control mappings is called UVCIOC_CTRL_MAP. +Previous driver versions (before 0.2.0) required another ioctl to be used +beforehand (UVCIOC_CTRL_ADD) to pass XU control information to the UVC driver. +This is no longer necessary as newer uvcvideo versions query the information +directly from the device. + +For details on the UVCIOC_CTRL_MAP ioctl please refer to the section titled +"IOCTL reference" below. + + +3. Driver specific XU control interface + +For applications that need to access XU controls directly, e.g. for testing +purposes, firmware upload, or accessing binary controls, a second mechanism to +access XU controls is provided in the form of a driver-specific ioctl, namely +UVCIOC_CTRL_QUERY. + +A call to this ioctl allows applications to send queries to the UVC driver that +directly map to the low-level UVC control requests. + +In order to make such a request the UVC unit ID of the control's extension unit +and the control selector need to be known. This information either needs to be +hardcoded in the application or queried using other ways such as by parsing the +UVC descriptor or, if available, using the media controller API to enumerate a +device's entities. + +Unless the control size is already known it is necessary to first make a +UVC_GET_LEN requests in order to be able to allocate a sufficiently large buffer +and set the buffer size to the correct value. Similarly, to find out whether +UVC_GET_CUR or UVC_SET_CUR are valid requests for a given control, a +UVC_GET_INFO request should be made. The bits 0 (GET supported) and 1 (SET +supported) of the resulting byte indicate which requests are valid. + +With the addition of the UVCIOC_CTRL_QUERY ioctl the UVCIOC_CTRL_GET and +UVCIOC_CTRL_SET ioctls have become obsolete since their functionality is a +subset of the former ioctl. For the time being they are still supported but +application developers are encouraged to use UVCIOC_CTRL_QUERY instead. + +For details on the UVCIOC_CTRL_QUERY ioctl please refer to the section titled +"IOCTL reference" below. + + +4. Security + +The API doesn't currently provide a fine-grained access control facility. The +UVCIOC_CTRL_ADD and UVCIOC_CTRL_MAP ioctls require super user permissions. + +Suggestions on how to improve this are welcome. + + +5. Debugging + +In order to debug problems related to XU controls or controls in general it is +recommended to enable the UVC_TRACE_CONTROL bit in the module parameter 'trace'. +This causes extra output to be written into the system log. + + +6. IOCTL reference + +---- UVCIOC_CTRL_MAP - Map a UVC control to a V4L2 control ---- + +Argument: struct uvc_xu_control_mapping + +Description: + This ioctl creates a mapping between a UVC control or part of a UVC + control and a V4L2 control. Once mappings are defined, userspace + applications can access vendor-defined UVC control through the V4L2 + control API. + + To create a mapping, applications fill the uvc_xu_control_mapping + structure with information about an existing UVC control defined with + UVCIOC_CTRL_ADD and a new V4L2 control. + + A UVC control can be mapped to several V4L2 controls. For instance, + a UVC pan/tilt control could be mapped to separate pan and tilt V4L2 + controls. The UVC control is divided into non overlapping fields using + the 'size' and 'offset' fields and are then independantly mapped to + V4L2 control. + + For signed integer V4L2 controls the data_type field should be set to + UVC_CTRL_DATA_TYPE_SIGNED. Other values are currently ignored. + +Return value: + On success 0 is returned. On error -1 is returned and errno is set + appropriately. + + ENOMEM + Not enough memory to perform the operation. + EPERM + Insufficient privileges (super user privileges are required). + EINVAL + No such UVC control. + EOVERFLOW + The requested offset and size would overflow the UVC control. + EEXIST + Mapping already exists. + +Data types: + * struct uvc_xu_control_mapping + + __u32 id V4L2 control identifier + __u8 name[32] V4L2 control name + __u8 entity[16] UVC extension unit GUID + __u8 selector UVC control selector + __u8 size V4L2 control size (in bits) + __u8 offset V4L2 control offset (in bits) + enum v4l2_ctrl_type + v4l2_type V4L2 control type + enum uvc_control_data_type + data_type UVC control data type + struct uvc_menu_info + *menu_info Array of menu entries (for menu controls only) + __u32 menu_count Number of menu entries (for menu controls only) + + * struct uvc_menu_info + + __u32 value Menu entry value used by the device + __u8 name[32] Menu entry name + + + * enum uvc_control_data_type + + UVC_CTRL_DATA_TYPE_RAW Raw control (byte array) + UVC_CTRL_DATA_TYPE_SIGNED Signed integer + UVC_CTRL_DATA_TYPE_UNSIGNED Unsigned integer + UVC_CTRL_DATA_TYPE_BOOLEAN Boolean + UVC_CTRL_DATA_TYPE_ENUM Enumeration + UVC_CTRL_DATA_TYPE_BITMASK Bitmask + + +---- UVCIOC_CTRL_QUERY - Query a UVC XU control ---- + +Argument: struct uvc_xu_control_query + +Description: + This ioctl queries a UVC XU control identified by its extension unit ID + and control selector. + + There are a number of different queries available that closely + correspond to the low-level control requests described in the UVC + specification. These requests are: + + UVC_GET_CUR + Obtain the current value of the control. + UVC_GET_MIN + Obtain the minimum value of the control. + UVC_GET_MAX + Obtain the maximum value of the control. + UVC_GET_DEF + Obtain the default value of the control. + UVC_GET_RES + Query the resolution of the control, i.e. the step size of the + allowed control values. + UVC_GET_LEN + Query the size of the control in bytes. + UVC_GET_INFO + Query the control information bitmap, which indicates whether + get/set requests are supported. + UVC_SET_CUR + Update the value of the control. + + Applications must set the 'size' field to the correct length for the + control. Exceptions are the UVC_GET_LEN and UVC_GET_INFO queries, for + which the size must be set to 2 and 1, respectively. The 'data' field + must point to a valid writable buffer big enough to hold the indicated + number of data bytes. + + Data is copied directly from the device without any driver-side + processing. Applications are responsible for data buffer formatting, + including little-endian/big-endian conversion. This is particularly + important for the result of the UVC_GET_LEN requests, which is always + returned as a little-endian 16-bit integer by the device. + +Return value: + On success 0 is returned. On error -1 is returned and errno is set + appropriately. + + ENOENT + The device does not support the given control or the specified + extension unit could not be found. + ENOBUFS + The specified buffer size is incorrect (too big or too small). + EINVAL + An invalid request code was passed. + EBADRQC + The given request is not supported by the given control. + EFAULT + The data pointer references an inaccessible memory area. + +Data types: + * struct uvc_xu_control_query + + __u8 unit Extension unit ID + __u8 selector Control selector + __u8 query Request code to send to the device + __u16 size Control data size (in bytes) + __u8 *data Control value diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX new file mode 100644 index 0000000..fe0251c --- /dev/null +++ b/Documentation/virtual/00-INDEX @@ -0,0 +1,10 @@ +Virtualization support in the Linux kernel. + +00-INDEX + - this file. +kvm/ + - Kernel Virtual Machine. See also http://linux-kvm.org +lguest/ + - Extremely simple hypervisor for experimental/educational use. +uml/ + - User Mode Linux, builds/runs Linux kernel as a userspace program. diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt new file mode 100644 index 0000000..42542eb --- /dev/null +++ b/Documentation/virtual/kvm/api.txt @@ -0,0 +1,1479 @@ +The Definitive KVM (Kernel-based Virtual Machine) API Documentation +=================================================================== + +1. General description + +The kvm API is a set of ioctls that are issued to control various aspects +of a virtual machine. The ioctls belong to three classes + + - System ioctls: These query and set global attributes which affect the + whole kvm subsystem. In addition a system ioctl is used to create + virtual machines + + - VM ioctls: These query and set attributes that affect an entire virtual + machine, for example memory layout. In addition a VM ioctl is used to + create virtual cpus (vcpus). + + Only run VM ioctls from the same process (address space) that was used + to create the VM. + + - vcpu ioctls: These query and set attributes that control the operation + of a single virtual cpu. + + Only run vcpu ioctls from the same thread that was used to create the + vcpu. + +2. File descriptors + +The kvm API is centered around file descriptors. An initial +open("/dev/kvm") obtains a handle to the kvm subsystem; this handle +can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this +handle will create a VM file descriptor which can be used to issue VM +ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu +and return a file descriptor pointing to it. Finally, ioctls on a vcpu +fd can be used to control the vcpu, including the important task of +actually running guest code. + +In general file descriptors can be migrated among processes by means +of fork() and the SCM_RIGHTS facility of unix domain socket. These +kinds of tricks are explicitly not supported by kvm. While they will +not cause harm to the host, their actual behavior is not guaranteed by +the API. The only supported use is one virtual machine per process, +and one vcpu per thread. + +3. Extensions + +As of Linux 2.6.22, the KVM ABI has been stabilized: no backward +incompatible change are allowed. However, there is an extension +facility that allows backward-compatible extensions to the API to be +queried and used. + +The extension mechanism is not based on on the Linux version number. +Instead, kvm defines extension identifiers and a facility to query +whether a particular extension identifier is available. If it is, a +set of ioctls is available for application use. + +4. API description + +This section describes ioctls that can be used to control kvm guests. +For each ioctl, the following information is provided along with a +description: + + Capability: which KVM extension provides this ioctl. Can be 'basic', + which means that is will be provided by any kernel that supports + API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which + means availability needs to be checked with KVM_CHECK_EXTENSION + (see section 4.4). + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Type: system, vm, or vcpu. + + Parameters: what parameters are accepted by the ioctl. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + +4.1 KVM_GET_API_VERSION + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: the constant KVM_API_VERSION (=12) + +This identifies the API version as the stable kvm API. It is not +expected that this number will change. However, Linux 2.6.20 and +2.6.21 report earlier versions; these are not documented and not +supported. Applications should refuse to run if KVM_GET_API_VERSION +returns a value other than 12. If this check passes, all ioctls +described as 'basic' will be available. + +4.2 KVM_CREATE_VM + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: a VM fd that can be used to control the new virtual machine. + +The new VM has no virtual cpus and no memory. An mmap() of a VM fd +will access the virtual machine's physical address space; offset zero +corresponds to guest physical address zero. Use of mmap() on a VM fd +is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is +available. + +4.3 KVM_GET_MSR_INDEX_LIST + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_msr_list (in/out) +Returns: 0 on success; -1 on error +Errors: + E2BIG: the msr index list is to be to fit in the array specified by + the user. + +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + +This ioctl returns the guest msrs that are supported. The list varies +by kvm version and host processor, but does not change otherwise. The +user fills in the size of the indices array in nmsrs, and in return +kvm adjusts nmsrs to reflect the actual number of msrs and fills in +the indices array with their numbers. + +Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are +not returned in the MSR list, as different vcpus can have a different number +of banks, as set via the KVM_X86_SETUP_MCE ioctl. + +4.4 KVM_CHECK_EXTENSION + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: extension identifier (KVM_CAP_*) +Returns: 0 if unsupported; 1 (or some other positive integer) if supported + +The API allows the application to query about extensions to the core +kvm API. Userspace passes an extension identifier (an integer) and +receives an integer that describes the extension availability. +Generally 0 means no and 1 means yes, but some extensions may report +additional information in the integer return value. + +4.5 KVM_GET_VCPU_MMAP_SIZE + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: size of vcpu mmap area, in bytes + +The KVM_RUN ioctl (cf.) communicates with userspace via a shared +memory region. This ioctl returns the size of that region. See the +KVM_RUN documentation for details. + +4.6 KVM_SET_MEMORY_REGION + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: struct kvm_memory_region (in) +Returns: 0 on success, -1 on error + +This ioctl is obsolete and has been removed. + +4.7 KVM_CREATE_VCPU + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: vcpu id (apic id on x86) +Returns: vcpu fd on success, -1 on error + +This API adds a vcpu to a virtual machine. The vcpu id is a small integer +in the range [0, max_vcpus). You can use KVM_CAP_NR_VCPUS of the +KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time. +If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 +cpus max. + +4.8 KVM_GET_DIRTY_LOG (vm ioctl) + +Capability: basic +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_dirty_log (in/out) +Returns: 0 on success, -1 on error + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +Given a memory slot, return a bitmap containing any pages dirtied +since the last call to this ioctl. Bit 0 is the first page in the +memory slot. Ensure the entire structure is cleared to avoid padding +issues. + +4.9 KVM_SET_MEMORY_ALIAS + +Capability: basic +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_memory_alias (in) +Returns: 0 (success), -1 (error) + +This ioctl is obsolete and has been removed. + +4.10 KVM_RUN + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: none +Returns: 0 on success, -1 on error +Errors: + EINTR: an unmasked signal is pending + +This ioctl is used to run a guest virtual cpu. While there are no +explicit parameters, there is an implicit parameter block that can be +obtained by mmap()ing the vcpu fd at offset 0, with the size given by +KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct +kvm_run' (see below). + +4.11 KVM_GET_REGS + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_regs (out) +Returns: 0 on success, -1 on error + +Reads the general purpose registers from the vcpu. + +/* x86 */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +4.12 KVM_SET_REGS + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_regs (in) +Returns: 0 on success, -1 on error + +Writes the general purpose registers into the vcpu. + +See KVM_GET_REGS for the data structure. + +4.13 KVM_GET_SREGS + +Capability: basic +Architectures: x86, ppc +Type: vcpu ioctl +Parameters: struct kvm_sregs (out) +Returns: 0 on success, -1 on error + +Reads special registers from the vcpu. + +/* x86 */ +struct kvm_sregs { + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +/* ppc -- see arch/powerpc/include/asm/kvm.h */ + +interrupt_bitmap is a bitmap of pending external interrupts. At most +one bit may be set. This interrupt has been acknowledged by the APIC +but not yet injected into the cpu core. + +4.14 KVM_SET_SREGS + +Capability: basic +Architectures: x86, ppc +Type: vcpu ioctl +Parameters: struct kvm_sregs (in) +Returns: 0 on success, -1 on error + +Writes special registers into the vcpu. See KVM_GET_SREGS for the +data structures. + +4.15 KVM_TRANSLATE + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_translation (in/out) +Returns: 0 on success, -1 on error + +Translates a virtual address according to the vcpu's current address +translation mode. + +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + +4.16 KVM_INTERRUPT + +Capability: basic +Architectures: x86, ppc +Type: vcpu ioctl +Parameters: struct kvm_interrupt (in) +Returns: 0 on success, -1 on error + +Queues a hardware interrupt vector to be injected. This is only +useful if in-kernel local APIC or equivalent is not used. + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +X86: + +Note 'irq' is an interrupt vector, not an interrupt pin or line. + +PPC: + +Queues an external interrupt to be injected. This ioctl is overleaded +with 3 different irq values: + +a) KVM_INTERRUPT_SET + + This injects an edge type external interrupt into the guest once it's ready + to receive interrupts. When injected, the interrupt is done. + +b) KVM_INTERRUPT_UNSET + + This unsets any pending interrupt. + + Only available with KVM_CAP_PPC_UNSET_IRQ. + +c) KVM_INTERRUPT_SET_LEVEL + + This injects a level type external interrupt into the guest context. The + interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET + is triggered. + + Only available with KVM_CAP_PPC_IRQ_LEVEL. + +Note that any value for 'irq' other than the ones stated above is invalid +and incurs unexpected behavior. + +4.17 KVM_DEBUG_GUEST + +Capability: basic +Architectures: none +Type: vcpu ioctl +Parameters: none) +Returns: -1 on error + +Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. + +4.18 KVM_GET_MSRS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_msrs (in/out) +Returns: 0 on success, -1 on error + +Reads model-specific registers from the vcpu. Supported msr indices can +be obtained using KVM_GET_MSR_INDEX_LIST. + +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array) and the 'index' member of each array entry. +kvm will fill in the 'data' member. + +4.19 KVM_SET_MSRS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_msrs (in) +Returns: 0 on success, -1 on error + +Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the +data structures. + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array), and the 'index' and 'data' members of each +array entry. + +4.20 KVM_SET_CPUID + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_cpuid (in) +Returns: 0 on success, -1 on error + +Defines the vcpu responses to the cpuid instruction. Applications +should use the KVM_SET_CPUID2 ioctl if available. + + +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + +4.21 KVM_SET_SIGNAL_MASK + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_signal_mask (in) +Returns: 0 on success, -1 on error + +Defines which signals are blocked during execution of KVM_RUN. This +signal mask temporarily overrides the threads signal mask. Any +unblocked signal received (except SIGKILL and SIGSTOP, which retain +their traditional behaviour) will cause KVM_RUN to return with -EINTR. + +Note the signal will only be delivered if not blocked by the original +signal mask. + +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + +4.22 KVM_GET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (out) +Returns: 0 on success, -1 on error + +Reads the floating point state from the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +4.23 KVM_SET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (in) +Returns: 0 on success, -1 on error + +Writes the floating point state to the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +4.24 KVM_CREATE_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Creates an interrupt controller model in the kernel. On x86, creates a virtual +ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a +local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 +only go to the IOAPIC. On ia64, a IOSAPIC is created. + +4.25 KVM_IRQ_LINE + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irq_level +Returns: 0 on success, -1 on error + +Sets the level of a GSI input to the interrupt controller model in the kernel. +Requires that an interrupt controller model has been previously created with +KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level +to be set to 1 and then back to 0. + +struct kvm_irq_level { + union { + __u32 irq; /* GSI */ + __s32 status; /* not used for KVM_IRQ_LEVEL */ + }; + __u32 level; /* 0 or 1 */ +}; + +4.26 KVM_GET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irqchip (in/out) +Returns: 0 on success, -1 on error + +Reads the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP into a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + +4.27 KVM_SET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irqchip (in) +Returns: 0 on success, -1 on error + +Sets the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP from a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + +4.28 KVM_XEN_HVM_CONFIG + +Capability: KVM_CAP_XEN_HVM +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_xen_hvm_config (in) +Returns: 0 on success, -1 on error + +Sets the MSR that the Xen HVM guest uses to initialize its hypercall +page, and provides the starting address and size of the hypercall +blobs in userspace. When the guest writes the MSR, kvm copies one +page of a blob (32- or 64-bit, depending on the vcpu mode) to guest +memory. + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + +4.29 KVM_GET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (out) +Returns: 0 on success, -1 on error + +Gets the current timestamp of kvmclock as seen by the current guest. In +conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + +4.30 KVM_SET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (in) +Returns: 0 on success, -1 on error + +Sets the current timestamp of kvmclock to the value specified in its parameter. +In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + +4.31 KVM_GET_VCPU_EVENTS + +Capability: KVM_CAP_VCPU_EVENTS +Extended by: KVM_CAP_INTR_SHADOW +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_vcpu_event (out) +Returns: 0 on success, -1 on error + +Gets currently pending exceptions, interrupts, and NMIs as well as related +states of the vcpu. + +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pad; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; +}; + +KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that +interrupt.shadow contains a valid state. Otherwise, this field is undefined. + +4.32 KVM_SET_VCPU_EVENTS + +Capability: KVM_CAP_VCPU_EVENTS +Extended by: KVM_CAP_INTR_SHADOW +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_vcpu_event (in) +Returns: 0 on success, -1 on error + +Set pending exceptions, interrupts, and NMIs as well as related states of the +vcpu. + +See KVM_GET_VCPU_EVENTS for the data structure. + +Fields that may be modified asynchronously by running VCPUs can be excluded +from the update. These fields are nmi.pending and sipi_vector. Keep the +corresponding bits in the flags field cleared to suppress overwriting the +current in-kernel state. The bits are: + +KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel +KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector + +If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in +the flags field to signal that interrupt.shadow contains a valid state and +shall be written into the VCPU. + +4.33 KVM_GET_DEBUGREGS + +Capability: KVM_CAP_DEBUGREGS +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_debugregs (out) +Returns: 0 on success, -1 on error + +Reads debug registers from the vcpu. + +struct kvm_debugregs { + __u64 db[4]; + __u64 dr6; + __u64 dr7; + __u64 flags; + __u64 reserved[9]; +}; + +4.34 KVM_SET_DEBUGREGS + +Capability: KVM_CAP_DEBUGREGS +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_debugregs (in) +Returns: 0 on success, -1 on error + +Writes debug registers into the vcpu. + +See KVM_GET_DEBUGREGS for the data structure. The flags field is unused +yet and must be cleared on entry. + +4.35 KVM_SET_USER_MEMORY_REGION + +Capability: KVM_CAP_USER_MEM +Architectures: all +Type: vm ioctl +Parameters: struct kvm_userspace_memory_region (in) +Returns: 0 on success, -1 on error + +struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ +}; + +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES 1UL + +This ioctl allows the user to create or modify a guest physical memory +slot. When changing an existing slot, it may be moved in the guest +physical memory space, or its flags may be modified. It may not be +resized. Slots may not overlap in guest physical address space. + +Memory for the region is taken starting at the address denoted by the +field userspace_addr, which must point at user addressable memory for +the entire memory slot size. Any object may back this memory, including +anonymous memory, ordinary files, and hugetlbfs. + +It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr +be identical. This allows large pages in the guest to be backed by large +pages in the host. + +The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which +instructs kvm to keep track of writes to memory within the slot. See +the KVM_GET_DIRTY_LOG ioctl. + +When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory +region are automatically reflected into the guest. For example, an mmap() +that affects the region will be made visible immediately. Another example +is madvise(MADV_DROP). + +It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. +The KVM_SET_MEMORY_REGION does not allow fine grained control over memory +allocation and is deprecated. + +4.36 KVM_SET_TSS_ADDR + +Capability: KVM_CAP_SET_TSS_ADDR +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long tss_address (in) +Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a three-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + +4.37 KVM_ENABLE_CAP + +Capability: KVM_CAP_ENABLE_CAP +Architectures: ppc +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + ++Not all extensions are enabled by default. Using this ioctl the application +can enable an extension, making it available to the guest. + +On systems that do not support this ioctl, it always fails. On systems that +do support it, it only works for extensions that are supported for enablement. + +To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should +be used. + +struct kvm_enable_cap { + /* in */ + __u32 cap; + +The capability that is supposed to get enabled. + + __u32 flags; + +A bitfield indicating future enhancements. Has to be 0 for now. + + __u64 args[4]; + +Arguments for enabling a feature. If a feature needs initial values to +function properly, this is the place to put them. + + __u8 pad[64]; +}; + +4.38 KVM_GET_MP_STATE + +Capability: KVM_CAP_MP_STATE +Architectures: x86, ia64 +Type: vcpu ioctl +Parameters: struct kvm_mp_state (out) +Returns: 0 on success; -1 on error + +struct kvm_mp_state { + __u32 mp_state; +}; + +Returns the vcpu's current "multiprocessing state" (though also valid on +uniprocessor guests). + +Possible values are: + + - KVM_MP_STATE_RUNNABLE: the vcpu is currently running + - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) + which has not yet received an INIT signal + - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is + now ready for a SIPI + - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and + is waiting for an interrupt + - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector + accessible via KVM_GET_VCPU_EVENTS) + +This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel +irqchip, the multiprocessing state must be maintained by userspace. + +4.39 KVM_SET_MP_STATE + +Capability: KVM_CAP_MP_STATE +Architectures: x86, ia64 +Type: vcpu ioctl +Parameters: struct kvm_mp_state (in) +Returns: 0 on success; -1 on error + +Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for +arguments. + +This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel +irqchip, the multiprocessing state must be maintained by userspace. + +4.40 KVM_SET_IDENTITY_MAP_ADDR + +Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long identity (in) +Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a one-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + +4.41 KVM_SET_BOOT_CPU_ID + +Capability: KVM_CAP_SET_BOOT_CPU_ID +Architectures: x86, ia64 +Type: vm ioctl +Parameters: unsigned long vcpu_id +Returns: 0 on success, -1 on error + +Define which vcpu is the Bootstrap Processor (BSP). Values are the same +as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default +is vcpu 0. + +4.42 KVM_GET_XSAVE + +Capability: KVM_CAP_XSAVE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xsave (out) +Returns: 0 on success, -1 on error + +struct kvm_xsave { + __u32 region[1024]; +}; + +This ioctl would copy current vcpu's xsave struct to the userspace. + +4.43 KVM_SET_XSAVE + +Capability: KVM_CAP_XSAVE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xsave (in) +Returns: 0 on success, -1 on error + +struct kvm_xsave { + __u32 region[1024]; +}; + +This ioctl would copy userspace's xsave struct to the kernel. + +4.44 KVM_GET_XCRS + +Capability: KVM_CAP_XCRS +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xcrs (out) +Returns: 0 on success, -1 on error + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +This ioctl would copy current vcpu's xcrs to the userspace. + +4.45 KVM_SET_XCRS + +Capability: KVM_CAP_XCRS +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xcrs (in) +Returns: 0 on success, -1 on error + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +This ioctl would set vcpu's xcr to the value userspace specified. + +4.46 KVM_GET_SUPPORTED_CPUID + +Capability: KVM_CAP_EXT_CPUID +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 +#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 +#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features which are supported by both the hardware +and kvm. Userspace can use the information returned by this ioctl to +construct cpuid information (for KVM_SET_CPUID2) that is consistent with +hardware, kernel, and userspace capabilities, and with user requirements (for +example, the user may wish to constrain cpuid to emulate older hardware, +or for feature consistency across a cluster). + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe the cpu +capabilities, an error (E2BIG) is returned. If the number is too high, +the 'nent' field is adjusted and an error (ENOMEM) is returned. If the +number is just right, the 'nent' field is adjusted to the number of valid +entries in the 'entries' array, which is then filled. + +The entries returned are the host cpuid as returned by the cpuid instruction, +with unknown or unsupported features masked out. Some features (for example, +x2apic), may not be present in the host cpu, but are exposed by kvm if it can +emulate them efficiently. The fields in each entry are defined as follows: + + function: the eax value used to obtain the entry + index: the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: an OR of zero or more of the following: + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + eax, ebx, ecx, edx: the values returned by the cpuid instruction for + this function/index combination + +4.47 KVM_PPC_GET_PVINFO + +Capability: KVM_CAP_PPC_GET_PVINFO +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_pvinfo (out) +Returns: 0 on success, !0 on error + +struct kvm_ppc_pvinfo { + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +This ioctl fetches PV specific information that need to be passed to the guest +using the device tree or other means from vm context. + +For now the only implemented piece of information distributed here is an array +of 4 instructions that make up a hypercall. + +If any additional field gets added to this structure later on, a bit for that +additional piece of information will be set in the flags bitmap. + +4.48 KVM_ASSIGN_PCI_DEVICE + +Capability: KVM_CAP_DEVICE_ASSIGNMENT +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_pci_dev (in) +Returns: 0 on success, -1 on error + +Assigns a host PCI device to the VM. + +struct kvm_assigned_pci_dev { + __u32 assigned_dev_id; + __u32 busnr; + __u32 devfn; + __u32 flags; + __u32 segnr; + union { + __u32 reserved[11]; + }; +}; + +The PCI device is specified by the triple segnr, busnr, and devfn. +Identification in succeeding service requests is done via assigned_dev_id. The +following flags are specified: + +/* Depends on KVM_CAP_IOMMU */ +#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + +4.49 KVM_DEASSIGN_PCI_DEVICE + +Capability: KVM_CAP_DEVICE_DEASSIGNMENT +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_pci_dev (in) +Returns: 0 on success, -1 on error + +Ends PCI device assignment, releasing all associated resources. + +See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is +used in kvm_assigned_pci_dev to identify the device. + +4.50 KVM_ASSIGN_DEV_IRQ + +Capability: KVM_CAP_ASSIGN_DEV_IRQ +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_irq (in) +Returns: 0 on success, -1 on error + +Assigns an IRQ to a passed-through device. + +struct kvm_assigned_irq { + __u32 assigned_dev_id; + __u32 host_irq; + __u32 guest_irq; + __u32 flags; + union { + struct { + __u32 addr_lo; + __u32 addr_hi; + __u32 data; + } guest_msi; + __u32 reserved[12]; + }; +}; + +The following flags are defined: + +#define KVM_DEV_IRQ_HOST_INTX (1 << 0) +#define KVM_DEV_IRQ_HOST_MSI (1 << 1) +#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) + +#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) +#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) +#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) + +It is not valid to specify multiple types per host or guest IRQ. However, the +IRQ type of host and guest can differ or can even be null. + +4.51 KVM_DEASSIGN_DEV_IRQ + +Capability: KVM_CAP_ASSIGN_DEV_IRQ +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_irq (in) +Returns: 0 on success, -1 on error + +Ends an IRQ assignment to a passed-through device. + +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified +by assigned_dev_id, flags must correspond to the IRQ type specified on +KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed. + +4.52 KVM_SET_GSI_ROUTING + +Capability: KVM_CAP_IRQ_ROUTING +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_irq_routing (in) +Returns: 0 on success, -1 on error + +Sets the GSI routing table entries, overwriting any previously set entries. + +struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; +}; + +No flags are specified so far, the corresponding field must be set to zero. + +struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; + __u32 pad[8]; + } u; +}; + +/* gsi routing entry types */ +#define KVM_IRQ_ROUTING_IRQCHIP 1 +#define KVM_IRQ_ROUTING_MSI 2 + +No flags are specified so far, the corresponding field must be set to zero. + +struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; +}; + +struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 pad; +}; + +4.53 KVM_ASSIGN_SET_MSIX_NR + +Capability: KVM_CAP_DEVICE_MSIX +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_msix_nr (in) +Returns: 0 on success, -1 on error + +Set the number of MSI-X interrupts for an assigned device. This service can +only be called once in the lifetime of an assigned device. + +struct kvm_assigned_msix_nr { + __u32 assigned_dev_id; + __u16 entry_nr; + __u16 padding; +}; + +#define KVM_MAX_MSIX_PER_DEV 256 + +4.54 KVM_ASSIGN_SET_MSIX_ENTRY + +Capability: KVM_CAP_DEVICE_MSIX +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_msix_entry (in) +Returns: 0 on success, -1 on error + +Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting +the GSI vector to zero means disabling the interrupt. + +struct kvm_assigned_msix_entry { + __u32 assigned_dev_id; + __u32 gsi; + __u16 entry; /* The index of entry in the MSI-X table */ + __u16 padding[3]; +}; + +4.54 KVM_SET_TSC_KHZ + +Capability: KVM_CAP_TSC_CONTROL +Architectures: x86 +Type: vcpu ioctl +Parameters: virtual tsc_khz +Returns: 0 on success, -1 on error + +Specifies the tsc frequency for the virtual machine. The unit of the +frequency is KHz. + +4.55 KVM_GET_TSC_KHZ + +Capability: KVM_CAP_GET_TSC_KHZ +Architectures: x86 +Type: vcpu ioctl +Parameters: none +Returns: virtual tsc-khz on success, negative value on error + +Returns the tsc frequency of the guest. The unit of the return value is +KHz. If the host has unstable tsc this ioctl returns -EIO instead as an +error. + +5. The kvm_run structure + +Application code obtains a pointer to the kvm_run structure by +mmap()ing a vcpu fd. From that point, application code can control +execution by changing fields in kvm_run prior to calling the KVM_RUN +ioctl, and obtain information about the reason KVM_RUN returned by +looking up structure members. + +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + +Request that KVM_RUN return when it becomes possible to inject external +interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. + + __u8 padding1[7]; + + /* out */ + __u32 exit_reason; + +When KVM_RUN has returned successfully (return value 0), this informs +application code why KVM_RUN has returned. Allowable values for this +field are detailed below. + + __u8 ready_for_interrupt_injection; + +If request_interrupt_window has been specified, this field indicates +an interrupt can be injected now with KVM_INTERRUPT. + + __u8 if_flag; + +The value of the current interrupt flag. Only valid if in-kernel +local APIC is not used. + + __u8 padding2[2]; + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + +The value of the cr8 register. Only valid if in-kernel local APIC is +not used. Both input and output. + + __u64 apic_base; + +The value of the APIC BASE msr. Only valid if in-kernel local +APIC is not used. Both input and output. + + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + +If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown +reasons. Further architecture-specific information is available in +hardware_exit_reason. + + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + +If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due +to unknown reasons. Further architecture-specific information is +available in hardware_entry_failure_reason. + + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + +Unused. + + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + +If exit_reason is KVM_EXIT_IO, then the vcpu has +executed a port I/O instruction which could not be satisfied by kvm. +data_offset describes where the data is located (KVM_EXIT_IO_OUT) or +where kvm expects application code to place the data for the next +KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. + + struct { + struct kvm_debug_exit_arch arch; + } debug; + +Unused. + + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + +If exit_reason is KVM_EXIT_MMIO, then the vcpu has +executed a memory-mapped I/O instruction which could not be satisfied +by kvm. The 'data' member contains the written data if 'is_write' is +true, and should be filled by application code otherwise. + +NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding +operations are complete (and guest state is consistent) only after userspace +has re-entered the kernel with KVM_RUN. The kernel side will first finish +incomplete operations and then check for pending signals. Userspace +can re-enter the guest with an unmasked signal pending to complete +pending operations. + + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + +Unused. This was once used for 'hypercall to userspace'. To implement +such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). +Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. + + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + +To be documented (KVM_TPR_ACCESS_REPORTING). + + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u64 mask; /* psw upper half */ + __u64 addr; /* psw lower half */ + __u16 ipa; + __u32 ipb; + } s390_sieic; + +s390 specific. + + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + +s390 specific. + + /* KVM_EXIT_DCR */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + +powerpc specific. + + /* KVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + +MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch +hypercalls and exit with this exit struct that contains all the guest gprs. + +If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. +Userspace can now handle the hypercall and when it's done modify the gprs as +necessary. Upon guest entry all guest GPRs will then be replaced by the values +in this struct. + + /* Fix the size of the union. */ + char padding[256]; + }; +}; diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt new file mode 100644 index 0000000..8820685 --- /dev/null +++ b/Documentation/virtual/kvm/cpuid.txt @@ -0,0 +1,45 @@ +KVM CPUID bits +Glauber Costa , Red Hat Inc, 2010 +===================================================== + +A guest running on a kvm host, can check some of its features using +cpuid. This is not always guaranteed to work, since userspace can +mask-out some, or even all KVM-related cpuid features before launching +a guest. + +KVM cpuid functions are: + +function: KVM_CPUID_SIGNATURE (0x40000000) +returns : eax = 0, + ebx = 0x4b4d564b, + ecx = 0x564b4d56, + edx = 0x4d. +Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". +This function queries the presence of KVM cpuid leafs. + + +function: define KVM_CPUID_FEATURES (0x40000001) +returns : ebx, ecx, edx = 0 + eax = and OR'ed group of (1 << flag), where each flags is: + + +flag || value || meaning +============================================================================= +KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs + || || 0x11 and 0x12. +------------------------------------------------------------------------------ +KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays + || || on PIO operations. +------------------------------------------------------------------------------ +KVM_FEATURE_MMU_OP || 2 || deprecated. +------------------------------------------------------------------------------ +KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs + || || 0x4b564d00 and 0x4b564d01 +------------------------------------------------------------------------------ +KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by + || || writing to msr 0x4b564d02 +------------------------------------------------------------------------------ +KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side + || || per-cpu warps are expected in + || || kvmclock. +------------------------------------------------------------------------------ diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt new file mode 100644 index 0000000..3b4cd3b --- /dev/null +++ b/Documentation/virtual/kvm/locking.txt @@ -0,0 +1,25 @@ +KVM Lock Overview +================= + +1. Acquisition Orders +--------------------- + +(to be written) + +2. Reference +------------ + +Name: kvm_lock +Type: raw_spinlock +Arch: any +Protects: - vm_list + - hardware virtualization enable/disable +Comment: 'raw' because hardware enabling/disabling must be atomic /wrt + migration. + +Name: kvm_arch::tsc_write_lock +Type: raw_spinlock +Arch: x86 +Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} + - tsc offset in vmcb +Comment: 'raw' because updating the tsc offsets must not be preempted. diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt new file mode 100644 index 0000000..f46aa58 --- /dev/null +++ b/Documentation/virtual/kvm/mmu.txt @@ -0,0 +1,348 @@ +The x86 kvm shadow mmu +====================== + +The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible +for presenting a standard x86 mmu to the guest, while translating guest +physical addresses to host physical addresses. + +The mmu code attempts to satisfy the following requirements: + +- correctness: the guest should not be able to determine that it is running + on an emulated mmu except for timing (we attempt to comply + with the specification, not emulate the characteristics of + a particular implementation such as tlb size) +- security: the guest must not be able to touch host memory not assigned + to it +- performance: minimize the performance penalty imposed by the mmu +- scaling: need to scale to large memory and large vcpu guests +- hardware: support the full range of x86 virtualization hardware +- integration: Linux memory management code must be in control of guest memory + so that swapping, page migration, page merging, transparent + hugepages, and similar features work without change +- dirty tracking: report writes to guest memory to enable live migration + and framebuffer-based displays +- footprint: keep the amount of pinned kernel memory low (most memory + should be shrinkable) +- reliability: avoid multipage or GFP_ATOMIC allocations + +Acronyms +======== + +pfn host page frame number +hpa host physical address +hva host virtual address +gfn guest frame number +gpa guest physical address +gva guest virtual address +ngpa nested guest physical address +ngva nested guest virtual address +pte page table entry (used also to refer generically to paging structure + entries) +gpte guest pte (referring to gfns) +spte shadow pte (referring to pfns) +tdp two dimensional paging (vendor neutral term for NPT and EPT) + +Virtual and real hardware supported +=================================== + +The mmu supports first-generation mmu hardware, which allows an atomic switch +of the current paging mode and cr3 during guest entry, as well as +two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware +it exposes is the traditional 2/3/4 level x86 mmu, with support for global +pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support +exposing NPT capable hardware on NPT capable hosts. + +Translation +=========== + +The primary job of the mmu is to program the processor's mmu to translate +addresses for the guest. Different translations are required at different +times: + +- when guest paging is disabled, we translate guest physical addresses to + host physical addresses (gpa->hpa) +- when guest paging is enabled, we translate guest virtual addresses, to + guest physical addresses, to host physical addresses (gva->gpa->hpa) +- when the guest launches a guest of its own, we translate nested guest + virtual addresses, to nested guest physical addresses, to guest physical + addresses, to host physical addresses (ngva->ngpa->gpa->hpa) + +The primary challenge is to encode between 1 and 3 translations into hardware +that support only 1 (traditional) and 2 (tdp) translations. When the +number of required translations matches the hardware, the mmu operates in +direct mode; otherwise it operates in shadow mode (see below). + +Memory +====== + +Guest memory (gpa) is part of the user address space of the process that is +using kvm. Userspace defines the translation between guest addresses and user +addresses (gpa->hva); note that two gpas may alias to the same hva, but not +vice versa. + +These hvas may be backed using any method available to the host: anonymous +memory, file backed memory, and device memory. Memory might be paged by the +host at any time. + +Events +====== + +The mmu is driven by events, some from the guest, some from the host. + +Guest generated events: +- writes to control registers (especially cr3) +- invlpg/invlpga instruction execution +- access to missing or protected translations + +Host generated events: +- changes in the gpa->hpa translation (either through gpa->hva changes or + through hva->hpa changes) +- memory pressure (the shrinker) + +Shadow pages +============ + +The principal data structure is the shadow page, 'struct kvm_mmu_page'. A +shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A +shadow page may contain a mix of leaf and nonleaf sptes. + +A nonleaf spte allows the hardware mmu to reach the leaf pages and +is not related to a translation directly. It points to other shadow pages. + +A leaf spte corresponds to either one or two translations encoded into +one paging structure entry. These are always the lowest level of the +translation stack, with optional higher level translations left to NPT/EPT. +Leaf ptes point at guest pages. + +The following table shows translations encoded by leaf ptes, with higher-level +translations in parentheses: + + Non-nested guests: + nonpaging: gpa->hpa + paging: gva->gpa->hpa + paging, tdp: (gva->)gpa->hpa + Nested guests: + non-tdp: ngva->gpa->hpa (*) + tdp: (ngva->)ngpa->gpa->hpa + +(*) the guest hypervisor will encode the ngva->gpa translation into its page + tables if npt is not present + +Shadow pages contain the following information: + role.level: + The level in the shadow paging hierarchy that this shadow page belongs to. + 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. + role.direct: + If set, leaf sptes reachable from this page are for a linear range. + Examples include real mode translation, large guest pages backed by small + host pages, and gpa->hpa translations when NPT or EPT is active. + The linear range starts at (gfn << PAGE_SHIFT) and its size is determined + by role.level (2MB for first level, 1GB for second level, 0.5TB for third + level, 256TB for fourth level) + If clear, this page corresponds to a guest page table denoted by the gfn + field. + role.quadrant: + When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit + sptes. That means a guest page table contains more ptes than the host, + so multiple shadow pages are needed to shadow one guest page. + For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the + first or second 512-gpte block in the guest page table. For second-level + page tables, each 32-bit gpte is converted to two 64-bit sptes + (since each first-level guest page is shadowed by two first-level + shadow pages) so role.quadrant takes values in the range 0..3. Each + quadrant maps 1GB virtual address space. + role.access: + Inherited guest access permissions in the form uwx. Note execute + permission is positive, not negative. + role.invalid: + The page is invalid and should not be used. It is a root page that is + currently pinned (by a cpu hardware register pointing to it); once it is + unpinned it will be destroyed. + role.cr4_pae: + Contains the value of cr4.pae for which the page is valid (e.g. whether + 32-bit or 64-bit gptes are in use). + role.nxe: + Contains the value of efer.nxe for which the page is valid. + role.cr0_wp: + Contains the value of cr0.wp for which the page is valid. + gfn: + Either the guest page table containing the translations shadowed by this + page, or the base page frame for linear translations. See role.direct. + spt: + A pageful of 64-bit sptes containing the translations for this page. + Accessed by both kvm and hardware. + The page pointed to by spt will have its page->private pointing back + at the shadow page structure. + sptes in spt point either at guest pages, or at lower-level shadow pages. + Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point + at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. + The spt array forms a DAG structure with the shadow page as a node, and + guest pages as leaves. + gfns: + An array of 512 guest frame numbers, one for each present pte. Used to + perform a reverse map from a pte to a gfn. When role.direct is set, any + element of this array can be calculated from the gfn field when used, in + this case, the array of gfns is not allocated. See role.direct and gfn. + slot_bitmap: + A bitmap containing one bit per memory slot. If the page contains a pte + mapping a page from memory slot n, then bit n of slot_bitmap will be set + (if a page is aliased among several slots, then it is not guaranteed that + all slots will be marked). + Used during dirty logging to avoid scanning a shadow page if none if its + pages need tracking. + root_count: + A counter keeping track of how many hardware registers (guest cr3 or + pdptrs) are now pointing at the page. While this counter is nonzero, the + page cannot be destroyed. See role.invalid. + multimapped: + Whether there exist multiple sptes pointing at this page. + parent_pte/parent_ptes: + If multimapped is zero, parent_pte points at the single spte that points at + this page's spt. Otherwise, parent_ptes points at a data structure + with a list of parent_ptes. + unsync: + If true, then the translations in this page may not match the guest's + translation. This is equivalent to the state of the tlb when a pte is + changed but before the tlb entry is flushed. Accordingly, unsync ptes + are synchronized when the guest executes invlpg or flushes its tlb by + other means. Valid for leaf pages. + unsync_children: + How many sptes in the page point at pages that are unsync (or have + unsynchronized children). + unsync_child_bitmap: + A bitmap indicating which sptes in spt point (directly or indirectly) at + pages that may be unsynchronized. Used to quickly locate all unsychronized + pages reachable from a given page. + +Reverse map +=========== + +The mmu maintains a reverse mapping whereby all ptes mapping a page can be +reached given its gfn. This is used, for example, when swapping out a page. + +Synchronized and unsynchronized pages +===================================== + +The guest uses two events to synchronize its tlb and page tables: tlb flushes +and page invalidations (invlpg). + +A tlb flush means that we need to synchronize all sptes reachable from the +guest's cr3. This is expensive, so we keep all guest page tables write +protected, and synchronize sptes to gptes when a gpte is written. + +A special case is when a guest page table is reachable from the current +guest cr3. In this case, the guest is obliged to issue an invlpg instruction +before using the translation. We take advantage of that by removing write +protection from the guest page, and allowing the guest to modify it freely. +We synchronize modified gptes when the guest invokes invlpg. This reduces +the amount of emulation we have to do when the guest modifies multiple gptes, +or when the a guest page is no longer used as a page table and is used for +random guest data. + +As a side effect we have to resynchronize all reachable unsynchronized shadow +pages on a tlb flush. + + +Reaction to events +================== + +- guest page fault (or npt page fault, or ept violation) + +This is the most complicated event. The cause of a page fault can be: + + - a true guest fault (the guest translation won't allow the access) (*) + - access to a missing translation + - access to a protected translation + - when logging dirty pages, memory is write protected + - synchronized shadow pages are write protected (*) + - access to untranslatable memory (mmio) + + (*) not applicable in direct mode + +Handling a page fault is performed as follows: + + - if needed, walk the guest page tables to determine the guest translation + (gva->gpa or ngpa->gpa) + - if permissions are insufficient, reflect the fault back to the guest + - determine the host page + - if this is an mmio request, there is no host page; call the emulator + to emulate the instruction instead + - walk the shadow page table to find the spte for the translation, + instantiating missing intermediate page tables as necessary + - try to unsynchronize the page + - if successful, we can let the guest continue and modify the gpte + - emulate the instruction + - if failed, unshadow the page and let the guest continue + - update any translations that were modified by the instruction + +invlpg handling: + + - walk the shadow page hierarchy and drop affected translations + - try to reinstantiate the indicated translation in the hope that the + guest will use it in the near future + +Guest control register updates: + +- mov to cr3 + - look up new shadow roots + - synchronize newly reachable shadow pages + +- mov to cr0/cr4/efer + - set up mmu context for new paging mode + - look up new shadow roots + - synchronize newly reachable shadow pages + +Host translation updates: + + - mmu notifier called with updated hva + - look up affected sptes through reverse map + - drop (or update) translations + +Emulating cr0.wp +================ + +If tdp is not enabled, the host must keep cr0.wp=1 so page write protection +works for the guest kernel, not guest guest userspace. When the guest +cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, +we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the +semantics require allowing any guest kernel access plus user read access). + +We handle this by mapping the permissions to two possible sptes, depending +on fault type: + +- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, + disallows user access) +- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel + write access) + +(user write faults generate a #PF) + +Large pages +=========== + +The mmu supports all combinations of large and small guest and host pages. +Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as +two separate 2M pages, on both guest and host, since the mmu always uses PAE +paging. + +To instantiate a large spte, four constraints must be satisfied: + +- the spte must point to a large host page +- the guest pte must be a large pte of at least equivalent size (if tdp is + enabled, there is no guest pte and this condition is satisified) +- if the spte will be writeable, the large page frame may not overlap any + write-protected pages +- the guest page must be wholly contained by a single memory slot + +To check the last two conditions, the mmu maintains a ->write_count set of +arrays for each memory slot and large page size. Every write protected page +causes its write_count to be incremented, thus preventing instantiation of +a large spte. The frames at the end of an unaligned memory slot have +artificically inflated ->write_counts so they can never be instantiated. + +Further reading +=============== + +- NPT presentation from KVM Forum 2008 + http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf + diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt new file mode 100644 index 0000000..d079aed --- /dev/null +++ b/Documentation/virtual/kvm/msr.txt @@ -0,0 +1,187 @@ +KVM-specific MSRs. +Glauber Costa , Red Hat Inc, 2010 +===================================================== + +KVM makes use of some custom MSRs to service some requests. + +Custom MSRs have a range reserved for them, that goes from +0x4b564d00 to 0x4b564dff. There are MSRs outside this area, +but they are deprecated and their use is discouraged. + +Custom MSR list +-------- + +The current supported Custom MSR list is: + +MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 + + data: 4-byte alignment physical address of a memory area which must be + in guest RAM. This memory is expected to hold a copy of the following + structure: + + struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; + } __attribute__((__packed__)); + + whose data will be filled in by the hypervisor. The hypervisor is only + guaranteed to update this data at the moment of MSR write. + Users that want to reliably query this information more than once have + to write more than once to this MSR. Fields have the following meanings: + + version: guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + sec: number of seconds for wallclock. + + nsec: number of nanoseconds for wallclock. + + Note that although MSRs are per-CPU entities, the effect of this + particular MSR is global. + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 + + data: 4-byte aligned physical address of a memory area which must be in + guest RAM, plus an enable bit in bit 0. This memory is expected to hold + a copy of the following structure: + + struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; + } __attribute__((__packed__)); /* 32 bytes */ + + whose data will be filled in by the hypervisor periodically. Only one + write, or registration, is needed for each VCPU. The interval between + updates of this structure is arbitrary and implementation-dependent. + The hypervisor may update this structure at any time it sees fit until + anything with bit0 == 0 is written to it. + + Fields have the following meanings: + + version: guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + tsc_timestamp: the tsc value at the current VCPU at the time + of the update of this structure. Guests can subtract this value + from current tsc to derive a notion of elapsed time since the + structure update. + + system_time: a host notion of monotonic time, including sleep + time at the time this structure was last updated. Unit is + nanoseconds. + + tsc_to_system_mul: a function of the tsc frequency. One has + to multiply any tsc-related quantity by this value to get + a value in nanoseconds, besides dividing by 2^tsc_shift + + tsc_shift: cycle to nanosecond divider, as a power of two, to + allow for shift rights. One has to shift right any tsc-related + quantity by this value to get a value in nanoseconds, besides + multiplying by tsc_to_system_mul. + + With this information, guests can derive per-CPU time by + doing: + + time = (current_tsc - tsc_timestamp) + time = (time * tsc_to_system_mul) >> tsc_shift + time = time + system_time + + flags: bits in this field indicate extended capabilities + coordinated between the guest and the hypervisor. Availability + of specific flags has to be checked in 0x40000001 cpuid leaf. + Current flags are: + + flag bit | cpuid bit | meaning + ------------------------------------------------------------- + | | time measures taken across + 0 | 24 | multiple cpus are guaranteed to + | | be monotonic + ------------------------------------------------------------- + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + + +MSR_KVM_WALL_CLOCK: 0x11 + + data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME: 0x12 + + data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + + The suggested algorithm for detecting kvmclock presence is then: + + if (!kvm_para_available()) /* refer to cpuid.txt */ + return NON_PRESENT; + + flags = cpuid_eax(0x40000001); + if (flags & 3) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + return PRESENT; + } else if (flags & 0) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + return PRESENT; + } else + return NON_PRESENT; + +MSR_KVM_ASYNC_PF_EN: 0x4b564d02 + data: Bits 63-6 hold 64-byte aligned physical address of a + 64 byte memory area which must be in guest RAM and must be + zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1 + when asynchronous page faults are enabled on the vcpu 0 when + disabled. Bit 2 is 1 if asynchronous page faults can be injected + when vcpu is in cpl == 0. + + First 4 byte of 64 byte memory location will be written to by + the hypervisor at the time of asynchronous page fault (APF) + injection to indicate type of asynchronous page fault. Value + of 1 means that the page referred to by the page fault is not + present. Value 2 means that the page is now available. Disabling + interrupt inhibits APFs. Guest must not enable interrupt + before the reason is read, or it may be overwritten by another + APF. Since APF uses the same exception vector as regular page + fault guest must reset the reason to 0 before it does + something that can generate normal page fault. If during page + fault APF reason is 0 it means that this is regular page + fault. + + During delivery of type 1 APF cr2 contains a token that will + be used to notify a guest when missing page becomes + available. When page becomes available type 2 APF is sent with + cr2 set to the token associated with the page. There is special + kind of token 0xffffffff which tells vcpu that it should wake + up all processes waiting for APFs and no individual type 2 APFs + will be sent. + + If APF is disabled while there are outstanding APFs, they will + not be delivered. + + Currently type 2 APF will be always delivered on the same vcpu as + type 1 was, but guest should not rely on that. diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt new file mode 100644 index 0000000..3ab969c --- /dev/null +++ b/Documentation/virtual/kvm/ppc-pv.txt @@ -0,0 +1,196 @@ +The PPC KVM paravirtual interface +================================= + +The basic execution principle by which KVM on PowerPC works is to run all kernel +space code in PR=1 which is user space. This way we trap all privileged +instructions and can emulate them accordingly. + +Unfortunately that is also the downfall. There are quite some privileged +instructions that needlessly return us to the hypervisor even though they +could be handled differently. + +This is what the PPC PV interface helps with. It takes privileged instructions +and transforms them into unprivileged ones with some help from the hypervisor. +This cuts down virtualization costs by about 50% on some of my benchmarks. + +The code for that interface can be found in arch/powerpc/kernel/kvm* + +Querying for existence +====================== + +To find out if we're running on KVM or not, we leverage the device tree. When +Linux is running on KVM, a node /hypervisor exists. That node contains a +compatible property with the value "linux,kvm". + +Once you determined you're running under a PV capable KVM, you can now use +hypercalls as described below. + +KVM hypercalls +============== + +Inside the device tree's /hypervisor node there's a property called +'hypercall-instructions'. This property contains at most 4 opcodes that make +up the hypercall. To call a hypercall, just call these instructions. + +The parameters are as follows: + + Register IN OUT + + r0 - volatile + r3 1st parameter Return code + r4 2nd parameter 1st output value + r5 3rd parameter 2nd output value + r6 4th parameter 3rd output value + r7 5th parameter 4th output value + r8 6th parameter 5th output value + r9 7th parameter 6th output value + r10 8th parameter 7th output value + r11 hypercall number 8th output value + r12 - volatile + +Hypercall definitions are shared in generic code, so the same hypercall numbers +apply for x86 and powerpc alike with the exception that each KVM hypercall +also needs to be ORed with the KVM vendor code which is (42 << 16). + +Return codes can be as follows: + + Code Meaning + + 0 Success + 12 Hypercall not implemented + <0 Error + +The magic page +============== + +To enable communication between the hypervisor and guest there is a new shared +page that contains parts of supervisor visible register state. The guest can +map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. + +With this hypercall issued the guest always gets the magic page mapped at the +desired location in effective and physical address space. For now, we always +map the page to -4096. This way we can access it using absolute load and store +functions. The following instruction reads the first field of the magic page: + + ld rX, -4096(0) + +The interface is designed to be extensible should there be need later to add +additional registers to the magic page. If you add fields to the magic page, +also define a new hypercall feature to indicate that the host can give you more +registers. Only if the host supports the additional features, make use of them. + +The magic page has the following layout as described in +arch/powerpc/include/asm/kvm_para.h: + +struct kvm_vcpu_arch_shared { + __u64 scratch1; + __u64 scratch2; + __u64 scratch3; + __u64 critical; /* Guest may not get interrupts if == r1 */ + __u64 sprg0; + __u64 sprg1; + __u64 sprg2; + __u64 sprg3; + __u64 srr0; + __u64 srr1; + __u64 dar; + __u64 msr; + __u32 dsisr; + __u32 int_pending; /* Tells the guest if we have an interrupt */ +}; + +Additions to the page must only occur at the end. Struct fields are always 32 +or 64 bit aligned, depending on them being 32 or 64 bit wide respectively. + +Magic page features +=================== + +When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, +a second return value is passed to the guest. This second return value contains +a bitmap of available features inside the magic page. + +The following enhancements to the magic page are currently available: + + KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page + +For enhanced features in the magic page, please check for the existence of the +feature before using them! + +MSR bits +======== + +The MSR contains bits that require hypervisor intervention and bits that do +not require direct hypervisor intervention because they only get interpreted +when entering the guest or don't have any impact on the hypervisor's behavior. + +The following bits are safe to be set inside the guest: + + MSR_EE + MSR_RI + MSR_CR + MSR_ME + +If any other bit changes in the MSR, please still use mtmsr(d). + +Patched instructions +==================== + +The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions +respectively on 32 bit systems with an added offset of 4 to accommodate for big +endianness. + +The following is a list of mapping the Linux kernel performs when running as +guest. Implementing any of those mappings is optional, as the instruction traps +also act on the shared page. So calling privileged instructions still works as +before. + +From To +==== == + +mfmsr rX ld rX, magic_page->msr +mfsprg rX, 0 ld rX, magic_page->sprg0 +mfsprg rX, 1 ld rX, magic_page->sprg1 +mfsprg rX, 2 ld rX, magic_page->sprg2 +mfsprg rX, 3 ld rX, magic_page->sprg3 +mfsrr0 rX ld rX, magic_page->srr0 +mfsrr1 rX ld rX, magic_page->srr1 +mfdar rX ld rX, magic_page->dar +mfdsisr rX lwz rX, magic_page->dsisr + +mtmsr rX std rX, magic_page->msr +mtsprg 0, rX std rX, magic_page->sprg0 +mtsprg 1, rX std rX, magic_page->sprg1 +mtsprg 2, rX std rX, magic_page->sprg2 +mtsprg 3, rX std rX, magic_page->sprg3 +mtsrr0 rX std rX, magic_page->srr0 +mtsrr1 rX std rX, magic_page->srr1 +mtdar rX std rX, magic_page->dar +mtdsisr rX stw rX, magic_page->dsisr + +tlbsync nop + +mtmsrd rX, 0 b +mtmsr rX b + +mtmsrd rX, 1 b + +[Book3S only] +mtsrin rX, rY b + +[BookE only] +wrteei [0|1] b + + +Some instructions require more logic to determine what's going on than a load +or store instruction can deliver. To enable patching of those, we keep some +RAM around where we can live translate instructions to. What happens is the +following: + + 1) copy emulation code to memory + 2) patch that code to fit the emulated instruction + 3) patch that code to return to the original pc + 4 + 4) patch the original instruction to branch to the new code + +That way we can inject an arbitrary amount of code as replacement for a single +instruction. This allows us to check for pending interrupts when setting EE=1 +for example. diff --git a/Documentation/virtual/kvm/review-checklist.txt b/Documentation/virtual/kvm/review-checklist.txt new file mode 100644 index 0000000..a850986 --- /dev/null +++ b/Documentation/virtual/kvm/review-checklist.txt @@ -0,0 +1,38 @@ +Review checklist for kvm patches +================================ + +1. The patch must follow Documentation/CodingStyle and + Documentation/SubmittingPatches. + +2. Patches should be against kvm.git master branch. + +3. If the patch introduces or modifies a new userspace API: + - the API must be documented in Documentation/virtual/kvm/api.txt + - the API must be discoverable using KVM_CHECK_EXTENSION + +4. New state must include support for save/restore. + +5. New features must default to off (userspace should explicitly request them). + Performance improvements can and should default to on. + +6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 + +7. Emulator changes should be accompanied by unit tests for qemu-kvm.git + kvm/test directory. + +8. Changes should be vendor neutral when possible. Changes to common code + are better than duplicating changes to vendor code. + +9. Similarly, prefer changes to arch independent code than to arch dependent + code. + +10. User/kernel interfaces and guest/host interfaces must be 64-bit clean + (all variables and sizes naturally aligned on 64-bit; use specific types + only - u64 rather than ulong). + +11. New guest visible features must either be documented in a hardware manual + or be accompanied by documentation. + +12. Features must be robust against reset and kexec - for example, shared + host/guest memory must be unshared to prevent the host from writing to + guest memory that the guest has not reserved for this purpose. diff --git a/Documentation/virtual/kvm/timekeeping.txt b/Documentation/virtual/kvm/timekeeping.txt new file mode 100644 index 0000000..df89463 --- /dev/null +++ b/Documentation/virtual/kvm/timekeeping.txt @@ -0,0 +1,612 @@ + + Timekeeping Virtualization for X86-Based Architectures + + Zachary Amsden + Copyright (c) 2010, Red Hat. All rights reserved. + +1) Overview +2) Timing Devices +3) TSC Hardware +4) Virtualization Problems + +========================================================================= + +1) Overview + +One of the most complicated parts of the X86 platform, and specifically, +the virtualization of this platform is the plethora of timing devices available +and the complexity of emulating those devices. In addition, virtualization of +time introduces a new set of challenges because it introduces a multiplexed +division of time beyond the control of the guest CPU. + +First, we will describe the various timekeeping hardware available, then +present some of the problems which arise and solutions available, giving +specific recommendations for certain classes of KVM guests. + +The purpose of this document is to collect data and information relevant to +timekeeping which may be difficult to find elsewhere, specifically, +information relevant to KVM and hardware-based virtualization. + +========================================================================= + +2) Timing Devices + +First we discuss the basic hardware devices available. TSC and the related +KVM clock are special enough to warrant a full exposition and are described in +the following section. + +2.1) i8254 - PIT + +One of the first timer devices available is the programmable interrupt timer, +or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three +channels which can be programmed to deliver periodic or one-shot interrupts. +These three channels can be configured in different modes and have individual +counters. Channel 1 and 2 were not available for general use in the original +IBM PC, and historically were connected to control RAM refresh and the PC +speaker. Now the PIT is typically integrated as part of an emulated chipset +and a separate physical PIT is not used. + +The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done +using single or multiple byte access to the I/O ports. There are 6 modes +available, but not all modes are available to all timers, as only timer 2 +has a connected gate input, required for modes 1 and 5. The gate line is +controlled by port 61h, bit 0, as illustrated in the following diagram. + + -------------- ---------------- +| | | | +| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 +| Clock | | | | + -------------- | +->| GATE TIMER 0 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM + | | | (aka /dev/null) + | +->| GATE TIMER 1 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> Port 61h, bit 5 + | | | +Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ + ---------------- _| )--|LPF|---Speaker + / *---- \___/ +Port 61h, bit 1 -----------------------------------/ + +The timer modes are now described. + +Mode 0: Single Timeout. This is a one-shot software timeout that counts down + when the gate is high (always true for timers 0 and 1). When the count + reaches zero, the output goes high. + +Mode 1: Triggered One-shot. The output is initially set high. When the gate + line is set high, a countdown is initiated (which does not stop if the gate is + lowered), during which the output is set low. When the count reaches zero, + the output goes high. + +Mode 2: Rate Generator. The output is initially set high. When the countdown + reaches 1, the output goes low for one count and then returns high. The value + is reloaded and the countdown automatically resumes. If the gate line goes + low, the count is halted. If the output is low when the gate is lowered, the + output automatically goes high (this only affects timer 2). + +Mode 3: Square Wave. This generates a high / low square wave. The count + determines the length of the pulse, which alternates between high and low + when zero is reached. The count only proceeds when gate is high and is + automatically reloaded on reaching zero. The count is decremented twice at + each clock to generate a full high / low cycle at the full periodic rate. + If the count is even, the clock remains high for N/2 counts and low for N/2 + counts; if the clock is odd, the clock is high for (N+1)/2 counts and low + for (N-1)/2 counts. Only even values are latched by the counter, so odd + values are not observed when reading. This is the intended mode for timer 2, + which generates sine-like tones by low-pass filtering the square wave output. + +Mode 4: Software Strobe. After programming this mode and loading the counter, + the output remains high until the counter reaches zero. Then the output + goes low for 1 clock cycle and returns high. The counter is not reloaded. + Counting only occurs when gate is high. + +Mode 5: Hardware Strobe. After programming and loading the counter, the + output remains high. When the gate is raised, a countdown is initiated + (which does not stop if the gate is lowered). When the counter reaches zero, + the output goes low for 1 clock cycle and then returns high. The counter is + not reloaded. + +In addition to normal binary counting, the PIT supports BCD counting. The +command port, 0x43 is used to set the counter and mode for each of the three +timers. + +PIT commands, issued to port 0x43, using the following bit encoding: + +Bit 7-4: Command (See table below) +Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) +Bit 0 : Binary (0) / BCD (1) + +Command table: + +0000 - Latch Timer 0 count for port 0x40 + sample and hold the count to be read in port 0x40; + additional commands ignored until counter is read; + mode bits ignored. + +0001 - Set Timer 0 LSB mode for port 0x40 + set timer to read LSB only and force MSB to zero; + mode bits set timer mode + +0010 - Set Timer 0 MSB mode for port 0x40 + set timer to read MSB only and force LSB to zero; + mode bits set timer mode + +0011 - Set Timer 0 16-bit mode for port 0x40 + set timer to read / write LSB first, then MSB; + mode bits set timer mode + +0100 - Latch Timer 1 count for port 0x41 - as described above +0101 - Set Timer 1 LSB mode for port 0x41 - as described above +0110 - Set Timer 1 MSB mode for port 0x41 - as described above +0111 - Set Timer 1 16-bit mode for port 0x41 - as described above + +1000 - Latch Timer 2 count for port 0x42 - as described above +1001 - Set Timer 2 LSB mode for port 0x42 - as described above +1010 - Set Timer 2 MSB mode for port 0x42 - as described above +1011 - Set Timer 2 16-bit mode for port 0x42 as described above + +1101 - General counter latch + Latch combination of counters into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + Bit 0 = Unused + +1110 - Latch timer status + Latch combination of counter mode into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + + The output of ports 0x40-0x42 following this command will be: + + Bit 7 = Output pin + Bit 6 = Count loaded (0 if timer has expired) + Bit 5-4 = Read / Write mode + 01 = MSB only + 10 = LSB only + 11 = LSB / MSB (16-bit) + Bit 3-1 = Mode + Bit 0 = Binary (0) / BCD mode (1) + +2.2) RTC + +The second device which was available in the original PC was the MC146818 real +time clock. The original device is now obsolete, and usually emulated by the +system chipset, sometimes by an HPET and some frankenstein IRQ routing. + +The RTC is accessed through CMOS variables, which uses an index register to +control which bytes are read. Since there is only one index register, read +of the CMOS and read of the RTC require lock protection (in addition, it is +dangerous to allow userspace utilities such as hwclock to have direct RTC +access, as they could corrupt kernel reads and writes of CMOS memory). + +The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt +can function as a periodic timer, an additional once a day alarm, and can issue +interrupts after an update of the CMOS registers by the MC146818 is complete. +The type of interrupt is signalled in the RTC status registers. + +The RTC will update the current time fields by battery power even while the +system is off. The current time fields should not be read while an update is +in progress, as indicated in the status register. + +The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be +programmed to a 32kHz divider if the RTC is to count seconds. + +This is the RAM map originally used for the RTC/CMOS: + +Location Size Description +------------------------------------------ +00h byte Current second (BCD) +01h byte Seconds alarm (BCD) +02h byte Current minute (BCD) +03h byte Minutes alarm (BCD) +04h byte Current hour (BCD) +05h byte Hours alarm (BCD) +06h byte Current day of week (BCD) +07h byte Current day of month (BCD) +08h byte Current month (BCD) +09h byte Current year (BCD) +0Ah byte Register A + bit 7 = Update in progress + bit 6-4 = Divider for clock + 000 = 4.194 MHz + 001 = 1.049 MHz + 010 = 32 kHz + 10X = test modes + 110 = reset / disable + 111 = reset / disable + bit 3-0 = Rate selection for periodic interrupt + 000 = periodic timer disabled + 001 = 3.90625 uS + 010 = 7.8125 uS + 011 = .122070 mS + 100 = .244141 mS + ... + 1101 = 125 mS + 1110 = 250 mS + 1111 = 500 mS +0Bh byte Register B + bit 7 = Run (0) / Halt (1) + bit 6 = Periodic interrupt enable + bit 5 = Alarm interrupt enable + bit 4 = Update-ended interrupt enable + bit 3 = Square wave interrupt enable + bit 2 = BCD calendar (0) / Binary (1) + bit 1 = 12-hour mode (0) / 24-hour mode (1) + bit 0 = 0 (DST off) / 1 (DST enabled) +OCh byte Register C (read only) + bit 7 = interrupt request flag (IRQF) + bit 6 = periodic interrupt flag (PF) + bit 5 = alarm interrupt flag (AF) + bit 4 = update interrupt flag (UF) + bit 3-0 = reserved +ODh byte Register D (read only) + bit 7 = RTC has power + bit 6-0 = reserved +32h byte Current century BCD (*) + (*) location vendor specific and now determined from ACPI global tables + +2.3) APIC + +On Pentium and later processors, an on-board timer is available to each CPU +as part of the Advanced Programmable Interrupt Controller. The APIC is +accessed through memory-mapped registers and provides interrupt service to each +CPU, used for IPIs and local timer interrupts. + +Although in theory the APIC is a safe and stable source for local interrupts, +in practice, many bugs and glitches have occurred due to the special nature of +the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect +the use of the APIC and that workarounds may be required. In addition, some of +these workarounds pose unique constraints for virtualization - requiring either +extra overhead incurred from extra reads of memory-mapped I/O or additional +functionality that may be more computationally expensive to implement. + +Since the APIC is documented quite well in the Intel and AMD manuals, we will +avoid repetition of the detail here. It should be pointed out that the APIC +timer is programmed through the LVT (local vector timer) register, is capable +of one-shot or periodic operation, and is based on the bus clock divided down +by the programmable divider register. + +2.4) HPET + +HPET is quite complex, and was originally intended to replace the PIT / RTC +support of the X86 PC. It remains to be seen whether that will be the case, as +the de facto standard of PC hardware is to emulate these older devices. Some +systems designated as legacy free may support only the HPET as a hardware timer +device. + +The HPET spec is rather loose and vague, requiring at least 3 hardware timers, +but allowing implementation freedom to support many more. It also imposes no +fixed rate on the timer frequency, but does impose some extremal values on +frequency, error and slew. + +In general, the HPET is recommended as a high precision (compared to PIT /RTC) +time source which is independent of local variation (as there is only one HPET +in any given system). The HPET is also memory-mapped, and its presence is +indicated through ACPI tables by the BIOS. + +Detailed specification of the HPET is beyond the current scope of this +document, as it is also very well documented elsewhere. + +2.5) Offboard Timers + +Several cards, both proprietary (watchdog boards) and commonplace (e1000) have +timing chips built into the cards which may have registers which are accessible +to kernel or user drivers. To the author's knowledge, using these to generate +a clocksource for a Linux or other kernel has not yet been attempted and is in +general frowned upon as not playing by the agreed rules of the game. Such a +timer device would require additional support to be virtualized properly and is +not considered important at this time as no known operating system does this. + +========================================================================= + +3) TSC Hardware + +The TSC or time stamp counter is relatively simple in theory; it counts +instruction cycles issued by the processor, which can be used as a measure of +time. In practice, due to a number of problems, it is the most complicated +timekeeping device to use. + +The TSC is represented internally as a 64-bit MSR which can be read with the +RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware +limitations made it possible to write the TSC, but generally on old hardware it +was only possible to write the low 32-bits of the 64-bit counter, and the upper +32-bits of the counter were cleared. Now, however, on Intel processors family +0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction +has been lifted and all 64-bits are writable. On AMD systems, the ability to +write the TSC MSR is not an architectural guarantee. + +The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by +means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. + +Some vendors have implemented an additional instruction, RDTSCP, which returns +atomically not just the TSC, but an indicator which corresponds to the +processor number. This can be used to index into an array of TSC variables to +determine offset information in SMP systems where TSCs are not synchronized. +The presence of this instruction must be determined by consulting CPUID feature +bits. + +Both VMX and SVM provide extension fields in the virtualization hardware which +allows the guest visible TSC to be offset by a constant. Newer implementations +promise to allow the TSC to additionally be scaled, but this hardware is not +yet widely available. + +3.1) TSC synchronization + +The TSC is a CPU-local clock in most implementations. This means, on SMP +platforms, the TSCs of different CPUs may start at different times depending +on when the CPUs are powered on. Generally, CPUs on the same die will share +the same clock, however, this is not always the case. + +The BIOS may attempt to resynchronize the TSCs during the poweron process and +the operating system or other system software may attempt to do this as well. +Several hardware limitations make the problem worse - if it is not possible to +write the full 64-bits of the TSC, it may be impossible to match the TSC in +newly arriving CPUs to that of the rest of the system, resulting in +unsynchronized TSCs. This may be done by BIOS or system software, but in +practice, getting a perfectly synchronized TSC will not be possible unless all +values are read from the same clock, which generally only is possible on single +socket systems or those with special hardware support. + +3.2) TSC and CPU hotplug + +As touched on already, CPUs which arrive later than the boot time of the system +may not have a TSC value that is synchronized with the rest of the system. +Either system software, BIOS, or SMM code may actually try to establish the TSC +to a value matching the rest of the system, but a perfect match is usually not +a guarantee. This can have the effect of bringing a system from a state where +TSC is synchronized back to a state where TSC synchronization flaws, however +small, may be exposed to the OS and any virtualization environment. + +3.3) TSC and multi-socket / NUMA + +Multi-socket systems, especially large multi-socket systems are likely to have +individual clocksources rather than a single, universally distributed clock. +Since these clocks are driven by different crystals, they will not have +perfectly matched frequency, and temperature and electrical variations will +cause the CPU clocks, and thus the TSCs to drift over time. Depending on the +exact clock and bus design, the drift may or may not be fixed in absolute +error, and may accumulate over time. + +In addition, very large systems may deliberately slew the clocks of individual +cores. This technique, known as spread-spectrum clocking, reduces EMI at the +clock frequency and harmonics of it, which may be required to pass FCC +standards for telecommunications and computer equipment. + +It is recommended not to trust the TSCs to remain synchronized on NUMA or +multiple socket systems for these reasons. + +3.4) TSC and C-states + +C-states, or idling states of the processor, especially C1E and deeper sleep +states may be problematic for TSC as well. The TSC may stop advancing in such +a state, resulting in a TSC which is behind that of other CPUs when execution +is resumed. Such CPUs must be detected and flagged by the operating system +based on CPU and chipset identifications. + +The TSC in such a case may be corrected by catching it up to a known external +clocksource. + +3.5) TSC frequency change / P-states + +To make things slightly more interesting, some CPUs may change frequency. They +may or may not run the TSC at the same rate, and because the frequency change +may be staggered or slewed, at some points in time, the TSC rate may not be +known other than falling within a range of values. In this case, the TSC will +not be a stable time source, and must be calibrated against a known, stable, +external clock to be a usable source of time. + +Whether the TSC runs at a constant rate or scales with the P-state is model +dependent and must be determined by inspecting CPUID, chipset or vendor +specific MSR fields. + +In addition, some vendors have known bugs where the P-state is actually +compensated for properly during normal operation, but when the processor is +inactive, the P-state may be raised temporarily to service cache misses from +other processors. In such cases, the TSC on halted CPUs could advance faster +than that of non-halted processors. AMD Turion processors are known to have +this problem. + +3.6) TSC and STPCLK / T-states + +External signals given to the processor may also have the effect of stopping +the TSC. This is typically done for thermal emergency power control to prevent +an overheating condition, and typically, there is no way to detect that this +condition has happened. + +3.7) TSC virtualization - VMX + +VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET +field specified in the VMCS. Special instructions must be used to read and +write the VMCS field. + +3.8) TSC virtualization - SVM + +SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, SVM allows passing through the host TSC plus an additional offset +field specified in the SVM control block. + +3.9) TSC feature bits in Linux + +In summary, there is no way to guarantee the TSC remains in perfect +synchronization unless it is explicitly guaranteed by the architecture. Even +if so, the TSCs in multi-sockets or NUMA systems may still run independently +despite being locally consistent. + +The following feature bits are used by Linux to signal various TSC attributes, +but they can only be taken to be meaningful for UP or single node systems. + +X86_FEATURE_TSC : The TSC is available in hardware +X86_FEATURE_RDTSCP : The RDTSCP instruction is available +X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states +X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states +X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) + +4) Virtualization Problems + +Timekeeping is especially problematic for virtualization because a number of +challenges arise. The most obvious problem is that time is now shared between +the host and, potentially, a number of virtual machines. Thus the virtual +operating system does not run with 100% usage of the CPU, despite the fact that +it may very well make that assumption. It may expect it to remain true to very +exacting bounds when interrupt sources are disabled, but in reality only its +virtual interrupt sources are disabled, and the machine may still be preempted +at any time. This causes problems as the passage of real time, the injection +of machine interrupts and the associated clock sources are no longer completely +synchronized with real time. + +This same problem can occur on native harware to a degree, as SMM mode may +steal cycles from the naturally on X86 systems when SMM mode is used by the +BIOS, but not in such an extreme fashion. However, the fact that SMM mode may +cause similar problems to virtualization makes it a good justification for +solving many of these problems on bare metal. + +4.1) Interrupt clocking + +One of the most immediate problems that occurs with legacy operating systems +is that the system timekeeping routines are often designed to keep track of +time by counting periodic interrupts. These interrupts may come from the PIT +or the RTC, but the problem is the same: the host virtualization engine may not +be able to deliver the proper number of interrupts per second, and so guest +time may fall behind. This is especially problematic if a high interrupt rate +is selected, such as 1000 HZ, which is unfortunately the default for many Linux +guests. + +There are three approaches to solving this problem; first, it may be possible +to simply ignore it. Guests which have a separate time source for tracking +'wall clock' or 'real time' may not need any adjustment of their interrupts to +maintain proper time. If this is not sufficient, it may be necessary to inject +additional interrupts into the guest in order to increase the effective +interrupt rate. This approach leads to complications in extreme conditions, +where host load or guest lag is too much to compensate for, and thus another +solution to the problem has risen: the guest may need to become aware of lost +ticks and compensate for them internally. Although promising in theory, the +implementation of this policy in Linux has been extremely error prone, and a +number of buggy variants of lost tick compensation are distributed across +commonly used Linux systems. + +Windows uses periodic RTC clocking as a means of keeping time internally, and +thus requires interrupt slewing to keep proper time. It does use a low enough +rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in +practice. + +4.2) TSC sampling and serialization + +As the highest precision time source available, the cycle counter of the CPU +has aroused much interest from developers. As explained above, this timer has +many problems unique to its nature as a local, potentially unstable and +potentially unsynchronized source. One issue which is not unique to the TSC, +but is highlighted because of its very precise nature is sampling delay. By +definition, the counter, once read is already old. However, it is also +possible for the counter to be read ahead of the actual use of the result. +This is a consequence of the superscalar execution of the instruction stream, +which may execute instructions out of order. Such execution is called +non-serialized. Forcing serialized execution is necessary for precise +measurement with the TSC, and requires a serializing instruction, such as CPUID +or an MSR read. + +Since CPUID may actually be virtualized by a trap and emulate mechanism, this +serialization can pose a performance issue for hardware virtualization. An +accurate time stamp counter reading may therefore not always be available, and +it may be necessary for an implementation to guard against "backwards" reads of +the TSC as seen from other CPUs, even in an otherwise perfectly synchronized +system. + +4.3) Timespec aliasing + +Additionally, this lack of serialization from the TSC poses another challenge +when using results of the TSC when measured against another time source. As +the TSC is much higher precision, many possible values of the TSC may be read +while another clock is still expressing the same value. + +That is, you may read (T,T+10) while external clock C maintains the same value. +Due to non-serialized reads, you may actually end up with a range which +fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but +calibrated against an external value may have a range of valid values. +Re-calibrating this computation may actually cause time, as computed after the +calibration, to go backwards, compared with time computed before the +calibration. + +This problem is particularly pronounced with an internal time source in Linux, +the kernel time, which is expressed in the theoretically high resolution +timespec - but which advances in much larger granularity intervals, sometimes +at the rate of jiffies, and possibly in catchup modes, at a much larger step. + +This aliasing requires care in the computation and recalibration of kvmclock +and any other values derived from TSC computation (such as TSC virtualization +itself). + +4.4) Migration + +Migration of a virtual machine raises problems for timekeeping in two ways. +First, the migration itself may take time, during which interrupts cannot be +delivered, and after which, the guest time may need to be caught up. NTP may +be able to help to some degree here, as the clock correction required is +typically small enough to fall in the NTP-correctable window. + +An additional concern is that timers based off the TSC (or HPET, if the raw bus +clock is exposed) may now be running at different rates, requiring compensation +in some way in the hypervisor by virtualizing these timers. In addition, +migrating to a faster machine may preclude the use of a passthrough TSC, as a +faster clock cannot be made visible to a guest without the potential of time +advancing faster than usual. A slower clock is less of a problem, as it can +always be caught up to the original rate. KVM clock avoids these problems by +simply storing multipliers and offsets against the TSC for the guest to convert +back into nanosecond resolution values. + +4.5) Scheduling + +Since scheduling may be based on precise timing and firing of interrupts, the +scheduling algorithms of an operating system may be adversely affected by +virtualization. In theory, the effect is random and should be universally +distributed, but in contrived as well as real scenarios (guest device access, +causes of virtualization exits, possible context switch), this may not always +be the case. The effect of this has not been well studied. + +In an attempt to work around this, several implementations have provided a +paravirtualized scheduler clock, which reveals the true amount of CPU time for +which a virtual machine has been running. + +4.6) Watchdogs + +Watchdog timers, such as the lock detector in Linux may fire accidentally when +running under hardware virtualization due to timer interrupts being delayed or +misinterpretation of the passage of real time. Usually, these warnings are +spurious and can be ignored, but in some circumstances it may be necessary to +disable such detection. + +4.7) Delays and precision timing + +Precise timing and delays may not be possible in a virtualized system. This +can happen if the system is controlling physical hardware, or issues delays to +compensate for slower I/O to and from devices. The first issue is not solvable +in general for a virtualized system; hardware control software can't be +adequately virtualized without a full real-time operating system, which would +require an RT aware virtualization platform. + +The second issue may cause performance problems, but this is unlikely to be a +significant issue. In many cases these delays may be eliminated through +configuration or paravirtualization. + +4.8) Covert channels and leaks + +In addition to the above problems, time information will inevitably leak to the +guest about the host in anything but a perfect implementation of virtualized +time. This may allow the guest to infer the presence of a hypervisor (as in a +red-pill type detection), and it may allow information to leak between guests +by using CPU utilization itself as a signalling channel. Preventing such +problems would require completely isolated virtual time which may not track +real time any longer. This may be useful in certain security or QA contexts, +but in general isn't recommended for real-world deployment scenarios. diff --git a/Documentation/virtual/lguest/.gitignore b/Documentation/virtual/lguest/.gitignore new file mode 100644 index 0000000..115587f --- /dev/null +++ b/Documentation/virtual/lguest/.gitignore @@ -0,0 +1 @@ +lguest diff --git a/Documentation/virtual/lguest/Makefile b/Documentation/virtual/lguest/Makefile new file mode 100644 index 0000000..bebac6b --- /dev/null +++ b/Documentation/virtual/lguest/Makefile @@ -0,0 +1,8 @@ +# This creates the demonstration utility "lguest" which runs a Linux guest. +# Missing headers? Add "-I../../include -I../../arch/x86/include" +CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE + +all: lguest + +clean: + rm -f lguest diff --git a/Documentation/virtual/lguest/extract b/Documentation/virtual/lguest/extract new file mode 100644 index 0000000..7730bb6 --- /dev/null +++ b/Documentation/virtual/lguest/extract @@ -0,0 +1,58 @@ +#! /bin/sh + +set -e + +PREFIX=$1 +shift + +trap 'rm -r $TMPDIR' 0 +TMPDIR=`mktemp -d` + +exec 3>/dev/null +for f; do + while IFS=" +" read -r LINE; do + case "$LINE" in + *$PREFIX:[0-9]*:\**) + NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` + if [ -f $TMPDIR/$NUM ]; then + echo "$TMPDIR/$NUM already exits prior to $f" + exit 1 + fi + exec 3>>$TMPDIR/$NUM + echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM + /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3 + ;; + *$PREFIX:[0-9]*) + NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` + if [ -f $TMPDIR/$NUM ]; then + echo "$TMPDIR/$NUM already exits prior to $f" + exit 1 + fi + exec 3>>$TMPDIR/$NUM + echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM + /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3 + ;; + *:\**) + /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3 + echo >&3 + exec 3>/dev/null + ;; + *) + /bin/echo "$LINE" >&3 + ;; + esac + done < $f + echo >&3 + exec 3>/dev/null +done + +LASTFILE="" +for f in $TMPDIR/*; do + if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then + LASTFILE=$(cat $TMPDIR/.$(basename $f) ) + echo "[ $LASTFILE ]" + fi + cat $f +done + diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c new file mode 100644 index 0000000..d9da7e1 --- /dev/null +++ b/Documentation/virtual/lguest/lguest.c @@ -0,0 +1,2095 @@ +/*P:100 + * This is the Launcher code, a simple program which lays out the "physical" + * memory for the new Guest by mapping the kernel image and the virtual + * devices, then opens /dev/lguest to tell the kernel about the Guest and + * control it. +:*/ +#define _LARGEFILE64_SOURCE +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "../../include/linux/lguest_launcher.h" +/*L:110 + * We can ignore the 42 include files we need for this program, but I do want + * to draw attention to the use of kernel-style types. + * + * As Linus said, "C is a Spartan language, and so should your naming be." I + * like these abbreviations, so we define them here. Note that u64 is always + * unsigned long long, which works on all Linux systems: this means that we can + * use %llu in printf for any u64. + */ +typedef unsigned long long u64; +typedef uint32_t u32; +typedef uint16_t u16; +typedef uint8_t u8; +/*:*/ + +#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ +#define BRIDGE_PFX "bridge:" +#ifndef SIOCBRADDIF +#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ +#endif +/* We can have up to 256 pages for devices. */ +#define DEVICE_PAGES 256 +/* This will occupy 3 pages: it must be a power of 2. */ +#define VIRTQUEUE_NUM 256 + +/*L:120 + * verbose is both a global flag and a macro. The C preprocessor allows + * this, and although I wouldn't recommend it, it works quite nicely here. + */ +static bool verbose; +#define verbose(args...) \ + do { if (verbose) printf(args); } while(0) +/*:*/ + +/* The pointer to the start of guest memory. */ +static void *guest_base; +/* The maximum guest physical address allowed, and maximum possible. */ +static unsigned long guest_limit, guest_max; +/* The /dev/lguest file descriptor. */ +static int lguest_fd; + +/* a per-cpu variable indicating whose vcpu is currently running */ +static unsigned int __thread cpu_id; + +/* This is our list of devices. */ +struct device_list { + /* Counter to assign interrupt numbers. */ + unsigned int next_irq; + + /* Counter to print out convenient device numbers. */ + unsigned int device_num; + + /* The descriptor page for the devices. */ + u8 *descpage; + + /* A single linked list of devices. */ + struct device *dev; + /* And a pointer to the last device for easy append. */ + struct device *lastdev; +}; + +/* The list of Guest devices, based on command line arguments. */ +static struct device_list devices; + +/* The device structure describes a single device. */ +struct device { + /* The linked-list pointer. */ + struct device *next; + + /* The device's descriptor, as mapped into the Guest. */ + struct lguest_device_desc *desc; + + /* We can't trust desc values once Guest has booted: we use these. */ + unsigned int feature_len; + unsigned int num_vq; + + /* The name of this device, for --verbose. */ + const char *name; + + /* Any queues attached to this device */ + struct virtqueue *vq; + + /* Is it operational */ + bool running; + + /* Does Guest want an intrrupt on empty? */ + bool irq_on_empty; + + /* Device-specific data. */ + void *priv; +}; + +/* The virtqueue structure describes a queue attached to a device. */ +struct virtqueue { + struct virtqueue *next; + + /* Which device owns me. */ + struct device *dev; + + /* The configuration for this queue. */ + struct lguest_vqconfig config; + + /* The actual ring of buffers. */ + struct vring vring; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* How many are used since we sent last irq? */ + unsigned int pending_used; + + /* Eventfd where Guest notifications arrive. */ + int eventfd; + + /* Function for the thread which is servicing this virtqueue. */ + void (*service)(struct virtqueue *vq); + pid_t thread; +}; + +/* Remember the arguments to the program so we can "reboot" */ +static char **main_args; + +/* The original tty settings to restore on exit. */ +static struct termios orig_term; + +/* + * We have to be careful with barriers: our devices are all run in separate + * threads and so we need to make sure that changes visible to the Guest happen + * in precise order. + */ +#define wmb() __asm__ __volatile__("" : : : "memory") +#define mb() __asm__ __volatile__("" : : : "memory") + +/* + * Convert an iovec element to the given type. + * + * This is a fairly ugly trick: we need to know the size of the type and + * alignment requirement to check the pointer is kosher. It's also nice to + * have the name of the type in case we report failure. + * + * Typing those three things all the time is cumbersome and error prone, so we + * have a macro which sets them all up and passes to the real function. + */ +#define convert(iov, type) \ + ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) + +static void *_convert(struct iovec *iov, size_t size, size_t align, + const char *name) +{ + if (iov->iov_len != size) + errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); + if ((unsigned long)iov->iov_base % align != 0) + errx(1, "Bad alignment %p for %s", iov->iov_base, name); + return iov->iov_base; +} + +/* Wrapper for the last available index. Makes it easier to change. */ +#define lg_last_avail(vq) ((vq)->last_avail_idx) + +/* + * The virtio configuration space is defined to be little-endian. x86 is + * little-endian too, but it's nice to be explicit so we have these helpers. + */ +#define cpu_to_le16(v16) (v16) +#define cpu_to_le32(v32) (v32) +#define cpu_to_le64(v64) (v64) +#define le16_to_cpu(v16) (v16) +#define le32_to_cpu(v32) (v32) +#define le64_to_cpu(v64) (v64) + +/* Is this iovec empty? */ +static bool iov_empty(const struct iovec iov[], unsigned int num_iov) +{ + unsigned int i; + + for (i = 0; i < num_iov; i++) + if (iov[i].iov_len) + return false; + return true; +} + +/* Take len bytes from the front of this iovec. */ +static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) +{ + unsigned int i; + + for (i = 0; i < num_iov; i++) { + unsigned int used; + + used = iov[i].iov_len < len ? iov[i].iov_len : len; + iov[i].iov_base += used; + iov[i].iov_len -= used; + len -= used; + } + assert(len == 0); +} + +/* The device virtqueue descriptors are followed by feature bitmasks. */ +static u8 *get_feature_bits(struct device *dev) +{ + return (u8 *)(dev->desc + 1) + + dev->num_vq * sizeof(struct lguest_vqconfig); +} + +/*L:100 + * The Launcher code itself takes us out into userspace, that scary place where + * pointers run wild and free! Unfortunately, like most userspace programs, + * it's quite boring (which is why everyone likes to hack on the kernel!). + * Perhaps if you make up an Lguest Drinking Game at this point, it will get + * you through this section. Or, maybe not. + * + * The Launcher sets up a big chunk of memory to be the Guest's "physical" + * memory and stores it in "guest_base". In other words, Guest physical == + * Launcher virtual with an offset. + * + * This can be tough to get your head around, but usually it just means that we + * use these trivial conversion functions when the Guest gives us its + * "physical" addresses: + */ +static void *from_guest_phys(unsigned long addr) +{ + return guest_base + addr; +} + +static unsigned long to_guest_phys(const void *addr) +{ + return (addr - guest_base); +} + +/*L:130 + * Loading the Kernel. + * + * We start with couple of simple helper routines. open_or_die() avoids + * error-checking code cluttering the callers: + */ +static int open_or_die(const char *name, int flags) +{ + int fd = open(name, flags); + if (fd < 0) + err(1, "Failed to open %s", name); + return fd; +} + +/* map_zeroed_pages() takes a number of pages. */ +static void *map_zeroed_pages(unsigned int num) +{ + int fd = open_or_die("/dev/zero", O_RDONLY); + void *addr; + + /* + * We use a private mapping (ie. if we write to the page, it will be + * copied). We allocate an extra two pages PROT_NONE to act as guard + * pages against read/write attempts that exceed allocated space. + */ + addr = mmap(NULL, getpagesize() * (num+2), + PROT_NONE, MAP_PRIVATE, fd, 0); + + if (addr == MAP_FAILED) + err(1, "Mmapping %u pages of /dev/zero", num); + + if (mprotect(addr + getpagesize(), getpagesize() * num, + PROT_READ|PROT_WRITE) == -1) + err(1, "mprotect rw %u pages failed", num); + + /* + * One neat mmap feature is that you can close the fd, and it + * stays mapped. + */ + close(fd); + + /* Return address after PROT_NONE page */ + return addr + getpagesize(); +} + +/* Get some more pages for a device. */ +static void *get_pages(unsigned int num) +{ + void *addr = from_guest_phys(guest_limit); + + guest_limit += num * getpagesize(); + if (guest_limit > guest_max) + errx(1, "Not enough memory for devices"); + return addr; +} + +/* + * This routine is used to load the kernel or initrd. It tries mmap, but if + * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), + * it falls back to reading the memory in. + */ +static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) +{ + ssize_t r; + + /* + * We map writable even though for some segments are marked read-only. + * The kernel really wants to be writable: it patches its own + * instructions. + * + * MAP_PRIVATE means that the page won't be copied until a write is + * done to it. This allows us to share untouched memory between + * Guests. + */ + if (mmap(addr, len, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) + return; + + /* pread does a seek and a read in one shot: saves a few lines. */ + r = pread(fd, addr, len, offset); + if (r != len) + err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); +} + +/* + * This routine takes an open vmlinux image, which is in ELF, and maps it into + * the Guest memory. ELF = Embedded Linking Format, which is the format used + * by all modern binaries on Linux including the kernel. + * + * The ELF headers give *two* addresses: a physical address, and a virtual + * address. We use the physical address; the Guest will map itself to the + * virtual address. + * + * We return the starting address. + */ +static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) +{ + Elf32_Phdr phdr[ehdr->e_phnum]; + unsigned int i; + + /* + * Sanity checks on the main ELF header: an x86 executable with a + * reasonable number of correctly-sized program headers. + */ + if (ehdr->e_type != ET_EXEC + || ehdr->e_machine != EM_386 + || ehdr->e_phentsize != sizeof(Elf32_Phdr) + || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) + errx(1, "Malformed elf header"); + + /* + * An ELF executable contains an ELF header and a number of "program" + * headers which indicate which parts ("segments") of the program to + * load where. + */ + + /* We read in all the program headers at once: */ + if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) + err(1, "Seeking to program headers"); + if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) + err(1, "Reading program headers"); + + /* + * Try all the headers: there are usually only three. A read-only one, + * a read-write one, and a "note" section which we don't load. + */ + for (i = 0; i < ehdr->e_phnum; i++) { + /* If this isn't a loadable segment, we ignore it */ + if (phdr[i].p_type != PT_LOAD) + continue; + + verbose("Section %i: size %i addr %p\n", + i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); + + /* We map this section of the file at its physical address. */ + map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), + phdr[i].p_offset, phdr[i].p_filesz); + } + + /* The entry point is given in the ELF header. */ + return ehdr->e_entry; +} + +/*L:150 + * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed + * to jump into it and it will unpack itself. We used to have to perform some + * hairy magic because the unpacking code scared me. + * + * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote + * a small patch to jump over the tricky bits in the Guest, so now we just read + * the funky header so we know where in the file to load, and away we go! + */ +static unsigned long load_bzimage(int fd) +{ + struct boot_params boot; + int r; + /* Modern bzImages get loaded at 1M. */ + void *p = from_guest_phys(0x100000); + + /* + * Go back to the start of the file and read the header. It should be + * a Linux boot header (see Documentation/x86/i386/boot.txt) + */ + lseek(fd, 0, SEEK_SET); + read(fd, &boot, sizeof(boot)); + + /* Inside the setup_hdr, we expect the magic "HdrS" */ + if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) + errx(1, "This doesn't look like a bzImage to me"); + + /* Skip over the extra sectors of the header. */ + lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); + + /* Now read everything into memory. in nice big chunks. */ + while ((r = read(fd, p, 65536)) > 0) + p += r; + + /* Finally, code32_start tells us where to enter the kernel. */ + return boot.hdr.code32_start; +} + +/*L:140 + * Loading the kernel is easy when it's a "vmlinux", but most kernels + * come wrapped up in the self-decompressing "bzImage" format. With a little + * work, we can load those, too. + */ +static unsigned long load_kernel(int fd) +{ + Elf32_Ehdr hdr; + + /* Read in the first few bytes. */ + if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) + err(1, "Reading kernel"); + + /* If it's an ELF file, it starts with "\177ELF" */ + if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) + return map_elf(fd, &hdr); + + /* Otherwise we assume it's a bzImage, and try to load it. */ + return load_bzimage(fd); +} + +/* + * This is a trivial little helper to align pages. Andi Kleen hated it because + * it calls getpagesize() twice: "it's dumb code." + * + * Kernel guys get really het up about optimization, even when it's not + * necessary. I leave this code as a reaction against that. + */ +static inline unsigned long page_align(unsigned long addr) +{ + /* Add upwards and truncate downwards. */ + return ((addr + getpagesize()-1) & ~(getpagesize()-1)); +} + +/*L:180 + * An "initial ram disk" is a disk image loaded into memory along with the + * kernel which the kernel can use to boot from without needing any drivers. + * Most distributions now use this as standard: the initrd contains the code to + * load the appropriate driver modules for the current machine. + * + * Importantly, James Morris works for RedHat, and Fedora uses initrds for its + * kernels. He sent me this (and tells me when I break it). + */ +static unsigned long load_initrd(const char *name, unsigned long mem) +{ + int ifd; + struct stat st; + unsigned long len; + + ifd = open_or_die(name, O_RDONLY); + /* fstat() is needed to get the file size. */ + if (fstat(ifd, &st) < 0) + err(1, "fstat() on initrd '%s'", name); + + /* + * We map the initrd at the top of memory, but mmap wants it to be + * page-aligned, so we round the size up for that. + */ + len = page_align(st.st_size); + map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); + /* + * Once a file is mapped, you can close the file descriptor. It's a + * little odd, but quite useful. + */ + close(ifd); + verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); + + /* We return the initrd size. */ + return len; +} +/*:*/ + +/* + * Simple routine to roll all the commandline arguments together with spaces + * between them. + */ +static void concat(char *dst, char *args[]) +{ + unsigned int i, len = 0; + + for (i = 0; args[i]; i++) { + if (i) { + strcat(dst+len, " "); + len++; + } + strcpy(dst+len, args[i]); + len += strlen(args[i]); + } + /* In case it's empty. */ + dst[len] = '\0'; +} + +/*L:185 + * This is where we actually tell the kernel to initialize the Guest. We + * saw the arguments it expects when we looked at initialize() in lguest_user.c: + * the base of Guest "physical" memory, the top physical page to allow and the + * entry point for the Guest. + */ +static void tell_kernel(unsigned long start) +{ + unsigned long args[] = { LHREQ_INITIALIZE, + (unsigned long)guest_base, + guest_limit / getpagesize(), start }; + verbose("Guest: %p - %p (%#lx)\n", + guest_base, guest_base + guest_limit, guest_limit); + lguest_fd = open_or_die("/dev/lguest", O_RDWR); + if (write(lguest_fd, args, sizeof(args)) < 0) + err(1, "Writing to /dev/lguest"); +} +/*:*/ + +/*L:200 + * Device Handling. + * + * When the Guest gives us a buffer, it sends an array of addresses and sizes. + * We need to make sure it's not trying to reach into the Launcher itself, so + * we have a convenient routine which checks it and exits with an error message + * if something funny is going on: + */ +static void *_check_pointer(unsigned long addr, unsigned int size, + unsigned int line) +{ + /* + * Check if the requested address and size exceeds the allocated memory, + * or addr + size wraps around. + */ + if ((addr + size) > guest_limit || (addr + size) < addr) + errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); + /* + * We return a pointer for the caller's convenience, now we know it's + * safe to use. + */ + return from_guest_phys(addr); +} +/* A macro which transparently hands the line number to the real function. */ +#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) + +/* + * Each buffer in the virtqueues is actually a chain of descriptors. This + * function returns the next descriptor in the chain, or vq->vring.num if we're + * at the end. + */ +static unsigned next_desc(struct vring_desc *desc, + unsigned int i, unsigned int max) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc[i].flags & VRING_DESC_F_NEXT)) + return max; + + /* Check they're not leading us off end of descriptors. */ + next = desc[i].next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + wmb(); + + if (next >= max) + errx(1, "Desc next is %u", next); + + return next; +} + +/* + * This actually sends the interrupt for this virtqueue, if we've used a + * buffer. + */ +static void trigger_irq(struct virtqueue *vq) +{ + unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; + + /* Don't inform them if nothing used. */ + if (!vq->pending_used) + return; + vq->pending_used = 0; + + /* If they don't want an interrupt, don't send one... */ + if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { + /* ... unless they've asked us to force one on empty. */ + if (!vq->dev->irq_on_empty + || lg_last_avail(vq) != vq->vring.avail->idx) + return; + } + + /* Send the Guest an interrupt tell them we used something up. */ + if (write(lguest_fd, buf, sizeof(buf)) != 0) + err(1, "Triggering irq %i", vq->config.irq); +} + +/* + * This looks in the virtqueue for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function waits if necessary, and returns the descriptor number found. + */ +static unsigned wait_for_vq_desc(struct virtqueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) +{ + unsigned int i, head, max; + struct vring_desc *desc; + u16 last_avail = lg_last_avail(vq); + + /* There's nothing available? */ + while (last_avail == vq->vring.avail->idx) { + u64 event; + + /* + * Since we're about to sleep, now is a good time to tell the + * Guest about what we've used up to now. + */ + trigger_irq(vq); + + /* OK, now we need to know about added descriptors. */ + vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; + + /* + * They could have slipped one in as we were doing that: make + * sure it's written, then check again. + */ + mb(); + if (last_avail != vq->vring.avail->idx) { + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; + break; + } + + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) + errx(1, "Event read failed?"); + + /* We don't need to be notified again. */ + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; + } + + /* Check it isn't doing very strange things with descriptor numbers. */ + if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) + errx(1, "Guest moved used index from %u to %u", + last_avail, vq->vring.avail->idx); + + /* + * Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + head = vq->vring.avail->ring[last_avail % vq->vring.num]; + lg_last_avail(vq)++; + + /* If their number is silly, that's a fatal mistake. */ + if (head >= vq->vring.num) + errx(1, "Guest says index %u is available", head); + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + + max = vq->vring.num; + desc = vq->vring.desc; + i = head; + + /* + * If this is an indirect entry, then this buffer contains a descriptor + * table which we handle as if it's any normal descriptor chain. + */ + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) + errx(1, "Invalid size for indirect buffer table"); + + max = desc[i].len / sizeof(struct vring_desc); + desc = check_pointer(desc[i].addr, desc[i].len); + i = 0; + } + + do { + /* Grab the first descriptor, and check it's OK. */ + iov[*out_num + *in_num].iov_len = desc[i].len; + iov[*out_num + *in_num].iov_base + = check_pointer(desc[i].addr, desc[i].len); + /* If this is an input descriptor, increment that count. */ + if (desc[i].flags & VRING_DESC_F_WRITE) + (*in_num)++; + else { + /* + * If it's an output descriptor, they're all supposed + * to come before any input descriptors. + */ + if (*in_num) + errx(1, "Descriptor has out after in"); + (*out_num)++; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (*out_num + *in_num > max) + errx(1, "Looped descriptor"); + } while ((i = next_desc(desc, i, max)) != max); + + return head; +} + +/* + * After we've used one of their buffers, we tell the Guest about it. Sometime + * later we'll want to send them an interrupt using trigger_irq(); note that + * wait_for_vq_desc() does that for us if it has to wait. + */ +static void add_used(struct virtqueue *vq, unsigned int head, int len) +{ + struct vring_used_elem *used; + + /* + * The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. + */ + used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; + used->id = head; + used->len = len; + /* Make sure buffer is written before we update index. */ + wmb(); + vq->vring.used->idx++; + vq->pending_used++; +} + +/* And here's the combo meal deal. Supersize me! */ +static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) +{ + add_used(vq, head, len); + trigger_irq(vq); +} + +/* + * The Console + * + * We associate some data with the console for our exit hack. + */ +struct console_abort { + /* How many times have they hit ^C? */ + int count; + /* When did they start? */ + struct timeval start; +}; + +/* This is the routine which handles console input (ie. stdin). */ +static void console_input(struct virtqueue *vq) +{ + int len; + unsigned int head, in_num, out_num; + struct console_abort *abort = vq->dev->priv; + struct iovec iov[vq->vring.num]; + + /* Make sure there's a descriptor available. */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); + if (out_num) + errx(1, "Output buffers in console in queue?"); + + /* Read into it. This is where we usually wait. */ + len = readv(STDIN_FILENO, iov, in_num); + if (len <= 0) { + /* Ran out of input? */ + warnx("Failed to get console input, ignoring console."); + /* + * For simplicity, dying threads kill the whole Launcher. So + * just nap here. + */ + for (;;) + pause(); + } + + /* Tell the Guest we used a buffer. */ + add_used_and_trigger(vq, head, len); + + /* + * Three ^C within one second? Exit. + * + * This is such a hack, but works surprisingly well. Each ^C has to + * be in a buffer by itself, so they can't be too fast. But we check + * that we get three within about a second, so they can't be too + * slow. + */ + if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { + abort->count = 0; + return; + } + + abort->count++; + if (abort->count == 1) + gettimeofday(&abort->start, NULL); + else if (abort->count == 3) { + struct timeval now; + gettimeofday(&now, NULL); + /* Kill all Launcher processes with SIGINT, like normal ^C */ + if (now.tv_sec <= abort->start.tv_sec+1) + kill(0, SIGINT); + abort->count = 0; + } +} + +/* This is the routine which handles console output (ie. stdout). */ +static void console_output(struct virtqueue *vq) +{ + unsigned int head, out, in; + struct iovec iov[vq->vring.num]; + + /* We usually wait in here, for the Guest to give us something. */ + head = wait_for_vq_desc(vq, iov, &out, &in); + if (in) + errx(1, "Input buffers in console output queue?"); + + /* writev can return a partial write, so we loop here. */ + while (!iov_empty(iov, out)) { + int len = writev(STDOUT_FILENO, iov, out); + if (len <= 0) + err(1, "Write to stdout gave %i", len); + iov_consume(iov, out, len); + } + + /* + * We're finished with that buffer: if we're going to sleep, + * wait_for_vq_desc() will prod the Guest with an interrupt. + */ + add_used(vq, head, 0); +} + +/* + * The Network + * + * Handling output for network is also simple: we get all the output buffers + * and write them to /dev/net/tun. + */ +struct net_info { + int tunfd; +}; + +static void net_output(struct virtqueue *vq) +{ + struct net_info *net_info = vq->dev->priv; + unsigned int head, out, in; + struct iovec iov[vq->vring.num]; + + /* We usually wait in here for the Guest to give us a packet. */ + head = wait_for_vq_desc(vq, iov, &out, &in); + if (in) + errx(1, "Input buffers in net output queue?"); + /* + * Send the whole thing through to /dev/net/tun. It expects the exact + * same format: what a coincidence! + */ + if (writev(net_info->tunfd, iov, out) < 0) + errx(1, "Write to tun failed?"); + + /* + * Done with that one; wait_for_vq_desc() will send the interrupt if + * all packets are processed. + */ + add_used(vq, head, 0); +} + +/* + * Handling network input is a bit trickier, because I've tried to optimize it. + * + * First we have a helper routine which tells is if from this file descriptor + * (ie. the /dev/net/tun device) will block: + */ +static bool will_block(int fd) +{ + fd_set fdset; + struct timeval zero = { 0, 0 }; + FD_ZERO(&fdset); + FD_SET(fd, &fdset); + return select(fd+1, &fdset, NULL, NULL, &zero) != 1; +} + +/* + * This handles packets coming in from the tun device to our Guest. Like all + * service routines, it gets called again as soon as it returns, so you don't + * see a while(1) loop here. + */ +static void net_input(struct virtqueue *vq) +{ + int len; + unsigned int head, out, in; + struct iovec iov[vq->vring.num]; + struct net_info *net_info = vq->dev->priv; + + /* + * Get a descriptor to write an incoming packet into. This will also + * send an interrupt if they're out of descriptors. + */ + head = wait_for_vq_desc(vq, iov, &out, &in); + if (out) + errx(1, "Output buffers in net input queue?"); + + /* + * If it looks like we'll block reading from the tun device, send them + * an interrupt. + */ + if (vq->pending_used && will_block(net_info->tunfd)) + trigger_irq(vq); + + /* + * Read in the packet. This is where we normally wait (when there's no + * incoming network traffic). + */ + len = readv(net_info->tunfd, iov, in); + if (len <= 0) + err(1, "Failed to read from tun."); + + /* + * Mark that packet buffer as used, but don't interrupt here. We want + * to wait until we've done as much work as we can. + */ + add_used(vq, head, len); +} +/*:*/ + +/* This is the helper to create threads: run the service routine in a loop. */ +static int do_thread(void *_vq) +{ + struct virtqueue *vq = _vq; + + for (;;) + vq->service(vq); + return 0; +} + +/* + * When a child dies, we kill our entire process group with SIGTERM. This + * also has the side effect that the shell restores the console for us! + */ +static void kill_launcher(int signal) +{ + kill(0, SIGTERM); +} + +static void reset_device(struct device *dev) +{ + struct virtqueue *vq; + + verbose("Resetting device %s\n", dev->name); + + /* Clear any features they've acked. */ + memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); + + /* We're going to be explicitly killing threads, so ignore them. */ + signal(SIGCHLD, SIG_IGN); + + /* Zero out the virtqueues, get rid of their threads */ + for (vq = dev->vq; vq; vq = vq->next) { + if (vq->thread != (pid_t)-1) { + kill(vq->thread, SIGTERM); + waitpid(vq->thread, NULL, 0); + vq->thread = (pid_t)-1; + } + memset(vq->vring.desc, 0, + vring_size(vq->config.num, LGUEST_VRING_ALIGN)); + lg_last_avail(vq) = 0; + } + dev->running = false; + + /* Now we care if threads die. */ + signal(SIGCHLD, (void *)kill_launcher); +} + +/*L:216 + * This actually creates the thread which services the virtqueue for a device. + */ +static void create_thread(struct virtqueue *vq) +{ + /* + * Create stack for thread. Since the stack grows upwards, we point + * the stack pointer to the end of this region. + */ + char *stack = malloc(32768); + unsigned long args[] = { LHREQ_EVENTFD, + vq->config.pfn*getpagesize(), 0 }; + + /* Create a zero-initialized eventfd. */ + vq->eventfd = eventfd(0, 0); + if (vq->eventfd < 0) + err(1, "Creating eventfd"); + args[2] = vq->eventfd; + + /* + * Attach an eventfd to this virtqueue: it will go off when the Guest + * does an LHCALL_NOTIFY for this vq. + */ + if (write(lguest_fd, &args, sizeof(args)) != 0) + err(1, "Attaching eventfd"); + + /* + * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so + * we get a signal if it dies. + */ + vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); + if (vq->thread == (pid_t)-1) + err(1, "Creating clone"); + + /* We close our local copy now the child has it. */ + close(vq->eventfd); +} + +static bool accepted_feature(struct device *dev, unsigned int bit) +{ + const u8 *features = get_feature_bits(dev) + dev->feature_len; + + if (dev->feature_len < bit / CHAR_BIT) + return false; + return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT)); +} + +static void start_device(struct device *dev) +{ + unsigned int i; + struct virtqueue *vq; + + verbose("Device %s OK: offered", dev->name); + for (i = 0; i < dev->feature_len; i++) + verbose(" %02x", get_feature_bits(dev)[i]); + verbose(", accepted"); + for (i = 0; i < dev->feature_len; i++) + verbose(" %02x", get_feature_bits(dev) + [dev->feature_len+i]); + + dev->irq_on_empty = accepted_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); + + for (vq = dev->vq; vq; vq = vq->next) { + if (vq->service) + create_thread(vq); + } + dev->running = true; +} + +static void cleanup_devices(void) +{ + struct device *dev; + + for (dev = devices.dev; dev; dev = dev->next) + reset_device(dev); + + /* If we saved off the original terminal settings, restore them now. */ + if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) + tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); +} + +/* When the Guest tells us they updated the status field, we handle it. */ +static void update_device_status(struct device *dev) +{ + /* A zero status is a reset, otherwise it's a set of flags. */ + if (dev->desc->status == 0) + reset_device(dev); + else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { + warnx("Device %s configuration FAILED", dev->name); + if (dev->running) + reset_device(dev); + } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { + if (!dev->running) + start_device(dev); + } +} + +/*L:215 + * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In + * particular, it's used to notify us of device status changes during boot. + */ +static void handle_output(unsigned long addr) +{ + struct device *i; + + /* Check each device. */ + for (i = devices.dev; i; i = i->next) { + struct virtqueue *vq; + + /* + * Notifications to device descriptors mean they updated the + * device status. + */ + if (from_guest_phys(addr) == i->desc) { + update_device_status(i); + return; + } + + /* + * Devices *can* be used before status is set to DRIVER_OK. + * The original plan was that they would never do this: they + * would always finish setting up their status bits before + * actually touching the virtqueues. In practice, we allowed + * them to, and they do (eg. the disk probes for partition + * tables as part of initialization). + * + * If we see this, we start the device: once it's running, we + * expect the device to catch all the notifications. + */ + for (vq = i->vq; vq; vq = vq->next) { + if (addr != vq->config.pfn*getpagesize()) + continue; + if (i->running) + errx(1, "Notification on running %s", i->name); + /* This just calls create_thread() for each virtqueue */ + start_device(i); + return; + } + } + + /* + * Early console write is done using notify on a nul-terminated string + * in Guest memory. It's also great for hacking debugging messages + * into a Guest. + */ + if (addr >= guest_limit) + errx(1, "Bad NOTIFY %#lx", addr); + + write(STDOUT_FILENO, from_guest_phys(addr), + strnlen(from_guest_phys(addr), guest_limit - addr)); +} + +/*L:190 + * Device Setup + * + * All devices need a descriptor so the Guest knows it exists, and a "struct + * device" so the Launcher can keep track of it. We have common helper + * routines to allocate and manage them. + */ + +/* + * The layout of the device page is a "struct lguest_device_desc" followed by a + * number of virtqueue descriptors, then two sets of feature bits, then an + * array of configuration bytes. This routine returns the configuration + * pointer. + */ +static u8 *device_config(const struct device *dev) +{ + return (void *)(dev->desc + 1) + + dev->num_vq * sizeof(struct lguest_vqconfig) + + dev->feature_len * 2; +} + +/* + * This routine allocates a new "struct lguest_device_desc" from descriptor + * table page just above the Guest's normal memory. It returns a pointer to + * that descriptor. + */ +static struct lguest_device_desc *new_dev_desc(u16 type) +{ + struct lguest_device_desc d = { .type = type }; + void *p; + + /* Figure out where the next device config is, based on the last one. */ + if (devices.lastdev) + p = device_config(devices.lastdev) + + devices.lastdev->desc->config_len; + else + p = devices.descpage; + + /* We only have one page for all the descriptors. */ + if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) + errx(1, "Too many devices"); + + /* p might not be aligned, so we memcpy in. */ + return memcpy(p, &d, sizeof(d)); +} + +/* + * Each device descriptor is followed by the description of its virtqueues. We + * specify how many descriptors the virtqueue is to have. + */ +static void add_virtqueue(struct device *dev, unsigned int num_descs, + void (*service)(struct virtqueue *)) +{ + unsigned int pages; + struct virtqueue **i, *vq = malloc(sizeof(*vq)); + void *p; + + /* First we need some memory for this virtqueue. */ + pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1) + / getpagesize(); + p = get_pages(pages); + + /* Initialize the virtqueue */ + vq->next = NULL; + vq->last_avail_idx = 0; + vq->dev = dev; + + /* + * This is the routine the service thread will run, and its Process ID + * once it's running. + */ + vq->service = service; + vq->thread = (pid_t)-1; + + /* Initialize the configuration. */ + vq->config.num = num_descs; + vq->config.irq = devices.next_irq++; + vq->config.pfn = to_guest_phys(p) / getpagesize(); + + /* Initialize the vring. */ + vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); + + /* + * Append virtqueue to this device's descriptor. We use + * device_config() to get the end of the device's current virtqueues; + * we check that we haven't added any config or feature information + * yet, otherwise we'd be overwriting them. + */ + assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); + memcpy(device_config(dev), &vq->config, sizeof(vq->config)); + dev->num_vq++; + dev->desc->num_vq++; + + verbose("Virtqueue page %#lx\n", to_guest_phys(p)); + + /* + * Add to tail of list, so dev->vq is first vq, dev->vq->next is + * second. + */ + for (i = &dev->vq; *i; i = &(*i)->next); + *i = vq; +} + +/* + * The first half of the feature bitmask is for us to advertise features. The + * second half is for the Guest to accept features. + */ +static void add_feature(struct device *dev, unsigned bit) +{ + u8 *features = get_feature_bits(dev); + + /* We can't extend the feature bits once we've added config bytes */ + if (dev->desc->feature_len <= bit / CHAR_BIT) { + assert(dev->desc->config_len == 0); + dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; + } + + features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); +} + +/* + * This routine sets the configuration fields for an existing device's + * descriptor. It only works for the last device, but that's OK because that's + * how we use it. + */ +static void set_config(struct device *dev, unsigned len, const void *conf) +{ + /* Check we haven't overflowed our single page. */ + if (device_config(dev) + len > devices.descpage + getpagesize()) + errx(1, "Too many devices"); + + /* Copy in the config information, and store the length. */ + memcpy(device_config(dev), conf, len); + dev->desc->config_len = len; + + /* Size must fit in config_len field (8 bits)! */ + assert(dev->desc->config_len == len); +} + +/* + * This routine does all the creation and setup of a new device, including + * calling new_dev_desc() to allocate the descriptor and device memory. We + * don't actually start the service threads until later. + * + * See what I mean about userspace being boring? + */ +static struct device *new_device(const char *name, u16 type) +{ + struct device *dev = malloc(sizeof(*dev)); + + /* Now we populate the fields one at a time. */ + dev->desc = new_dev_desc(type); + dev->name = name; + dev->vq = NULL; + dev->feature_len = 0; + dev->num_vq = 0; + dev->running = false; + + /* + * Append to device list. Prepending to a single-linked list is + * easier, but the user expects the devices to be arranged on the bus + * in command-line order. The first network device on the command line + * is eth0, the first block device /dev/vda, etc. + */ + if (devices.lastdev) + devices.lastdev->next = dev; + else + devices.dev = dev; + devices.lastdev = dev; + + return dev; +} + +/* + * Our first setup routine is the console. It's a fairly simple device, but + * UNIX tty handling makes it uglier than it could be. + */ +static void setup_console(void) +{ + struct device *dev; + + /* If we can save the initial standard input settings... */ + if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { + struct termios term = orig_term; + /* + * Then we turn off echo, line buffering and ^C etc: We want a + * raw input stream to the Guest. + */ + term.c_lflag &= ~(ISIG|ICANON|ECHO); + tcsetattr(STDIN_FILENO, TCSANOW, &term); + } + + dev = new_device("console", VIRTIO_ID_CONSOLE); + + /* We store the console state in dev->priv, and initialize it. */ + dev->priv = malloc(sizeof(struct console_abort)); + ((struct console_abort *)dev->priv)->count = 0; + + /* + * The console needs two virtqueues: the input then the output. When + * they put something the input queue, we make sure we're listening to + * stdin. When they put something in the output queue, we write it to + * stdout. + */ + add_virtqueue(dev, VIRTQUEUE_NUM, console_input); + add_virtqueue(dev, VIRTQUEUE_NUM, console_output); + + verbose("device %u: console\n", ++devices.device_num); +} +/*:*/ + +/*M:010 + * Inter-guest networking is an interesting area. Simplest is to have a + * --sharenet= option which opens or creates a named pipe. This can be + * used to send packets to another guest in a 1:1 manner. + * + * More sopisticated is to use one of the tools developed for project like UML + * to do networking. + * + * Faster is to do virtio bonding in kernel. Doing this 1:1 would be + * completely generic ("here's my vring, attach to your vring") and would work + * for any traffic. Of course, namespace and permissions issues need to be + * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide + * multiple inter-guest channels behind one interface, although it would + * require some manner of hotplugging new virtio channels. + * + * Finally, we could implement a virtio network switch in the kernel. +:*/ + +static u32 str2ip(const char *ipaddr) +{ + unsigned int b[4]; + + if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) + errx(1, "Failed to parse IP address '%s'", ipaddr); + return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; +} + +static void str2mac(const char *macaddr, unsigned char mac[6]) +{ + unsigned int m[6]; + if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", + &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) + errx(1, "Failed to parse mac address '%s'", macaddr); + mac[0] = m[0]; + mac[1] = m[1]; + mac[2] = m[2]; + mac[3] = m[3]; + mac[4] = m[4]; + mac[5] = m[5]; +} + +/* + * This code is "adapted" from libbridge: it attaches the Host end of the + * network device to the bridge device specified by the command line. + * + * This is yet another James Morris contribution (I'm an IP-level guy, so I + * dislike bridging), and I just try not to break it. + */ +static void add_to_bridge(int fd, const char *if_name, const char *br_name) +{ + int ifidx; + struct ifreq ifr; + + if (!*br_name) + errx(1, "must specify bridge name"); + + ifidx = if_nametoindex(if_name); + if (!ifidx) + errx(1, "interface %s does not exist!", if_name); + + strncpy(ifr.ifr_name, br_name, IFNAMSIZ); + ifr.ifr_name[IFNAMSIZ-1] = '\0'; + ifr.ifr_ifindex = ifidx; + if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) + err(1, "can't add %s to bridge %s", if_name, br_name); +} + +/* + * This sets up the Host end of the network device with an IP address, brings + * it up so packets will flow, the copies the MAC address into the hwaddr + * pointer. + */ +static void configure_device(int fd, const char *tapif, u32 ipaddr) +{ + struct ifreq ifr; + struct sockaddr_in sin; + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, tapif); + + /* Don't read these incantations. Just cut & paste them like I did! */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(ipaddr); + memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); + if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) + err(1, "Setting %s interface address", tapif); + ifr.ifr_flags = IFF_UP; + if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) + err(1, "Bringing interface %s up", tapif); +} + +static int get_tun_device(char tapif[IFNAMSIZ]) +{ + struct ifreq ifr; + int netfd; + + /* Start with this zeroed. Messy but sure. */ + memset(&ifr, 0, sizeof(ifr)); + + /* + * We open the /dev/net/tun device and tell it we want a tap device. A + * tap device is like a tun device, only somehow different. To tell + * the truth, I completely blundered my way through this code, but it + * works now! + */ + netfd = open_or_die("/dev/net/tun", O_RDWR); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; + strcpy(ifr.ifr_name, "tap%d"); + if (ioctl(netfd, TUNSETIFF, &ifr) != 0) + err(1, "configuring /dev/net/tun"); + + if (ioctl(netfd, TUNSETOFFLOAD, + TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) + err(1, "Could not set features for tun device"); + + /* + * We don't need checksums calculated for packets coming in this + * device: trust us! + */ + ioctl(netfd, TUNSETNOCSUM, 1); + + memcpy(tapif, ifr.ifr_name, IFNAMSIZ); + return netfd; +} + +/*L:195 + * Our network is a Host<->Guest network. This can either use bridging or + * routing, but the principle is the same: it uses the "tun" device to inject + * packets into the Host as if they came in from a normal network card. We + * just shunt packets between the Guest and the tun device. + */ +static void setup_tun_net(char *arg) +{ + struct device *dev; + struct net_info *net_info = malloc(sizeof(*net_info)); + int ipfd; + u32 ip = INADDR_ANY; + bool bridging = false; + char tapif[IFNAMSIZ], *p; + struct virtio_net_config conf; + + net_info->tunfd = get_tun_device(tapif); + + /* First we create a new network device. */ + dev = new_device("net", VIRTIO_ID_NET); + dev->priv = net_info; + + /* Network devices need a recv and a send queue, just like console. */ + add_virtqueue(dev, VIRTQUEUE_NUM, net_input); + add_virtqueue(dev, VIRTQUEUE_NUM, net_output); + + /* + * We need a socket to perform the magic network ioctls to bring up the + * tap interface, connect to the bridge etc. Any socket will do! + */ + ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + if (ipfd < 0) + err(1, "opening IP socket"); + + /* If the command line was --tunnet=bridge: do bridging. */ + if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { + arg += strlen(BRIDGE_PFX); + bridging = true; + } + + /* A mac address may follow the bridge name or IP address */ + p = strchr(arg, ':'); + if (p) { + str2mac(p+1, conf.mac); + add_feature(dev, VIRTIO_NET_F_MAC); + *p = '\0'; + } + + /* arg is now either an IP address or a bridge name */ + if (bridging) + add_to_bridge(ipfd, tapif, arg); + else + ip = str2ip(arg); + + /* Set up the tun device. */ + configure_device(ipfd, tapif, ip); + + add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); + /* Expect Guest to handle everything except UFO */ + add_feature(dev, VIRTIO_NET_F_CSUM); + add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); + add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); + add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); + add_feature(dev, VIRTIO_NET_F_GUEST_ECN); + add_feature(dev, VIRTIO_NET_F_HOST_TSO4); + add_feature(dev, VIRTIO_NET_F_HOST_TSO6); + add_feature(dev, VIRTIO_NET_F_HOST_ECN); + /* We handle indirect ring entries */ + add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); + set_config(dev, sizeof(conf), &conf); + + /* We don't need the socket any more; setup is done. */ + close(ipfd); + + devices.device_num++; + + if (bridging) + verbose("device %u: tun %s attached to bridge: %s\n", + devices.device_num, tapif, arg); + else + verbose("device %u: tun %s: %s\n", + devices.device_num, tapif, arg); +} +/*:*/ + +/* This hangs off device->priv. */ +struct vblk_info { + /* The size of the file. */ + off64_t len; + + /* The file descriptor for the file. */ + int fd; + +}; + +/*L:210 + * The Disk + * + * The disk only has one virtqueue, so it only has one thread. It is really + * simple: the Guest asks for a block number and we read or write that position + * in the file. + * + * Before we serviced each virtqueue in a separate thread, that was unacceptably + * slow: the Guest waits until the read is finished before running anything + * else, even if it could have been doing useful work. + * + * We could have used async I/O, except it's reputed to suck so hard that + * characters actually go missing from your code when you try to use it. + */ +static void blk_request(struct virtqueue *vq) +{ + struct vblk_info *vblk = vq->dev->priv; + unsigned int head, out_num, in_num, wlen; + int ret; + u8 *in; + struct virtio_blk_outhdr *out; + struct iovec iov[vq->vring.num]; + off64_t off; + + /* + * Get the next request, where we normally wait. It triggers the + * interrupt to acknowledge previously serviced requests (if any). + */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); + + /* + * Every block request should contain at least one output buffer + * (detailing the location on disk and the type of request) and one + * input buffer (to hold the result). + */ + if (out_num == 0 || in_num == 0) + errx(1, "Bad virtblk cmd %u out=%u in=%u", + head, out_num, in_num); + + out = convert(&iov[0], struct virtio_blk_outhdr); + in = convert(&iov[out_num+in_num-1], u8); + /* + * For historical reasons, block operations are expressed in 512 byte + * "sectors". + */ + off = out->sector * 512; + + /* + * In general the virtio block driver is allowed to try SCSI commands. + * It'd be nice if we supported eject, for example, but we don't. + */ + if (out->type & VIRTIO_BLK_T_SCSI_CMD) { + fprintf(stderr, "Scsi commands unsupported\n"); + *in = VIRTIO_BLK_S_UNSUPP; + wlen = sizeof(*in); + } else if (out->type & VIRTIO_BLK_T_OUT) { + /* + * Write + * + * Move to the right location in the block file. This can fail + * if they try to write past end. + */ + if (lseek64(vblk->fd, off, SEEK_SET) != off) + err(1, "Bad seek to sector %llu", out->sector); + + ret = writev(vblk->fd, iov+1, out_num-1); + verbose("WRITE to sector %llu: %i\n", out->sector, ret); + + /* + * Grr... Now we know how long the descriptor they sent was, we + * make sure they didn't try to write over the end of the block + * file (possibly extending it). + */ + if (ret > 0 && off + ret > vblk->len) { + /* Trim it back to the correct length */ + ftruncate64(vblk->fd, vblk->len); + /* Die, bad Guest, die. */ + errx(1, "Write past end %llu+%u", off, ret); + } + + wlen = sizeof(*in); + *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); + } else if (out->type & VIRTIO_BLK_T_FLUSH) { + /* Flush */ + ret = fdatasync(vblk->fd); + verbose("FLUSH fdatasync: %i\n", ret); + wlen = sizeof(*in); + *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); + } else { + /* + * Read + * + * Move to the right location in the block file. This can fail + * if they try to read past end. + */ + if (lseek64(vblk->fd, off, SEEK_SET) != off) + err(1, "Bad seek to sector %llu", out->sector); + + ret = readv(vblk->fd, iov+1, in_num-1); + verbose("READ from sector %llu: %i\n", out->sector, ret); + if (ret >= 0) { + wlen = sizeof(*in) + ret; + *in = VIRTIO_BLK_S_OK; + } else { + wlen = sizeof(*in); + *in = VIRTIO_BLK_S_IOERR; + } + } + + /* Finished that request. */ + add_used(vq, head, wlen); +} + +/*L:198 This actually sets up a virtual block device. */ +static void setup_block_file(const char *filename) +{ + struct device *dev; + struct vblk_info *vblk; + struct virtio_blk_config conf; + + /* Creat the device. */ + dev = new_device("block", VIRTIO_ID_BLOCK); + + /* The device has one virtqueue, where the Guest places requests. */ + add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); + + /* Allocate the room for our own bookkeeping */ + vblk = dev->priv = malloc(sizeof(*vblk)); + + /* First we open the file and store the length. */ + vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); + vblk->len = lseek64(vblk->fd, 0, SEEK_END); + + /* We support FLUSH. */ + add_feature(dev, VIRTIO_BLK_F_FLUSH); + + /* Tell Guest how many sectors this device has. */ + conf.capacity = cpu_to_le64(vblk->len / 512); + + /* + * Tell Guest not to put in too many descriptors at once: two are used + * for the in and out elements. + */ + add_feature(dev, VIRTIO_BLK_F_SEG_MAX); + conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); + + /* Don't try to put whole struct: we have 8 bit limit. */ + set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf); + + verbose("device %u: virtblock %llu sectors\n", + ++devices.device_num, le64_to_cpu(conf.capacity)); +} + +/*L:211 + * Our random number generator device reads from /dev/random into the Guest's + * input buffers. The usual case is that the Guest doesn't want random numbers + * and so has no buffers although /dev/random is still readable, whereas + * console is the reverse. + * + * The same logic applies, however. + */ +struct rng_info { + int rfd; +}; + +static void rng_input(struct virtqueue *vq) +{ + int len; + unsigned int head, in_num, out_num, totlen = 0; + struct rng_info *rng_info = vq->dev->priv; + struct iovec iov[vq->vring.num]; + + /* First we need a buffer from the Guests's virtqueue. */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); + if (out_num) + errx(1, "Output buffers in rng?"); + + /* + * Just like the console write, we loop to cover the whole iovec. + * In this case, short reads actually happen quite a bit. + */ + while (!iov_empty(iov, in_num)) { + len = readv(rng_info->rfd, iov, in_num); + if (len <= 0) + err(1, "Read from /dev/random gave %i", len); + iov_consume(iov, in_num, len); + totlen += len; + } + + /* Tell the Guest about the new input. */ + add_used(vq, head, totlen); +} + +/*L:199 + * This creates a "hardware" random number device for the Guest. + */ +static void setup_rng(void) +{ + struct device *dev; + struct rng_info *rng_info = malloc(sizeof(*rng_info)); + + /* Our device's privat info simply contains the /dev/random fd. */ + rng_info->rfd = open_or_die("/dev/random", O_RDONLY); + + /* Create the new device. */ + dev = new_device("rng", VIRTIO_ID_RNG); + dev->priv = rng_info; + + /* The device has one virtqueue, where the Guest places inbufs. */ + add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); + + verbose("device %u: rng\n", devices.device_num++); +} +/* That's the end of device setup. */ + +/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ +static void __attribute__((noreturn)) restart_guest(void) +{ + unsigned int i; + + /* + * Since we don't track all open fds, we simply close everything beyond + * stderr. + */ + for (i = 3; i < FD_SETSIZE; i++) + close(i); + + /* Reset all the devices (kills all threads). */ + cleanup_devices(); + + execv(main_args[0], main_args); + err(1, "Could not exec %s", main_args[0]); +} + +/*L:220 + * Finally we reach the core of the Launcher which runs the Guest, serves + * its input and output, and finally, lays it to rest. + */ +static void __attribute__((noreturn)) run_guest(void) +{ + for (;;) { + unsigned long notify_addr; + int readval; + + /* We read from the /dev/lguest device to run the Guest. */ + readval = pread(lguest_fd, ¬ify_addr, + sizeof(notify_addr), cpu_id); + + /* One unsigned long means the Guest did HCALL_NOTIFY */ + if (readval == sizeof(notify_addr)) { + verbose("Notify on address %#lx\n", notify_addr); + handle_output(notify_addr); + /* ENOENT means the Guest died. Reading tells us why. */ + } else if (errno == ENOENT) { + char reason[1024] = { 0 }; + pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); + errx(1, "%s", reason); + /* ERESTART means that we need to reboot the guest */ + } else if (errno == ERESTART) { + restart_guest(); + /* Anything else means a bug or incompatible change. */ + } else + err(1, "Running guest failed"); + } +} +/*L:240 + * This is the end of the Launcher. The good news: we are over halfway + * through! The bad news: the most fiendish part of the code still lies ahead + * of us. + * + * Are you ready? Take a deep breath and join me in the core of the Host, in + * "make Host". +:*/ + +static struct option opts[] = { + { "verbose", 0, NULL, 'v' }, + { "tunnet", 1, NULL, 't' }, + { "block", 1, NULL, 'b' }, + { "rng", 0, NULL, 'r' }, + { "initrd", 1, NULL, 'i' }, + { "username", 1, NULL, 'u' }, + { "chroot", 1, NULL, 'c' }, + { NULL }, +}; +static void usage(void) +{ + errx(1, "Usage: lguest [--verbose] " + "[--tunnet=(:|bridge::)\n" + "|--block=|--initrd=]...\n" + " vmlinux [args...]"); +} + +/*L:105 The main routine is where the real work begins: */ +int main(int argc, char *argv[]) +{ + /* Memory, code startpoint and size of the (optional) initrd. */ + unsigned long mem = 0, start, initrd_size = 0; + /* Two temporaries. */ + int i, c; + /* The boot information for the Guest. */ + struct boot_params *boot; + /* If they specify an initrd file to load. */ + const char *initrd_name = NULL; + + /* Password structure for initgroups/setres[gu]id */ + struct passwd *user_details = NULL; + + /* Directory to chroot to */ + char *chroot_path = NULL; + + /* Save the args: we "reboot" by execing ourselves again. */ + main_args = argv; + + /* + * First we initialize the device list. We keep a pointer to the last + * device, and the next interrupt number to use for devices (1: + * remember that 0 is used by the timer). + */ + devices.lastdev = NULL; + devices.next_irq = 1; + + /* We're CPU 0. In fact, that's the only CPU possible right now. */ + cpu_id = 0; + + /* + * We need to know how much memory so we can set up the device + * descriptor and memory pages for the devices as we parse the command + * line. So we quickly look through the arguments to find the amount + * of memory now. + */ + for (i = 1; i < argc; i++) { + if (argv[i][0] != '-') { + mem = atoi(argv[i]) * 1024 * 1024; + /* + * We start by mapping anonymous pages over all of + * guest-physical memory range. This fills it with 0, + * and ensures that the Guest won't be killed when it + * tries to access it. + */ + guest_base = map_zeroed_pages(mem / getpagesize() + + DEVICE_PAGES); + guest_limit = mem; + guest_max = mem + DEVICE_PAGES*getpagesize(); + devices.descpage = get_pages(1); + break; + } + } + + /* The options are fairly straight-forward */ + while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { + switch (c) { + case 'v': + verbose = true; + break; + case 't': + setup_tun_net(optarg); + break; + case 'b': + setup_block_file(optarg); + break; + case 'r': + setup_rng(); + break; + case 'i': + initrd_name = optarg; + break; + case 'u': + user_details = getpwnam(optarg); + if (!user_details) + err(1, "getpwnam failed, incorrect username?"); + break; + case 'c': + chroot_path = optarg; + break; + default: + warnx("Unknown argument %s", argv[optind]); + usage(); + } + } + /* + * After the other arguments we expect memory and kernel image name, + * followed by command line arguments for the kernel. + */ + if (optind + 2 > argc) + usage(); + + verbose("Guest base is at %p\n", guest_base); + + /* We always have a console device */ + setup_console(); + + /* Now we load the kernel */ + start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); + + /* Boot information is stashed at physical address 0 */ + boot = from_guest_phys(0); + + /* Map the initrd image if requested (at top of physical memory) */ + if (initrd_name) { + initrd_size = load_initrd(initrd_name, mem); + /* + * These are the location in the Linux boot header where the + * start and size of the initrd are expected to be found. + */ + boot->hdr.ramdisk_image = mem - initrd_size; + boot->hdr.ramdisk_size = initrd_size; + /* The bootloader type 0xFF means "unknown"; that's OK. */ + boot->hdr.type_of_loader = 0xFF; + } + + /* + * The Linux boot header contains an "E820" memory map: ours is a + * simple, single region. + */ + boot->e820_entries = 1; + boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); + /* + * The boot header contains a command line pointer: we put the command + * line after the boot header. + */ + boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); + /* We use a simple helper to copy the arguments separated by spaces. */ + concat((char *)(boot + 1), argv+optind+2); + + /* Boot protocol version: 2.07 supports the fields for lguest. */ + boot->hdr.version = 0x207; + + /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ + boot->hdr.hardware_subarch = 1; + + /* Tell the entry path not to try to reload segment registers. */ + boot->hdr.loadflags |= KEEP_SEGMENTS; + + /* + * We tell the kernel to initialize the Guest: this returns the open + * /dev/lguest file descriptor. + */ + tell_kernel(start); + + /* Ensure that we terminate if a device-servicing child dies. */ + signal(SIGCHLD, kill_launcher); + + /* If we exit via err(), this kills all the threads, restores tty. */ + atexit(cleanup_devices); + + /* If requested, chroot to a directory */ + if (chroot_path) { + if (chroot(chroot_path) != 0) + err(1, "chroot(\"%s\") failed", chroot_path); + + if (chdir("/") != 0) + err(1, "chdir(\"/\") failed"); + + verbose("chroot done\n"); + } + + /* If requested, drop privileges */ + if (user_details) { + uid_t u; + gid_t g; + + u = user_details->pw_uid; + g = user_details->pw_gid; + + if (initgroups(user_details->pw_name, g) != 0) + err(1, "initgroups failed"); + + if (setresgid(g, g, g) != 0) + err(1, "setresgid failed"); + + if (setresuid(u, u, u) != 0) + err(1, "setresuid failed"); + + verbose("Dropping privileges completed\n"); + } + + /* Finally, run the Guest. This doesn't return. */ + run_guest(); +} +/*:*/ + +/*M:999 + * Mastery is done: you now know everything I do. + * + * But surely you have seen code, features and bugs in your wanderings which + * you now yearn to attack? That is the real game, and I look forward to you + * patching and forking lguest into the Your-Name-Here-visor. + * + * Farewell, and good coding! + * Rusty Russell. + */ diff --git a/Documentation/virtual/lguest/lguest.txt b/Documentation/virtual/lguest/lguest.txt new file mode 100644 index 0000000..bff0c55 --- /dev/null +++ b/Documentation/virtual/lguest/lguest.txt @@ -0,0 +1,129 @@ + __ + (___()'`; Rusty's Remarkably Unreliable Guide to Lguest + /, /` - or, A Young Coder's Illustrated Hypervisor + \\"--\\ http://lguest.ozlabs.org + +Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel, +for Linux developers and users to experiment with virtualization with the +minimum of complexity. Nonetheless, it should have sufficient features to +make it useful for specific tasks, and, of course, you are encouraged to fork +and enhance it (see drivers/lguest/README). + +Features: + +- Kernel module which runs in a normal kernel. +- Simple I/O model for communication. +- Simple program to create new guests. +- Logo contains cute puppies: http://lguest.ozlabs.org + +Developer features: + +- Fun to hack on. +- No ABI: being tied to a specific kernel anyway, you can change anything. +- Many opportunities for improvement or feature implementation. + +Running Lguest: + +- The easiest way to run lguest is to use same kernel as guest and host. + You can configure them differently, but usually it's easiest not to. + + You will need to configure your kernel with the following options: + + "General setup": + "Prompt for development and/or incomplete code/drivers" = Y + (CONFIG_EXPERIMENTAL=y) + + "Processor type and features": + "Paravirtualized guest support" = Y + "Lguest guest support" = Y + "High Memory Support" = off/4GB + "Alignment value to which kernel should be aligned" = 0x100000 + (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and + CONFIG_PHYSICAL_ALIGN=0x100000) + + "Device Drivers": + "Block devices" + "Virtio block driver (EXPERIMENTAL)" = M/Y + "Network device support" + "Universal TUN/TAP device driver support" = M/Y + "Virtio network driver (EXPERIMENTAL)" = M/Y + (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m) + + "Virtualization" + "Linux hypervisor example code" = M/Y + (CONFIG_LGUEST=m) + +- A tool called "lguest" is available in this directory: type "make" + to build it. If you didn't build your kernel in-tree, use "make + O=". + +- Create or find a root disk image. There are several useful ones + around, such as the xm-test tiny root image at + http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img + + For more serious work, I usually use a distribution ISO image and + install it under qemu, then make multiple copies: + + dd if=/dev/zero of=rootfile bs=1M count=2048 + qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d + + Make sure that you install a getty on /dev/hvc0 if you want to log in on the + console! + +- "modprobe lg" if you built it as a module. + +- Run an lguest as root: + + Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ + --block=rootfile root=/dev/vda + + Explanation: + 64: the amount of memory to use, in MB. + + vmlinux: the kernel image found in the top of your build directory. You + can also use a standard bzImage. + + --tunnet=192.168.19.1: configures a "tap" device for networking with this + IP address. + + --block=rootfile: a file or block device which becomes /dev/vda + inside the guest. + + root=/dev/vda: this (and anything else on the command line) are + kernel boot parameters. + +- Configuring networking. I usually have the host masquerade, using + "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 > + /proc/sys/net/ipv4/ip_forward". In this example, I would configure + eth0 inside the guest at 192.168.19.2. + + Another method is to bridge the tap device to an external interface + using --tunnet=bridge:, and perhaps run dhcp on the guest + to obtain an IP address. The bridge needs to be configured first: + this option simply adds the tap interface to it. + + A simple example on my system: + + ifconfig eth0 0.0.0.0 + brctl addbr lg0 + ifconfig lg0 up + brctl addif lg0 eth0 + dhclient lg0 + + Then use --tunnet=bridge:lg0 when launching the guest. + + See: + + http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge + + for general information on how to get bridging to work. + +- Random number generation. Using the --rng option will provide a + /dev/hwrng in the guest that will read from the host's /dev/random. + Use this option in conjunction with rng-tools (see ../hw_random.txt) + to provide entropy to the guest kernel's /dev/random. + +There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest + +Good luck! +Rusty Russell rusty@rustcorp.com.au. diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt new file mode 100644 index 0000000..5d0fc8b --- /dev/null +++ b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt @@ -0,0 +1,4589 @@ + User Mode Linux HOWTO + User Mode Linux Core Team + Mon Nov 18 14:16:16 EST 2002 + + This document describes the use and abuse of Jeff Dike's User Mode + Linux: a port of the Linux kernel as a normal Intel Linux process. + ______________________________________________________________________ + + Table of Contents + + 1. Introduction + + 1.1 How is User Mode Linux Different? + 1.2 Why Would I Want User Mode Linux? + + 2. Compiling the kernel and modules + + 2.1 Compiling the kernel + 2.2 Compiling and installing kernel modules + 2.3 Compiling and installing uml_utilities + + 3. Running UML and logging in + + 3.1 Running UML + 3.2 Logging in + 3.3 Examples + + 4. UML on 2G/2G hosts + + 4.1 Introduction + 4.2 The problem + 4.3 The solution + + 5. Setting up serial lines and consoles + + 5.1 Specifying the device + 5.2 Specifying the channel + 5.3 Examples + + 6. Setting up the network + + 6.1 General setup + 6.2 Userspace daemons + 6.3 Specifying ethernet addresses + 6.4 UML interface setup + 6.5 Multicast + 6.6 TUN/TAP with the uml_net helper + 6.7 TUN/TAP with a preconfigured tap device + 6.8 Ethertap + 6.9 The switch daemon + 6.10 Slip + 6.11 Slirp + 6.12 pcap + 6.13 Setting up the host yourself + + 7. Sharing Filesystems between Virtual Machines + + 7.1 A warning + 7.2 Using layered block devices + 7.3 Note! + 7.4 Another warning + 7.5 uml_moo : Merging a COW file with its backing file + + 8. Creating filesystems + + 8.1 Create the filesystem file + 8.2 Assign the file to a UML device + 8.3 Creating and mounting the filesystem + + 9. Host file access + + 9.1 Using hostfs + 9.2 hostfs as the root filesystem + 9.3 Building hostfs + + 10. The Management Console + 10.1 version + 10.2 halt and reboot + 10.3 config + 10.4 remove + 10.5 sysrq + 10.6 help + 10.7 cad + 10.8 stop + 10.9 go + + 11. Kernel debugging + + 11.1 Starting the kernel under gdb + 11.2 Examining sleeping processes + 11.3 Running ddd on UML + 11.4 Debugging modules + 11.5 Attaching gdb to the kernel + 11.6 Using alternate debuggers + + 12. Kernel debugging examples + + 12.1 The case of the hung fsck + 12.2 Episode 2: The case of the hung fsck + + 13. What to do when UML doesn't work + + 13.1 Strange compilation errors when you build from source + 13.2 (obsolete) + 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem + 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' + 13.5 UML doesn't work when /tmp is an NFS filesystem + 13.6 UML hangs on boot when compiled with gprof support + 13.7 syslogd dies with a SIGTERM on startup + 13.8 TUN/TAP networking doesn't work on a 2.4 host + 13.9 You can network to the host but not to other machines on the net + 13.10 I have no root and I want to scream + 13.11 UML build conflict between ptrace.h and ucontext.h + 13.12 The UML BogoMips is exactly half the host's BogoMips + 13.13 When you run UML, it immediately segfaults + 13.14 xterms appear, then immediately disappear + 13.15 Any other panic, hang, or strange behavior + + 14. Diagnosing Problems + + 14.1 Case 1 : Normal kernel panics + 14.2 Case 2 : Tracing thread panics + 14.3 Case 3 : Tracing thread panics caused by other threads + 14.4 Case 4 : Hangs + + 15. Thanks + + 15.1 Code and Documentation + 15.2 Flushing out bugs + 15.3 Buglets and clean-ups + 15.4 Case Studies + 15.5 Other contributions + + + ______________________________________________________________________ + + 11.. IInnttrroodduuccttiioonn + + Welcome to User Mode Linux. It's going to be fun. + + + + 11..11.. HHooww iiss UUsseerr MMooddee LLiinnuuxx DDiiffffeerreenntt?? + + Normally, the Linux Kernel talks straight to your hardware (video + card, keyboard, hard drives, etc), and any programs which run ask the + kernel to operate the hardware, like so: + + + + +-----------+-----------+----+ + | Process 1 | Process 2 | ...| + +-----------+-----------+----+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + The User Mode Linux Kernel is different; instead of talking to the + hardware, it talks to a `real' Linux kernel (called the `host kernel' + from now on), like any other program. Programs can then run inside + User-Mode Linux as if they were running under a normal kernel, like + so: + + + + +----------------+ + | Process 2 | ...| + +-----------+----------------+ + | Process 1 | User-Mode Linux| + +----------------------------+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + + 11..22.. WWhhyy WWoouulldd II WWaanntt UUsseerr MMooddee LLiinnuuxx?? + + + 1. If User Mode Linux crashes, your host kernel is still fine. + + 2. You can run a usermode kernel as a non-root user. + + 3. You can debug the User Mode Linux like any normal process. + + 4. You can run gprof (profiling) and gcov (coverage testing). + + 5. You can play with your kernel without breaking things. + + 6. You can use it as a sandbox for testing new apps. + + 7. You can try new development kernels safely. + + 8. You can run different distributions simultaneously. + + 9. It's extremely fun. + + + + + + 22.. CCoommppiilliinngg tthhee kkeerrnneell aanndd mmoodduulleess + + + + + 22..11.. CCoommppiilliinngg tthhee kkeerrnneell + + + Compiling the user mode kernel is just like compiling any other + kernel. Let's go through the steps, using 2.4.0-prerelease (current + as of this writing) as an example: + + + 1. Download the latest UML patch from + + the download page + . + + + 3. Make a directory and unpack the kernel into it. + + + + host% + mkdir ~/uml + + + + + + + host% + cd ~/uml + + + + + + + host% + tar -xzvf linux-2.4.0-prerelease.tar.bz2 + + + + + + + 4. Apply the patch using + + + + host% + cd ~/uml/linux + + + + host% + bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1 + + + + + + + 5. Run your favorite config; `make xconfig ARCH=um' is the most + convenient. `make config ARCH=um' and 'make menuconfig ARCH=um' + will work as well. The defaults will give you a useful kernel. If + you want to change something, go ahead, it probably won't hurt + anything. + + + Note: If the host is configured with a 2G/2G address space split + rather than the usual 3G/1G split, then the packaged UML binaries + will not run. They will immediately segfault. See ``UML on 2G/2G + hosts'' for the scoop on running UML on your system. + + + + 6. Finish with `make linux ARCH=um': the result is a file called + `linux' in the top directory of your source tree. + + Make sure that you don't build this kernel in /usr/src/linux. On some + distributions, /usr/include/asm is a link into this pool. The user- + mode build changes the other end of that link, and things that include + stop compiling. + + The sources are also available from cvs at the project's cvs page, + which has directions on getting the sources. You can also browse the + CVS pool from there. + + If you get the CVS sources, you will have to check them out into an + empty directory. You will then have to copy each file into the + corresponding directory in the appropriate kernel pool. + + If you don't have the latest kernel pool, you can get the + corresponding user-mode sources with + + + host% cvs co -r v_2_3_x linux + + + + + where 'x' is the version in your pool. Note that you will not get the + bug fixes and enhancements that have gone into subsequent releases. + + + 22..22.. CCoommppiilliinngg aanndd iinnssttaalllliinngg kkeerrnneell mmoodduulleess + + UML modules are built in the same way as the native kernel (with the + exception of the 'ARCH=um' that you always need for UML): + + + host% make modules ARCH=um + + + + + Any modules that you want to load into this kernel need to be built in + the user-mode pool. Modules from the native kernel won't work. + + You can install them by using ftp or something to copy them into the + virtual machine and dropping them into /lib/modules/`uname -r`. + + You can also get the kernel build process to install them as follows: + + 1. with the kernel not booted, mount the root filesystem in the top + level of the kernel pool: + + + host% mount root_fs mnt -o loop + + + + + + + 2. run + + + host% + make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um + + + + + + + 3. unmount the filesystem + + + host% umount mnt + + + + + + + 4. boot the kernel on it + + + When the system is booted, you can use insmod as usual to get the + modules into the kernel. A number of things have been loaded into UML + as modules, especially filesystems and network protocols and filters, + so most symbols which need to be exported probably already are. + However, if you do find symbols that need exporting, let us + know, and + they'll be "taken care of". + + + + 22..33.. CCoommppiilliinngg aanndd iinnssttaalllliinngg uummll__uuttiilliittiieess + + Many features of the UML kernel require a user-space helper program, + so a uml_utilities package is distributed separately from the kernel + patch which provides these helpers. Included within this is: + + +o port-helper - Used by consoles which connect to xterms or ports + + +o tunctl - Configuration tool to create and delete tap devices + + +o uml_net - Setuid binary for automatic tap device configuration + + +o uml_switch - User-space virtual switch required for daemon + transport + + The uml_utilities tree is compiled with: + + + host# + make && make install + + + + + Note that UML kernel patches may require a specific version of the + uml_utilities distribution. If you don't keep up with the mailing + lists, ensure that you have the latest release of uml_utilities if you + are experiencing problems with your UML kernel, particularly when + dealing with consoles or command-line switches to the helper programs + + + + + + + + + 33.. RRuunnnniinngg UUMMLL aanndd llooggggiinngg iinn + + + + 33..11.. RRuunnnniinngg UUMMLL + + It runs on 2.2.15 or later, and all 2.4 kernels. + + + Booting UML is straightforward. Simply run 'linux': it will try to + mount the file `root_fs' in the current directory. You do not need to + run it as root. If your root filesystem is not named `root_fs', then + you need to put a `ubd0=root_fs_whatever' switch on the linux command + line. + + + You will need a filesystem to boot UML from. There are a number + available for download from here . There are also several tools + which can be + used to generate UML-compatible filesystem images from media. + The kernel will boot up and present you with a login prompt. + + + Note: If the host is configured with a 2G/2G address space split + rather than the usual 3G/1G split, then the packaged UML binaries will + not run. They will immediately segfault. See ``UML on 2G/2G hosts'' + for the scoop on running UML on your system. + + + + 33..22.. LLooggggiinngg iinn + + + + The prepackaged filesystems have a root account with password 'root' + and a user account with password 'user'. The login banner will + generally tell you how to log in. So, you log in and you will find + yourself inside a little virtual machine. Our filesystems have a + variety of commands and utilities installed (and it is fairly easy to + add more), so you will have a lot of tools with which to poke around + the system. + + There are a couple of other ways to log in: + + +o On a virtual console + + + + Each virtual console that is configured (i.e. the device exists in + /dev and /etc/inittab runs a getty on it) will come up in its own + xterm. If you get tired of the xterms, read ``Setting up serial + lines and consoles'' to see how to attach the consoles to + something else, like host ptys. + + + + +o Over the serial line + + + In the boot output, find a line that looks like: + + + + serial line 0 assigned pty /dev/ptyp1 + + + + + Attach your favorite terminal program to the corresponding tty. I.e. + for minicom, the command would be + + + host% minicom -o -p /dev/ttyp1 + + + + + + + +o Over the net + + + If the network is running, then you can telnet to the virtual + machine and log in to it. See ``Setting up the network'' to learn + about setting up a virtual network. + + When you're done using it, run halt, and the kernel will bring itself + down and the process will exit. + + + 33..33.. EExxaammpplleess + + Here are some examples of UML in action: + + +o A login session + + +o A virtual network + + + + + + + + 44.. UUMMLL oonn 22GG//22GG hhoossttss + + + + + 44..11.. IInnttrroodduuccttiioonn + + + Most Linux machines are configured so that the kernel occupies the + upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and + processes use the lower 3G (0x00000000 - 0xbfffffff). However, some + machine are configured with a 2G/2G split, with the kernel occupying + the upper 2G (0x80000000 - 0xffffffff) and processes using the lower + 2G (0x00000000 - 0x7fffffff). + + + + + 44..22.. TThhee pprroobblleemm + + + The prebuilt UML binaries on this site will not run on 2G/2G hosts + because UML occupies the upper .5G of the 3G process address space + (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right + in the middle of the kernel address space, so UML won't even load - it + will immediately segfault. + + + + + 44..33.. TThhee ssoolluuttiioonn + + + The fix for this is to rebuild UML from source after enabling + CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to + load itself in the top .5G of that smaller process address space, + where it will run fine. See ``Compiling the kernel and modules'' if + you need help building UML from source. + + + + + + + + + + + 55.. SSeettttiinngg uupp sseerriiaall lliinneess aanndd ccoonnssoolleess + + + It is possible to attach UML serial lines and consoles to many types + of host I/O channels by specifying them on the command line. + + + You can attach them to host ptys, ttys, file descriptors, and ports. + This allows you to do things like + + +o have a UML console appear on an unused host console, + + +o hook two virtual machines together by having one attach to a pty + and having the other attach to the corresponding tty + + +o make a virtual machine accessible from the net by attaching a + console to a port on the host. + + + The general format of the command line option is device=channel. + + + + 55..11.. SSppeecciiffyyiinngg tthhee ddeevviiccee + + Devices are specified with "con" or "ssl" (console or serial line, + respectively), optionally with a device number if you are talking + about a specific device. + + + Using just "con" or "ssl" describes all of the consoles or serial + lines. If you want to talk about console #3 or serial line #10, they + would be "con3" and "ssl10", respectively. + + + A specific device name will override a less general "con=" or "ssl=". + So, for example, you can assign a pty to each of the serial lines + except for the first two like this: + + + ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 + + + + + The specificity of the device name is all that matters; order on the + command line is irrelevant. + + + + 55..22.. SSppeecciiffyyiinngg tthhee cchhaannnneell + + There are a number of different types of channels to attach a UML + device to, each with a different way of specifying exactly what to + attach to. + + +o pseudo-terminals - device=pty pts terminals - device=pts + + + This will cause UML to allocate a free host pseudo-terminal for the + device. The terminal that it got will be announced in the boot + log. You access it by attaching a terminal program to the + corresponding tty: + + +o screen /dev/pts/n + + +o screen /dev/ttyxx + + +o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts + devices + + +o kermit - start it up, 'open' the device, then 'connect' + + + + + + +o terminals - device=tty:tty device file + + + This will make UML attach the device to the specified tty (i.e + + + con1=tty:/dev/tty3 + + + + + will attach UML's console 1 to the host's /dev/tty3). If the tty that + you specify is the slave end of a tty/pty pair, something else must + have already opened the corresponding pty in order for this to work. + + + + + + +o xterms - device=xterm + + + UML will run an xterm and the device will be attached to it. + + + + + + +o Port - device=port:port number + + + This will attach the UML devices to the specified host port. + Attaching console 1 to the host's port 9000 would be done like + this: + + + con1=port:9000 + + + + + Attaching all the serial lines to that port would be done similarly: + + + ssl=port:9000 + + + + + You access these devices by telnetting to that port. Each active tel- + net session gets a different device. If there are more telnets to a + port than UML devices attached to it, then the extra telnet sessions + will block until an existing telnet detaches, or until another device + becomes active (i.e. by being activated in /etc/inittab). + + This channel has the advantage that you can both attach multiple UML + devices to it and know how to access them without reading the UML boot + log. It is also unique in allowing access to a UML from remote + machines without requiring that the UML be networked. This could be + useful in allowing public access to UMLs because they would be + accessible from the net, but wouldn't need any kind of network + filtering or access control because they would have no network access. + + + If you attach the main console to a portal, then the UML boot will + appear to hang. In reality, it's waiting for a telnet to connect, at + which point the boot will proceed. + + + + + + +o already-existing file descriptors - device=file descriptor + + + If you set up a file descriptor on the UML command line, you can + attach a UML device to it. This is most commonly used to put the + main console back on stdin and stdout after assigning all the other + consoles to something else: + + + con0=fd:0,fd:1 con=pts + + + + + + + + + +o Nothing - device=null + + + This allows the device to be opened, in contrast to 'none', but + reads will block, and writes will succeed and the data will be + thrown out. + + + + + + +o None - device=none + + + This causes the device to disappear. + + + + You can also specify different input and output channels for a device + by putting a comma between them: + + + ssl3=tty:/dev/tty2,xterm + + + + + will cause serial line 3 to accept input on the host's /dev/tty3 and + display output on an xterm. That's a silly example - the most common + use of this syntax is to reattach the main console to stdin and stdout + as shown above. + + + If you decide to move the main console away from stdin/stdout, the + initial boot output will appear in the terminal that you're running + UML in. However, once the console driver has been officially + initialized, then the boot output will start appearing wherever you + specified that console 0 should be. That device will receive all + subsequent output. + + + + 55..33.. EExxaammpplleess + + There are a number of interesting things you can do with this + capability. + + + First, this is how you get rid of those bleeding console xterms by + attaching them to host ptys: + + + con=pty con0=fd:0,fd:1 + + + + + This will make a UML console take over an unused host virtual console, + so that when you switch to it, you will see the UML login prompt + rather than the host login prompt: + + + con1=tty:/dev/tty6 + + + + + You can attach two virtual machines together with what amounts to a + serial line as follows: + + Run one UML with a serial line attached to a pty - + + + ssl1=pty + + + + + Look at the boot log to see what pty it got (this example will assume + that it got /dev/ptyp1). + + Boot the other UML with a serial line attached to the corresponding + tty - + + + ssl1=tty:/dev/ttyp1 + + + + + Log in, make sure that it has no getty on that serial line, attach a + terminal program like minicom to it, and you should see the login + prompt of the other virtual machine. + + + 66.. SSeettttiinngg uupp tthhee nneettwwoorrkk + + + + This page describes how to set up the various transports and to + provide a UML instance with network access to the host, other machines + on the local net, and the rest of the net. + + + As of 2.4.5, UML networking has been completely redone to make it much + easier to set up, fix bugs, and add new features. + + + There is a new helper, uml_net, which does the host setup that + requires root privileges. + + + There are currently five transport types available for a UML virtual + machine to exchange packets with other hosts: + + +o ethertap + + +o TUN/TAP + + +o Multicast + + +o a switch daemon + + +o slip + + +o slirp + + +o pcap + + The TUN/TAP, ethertap, slip, and slirp transports allow a UML + instance to exchange packets with the host. They may be directed + to the host or the host may just act as a router to provide access + to other physical or virtual machines. + + + The pcap transport is a synthetic read-only interface, using the + libpcap binary to collect packets from interfaces on the host and + filter them. This is useful for building preconfigured traffic + monitors or sniffers. + + + The daemon and multicast transports provide a completely virtual + network to other virtual machines. This network is completely + disconnected from the physical network unless one of the virtual + machines on it is acting as a gateway. + + + With so many host transports, which one should you use? Here's when + you should use each one: + + +o ethertap - if you want access to the host networking and it is + running 2.2 + + +o TUN/TAP - if you want access to the host networking and it is + running 2.4. Also, the TUN/TAP transport is able to use a + preconfigured device, allowing it to avoid using the setuid uml_net + helper, which is a security advantage. + + +o Multicast - if you want a purely virtual network and you don't want + to set up anything but the UML + + +o a switch daemon - if you want a purely virtual network and you + don't mind running the daemon in order to get somewhat better + performance + + +o slip - there is no particular reason to run the slip backend unless + ethertap and TUN/TAP are just not available for some reason + + +o slirp - if you don't have root access on the host to setup + networking, or if you don't want to allocate an IP to your UML + + +o pcap - not much use for actual network connectivity, but great for + monitoring traffic on the host + + Ethertap is available on 2.4 and works fine. TUN/TAP is preferred + to it because it has better performance and ethertap is officially + considered obsolete in 2.4. Also, the root helper only needs to + run occasionally for TUN/TAP, rather than handling every packet, as + it does with ethertap. This is a slight security advantage since + it provides fewer opportunities for a nasty UML user to somehow + exploit the helper's root privileges. + + + 66..11.. GGeenneerraall sseettuupp + + First, you must have the virtual network enabled in your UML. If are + running a prebuilt kernel from this site, everything is already + enabled. If you build the kernel yourself, under the "Network device + support" menu, enable "Network device support", and then the three + transports. + + + The next step is to provide a network device to the virtual machine. + This is done by describing it on the kernel command line. + + The general format is + + + eth = , + + + + + For example, a virtual ethernet device may be attached to a host + ethertap device as follows: + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + This sets up eth0 inside the virtual machine to attach itself to the + host /dev/tap0, assigns it an ethernet address, and assigns the host + tap0 interface an IP address. + + + + Note that the IP address you assign to the host end of the tap device + must be different than the IP you assign to the eth device inside UML. + If you are short on IPs and don't want to consume two per UML, then + you can reuse the host's eth IP address for the host ends of the tap + devices. Internally, the UMLs must still get unique IPs for their eth + devices. You can also give the UMLs non-routable IPs (192.168.x.x or + 10.x.x.x) and have the host masquerade them. This will let outgoing + connections work, but incoming connections won't without more work, + such as port forwarding from the host. + Also note that when you configure the host side of an interface, it is + only acting as a gateway. It will respond to pings sent to it + locally, but is not useful to do that since it's a host interface. + You are not talking to the UML when you ping that interface and get a + response. + + + You can also add devices to a UML and remove them at runtime. See the + ``The Management Console'' page for details. + + + The sections below describe this in more detail. + + + Once you've decided how you're going to set up the devices, you boot + UML, log in, configure the UML side of the devices, and set up routes + to the outside world. At that point, you will be able to talk to any + other machines, physical or virtual, on the net. + + + If ifconfig inside UML fails and the network refuses to come up, run + tell you what went wrong. + + + + 66..22.. UUsseerrssppaaccee ddaaeemmoonnss + + You will likely need the setuid helper, or the switch daemon, or both. + They are both installed with the RPM and deb, so if you've installed + either, you can skip the rest of this section. + + + If not, then you need to check them out of CVS, build them, and + install them. The helper is uml_net, in CVS /tools/uml_net, and the + daemon is uml_switch, in CVS /tools/uml_router. They are both built + with a plain 'make'. Both need to be installed in a directory that's + in your path - /usr/bin is recommend. On top of that, uml_net needs + to be setuid root. + + + + 66..33.. SSppeecciiffyyiinngg eetthheerrnneett aaddddrreesssseess + + Below, you will see that the TUN/TAP, ethertap, and daemon interfaces + allow you to specify hardware addresses for the virtual ethernet + devices. This is generally not necessary. If you don't have a + specific reason to do it, you probably shouldn't. If one is not + specified on the command line, the driver will assign one based on the + device IP address. It will provide the address fe:fd:nn:nn:nn:nn + where nn.nn.nn.nn is the device IP address. This is nearly always + sufficient to guarantee a unique hardware address for the device. A + couple of exceptions are: + + +o Another set of virtual ethernet devices are on the same network and + they are assigned hardware addresses using a different scheme which + may conflict with the UML IP address-based scheme + + +o You aren't going to use the device for IP networking, so you don't + assign the device an IP address + + If you let the driver provide the hardware address, you should make + sure that the device IP address is known before the interface is + brought up. So, inside UML, this will guarantee that: + + + + UML# + ifconfig eth0 192.168.0.250 up + + + + + If you decide to assign the hardware address yourself, make sure that + the first byte of the address is even. Addresses with an odd first + byte are broadcast addresses, which you don't want assigned to a + device. + + + + 66..44.. UUMMLL iinntteerrffaaccee sseettuupp + + Once the network devices have been described on the command line, you + should boot UML and log in. + + + The first thing to do is bring the interface up: + + + UML# ifconfig ethn ip-address up + + + + + You should be able to ping the host at this point. + + + To reach the rest of the world, you should set a default route to the + host: + + + UML# route add default gw host ip + + + + + Again, with host ip of 192.168.0.4: + + + UML# route add default gw 192.168.0.4 + + + + + This page used to recommend setting a network route to your local net. + This is wrong, because it will cause UML to try to figure out hardware + addresses of the local machines by arping on the interface to the + host. Since that interface is basically a single strand of ethernet + with two nodes on it (UML and the host) and arp requests don't cross + networks, they will fail to elicit any responses. So, what you want + is for UML to just blindly throw all packets at the host and let it + figure out what to do with them, which is what leaving out the network + route and adding the default route does. + + + Note: If you can't communicate with other hosts on your physical + ethernet, it's probably because of a network route that's + automatically set up. If you run 'route -n' and see a route that + looks like this: + + + + + Destination Gateway Genmask Flags Metric Ref Use Iface + 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 + + + + + with a mask that's not 255.255.255.255, then replace it with a route + to your host: + + + UML# + route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 + + + + + + + UML# + route add -host 192.168.0.4 dev eth0 + + + + + This, plus the default route to the host, will allow UML to exchange + packets with any machine on your ethernet. + + + + 66..55.. MMuullttiiccaasstt + + The simplest way to set up a virtual network between multiple UMLs is + to use the mcast transport. This was written by Harald Welte and is + present in UML version 2.4.5-5um and later. Your system must have + multicast enabled in the kernel and there must be a multicast-capable + network device on the host. Normally, this is eth0, but if there is + no ethernet card on the host, then you will likely get strange error + messages when you bring the device up inside UML. + + + To use it, run two UMLs with + + + eth0=mcast + + + + + on their command lines. Log in, configure the ethernet device in each + machine with different IP addresses: + + + UML1# ifconfig eth0 192.168.0.254 + + + + + + + UML2# ifconfig eth0 192.168.0.253 + + + + + and they should be able to talk to each other. + + The full set of command line options for this transport are + + + + ethn=mcast,ethernet address,multicast + address,multicast port,ttl + + + + + Harald's original README is here and explains these in detail, as well as + some other issues. + + There is also a related point-to-point only "ucast" transport. + This is useful when your network does not support multicast, and + all network connections are simple point to point links. + + The full set of command line options for this transport are + + + ethn=ucast,ethernet address,remote address,listen port,remote port + + + + + 66..66.. TTUUNN//TTAAPP wwiitthh tthhee uummll__nneett hheellppeerr + + TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the + host. The TUN/TAP backend has been in UML since 2.4.9-3um. + + + The easiest way to get up and running is to let the setuid uml_net + helper do the host setup for you. This involves insmod-ing the tun.o + module if necessary, configuring the device, and setting up IP + forwarding, routing, and proxy arp. If you are new to UML networking, + do this first. If you're concerned about the security implications of + the setuid helper, use it to get up and running, then read the next + section to see how to have UML use a preconfigured tap device, which + avoids the use of uml_net. + + + If you specify an IP address for the host side of the device, the + uml_net helper will do all necessary setup on the host - the only + requirement is that TUN/TAP be available, either built in to the host + kernel or as the tun.o module. + + The format of the command line switch to attach a device to a TUN/TAP + device is + + + eth =tuntap,,, + + + + + For example, this argument will attach the UML's eth0 to the next + available tap device and assign an ethernet address to it based on its + IP address + + + eth0=tuntap,,,192.168.0.254 + + + + + + + Note that the IP address that must be used for the eth device inside + UML is fixed by the routing and proxy arp that is set up on the + TUN/TAP device on the host. You can use a different one, but it won't + work because reply packets won't reach the UML. This is a feature. + It prevents a nasty UML user from doing things like setting the UML IP + to the same as the network's nameserver or mail server. + + + There are a couple potential problems with running the TUN/TAP + transport on a 2.4 host kernel + + +o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host + kernel or use the ethertap transport. + + +o With an upgraded kernel, TUN/TAP may fail with + + + File descriptor in bad state + + + + + This is due to a header mismatch between the upgraded kernel and the + kernel that was originally installed on the machine. The fix is to + make sure that /usr/src/linux points to the headers for the running + kernel. + + These were pointed out by Tim Robinson in + name="this uml- + user post"> . + + + + 66..77.. TTUUNN//TTAAPP wwiitthh aa pprreeccoonnffiigguurreedd ttaapp ddeevviiccee + + If you prefer not to have UML use uml_net (which is somewhat + insecure), with UML 2.4.17-11, you can set up a TUN/TAP device + beforehand. The setup needs to be done as root, but once that's done, + there is no need for root assistance. Setting up the device is done + as follows: + + +o Create the device with tunctl (available from the UML utilities + tarball) + + + + + host# tunctl -u uid + + + + + where uid is the user id or username that UML will be run as. This + will tell you what device was created. + + +o Configure the device IP (change IP addresses and device name to + suit) + + + + + host# ifconfig tap0 192.168.0.254 up + + + + + + +o Set up routing and arping if desired - this is my recipe, there are + other ways of doing the same thing + + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' + + host# + route add -host 192.168.0.253 dev tap0 + + + + + + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' + + + + + + + host# + arp -Ds 192.168.0.253 eth0 pub + + + + + Note that this must be done every time the host boots - this configu- + ration is not stored across host reboots. So, it's probably a good + idea to stick it in an rc file. An even better idea would be a little + utility which reads the information from a config file and sets up + devices at boot time. + + +o Rather than using up two IPs and ARPing for one of them, you can + also provide direct access to your LAN by the UML by using a + bridge. + + + host# + brctl addbr br0 + + + + + + + host# + ifconfig eth0 0.0.0.0 promisc up + + + + + + + host# + ifconfig tap0 0.0.0.0 promisc up + + + + + + + host# + ifconfig br0 192.168.0.1 netmask 255.255.255.0 up + + + + + + + + host# + brctl stp br0 off + + + + + + + host# + brctl setfd br0 1 + + + + + + + host# + brctl sethello br0 1 + + + + + + + host# + brctl addif br0 eth0 + + + + + + + host# + brctl addif br0 tap0 + + + + + Note that 'br0' should be setup using ifconfig with the existing IP + address of eth0, as eth0 no longer has its own IP. + + +o + + + Also, the /dev/net/tun device must be writable by the user running + UML in order for the UML to use the device that's been configured + for it. The simplest thing to do is + + + host# chmod 666 /dev/net/tun + + + + + Making it world-writable looks bad, but it seems not to be + exploitable as a security hole. However, it does allow anyone to cre- + ate useless tap devices (useless because they can't configure them), + which is a DOS attack. A somewhat more secure alternative would to be + to create a group containing all the users who have preconfigured tap + devices and chgrp /dev/net/tun to that group with mode 664 or 660. + + + +o Once the device is set up, run UML with 'eth0=tuntap,device name' + (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the + mconsole config command). + + +o Bring the eth device up in UML and you're in business. + + If you don't want that tap device any more, you can make it non- + persistent with + + + host# tunctl -d tap device + + + + + Finally, tunctl has a -b (for brief mode) switch which causes it to + output only the name of the tap device it created. This makes it + suitable for capture by a script: + + + host# TAP=`tunctl -u 1000 -b` + + + + + + + 66..88.. EEtthheerrttaapp + + Ethertap is the general mechanism on 2.2 for userspace processes to + exchange packets with the kernel. + + + + To use this transport, you need to describe the virtual network device + on the UML command line. The general format for this is + + + eth =ethertap, , , + + + + + So, the previous example + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + attaches the UML eth0 device to the host /dev/tap0, assigns it the + ethernet address fe:fd:0:0:0:1, and assigns the IP address + 192.168.0.254 to the tap device. + + + + The tap device is mandatory, but the others are optional. If the + ethernet address is omitted, one will be assigned to it. + + + The presence of the tap IP address will cause the helper to run and do + whatever host setup is needed to allow the virtual machine to + communicate with the outside world. If you're not sure you know what + you're doing, this is the way to go. + + + If it is absent, then you must configure the tap device and whatever + arping and routing you will need on the host. However, even in this + case, the uml_net helper still needs to be in your path and it must be + setuid root if you're not running UML as root. This is because the + tap device doesn't support SIGIO, which UML needs in order to use + something as a source of input. So, the helper is used as a + convenient asynchronous IO thread. + + If you're using the uml_net helper, you can ignore the following host + setup - uml_net will do it for you. You just need to make sure you + have ethertap available, either built in to the host kernel or + available as a module. + + + If you want to set things up yourself, you need to make sure that the + appropriate /dev entry exists. If it doesn't, become root and create + it as follows: + + + mknod /dev/tap c 36 + 16 + + + + + For example, this is how to create /dev/tap0: + + + mknod /dev/tap0 c 36 0 + 16 + + + + + You also need to make sure that the host kernel has ethertap support. + If ethertap is enabled as a module, you apparently need to insmod + ethertap once for each ethertap device you want to enable. So, + + + host# + insmod ethertap + + + + + will give you the tap0 interface. To get the tap1 interface, you need + to run + + + host# + insmod ethertap unit=1 -o ethertap1 + + + + + + + + 66..99.. TThhee sswwiittcchh ddaaeemmoonn + + NNoottee: This is the daemon formerly known as uml_router, but which was + renamed so the network weenies of the world would stop growling at me. + + + The switch daemon, uml_switch, provides a mechanism for creating a + totally virtual network. By default, it provides no connection to the + host network (but see -tap, below). + + + The first thing you need to do is run the daemon. Running it with no + arguments will make it listen on a default pair of unix domain + sockets. + + + If you want it to listen on a different pair of sockets, use + + + -unix control socket data socket + + + + + + If you want it to act as a hub rather than a switch, use + + + -hub + + + + + + If you want the switch to be connected to host networking (allowing + the umls to get access to the outside world through the host), use + + + -tap tap0 + + + + + + Note that the tap device must be preconfigured (see "TUN/TAP with a + preconfigured tap device", above). If you're using a different tap + device than tap0, specify that instead of tap0. + + + uml_switch can be backgrounded as follows + + + host% + uml_switch [ options ] < /dev/null > /dev/null + + + + + The reason it doesn't background by default is that it listens to + stdin for EOF. When it sees that, it exits. + + + The general format of the kernel command line switch is + + + + ethn=daemon,ethernet address,socket + type,control socket,data socket + + + + + You can leave off everything except the 'daemon'. You only need to + specify the ethernet address if the one that will be assigned to it + isn't acceptable for some reason. The rest of the arguments describe + how to communicate with the daemon. You should only specify them if + you told the daemon to use different sockets than the default. So, if + you ran the daemon with no arguments, running the UML on the same + machine with + eth0=daemon + + + + + will cause the eth0 driver to attach itself to the daemon correctly. + + + + 66..1100.. SSlliipp + + Slip is another, less general, mechanism for a process to communicate + with the host networking. In contrast to the ethertap interface, + which exchanges ethernet frames with the host and can be used to + transport any higher-level protocol, it can only be used to transport + IP. + + + The general format of the command line switch is + + + + ethn=slip,slip IP + + + + + The slip IP argument is the IP address that will be assigned to the + host end of the slip device. If it is specified, the helper will run + and will set up the host so that the virtual machine can reach it and + the rest of the network. + + + There are some oddities with this interface that you should be aware + of. You should only specify one slip device on a given virtual + machine, and its name inside UML will be 'umn', not 'eth0' or whatever + you specified on the command line. These problems will be fixed at + some point. + + + + 66..1111.. SSlliirrpp + + slirp uses an external program, usually /usr/bin/slirp, to provide IP + only networking connectivity through the host. This is similar to IP + masquerading with a firewall, although the translation is performed in + user-space, rather than by the kernel. As slirp does not set up any + interfaces on the host, or changes routing, slirp does not require + root access or setuid binaries on the host. + + + The general format of the command line switch for slirp is: + + + + ethn=slirp,ethernet address,slirp path + + + + + The ethernet address is optional, as UML will set up the interface + with an ethernet address based upon the initial IP address of the + interface. The slirp path is generally /usr/bin/slirp, although it + will depend on distribution. + + + The slirp program can have a number of options passed to the command + line and we can't add them to the UML command line, as they will be + parsed incorrectly. Instead, a wrapper shell script can be written or + the options inserted into the /.slirprc file. More information on + all of the slirp options can be found in its man pages. + + + The eth0 interface on UML should be set up with the IP 10.2.0.15, + although you can use anything as long as it is not used by a network + you will be connecting to. The default route on UML should be set to + use + + + UML# + route add default dev eth0 + + + + + slirp provides a number of useful IP addresses which can be used by + UML, such as 10.0.2.3 which is an alias for the DNS server specified + in /etc/resolv.conf on the host or the IP given in the 'dns' option + for slirp. + + + Even with a baudrate setting higher than 115200, the slirp connection + is limited to 115200. If you need it to go faster, the slirp binary + needs to be compiled with FULL_BOLT defined in config.h. + + + + 66..1122.. ppccaapp + + The pcap transport is attached to a UML ethernet device on the command + line or with uml_mconsole with the following syntax: + + + + ethn=pcap,host interface,filter + expression,option1,option2 + + + + + The expression and options are optional. + + + The interface is whatever network device on the host you want to + sniff. The expression is a pcap filter expression, which is also what + tcpdump uses, so if you know how to specify tcpdump filters, you will + use the same expressions here. The options are up to two of + 'promisc', control whether pcap puts the host interface into + promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap + expression optimizer is used. + + + Example: + + + + eth0=pcap,eth0,tcp + + eth1=pcap,eth0,!tcp + + + + will cause the UML eth0 to emit all tcp packets on the host eth0 and + the UML eth1 to emit all non-tcp packets on the host eth0. + + + + 66..1133.. SSeettttiinngg uupp tthhee hhoosstt yyoouurrsseellff + + If you don't specify an address for the host side of the ethertap or + slip device, UML won't do any setup on the host. So this is what is + needed to get things working (the examples use a host-side IP of + 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your + own network): + + +o The device needs to be configured with its IP address. Tap devices + are also configured with an mtu of 1484. Slip devices are + configured with a point-to-point address pointing at the UML ip + address. + + + host# ifconfig tap0 arp mtu 1484 192.168.0.251 up + + + + + + + host# + ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up + + + + + + +o If a tap device is being set up, a route is set to the UML IP. + + + UML# route add -host 192.168.0.250 gw 192.168.0.251 + + + + + + +o To allow other hosts on your network to see the virtual machine, + proxy arp is set up for it. + + + host# arp -Ds 192.168.0.250 eth0 pub + + + + + + +o Finally, the host is set up to route packets. + + + host# echo 1 > /proc/sys/net/ipv4/ip_forward + + + + + + + + + + + 77.. SShhaarriinngg FFiilleessyysstteemmss bbeettwweeeenn VViirrttuuaall MMaacchhiinneess + + + + + 77..11.. AA wwaarrnniinngg + + Don't attempt to share filesystems simply by booting two UMLs from the + same file. That's the same thing as booting two physical machines + from a shared disk. It will result in filesystem corruption. + + + + 77..22.. UUssiinngg llaayyeerreedd bblloocckk ddeevviicceess + + The way to share a filesystem between two virtual machines is to use + the copy-on-write (COW) layering capability of the ubd block driver. + As of 2.4.6-2um, the driver supports layering a read-write private + device over a read-only shared device. A machine's writes are stored + in the private device, while reads come from either device - the + private one if the requested block is valid in it, the shared one if + not. Using this scheme, the majority of data which is unchanged is + shared between an arbitrary number of virtual machines, each of which + has a much smaller file containing the changes that it has made. With + a large number of UMLs booting from a large root filesystem, this + leads to a huge disk space saving. It will also help performance, + since the host will be able to cache the shared data using a much + smaller amount of memory, so UML disk requests will be served from the + host's memory rather than its disks. + + + + + To add a copy-on-write layer to an existing block device file, simply + add the name of the COW file to the appropriate ubd switch: + + + ubd0=root_fs_cow,root_fs_debian_22 + + + + + where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is + the existing shared filesystem. The COW file need not exist. If it + doesn't, the driver will create and initialize it. Once the COW file + has been initialized, it can be used on its own on the command line: + + + ubd0=root_fs_cow + + + + + The name of the backing file is stored in the COW file header, so it + would be redundant to continue specifying it on the command line. + + + + 77..33.. NNoottee!! + + When checking the size of the COW file in order to see the gobs of + space that you're saving, make sure you use 'ls -ls' to see the actual + disk consumption rather than the length of the file. The COW file is + sparse, so the length will be very different from the disk usage. + Here is a 'ls -l' of a COW file and backing file from one boot and + shutdown: + host% ls -l cow.debian debian2.2 + -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Doesn't look like much saved space, does it? Well, here's 'ls -ls': + + + host% ls -ls cow.debian debian2.2 + 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Now, you can see that the COW file has less than a meg of disk, rather + than 492 meg. + + + + 77..44.. AAnnootthheerr wwaarrnniinngg + + Once a filesystem is being used as a readonly backing file for a COW + file, do not boot directly from it or modify it in any way. Doing so + will invalidate any COW files that are using it. The mtime and size + of the backing file are stored in the COW file header at its creation, + and they must continue to match. If they don't, the driver will + refuse to use the COW file. + + + + + If you attempt to evade this restriction by changing either the + backing file or the COW header by hand, you will get a corrupted + filesystem. + + + + + Among other things, this means that upgrading the distribution in a + backing file and expecting that all of the COW files using it will see + the upgrade will not work. + + + + + 77..55.. uummll__mmoooo :: MMeerrggiinngg aa CCOOWW ffiillee wwiitthh iittss bbaacckkiinngg ffiillee + + Depending on how you use UML and COW devices, it may be advisable to + merge the changes in the COW file into the backing file every once in + a while. + + + + + The utility that does this is uml_moo. Its usage is + + + host% uml_moo COW file new backing file + + + + + There's no need to specify the backing file since that information is + already in the COW file header. If you're paranoid, boot the new + merged file, and if you're happy with it, move it over the old backing + file. + + + + + uml_moo creates a new backing file by default as a safety measure. It + also has a destructive merge option which will merge the COW file + directly into its current backing file. This is really only usable + when the backing file only has one COW file associated with it. If + there are multiple COWs associated with a backing file, a -d merge of + one of them will invalidate all of the others. However, it is + convenient if you're short of disk space, and it should also be + noticeably faster than a non-destructive merge. + + + + + uml_moo is installed with the UML deb and RPM. If you didn't install + UML from one of those packages, you can also get it from the UML + utilities tar file in tools/moo. + + + + + + + + + 88.. CCrreeaattiinngg ffiilleessyysstteemmss + + + You may want to create and mount new UML filesystems, either because + your root filesystem isn't large enough or because you want to use a + filesystem other than ext2. + + + This was written on the occasion of reiserfs being included in the + 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will + talk about reiserfs. This information is generic, and the examples + should be easy to translate to the filesystem of your choice. + + + 88..11.. CCrreeaattee tthhee ffiilleessyysstteemm ffiillee + + dd is your friend. All you need to do is tell dd to create an empty + file of the appropriate size. I usually make it sparse to save time + and to avoid allocating disk space until it's actually used. For + example, the following command will create a sparse 100 meg file full + of zeroes. + + + host% + dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M + + + + + + + 88..22.. AAssssiiggnn tthhee ffiillee ttoo aa UUMMLL ddeevviiccee + + Add an argument like the following to the UML command line: + + ubd4=new_filesystem + + + + + making sure that you use an unassigned ubd device number. + + + + 88..33.. CCrreeaattiinngg aanndd mmoouunnttiinngg tthhee ffiilleessyysstteemm + + Make sure that the filesystem is available, either by being built into + the kernel, or available as a module, then boot up UML and log in. If + the root filesystem doesn't have the filesystem utilities (mkfs, fsck, + etc), then get them into UML by way of the net or hostfs. + + + Make the new filesystem on the device assigned to the new file: + + + host# mkreiserfs /dev/ubd/4 + + + <----------- MKREISERFSv2 -----------> + + ReiserFS version 3.6.25 + Block size 4096 bytes + Block count 25856 + Used blocks 8212 + Journal - 8192 blocks (18-8209), journal header is in block 8210 + Bitmaps: 17 + Root block 8211 + Hash function "r5" + ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y + journal size 8192 (from 18) + Initializing journal - 0%....20%....40%....60%....80%....100% + Syncing..done. + + + + + Now, mount it: + + + UML# + mount /dev/ubd/4 /mnt + + + + + and you're in business. + + + + + + + + + + 99.. HHoosstt ffiillee aacccceessss + + + If you want to access files on the host machine from inside UML, you + can treat it as a separate machine and either nfs mount directories + from the host or copy files into the virtual machine with scp or rcp. + However, since UML is running on the host, it can access those + files just like any other process and make them available inside the + virtual machine without needing to use the network. + + + This is now possible with the hostfs virtual filesystem. With it, you + can mount a host directory into the UML filesystem and access the + files contained in it just as you would on the host. + + + 99..11.. UUssiinngg hhoossttffss + + To begin with, make sure that hostfs is available inside the virtual + machine with + + + UML# cat /proc/filesystems + + + + . hostfs should be listed. If it's not, either rebuild the kernel + with hostfs configured into it or make sure that hostfs is built as a + module and available inside the virtual machine, and insmod it. + + + Now all you need to do is run mount: + + + UML# mount none /mnt/host -t hostfs + + + + + will mount the host's / on the virtual machine's /mnt/host. + + + If you don't want to mount the host root directory, then you can + specify a subdirectory to mount with the -o switch to mount: + + + UML# mount none /mnt/home -t hostfs -o /home + + + + + will mount the hosts's /home on the virtual machine's /mnt/home. + + + + 99..22.. hhoossttffss aass tthhee rroooott ffiilleessyysstteemm + + It's possible to boot from a directory hierarchy on the host using + hostfs rather than using the standard filesystem in a file. + + To start, you need that hierarchy. The easiest way is to loop mount + an existing root_fs file: + + + host# mount root_fs uml_root_dir -o loop + + + + + You need to change the filesystem type of / in etc/fstab to be + 'hostfs', so that line looks like this: + + /dev/ubd/0 / hostfs defaults 1 1 + + + + + Then you need to chown to yourself all the files in that directory + that are owned by root. This worked for me: + + + host# find . -uid 0 -exec chown jdike {} \; + + + + + Next, make sure that your UML kernel has hostfs compiled in, not as a + module. Then run UML with the boot device pointing at that directory: + + + ubd0=/path/to/uml/root/directory + + + + + UML should then boot as it does normally. + + + 99..33.. BBuuiillddiinngg hhoossttffss + + If you need to build hostfs because it's not in your kernel, you have + two choices: + + + + +o Compiling hostfs into the kernel: + + + Reconfigure the kernel and set the 'Host filesystem' option under + + + +o Compiling hostfs as a module: + + + Reconfigure the kernel and set the 'Host filesystem' option under + be in arch/um/fs/hostfs/hostfs.o. Install that in + /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and + + + UML# insmod hostfs + + + + + + + + + + + + + 1100.. TThhee MMaannaaggeemmeenntt CCoonnssoollee + + + + The UML management console is a low-level interface to the kernel, + somewhat like the i386 SysRq interface. Since there is a full-blown + operating system under UML, there is much greater flexibility possible + than with the SysRq mechanism. + + + There are a number of things you can do with the mconsole interface: + + +o get the kernel version + + +o add and remove devices + + +o halt or reboot the machine + + +o Send SysRq commands + + +o Pause and resume the UML + + + You need the mconsole client (uml_mconsole) which is present in CVS + (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in + 2.4.6. + + + You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. + When you boot UML, you'll see a line like: + + + mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole + + + + + If you specify a unique machine id one the UML command line, i.e. + + + umid=debian + + + + + you'll see this + + + mconsole initialized on /home/jdike/.uml/debian/mconsole + + + + + That file is the socket that uml_mconsole will use to communicate with + UML. Run it with either the umid or the full path as its argument: + + + host% uml_mconsole debian + + + + + or + + + host% uml_mconsole /home/jdike/.uml/debian/mconsole + + + + + You'll get a prompt, at which you can run one of these commands: + + +o version + + +o halt + + +o reboot + + +o config + + +o remove + + +o sysrq + + +o help + + +o cad + + +o stop + + +o go + + + 1100..11.. vveerrssiioonn + + This takes no arguments. It prints the UML version. + + + (mconsole) version + OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 + + + + + There are a couple actual uses for this. It's a simple no-op which + can be used to check that a UML is running. It's also a way of + sending an interrupt to the UML. This is sometimes useful on SMP + hosts, where there's a bug which causes signals to UML to be lost, + often causing it to appear to hang. Sending such a UML the mconsole + version command is a good way to 'wake it up' before networking has + been enabled, as it does not do anything to the function of the UML. + + + + 1100..22.. hhaalltt aanndd rreebboooott + + These take no arguments. They shut the machine down immediately, with + no syncing of disks and no clean shutdown of userspace. So, they are + pretty close to crashing the machine. + + + (mconsole) halt + OK + + + + + + + 1100..33.. ccoonnffiigg + + "config" adds a new device to the virtual machine. Currently the ubd + and network drivers support this. It takes one argument, which is the + device to add, with the same syntax as the kernel command line. + + + + + (mconsole) + config ubd3=/home/jdike/incoming/roots/root_fs_debian22 + + OK + (mconsole) config eth1=mcast + OK + + + + + + + 1100..44.. rreemmoovvee + + "remove" deletes a device from the system. Its argument is just the + name of the device to be removed. The device must be idle in whatever + sense the driver considers necessary. In the case of the ubd driver, + the removed block device must not be mounted, swapped on, or otherwise + open, and in the case of the network driver, the device must be down. + + + (mconsole) remove ubd3 + OK + (mconsole) remove eth1 + OK + + + + + + + 1100..55.. ssyyssrrqq + + This takes one argument, which is a single letter. It calls the + generic kernel's SysRq driver, which does whatever is called for by + that argument. See the SysRq documentation in Documentation/sysrq.txt + in your favorite kernel tree to see what letters are valid and what + they do. + + + + 1100..66.. hheellpp + + "help" returns a string listing the valid commands and what each one + does. + + + + 1100..77.. ccaadd + + This invokes the Ctl-Alt-Del action on init. What exactly this ends + up doing is up to /etc/inittab. Normally, it reboots the machine. + With UML, this is usually not desired, so if a halt would be better, + then find the section of inittab that looks like this + + + # What to do when CTRL-ALT-DEL is pressed. + ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now + + + + + and change the command to halt. + + + + 1100..88.. ssttoopp + + This puts the UML in a loop reading mconsole requests until a 'go' + mconsole command is received. This is very useful for making backups + of UML filesystems, as the UML can be stopped, then synced via 'sysrq + s', so that everything is written to the filesystem. You can then copy + the filesystem and then send the UML 'go' via mconsole. + + + Note that a UML running with more than one CPU will have problems + after you send the 'stop' command, as only one CPU will be held in a + mconsole loop and all others will continue as normal. This is a bug, + and will be fixed. + + + + 1100..99.. ggoo + + This resumes a UML after being paused by a 'stop' command. Note that + when the UML has resumed, TCP connections may have timed out and if + the UML is paused for a long period of time, crond might go a little + crazy, running all the jobs it didn't do earlier. + + + + + + + + + 1111.. KKeerrnneell ddeebbuuggggiinngg + + + NNoottee:: The interface that makes debugging, as described here, possible + is present in 2.4.0-test6 kernels and later. + + + Since the user-mode kernel runs as a normal Linux process, it is + possible to debug it with gdb almost like any other process. It is + slightly different because the kernel's threads are already being + ptraced for system call interception, so gdb can't ptrace them. + However, a mechanism has been added to work around that problem. + + + In order to debug the kernel, you need build it from source. See + ``Compiling the kernel and modules'' for information on doing that. + Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during + the config. These will compile the kernel with -g, and enable the + ptrace proxy so that gdb works with UML, respectively. + + + + + 1111..11.. SSttaarrttiinngg tthhee kkeerrnneell uunnddeerr ggddbb + + You can have the kernel running under the control of gdb from the + beginning by putting 'debug' on the command line. You will get an + xterm with gdb running inside it. The kernel will send some commands + to gdb which will leave it stopped at the beginning of start_kernel. + At this point, you can get things going with 'next', 'step', or + 'cont'. + + + There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an + interrupt handler. + 1111..22.. EExxaammiinniinngg sslleeeeppiinngg pprroocceesssseess + + Not every bug is evident in the currently running process. Sometimes, + processes hang in the kernel when they shouldn't because they've + deadlocked on a semaphore or something similar. In this case, when + you ^C gdb and get a backtrace, you will see the idle thread, which + isn't very relevant. + + + What you want is the stack of whatever process is sleeping when it + shouldn't be. You need to figure out which process that is, which is + generally fairly easy. Then you need to get its host process id, + which you can do either by looking at ps on the host or at + task.thread.extern_pid in gdb. + + + Now what you do is this: + + +o detach from the current thread + + + (UML gdb) det + + + + + + +o attach to the thread you are interested in + + + (UML gdb) att + + + + + + +o look at its stack and anything else of interest + + + (UML gdb) bt + + + + + Note that you can't do anything at this point that requires that a + process execute, e.g. calling a function + + +o when you're done looking at that process, reattach to the current + thread and continue it + + + (UML gdb) + att 1 + + + + + + + (UML gdb) + c + + + + + Here, specifying any pid which is not the process id of a UML thread + will cause gdb to reattach to the current thread. I commonly use 1, + but any other invalid pid would work. + + + + 1111..33.. RRuunnnniinngg dddddd oonn UUMMLL + + ddd works on UML, but requires a special kludge. The process goes + like this: + + +o Start ddd + + + host% ddd linux + + + + + + +o With ps, get the pid of the gdb that ddd started. You can ask the + gdb to tell you, but for some reason that confuses things and + causes a hang. + + +o run UML with 'debug=parent gdb-pid=' added to the command line + - it will just sit there after you hit return + + +o type 'att 1' to the ddd gdb and you will see something like + + + 0xa013dc51 in __kill () + + + (gdb) + + + + + + +o At this point, type 'c', UML will boot up, and you can use ddd just + as you do on any other process. + + + + 1111..44.. DDeebbuuggggiinngg mmoodduulleess + + gdb has support for debugging code which is dynamically loaded into + the process. This support is what is needed to debug kernel modules + under UML. + + + Using that support is somewhat complicated. You have to tell gdb what + object file you just loaded into UML and where in memory it is. Then, + it can read the symbol table, and figure out where all the symbols are + from the load address that you provided. It gets more interesting + when you load the module again (i.e. after an rmmod). You have to + tell gdb to forget about all its symbols, including the main UML ones + for some reason, then load then all back in again. + + + There's an easy way and a hard way to do this. The easy way is to use + the umlgdb expect script written by Chandan Kudige. It basically + automates the process for you. + + + First, you must tell it where your modules are. There is a list in + the script that looks like this: + set MODULE_PATHS { + "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" + "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" + "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" + } + + + + + You change that to list the names and paths of the modules that you + are going to debug. Then you run it from the toplevel directory of + your UML pool and it basically tells you what to do: + + + + + ******** GDB pid is 21903 ******** + Start UML as: ./linux debug gdb-pid=21903 + + + + GNU gdb 5.0rh-5 Red Hat Linux 7.1 + Copyright 2001 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) b sys_init_module + Breakpoint 1 at 0xa0011923: file module.c, line 349. + (gdb) att 1 + + + + + After you run UML and it sits there doing nothing, you hit return at + the 'att 1' and continue it: + + + Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 + 0xa00f4221 in __kill () + (UML gdb) c + Continuing. + + + + + At this point, you debug normally. When you insmod something, the + expect magic will kick in and you'll see something like: + + + + + + + + + + + + + + + + + + *** Module hostfs loaded *** + Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 349 char *name, *n_name, *name_tmp = NULL; + (UML gdb) finish + Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 + 411 else res = EXECUTE_SYSCALL(syscall, regs); + Value returned is $1 = 0 + (UML gdb) + p/x (int)module_list + module_list->size_of_struct + + $2 = 0xa9021054 + (UML gdb) symbol-file ./linux + Load new symbol table from "./linux"? (y or n) y + Reading symbols from ./linux... + done. + (UML gdb) + add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 + + add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at + .text_addr = 0xa9021054 + (y or n) y + + Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... + done. + (UML gdb) p *module_list + $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", + size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, + nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, + init = 0xa90221f0 , cleanup = 0xa902222c , + ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, + persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, + kallsyms_end = 0x0, + archdata_start = 0x1b855
, + archdata_end = 0xe5890000
, + kernel_data = 0xf689c35d
} + >> Finished loading symbols for hostfs ... + + + + + That's the easy way. It's highly recommended. The hard way is + described below in case you're interested in what's going on. + + + Boot the kernel under the debugger and load the module with insmod or + modprobe. With gdb, do: + + + (UML gdb) p module_list + + + + + This is a list of modules that have been loaded into the kernel, with + the most recently loaded module first. Normally, the module you want + is at module_list. If it's not, walk down the next links, looking at + the name fields until find the module you want to debug. Take the + address of that structure, and add module.size_of_struct (which in + 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition + for you :-): + + + + (UML gdb) + printf "%#x\n", (int)module_list module_list->size_of_struct + + + + + The offset from the module start occasionally changes (before 2.4.0, + it was module.size_of_struct + 4), so it's a good idea to check the + init and cleanup addresses once in a while, as describe below. Now + do: + + + (UML gdb) + add-symbol-file /path/to/module/on/host that_address + + + + + Tell gdb you really want to do it, and you're in business. + + + If there's any doubt that you got the offset right, like breakpoints + appear not to work, or they're appearing in the wrong place, you can + check it by looking at the module structure. The init and cleanup + fields should look like: + + + init = 0x588066b0 , cleanup = 0x588066c0 + + + + + with no offsets on the symbol names. If the names are right, but they + are offset, then the offset tells you how much you need to add to the + address you gave to add-symbol-file. + + + When you want to load in a new version of the module, you need to get + gdb to forget about the old one. The only way I've found to do that + is to tell gdb to forget about all symbols that it knows about: + + + (UML gdb) symbol-file + + + + + Then reload the symbols from the kernel binary: + + + (UML gdb) symbol-file /path/to/kernel + + + + + and repeat the process above. You'll also need to re-enable break- + points. They were disabled when you dumped all the symbols because + gdb couldn't figure out where they should go. + + + + 1111..55.. AAttttaacchhiinngg ggddbb ttoo tthhee kkeerrnneell + + If you don't have the kernel running under gdb, you can attach gdb to + it later by sending the tracing thread a SIGUSR1. The first line of + the console output identifies its pid: + tracing thread pid = 20093 + + + + + When you send it the signal: + + + host% kill -USR1 20093 + + + + + you will get an xterm with gdb running in it. + + + If you have the mconsole compiled into UML, then the mconsole client + can be used to start gdb: + + + (mconsole) (mconsole) config gdb=xterm + + + + + will fire up an xterm with gdb running in it. + + + + 1111..66.. UUssiinngg aalltteerrnnaattee ddeebbuuggggeerrss + + UML has support for attaching to an already running debugger rather + than starting gdb itself. This is present in CVS as of 17 Apr 2001. + I sent it to Alan for inclusion in the ac tree, and it will be in my + 2.4.4 release. + + + This is useful when gdb is a subprocess of some UI, such as emacs or + ddd. It can also be used to run debuggers other than gdb on UML. + Below is an example of using strace as an alternate debugger. + + + To do this, you need to get the pid of the debugger and pass it in + with the + + + If you are using gdb under some UI, then tell it to 'att 1', and + you'll find yourself attached to UML. + + + If you are using something other than gdb as your debugger, then + you'll need to get it to do the equivalent of 'att 1' if it doesn't do + it automatically. + + + An example of an alternate debugger is strace. You can strace the + actual kernel as follows: + + +o Run the following in a shell + + + host% + sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' + + + + +o Run UML with 'debug' and 'gdb-pid=' with the pid printed out + by the previous command + + +o Hit return in the shell, and UML will start running, and strace + output will start accumulating in the output file. + + Note that this is different from running + + + host% strace ./linux + + + + + That will strace only the main UML thread, the tracing thread, which + doesn't do any of the actual kernel work. It just oversees the vir- + tual machine. In contrast, using strace as described above will show + you the low-level activity of the virtual machine. + + + + + + 1122.. KKeerrnneell ddeebbuuggggiinngg eexxaammpplleess + + 1122..11.. TThhee ccaassee ooff tthhee hhuunngg ffsscckk + + When booting up the kernel, fsck failed, and dropped me into a shell + to fix things up. I ran fsck -y, which hung: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 was not cleanly unmounted, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Inode 19780, i_blocks is 1548, should be 540. Fix? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + + + + + The standard drill in this sort of situation is to fire up gdb on the + signal thread, which, in this case, was pid 1935. In another window, + I run gdb and attach pid 1935. + + + + + ~/linux/2.3.26/um 1016: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + + (gdb) att 1935 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 + 0x100756d9 in __wait4 () + + + + + + + Let's see what's currently running: + + + + (gdb) p current_task.pid + $1 = 0 + + + + + + It's the idle thread, which means that fsck went to sleep for some + reason and never woke up. + + + Let's guess that the last process in the process list is fsck: + + + + (gdb) p current_task.prev_task.comm + $13 = "fsck.ext2\000\000\000\000\000\000" + + + + + + It is, so let's see what it thinks it's up to: + + + + (gdb) p current_task.prev_task.thread + $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, + kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { + 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, + request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { + regs = {1350467584, 2952789424, 0 }, sigstack = 0, + pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, + arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { + op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} + + + + + + The interesting things here are the fact that its .thread.syscall.id + is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or + the defines in include/asm-um/arch/unistd.h), and that it never + returned. Also, its .request.op is OP_SWITCH (see + arch/um/include/user_util.h). These mean that it went into a write, + and, for some reason, called schedule(). + + + The fact that it never returned from write means that its stack should + be fairly interesting. Its pid is 1980 (.thread.extern_pid). That + process is being ptraced by the signal thread, so it must be detached + before gdb can attach it: + + + + + + + + + + + (gdb) call detach(1980) + + Program received signal SIGSEGV, Segmentation fault. + + The program being debugged stopped while in a function called from GDB. + When the function (detach) is done executing, GDB will silently + stop (instead of continuing to evaluate the expression containing + the function call). + (gdb) call detach(1980) + $15 = 0 + + + + + + The first detach segfaults for some reason, and the second one + succeeds. + + + Now I detach from the signal thread, attach to the fsck thread, and + look at its stack: + + + (gdb) det + Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 + (gdb) att 1980 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 + 0x10070451 in __kill () + (gdb) bt + #0 0x10070451 in __kill () + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + #4 0x10001d12 in schedule () at sched.c:777 + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #9 + #10 0x10155404 in errno () + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #14 + #15 0xc0fd in ?? () + #16 0x10016647 in sys_write (fd=3, + buf=0x80b8800
, count=1024) + at read_write.c:159 + #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #19 + #20 0x400dc8b0 in ?? () + + + + + + The interesting things here are : + + +o There are two segfaults on this stack (frames 9 and 14) + + +o The first faulting address (frame 11) is 0x50000800 + + (gdb) p (void *)1342179328 + $16 = (void *) 0x50000800 + + + + + + The initial faulting address is interesting because it is on the idle + thread's stack. I had been seeing the idle thread segfault for no + apparent reason, and the cause looked like stack corruption. In hopes + of catching the culprit in the act, I had turned off all protections + to that stack while the idle thread wasn't running. This apparently + tripped that trap. + + + However, the more immediate problem is that second segfault and I'm + going to concentrate on that. First, I want to see where the fault + happened, so I have to go look at the sigcontent struct in frame 8: + + + + (gdb) up + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + 30 kill(pid, SIGUSR1); + (gdb) + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + 156 usr1_pid(getpid()); + (gdb) + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + 161 _switch_to(prev, next); + (gdb) + #4 0x10001d12 in schedule () at sched.c:777 + 777 switch_to(prev, next, prev); + (gdb) + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + 71 schedule(); + (gdb) + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + 157 } + (gdb) + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + 182 segv_handler(sc); + (gdb) p *sc + Cannot access memory at address 0x0. + + + + + That's not very useful, so I'll try a more manual method: + + + (gdb) p *((struct sigcontext *) (&sig + 1)) + $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, + esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, + trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, + esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1280} + + + + The ip is in handle_mm_fault: + + + (gdb) p (void *)268480945 + $20 = (void *) 0x1000b1b1 + (gdb) i sym $20 + handle_mm_fault + 57 in section .text + + + + + + Specifically, it's in pte_alloc: + + + (gdb) i line *$20 + Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b1 + and ends at 0x1000b1b7 . + + + + + + To find where in handle_mm_fault this is, I'll jump forward in the + code until I see an address in that procedure: + + + + (gdb) i line *0x1000b1c0 + Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b7 + and ends at 0x1000b1c3 . + (gdb) i line *0x1000b1d0 + Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1d0 + and ends at 0x1000b1da . + (gdb) i line *0x1000b1e0 + Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1da + and ends at 0x1000b1e1 . + (gdb) i line *0x1000b1f0 + Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1f0 + and ends at 0x1000b200 . + (gdb) i line *0x1000b200 + Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b200 + and ends at 0x1000b208 . + (gdb) i line *0x1000b210 + Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b210 + and ends at 0x1000b219 . + (gdb) i line *0x1000b220 + Line 1168 of "memory.c" starts at address 0x1000b21e + and ends at 0x1000b222 . + + + + + + Something is apparently wrong with the page tables or vma_structs, so + lets go back to frame 11 and have a look at them: + + + + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + 50 handle_mm_fault(current, vma, address, is_write); + (gdb) call pgd_offset_proc(vma->vm_mm, address) + $22 = (pgd_t *) 0x80a548c + + + + + + That's pretty bogus. Page tables aren't supposed to be in process + text or data areas. Let's see what's in the vma: + + + (gdb) p *vma + $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, + vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, + vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, + vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, + vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, + vm_private_data = 0x62} + (gdb) p *vma.vm_mm + $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, + pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, + map_count = 134909076, mmap_sem = {count = {counter = 135073792}, + sleepers = -1342177872, wait = {lock = , + task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, + __magic = -1342177670, __creator = -1342177300}, __magic = 98}, + page_table_lock = {}, context = 138, start_code = 0, end_code = 0, + start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, + arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, + total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, + swap_address = 0, segments = 0x0} + + + + + + This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx + addresses, this is looking like a stack was plonked down on top of + these structures. Maybe it's a stack overflow from the next page: + + + + (gdb) p vma + $25 = (struct vm_area_struct *) 0x507d2434 + + + + + + That's towards the lower quarter of the page, so that would have to + have been pretty heavy stack overflow: + + + + + + + + + + + + + + + (gdb) x/100x $25 + 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c + 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 + 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a + 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 + 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 + 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 + 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 + + + + + + It's not stack overflow. The only "stack-like" piece of this data is + the vma_struct itself. + + + At this point, I don't see any avenues to pursue, so I just have to + admit that I have no idea what's going on. What I will do, though, is + stick a trap on the segfault handler which will stop if it sees any + writes to the idle thread's stack. That was the thing that happened + first, and it may be that if I can catch it immediately, what's going + on will be somewhat clearer. + + + 1122..22.. EEppiissooddee 22:: TThhee ccaassee ooff tthhee hhuunngg ffsscckk + + After setting a trap in the SEGV handler for accesses to the signal + thread's stack, I reran the kernel. + + + fsck hung again, this time by hitting the trap: + + + + + + + + + + + + + + + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 contains a file system with errors, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + Untested (4127) [100fe44c]: trap_kern.c line 31 + + + + + + I need to get the signal thread to detach from pid 4127 so that I can + attach to it with gdb. This is done by sending it a SIGUSR1, which is + caught by the signal thread, which detaches the process: + + + kill -USR1 4127 + + + + + + Now I can run gdb on it: + + + + + + + + + + + + + + ~/linux/2.3.26/um 1034: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) att 4127 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 + 0x10075891 in __libc_nanosleep () + + + + + + The backtrace shows that it was in a write and that the fault address + (address in frame 3) is 0x50000800, which is right in the middle of + the signal thread's stack page: + + + (gdb) bt + #0 0x10075891 in __libc_nanosleep () + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + #2 0x1006ce9a in stop () at user_util.c:191 + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 + #6 + #7 0xc0fd in ?? () + #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) + at read_write.c:159 + #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #11 + #12 0x400dc8b0 in ?? () + #13 + #14 0x400dc8b0 in ?? () + #15 0x80545fd in ?? () + #16 0x804daae in ?? () + #17 0x8054334 in ?? () + #18 0x804d23e in ?? () + #19 0x8049632 in ?? () + #20 0x80491d2 in ?? () + #21 0x80596b5 in ?? () + (gdb) p (void *)1342179328 + $3 = (void *) 0x50000800 + + + + + + Going up the stack to the segv_handler frame and looking at where in + the code the access happened shows that it happened near line 110 of + block_dev.c: + + + + + + + + + + (gdb) up + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. + (gdb) + #2 0x1006ce9a in stop () at user_util.c:191 + 191 while(1) sleep(1000000); + (gdb) + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + 31 KERN_UNTESTED(); + (gdb) + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) p *sc + $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, + esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, + err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, + esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1342179328} + (gdb) p (void *)268550834 + $2 = (void *) 0x1001c2b2 + (gdb) i sym $2 + block_write + 1090 in section .text + (gdb) i line *$2 + Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" + starts at address 0x1001c2a1 + and ends at 0x1001c2bf . + (gdb) i line *0x1001c2c0 + Line 110 of "block_dev.c" starts at address 0x1001c2bf + and ends at 0x1001c2e3 . + + + + + + Looking at the source shows that the fault happened during a call to + copy_to_user to copy the data into the kernel: + + + 107 count -= chars; + 108 copy_from_user(p,buf,chars); + 109 p += chars; + 110 buf += chars; + + + + + + p is the pointer which must contain 0x50000800, since buf contains + 0x80b8800 (frame 8 above). It is defined as: + + + p = offset + bh->b_data; + + + + + + I need to figure out what bh is, and it just so happens that bh is + passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a + few lines later, so I do a little disassembly: + + + + + (gdb) disas 0x1001c2bf 0x1001c2e0 + Dump of assembler code from 0x1001c2bf to 0x1001c2d0: + 0x1001c2bf : addl %eax,0xc(%ebp) + 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx + 0x1001c2c8 : btsl $0x0,0x18(%edx) + 0x1001c2cd : btsl $0x1,0x18(%edx) + 0x1001c2d2 : sbbl %ecx,%ecx + 0x1001c2d4 : testl %ecx,%ecx + 0x1001c2d6 : jne 0x1001c2e3 + 0x1001c2d8 : pushl $0x0 + 0x1001c2da : pushl %edx + 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> + End of assembler dump. + + + + + + At that point, bh is in %edx (address 0x1001c2da), which is calculated + at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, + taking %ebp from the sigcontext_struct above: + + + (gdb) p (void *)1342631484 + $5 = (void *) 0x5006ee3c + (gdb) p 0x5006ee3c+0xfffffdd4 + $6 = 1342630928 + (gdb) p (void *)$6 + $7 = (void *) 0x5006ec10 + (gdb) p *((void **)$7) + $8 = (void *) 0x50100200 + + + + + + Now, I look at the structure to see what's in it, and particularly, + what its b_data field contains: + + + (gdb) p *((struct buffer_head *)0x50100200) + $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, + b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, + b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, + b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, + b_data = 0x50000800 "", b_page = 0x50004000, + b_end_io = 0x10017f60 , b_dev_id = 0x0, + b_rsector = 98810, b_wait = {lock = , + task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, + __creator = 0}, b_kiobuf = 0x0} + + + + + + The b_data field is indeed 0x50000800, so the question becomes how + that happened. The rest of the structure looks fine, so this probably + is not a case of data corruption. It happened on purpose somehow. + + + The b_page field is a pointer to the page_struct representing the + 0x50000000 page. Looking at it shows the kernel's idea of the state + of that page: + + + + (gdb) p *$13.b_page + $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, + index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { + next = 0x50008460, prev = 0x50019350}, wait = { + lock = , task_list = {next = 0x50004024, + prev = 0x50004024}, __magic = 1342193708, __creator = 0}, + pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, + zone = 0x100c5160} + + + + + + Some sanity-checking: the virtual field shows the "virtual" address of + this page, which in this kernel is the same as its "physical" address, + and the page_struct itself should be mem_map[0], since it represents + the first page of memory: + + + + (gdb) p (void *)1342177280 + $18 = (void *) 0x50000000 + (gdb) p mem_map + $19 = (mem_map_t *) 0x50004000 + + + + + + These check out fine. + + + Now to check out the page_struct itself. In particular, the flags + field shows whether the page is considered free or not: + + + (gdb) p (void *)132 + $21 = (void *) 0x84 + + + + + + The "reserved" bit is the high bit, which is definitely not set, so + the kernel considers the signal stack page to be free and available to + be used. + + + At this point, I jump to conclusions and start looking at my early + boot code, because that's where that page is supposed to be reserved. + + + In my setup_arch procedure, I have the following code which looks just + fine: + + + + bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); + free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); + + + + + + Two stack pages have already been allocated, and low_physmem points to + the third page, which is the beginning of free memory. + The init_bootmem call declares the entire memory to the boot memory + manager, which marks it all reserved. The free_bootmem call frees up + all of it, except for the first two pages. This looks correct to me. + + + So, I decide to see init_bootmem run and make sure that it is marking + those first two pages as reserved. I never get that far. + + + Stepping into init_bootmem, and looking at bootmem_map before looking + at what it contains shows the following: + + + + (gdb) p bootmem_map + $3 = (void *) 0x50000000 + + + + + + Aha! The light dawns. That first page is doing double duty as a + stack and as the boot memory map. The last thing that the boot memory + manager does is to free the pages used by its memory map, so this page + is getting freed even its marked as reserved. + + + The fix was to initialize the boot memory manager before allocating + those two stack pages, and then allocate them through the boot memory + manager. After doing this, and fixing a couple of subsequent buglets, + the stack corruption problem disappeared. + + + + + + 1133.. WWhhaatt ttoo ddoo wwhheenn UUMMLL ddooeessnn''tt wwoorrkk + + + + + 1133..11.. SSttrraannggee ccoommppiillaattiioonn eerrrroorrss wwhheenn yyoouu bbuuiilldd ffrroomm ssoouurrccee + + As of test11, it is necessary to have "ARCH=um" in the environment or + on the make command line for all steps in building UML, including + clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, + and linux. If you forget for any of them, the i386 build seems to + contaminate the UML build. If this happens, start from scratch with + + + host% + make mrproper ARCH=um + + + + + and repeat the build process with ARCH=um on all the steps. + + + See ``Compiling the kernel and modules'' for more details. + + + Another cause of strange compilation errors is building UML in + /usr/src/linux. If you do this, the first thing you need to do is + clean up the mess you made. The /usr/src/linux/asm link will now + point to /usr/src/linux/asm-um. Make it point back to + /usr/src/linux/asm-i386. Then, move your UML pool someplace else and + build it there. Also see below, where a more specific set of symptoms + is described. + + + + 1133..33.. AA vvaarriieettyy ooff ppaanniiccss aanndd hhaannggss wwiitthh //ttmmpp oonn aa rreeiisseerrffss ffiilleessyyss-- + tteemm + + I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. + Panics preceded by + + + Detaching pid nnnn + + + + are diagnostic of this problem. This is a reiserfs bug which causes a + thread to occasionally read stale data from a mmapped page shared with + another thread. The fix is to upgrade the filesystem or to have /tmp + be an ext2 filesystem. + + + + 1133..44.. TThhee ccoommppiillee ffaaiillss wwiitthh eerrrroorrss aabboouutt ccoonnfflliiccttiinngg ttyyppeess ffoorr + ''ooppeenn'',, ''dduupp'',, aanndd ''wwaaiittppiidd'' + + This happens when you build in /usr/src/linux. The UML build makes + the include/asm link point to include/asm-um. /usr/include/asm points + to /usr/src/linux/include/asm, so when that link gets moved, files + which need to include the asm-i386 versions of headers get the + incompatible asm-um versions. The fix is to move the include/asm link + back to include/asm-i386 and to do UML builds someplace else. + + + + 1133..55.. UUMMLL ddooeessnn''tt wwoorrkk wwhheenn //ttmmpp iiss aann NNFFSS ffiilleessyysstteemm + + This seems to be a similar situation with the ReiserFS problem above. + Some versions of NFS seems not to handle mmap correctly, which UML + depends on. The workaround is have /tmp be a non-NFS directory. + + + 1133..66.. UUMMLL hhaannggss oonn bboooott wwhheenn ccoommppiilleedd wwiitthh ggpprrooff ssuuppppoorrtt + + If you build UML with gprof support and, early in the boot, it does + this + + + kernel BUG at page_alloc.c:100! + + + + + you have a buggy gcc. You can work around the problem by removing + UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up + another bug, but that one is fairly hard to reproduce. + + + + 1133..77.. ssyyssllooggdd ddiieess wwiitthh aa SSIIGGTTEERRMM oonn ssttaarrttuupp + + The exact boot error depends on the distribution that you're booting, + but Debian produces this: + + + /etc/rc2.d/S10sysklogd: line 49: 93 Terminated + start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD + + + + + This is a syslogd bug. There's a race between a parent process + installing a signal handler and its child sending the signal. See + this uml-devel post for the details. + + + + 1133..88.. TTUUNN//TTAAPP nneettwwoorrkkiinngg ddooeessnn''tt wwoorrkk oonn aa 22..44 hhoosstt + + There are a couple of problems which were + name="pointed + out"> by Tim Robinson + + +o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. + The fix is to upgrade to something more recent and then read the + next item. + + +o If you see + + + File descriptor in bad state + + + + when you bring up the device inside UML, you have a header mismatch + between the original kernel and the upgraded one. Make /usr/src/linux + point at the new headers. This will only be a problem if you build + uml_net yourself. + + + + 1133..99.. YYoouu ccaann nneettwwoorrkk ttoo tthhee hhoosstt bbuutt nnoott ttoo ootthheerr mmaacchhiinneess oonn tthhee + nneett + + If you can connect to the host, and the host can connect to UML, but + you cannot connect to any other machines, then you may need to enable + IP Masquerading on the host. Usually this is only experienced when + using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML + networking, rather than the public address space that your host is + connected to. UML does not enable IP Masquerading, so you will need + to create a static rule to enable it: + + + host% + iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE + + + + + Replace eth0 with the interface that you use to talk to the rest of + the world. + + + Documentation on IP Masquerading, and SNAT, can be found at + www.netfilter.org . + + + If you can reach the local net, but not the outside Internet, then + that is usually a routing problem. The UML needs a default route: + + + UML# + route add default gw gateway IP + + + + + The gateway IP can be any machine on the local net that knows how to + reach the outside world. Usually, this is the host or the local net- + work's gateway. + + + Occasionally, we hear from someone who can reach some machines, but + not others on the same net, or who can reach some ports on other + machines, but not others. These are usually caused by strange + firewalling somewhere between the UML and the other box. You track + this down by running tcpdump on every interface the packets travel + over and see where they disappear. When you find a machine that takes + the packets in, but does not send them onward, that's the culprit. + + + + 1133..1100.. II hhaavvee nnoo rroooott aanndd II wwaanntt ttoo ssccrreeaamm + + Thanks to Birgit Wahlich for telling me about this strange one. It + turns out that there's a limit of six environment variables on the + kernel command line. When that limit is reached or exceeded, argument + processing stops, which means that the 'root=' argument that UML + usually adds is not seen. So, the filesystem has no idea what the + root device is, so it panics. + + + The fix is to put less stuff on the command line. Glomming all your + setup variables into one is probably the best way to go. + + + + 1133..1111.. UUMMLL bbuuiilldd ccoonnfflliicctt bbeettwweeeenn ppttrraaccee..hh aanndd uuccoonntteexxtt..hh + + On some older systems, /usr/include/asm/ptrace.h and + /usr/include/sys/ucontext.h define the same names. So, when they're + included together, the defines from one completely mess up the parsing + of the other, producing errors like: + /usr/include/sys/ucontext.h:47: parse error before + `10' + + + + + plus a pile of warnings. + + + This is a libc botch, which has since been fixed, and I don't see any + way around it besides upgrading. + + + + 1133..1122.. TThhee UUMMLL BBooggooMMiippss iiss eexxaaccttllyy hhaallff tthhee hhoosstt''ss BBooggooMMiippss + + On i386 kernels, there are two ways of running the loop that is used + to calculate the BogoMips rating, using the TSC if it's there or using + a one-instruction loop. The TSC produces twice the BogoMips as the + loop. UML uses the loop, since it has nothing resembling a TSC, and + will get almost exactly the same BogoMips as a host using the loop. + However, on a host with a TSC, its BogoMips will be double the loop + BogoMips, and therefore double the UML BogoMips. + + + + 1133..1133.. WWhheenn yyoouu rruunn UUMMLL,, iitt iimmmmeeddiiaatteellyy sseeggffaauullttss + + If the host is configured with the 2G/2G address space split, that's + why. See ``UML on 2G/2G hosts'' for the details on getting UML to + run on your host. + + + + 1133..1144.. xxtteerrmmss aappppeeaarr,, tthheenn iimmmmeeddiiaatteellyy ddiissaappppeeaarr + + If you're running an up to date kernel with an old release of + uml_utilities, the port-helper program will not work properly, so + xterms will exit straight after they appear. The solution is to + upgrade to the latest release of uml_utilities. Usually this problem + occurs when you have installed a packaged release of UML then compiled + your own development kernel without upgrading the uml_utilities from + the source distribution. + + + + 1133..1155.. AAnnyy ootthheerr ppaanniicc,, hhaanngg,, oorr ssttrraannggee bbeehhaavviioorr + + If you're seeing truly strange behavior, such as hangs or panics that + happen in random places, or you try running the debugger to see what's + happening and it acts strangely, then it could be a problem in the + host kernel. If you're not running a stock Linus or -ac kernel, then + try that. An early version of the preemption patch and a 2.4.10 SuSE + kernel have caused very strange problems in UML. + + + Otherwise, let me know about it. Send a message to one of the UML + mailing lists - either the developer list - user-mode-linux-devel at + lists dot sourceforge dot net (subscription info) or the user list - + user-mode-linux-user at lists dot sourceforge do net (subscription + info), whichever you prefer. Don't assume that everyone knows about + it and that a fix is imminent. + + + If you want to be super-helpful, read ``Diagnosing Problems'' and + follow the instructions contained therein. + 1144.. DDiiaaggnnoossiinngg PPrroobblleemmss + + + If you get UML to crash, hang, or otherwise misbehave, you should + report this on one of the project mailing lists, either the developer + list - user-mode-linux-devel at lists dot sourceforge dot net + (subscription info) or the user list - user-mode-linux-user at lists + dot sourceforge dot net (subscription info). When you do, it is + likely that I will want more information. So, it would be helpful to + read the stuff below, do whatever is applicable in your case, and + report the results to the list. + + + For any diagnosis, you're going to need to build a debugging kernel. + The binaries from this site aren't debuggable. If you haven't done + this before, read about ``Compiling the kernel and modules'' and + ``Kernel debugging'' UML first. + + + 1144..11.. CCaassee 11 :: NNoorrmmaall kkeerrnneell ppaanniiccss + + The most common case is for a normal thread to panic. To debug this, + you will need to run it under the debugger (add 'debug' to the command + line). An xterm will start up with gdb running inside it. Continue + it when it stops in start_kernel and make it crash. Now ^C gdb and + + + If the panic was a "Kernel mode fault", then there will be a segv + frame on the stack and I'm going to want some more information. The + stack might look something like this: + + + (UML gdb) backtrace + #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) + at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 + #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 + #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 + #3 0x1009bf38 in __restore () + at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 + #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) + at trap_kern.c:66 + #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 + #6 0x1009bf38 in __restore () + + + + + I'm going to want to see the symbol and line information for the value + of ip in the segv frame. In this case, you would do the following: + + + (UML gdb) i sym 268849158 + + + + + and + + + (UML gdb) i line *268849158 + + + + + The reason for this is the __restore frame right above the segv_han- + dler frame is hiding the frame that actually segfaulted. So, I have + to get that information from the faulting ip. + + + 1144..22.. CCaassee 22 :: TTrraacciinngg tthhrreeaadd ppaanniiccss + + The less common and more painful case is when the tracing thread + panics. In this case, the kernel debugger will be useless because it + needs a healthy tracing thread in order to work. The first thing to + do is get a backtrace from the tracing thread. This is done by + figuring out what its pid is, firing up gdb, and attaching it to that + pid. You can figure out the tracing thread pid by looking at the + first line of the console output, which will look like this: + + + tracing thread pid = 15851 + + + + + or by running ps on the host and finding the line that looks like + this: + + + jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] + + + + + If the panic was 'segfault in signals', then follow the instructions + above for collecting information about the location of the seg fault. + + + If the tracing thread flaked out all by itself, then send that + backtrace in and wait for our crack debugging team to fix the problem. + + + 1144..33.. CCaassee 33 :: TTrraacciinngg tthhrreeaadd ppaanniiccss ccaauusseedd bbyy ootthheerr tthhrreeaaddss + + However, there are cases where the misbehavior of another thread + caused the problem. The most common panic of this type is: + + + wait_for_stop failed to wait for to stop with + + + + + In this case, you'll need to get a backtrace from the process men- + tioned in the panic, which is complicated by the fact that the kernel + debugger is defunct and without some fancy footwork, another gdb can't + attach to it. So, this is how the fancy footwork goes: + + In a shell: + + + host% kill -STOP pid + + + + + Run gdb on the tracing thread as described in case 2 and do: + + + (host gdb) call detach(pid) + + + If you get a segfault, do it again. It always works the second time. + + Detach from the tracing thread and attach to that other thread: + + + (host gdb) detach + + + + + + + (host gdb) attach pid + + + + + If gdb hangs when attaching to that process, go back to a shell and + do: + + + host% + kill -CONT pid + + + + + And then get the backtrace: + + + (host gdb) backtrace + + + + + + 1144..44.. CCaassee 44 :: HHaannggss + + Hangs seem to be fairly rare, but they sometimes happen. When a hang + happens, we need a backtrace from the offending process. Run the + kernel debugger as described in case 1 and get a backtrace. If the + current process is not the idle thread, then send in the backtrace. + You can tell that it's the idle thread if the stack looks like this: + + + #0 0x100b1401 in __libc_nanosleep () + #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 + #2 0x100a546f in do_idle () at process_kern.c:445 + #3 0x100a5508 in cpu_idle () at process_kern.c:471 + #4 0x100ec18f in start_kernel () at init/main.c:592 + #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 + #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 + + + + + If this is the case, then some other process is at fault, and went to + sleep when it shouldn't have. Run ps on the host and figure out which + process should not have gone to sleep and stayed asleep. Then attach + to it with gdb and get a backtrace as described in case 3. + + + + + + + 1155.. TThhaannkkss + + + A number of people have helped this project in various ways, and this + page gives recognition where recognition is due. + + + If you're listed here and you would prefer a real link on your name, + or no link at all, instead of the despammed email address pseudo-link, + let me know. + + + If you're not listed here and you think maybe you should be, please + let me know that as well. I try to get everyone, but sometimes my + bookkeeping lapses and I forget about contributions. + + + 1155..11.. CCooddee aanndd DDooccuummeennttaattiioonn + + Rusty Russell - + + +o wrote the HOWTO + + +o prodded me into making this project official and putting it on + SourceForge + + +o came up with the way cool UML logo + + +o redid the config process + + + Peter Moulder - Fixed my config and build + processes, and added some useful code to the block driver + + + Bill Stearns - + + +o HOWTO updates + + +o lots of bug reports + + +o lots of testing + + +o dedicated a box (uml.ists.dartmouth.edu) to support UML development + + +o wrote the mkrootfs script, which allows bootable filesystems of + RPM-based distributions to be cranked out + + +o cranked out a large number of filesystems with said script + + + Jim Leu - Wrote the virtual ethernet driver + and associated usermode tools + + Lars Brinkhoff - Contributed the ptrace + proxy from his own project to allow easier + kernel debugging + + + Andrea Arcangeli - Redid some of the early boot + code so that it would work on machines with Large File Support + + + Chris Emerson - Did + the first UML port to Linux/ppc + + + Harald Welte - Wrote the multicast + transport for the network driver + + + Jorgen Cederlof - Added special file support to hostfs + + + Greg Lonnon - Changed the ubd driver + to allow it to layer a COW file on a shared read-only filesystem and + wrote the iomem emulation support + + + Henrik Nordstrom - Provided a variety + of patches, fixes, and clues + + + Lennert Buytenhek - Contributed various patches, a rewrite of the + network driver, the first implementation of the mconsole driver, and + did the bulk of the work needed to get SMP working again. + + + Yon Uriarte - Fixed the TUN/TAP network backend while I slept. + + + Adam Heath - Made a bunch of nice cleanups to the initialization code, + plus various other small patches. + + + Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and + is doing a real nice job of it. He also noticed and fixed a number of + actually and potentially exploitable security holes in uml_net. Plus + the occasional patch. I like patches. + + + James McMechan - James seems to have taken over maintenance of the ubd + driver and is doing a nice job of it. + + + Chandan Kudige - wrote the umlgdb script which automates the reloading + of module symbols. + + + Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, + enabling UML processes to access audio devices on the host. He also + submitted patches for the slip transport and lots of other things. + + + David Coulson - + + +o Set up the usermodelinux.org site, + which is a great way of keeping the UML user community on top of + UML goings-on. + + +o Site documentation and updates + + +o Nifty little UML management daemon UMLd + + + +o Lots of testing and bug reports + + + + + 1155..22.. FFlluusshhiinngg oouutt bbuuggss + + + + +o Yuri Pudgorodsky + + +o Gerald Britton + + +o Ian Wehrman + + +o Gord Lamb + + +o Eugene Koontz + + +o John H. Hartman + + +o Anders Karlsson + + +o Daniel Phillips + + +o John Fremlin + + +o Rainer Burgstaller + + +o James Stevenson + + +o Matt Clay + + +o Cliff Jefferies + + +o Geoff Hoff + + +o Lennert Buytenhek + + +o Al Viro + + +o Frank Klingenhoefer + + +o Livio Baldini Soares + + +o Jon Burgess + + +o Petru Paler + + +o Paul + + +o Chris Reahard + + +o Sverker Nilsson + + +o Gong Su + + +o johan verrept + + +o Bjorn Eriksson + + +o Lorenzo Allegrucci + + +o Muli Ben-Yehuda + + +o David Mansfield + + +o Howard Goff + + +o Mike Anderson + + +o John Byrne + + +o Sapan J. Batia + + +o Iris Huang + + +o Jan Hudec + + +o Voluspa + + + + + 1155..33.. BBuugglleettss aanndd cclleeaann--uuppss + + + + +o Dave Zarzycki + + +o Adam Lazur + + +o Boria Feigin + + +o Brian J. Murrell + + +o JS + + +o Roman Zippel + + +o Wil Cooley + + +o Ayelet Shemesh + + +o Will Dyson + + +o Sverker Nilsson + + +o dvorak + + +o v.naga srinivas + + +o Shlomi Fish + + +o Roger Binns + + +o johan verrept + + +o MrChuoi + + +o Peter Cleve + + +o Vincent Guffens + + +o Nathan Scott + + +o Patrick Caulfield + + +o jbearce + + +o Catalin Marinas + + +o Shane Spencer + + +o Zou Min + + + +o Ryan Boder + + +o Lorenzo Colitti + + +o Gwendal Grignou + + +o Andre' Breiler + + +o Tsutomu Yasuda + + + + 1155..44.. CCaassee SSttuuddiieess + + + +o Jon Wright + + +o William McEwan + + +o Michael Richardson + + + + 1155..55.. OOtthheerr ccoonnttrriibbuuttiioonnss + + + Bill Carr made the Red Hat mkrootfs script + work with RH 6.2. + + Michael Jennings sent in some material which + is now gracing the top of the index page of this site. + + SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com + . The bandwidth there made it possible to + produce most of the filesystems available on the project download + page. + + Laurent Bonnaud took the old grotty + Debian filesystem that I've been distributing and updated it to 2.2. + It is now available by itself here. + + Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make + releases even when Sourceforge is broken. + + Rodrigo de Castro looked at my broken pte code and told me what was + wrong with it, letting me fix a long-standing (several weeks) and + serious set of bugs. + + Chris Reahard built a specialized root filesystem for running a DNS + server jailed inside UML. It's available from the download + page in the Jail + Filesystems section. + + + + + + + + + + + + diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt new file mode 100644 index 0000000..36c367c --- /dev/null +++ b/Documentation/vm/cleancache.txt @@ -0,0 +1,278 @@ +MOTIVATION + +Cleancache is a new optional feature provided by the VFS layer that +potentially dramatically increases page cache effectiveness for +many workloads in many environments at a negligible cost. + +Cleancache can be thought of as a page-granularity victim cache for clean +pages that the kernel's pageframe replacement algorithm (PFRA) would like +to keep around, but can't since there isn't enough memory. So when the +PFRA "evicts" a page, it first attempts to use cleancache code to +put the data contained in that page into "transcendent memory", memory +that is not directly accessible or addressable by the kernel and is +of unknown and possibly time-varying size. + +Later, when a cleancache-enabled filesystem wishes to access a page +in a file on disk, it first checks cleancache to see if it already +contains it; if it does, the page of data is copied into the kernel +and a disk access is avoided. + +Transcendent memory "drivers" for cleancache are currently implemented +in Xen (using hypervisor memory) and zcache (using in-kernel compressed +memory) and other implementations are in development. + +FAQs are included below. + +IMPLEMENTATION OVERVIEW + +A cleancache "backend" that provides transcendent memory registers itself +to the kernel's cleancache "frontend" by calling cleancache_register_ops, +passing a pointer to a cleancache_ops structure with funcs set appropriately. +Note that cleancache_register_ops returns the previous settings so that +chaining can be performed if desired. The functions provided must conform to +certain semantics as follows: + +Most important, cleancache is "ephemeral". Pages which are copied into +cleancache have an indefinite lifetime which is completely unknowable +by the kernel and so may or may not still be in cleancache at any later time. +Thus, as its name implies, cleancache is not suitable for dirty pages. +Cleancache has complete discretion over what pages to preserve and what +pages to discard and when. + +Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a +pool id which, if positive, must be saved in the filesystem's superblock; +a negative return value indicates failure. A "put_page" will copy a +(presumably about-to-be-evicted) page into cleancache and associate it with +the pool id, a file key, and a page index into the file. (The combination +of a pool id, a file key, and an index is sometimes called a "handle".) +A "get_page" will copy the page, if found, from cleancache into kernel memory. +A "flush_page" will ensure the page no longer is present in cleancache; +a "flush_inode" will flush all pages associated with the specified file; +and, when a filesystem is unmounted, a "flush_fs" will flush all pages in +all files specified by the given pool id and also surrender the pool id. + +An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache +to treat the pool as shared using a 128-bit UUID as a key. On systems +that may run multiple kernels (such as hard partitioned or virtualized +systems) that may share a clustered filesystem, and where cleancache +may be shared among those kernels, calls to init_shared_fs that specify the +same UUID will receive the same pool id, thus allowing the pages to +be shared. Note that any security requirements must be imposed outside +of the kernel (e.g. by "tools" that control cleancache). Or a +cleancache implementation can simply disable shared_init by always +returning a negative value. + +If a get_page is successful on a non-shared pool, the page is flushed (thus +making cleancache an "exclusive" cache). On a shared pool, the page +is NOT flushed on a successful get_page so that it remains accessible to +other sharers. The kernel is responsible for ensuring coherency between +cleancache (shared or not), the page cache, and the filesystem, using +cleancache flush operations as required. + +Note that cleancache must enforce put-put-get coherency and get-get +coherency. For the former, if two puts are made to the same handle but +with different data, say AAA by the first put and BBB by the second, a +subsequent get can never return the stale data (AAA). For get-get coherency, +if a get for a given handle fails, subsequent gets for that handle will +never succeed unless preceded by a successful put with that handle. + +Last, cleancache provides no SMP serialization guarantees; if two +different Linux threads are simultaneously putting and flushing a page +with the same handle, the results are indeterminate. Callers must +lock the page to ensure serial behavior. + +CLEANCACHE PERFORMANCE METRICS + +Cleancache monitoring is done by sysfs files in the +/sys/kernel/mm/cleancache directory. The effectiveness of cleancache +can be measured (across all filesystems) with: + +succ_gets - number of gets that were successful +failed_gets - number of gets that failed +puts - number of puts attempted (all "succeed") +flushes - number of flushes attempted + +A backend implementatation may provide additional metrics. + +FAQ + +1) Where's the value? (Andrew Morton) + +Cleancache provides a significant performance benefit to many workloads +in many environments with negligible overhead by improving the +effectiveness of the pagecache. Clean pagecache pages are +saved in transcendent memory (RAM that is otherwise not directly +addressable to the kernel); fetching those pages later avoids "refaults" +and thus disk reads. + +Cleancache (and its sister code "frontswap") provide interfaces for +this transcendent memory (aka "tmem"), which conceptually lies between +fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. +Disallowing direct kernel or userland reads/writes to tmem +is ideal when data is transformed to a different form and size (such +as with compression) or secretly moved (as might be useful for write- +balancing for some RAM-like devices). Evicted page-cache pages (and +swap pages) are a great use for this kind of slower-than-RAM-but-much- +faster-than-disk transcendent memory, and the cleancache (and frontswap) +"page-object-oriented" specification provides a nice way to read and +write -- and indirectly "name" -- the pages. + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources across the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to +do it well with no kernel change have essentially failed (except in some +well-publicized special-case workloads). Cleancache -- and frontswap -- +with a fairly small impact on the kernel, provide a huge amount +of flexibility for more dynamic, flexible RAM multiplexing. +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "self-ballooning"), page cache pages +are the first to go, and cleancache allows those pages to be +saved and reclaimed if overall host system memory conditions allow. + +And the identical interface used for cleancache can be used in +physical systems as well. The zcache driver acts as a memory-hungry +device that stores pages of data in a compressed state. And +the proposed "RAMster" driver shares RAM across multiple physical +systems. + +2) Why does cleancache have its sticky fingers so deep inside the + filesystems and VFS? (Andrew Morton and Christoph Hellwig) + +The core hooks for cleancache in VFS are in most cases a single line +and the minimum set are placed precisely where needed to maintain +coherency (via cleancache_flush operations) between cleancache, +the page cache, and disk. All hooks compile into nothingness if +cleancache is config'ed off and turn into a function-pointer- +compare-to-NULL if config'ed on but no backend claims the ops +functions, or to a compare-struct-element-to-negative if a +backend claims the ops functions but a filesystem doesn't enable +cleancache. + +Some filesystems are built entirely on top of VFS and the hooks +in VFS are sufficient, so don't require an "init_fs" hook; the +initial implementation of cleancache didn't provide this hook. +But for some filesystems (such as btrfs), the VFS hooks are +incomplete and one or more hooks in fs-specific code are required. +And for some other filesystems, such as tmpfs, cleancache may +be counterproductive. So it seemed prudent to require a filesystem +to "opt in" to use cleancache, which requires adding a hook in +each filesystem. Not all filesystems are supported by cleancache +only because they haven't been tested. The existing set should +be sufficient to validate the concept, the opt-in approach means +that untested filesystems are not affected, and the hooks in the +existing filesystems should make it very easy to add more +filesystems in the future. + +The total impact of the hooks to existing fs and mm files is only +about 40 lines added (not counting comments and blank lines). + +3) Why not make cleancache asynchronous and batched so it can + more easily interface with real devices with DMA instead + of copying each individual page? (Minchan Kim) + +The one-page-at-a-time copy semantics simplifies the implementation +on both the frontend and backend and also allows the backend to +do fancy things on-the-fly like page compression and +page deduplication. And since the data is "gone" (copied into/out +of the pageframe) before the cleancache get/put call returns, +a great deal of race conditions and potential coherency issues +are avoided. While the interface seems odd for a "real device" +or for real kernel-addressable RAM, it makes perfect sense for +transcendent memory. + +4) Why is non-shared cleancache "exclusive"? And where is the + page "flushed" after a "get"? (Minchan Kim) + +The main reason is to free up space in transcendent memory and +to avoid unnecessary cleancache_flush calls. If you want inclusive, +the page can be "put" immediately following the "get". If +put-after-get for inclusive becomes common, the interface could +be easily extended to add a "get_no_flush" call. + +The flush is done by the cleancache backend implementation. + +5) What's the performance impact? + +Performance analysis has been presented at OLS'09 and LCA'10. +Briefly, performance gains can be significant on most workloads, +especially when memory pressure is high (e.g. when RAM is +overcommitted in a virtual workload); and because the hooks are +invoked primarily in place of or in addition to a disk read/write, +overhead is negligible even in worst case workloads. Basically +cleancache replaces I/O with memory-copy-CPU-overhead; on older +single-core systems with slow memory-copy speeds, cleancache +has little value, but in newer multicore machines, especially +consolidated/virtualized machines, it has great value. + +6) How do I add cleancache support for filesystem X? (Boaz Harrash) + +Filesystems that are well-behaved and conform to certain +restrictions can utilize cleancache simply by making a call to +cleancache_init_fs at mount time. Unusual, misbehaving, or +poorly layered filesystems must either add additional hooks +and/or undergo extensive additional testing... or should just +not enable the optional cleancache. + +Some points for a filesystem to consider: + +- The FS should be block-device-based (e.g. a ram-based FS such + as tmpfs should not enable cleancache) +- To ensure coherency/correctness, the FS must ensure that all + file removal or truncation operations either go through VFS or + add hooks to do the equivalent cleancache "flush" operations +- To ensure coherency/correctness, either inode numbers must + be unique across the lifetime of the on-disk file OR the + FS must provide an "encode_fh" function. +- The FS must call the VFS superblock alloc and deactivate routines + or add hooks to do the equivalent cleancache calls done there. +- To maximize performance, all pages fetched from the FS should + go through the do_mpag_readpage routine or the FS should add + hooks to do the equivalent (cf. btrfs) +- Currently, the FS blocksize must be the same as PAGESIZE. This + is not an architectural restriction, but no backends currently + support anything different. +- A clustered FS should invoke the "shared_init_fs" cleancache + hook to get best performance for some backends. + +7) Why not use the KVA of the inode as the key? (Christoph Hellwig) + +If cleancache would use the inode virtual address instead of +inode/filehandle, the pool id could be eliminated. But, this +won't work because cleancache retains pagecache data pages +persistently even when the inode has been pruned from the +inode unused list, and only flushes the data page if the file +gets removed/truncated. So if cleancache used the inode kva, +there would be potential coherency issues if/when the inode +kva is reused for a different file. Alternately, if cleancache +flushed the pages when the inode kva was freed, much of the value +of cleancache would be lost because the cache of pages in cleanache +is potentially much larger than the kernel pagecache and is most +useful if the pages survive inode cache removal. + +8) Why is a global variable required? + +The cleancache_enabled flag is checked in all of the frequently-used +cleancache hooks. The alternative is a function call to check a static +variable. Since cleancache is enabled dynamically at runtime, systems +that don't enable cleancache would suffer thousands (possibly +tens-of-thousands) of unnecessary function calls per second. So the +global variable allows cleancache to be enabled by default at compile +time, but have insignificant performance impact when cleancache remains +disabled at runtime. + +9) Does cleanache work with KVM? + +The memory model of KVM is sufficiently different that a cleancache +backend may have less value for KVM. This remains to be tested, +especially in an overcommitted system. + +10) Does cleancache work in userspace? It sounds useful for + memory hungry caches like web browsers. (Jamie Lokier) + +No plans yet, though we agree it sounds useful, at least for +apps that bypass the page cache (e.g. O_DIRECT). + +Last updated: Dan Magenheimer, April 13 2011 diff --git a/Documentation/vm/locking b/Documentation/vm/locking index 25fadb4..f61228b 100644 --- a/Documentation/vm/locking +++ b/Documentation/vm/locking @@ -66,7 +66,7 @@ in some cases it is not really needed. Eg, vm_start is modified by expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_mmap_lock and the kmem cache +The page_table_lock nests with the inode i_mmap_mutex and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 092e596..c54b4f5 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -206,7 +206,7 @@ IOMMU (input/output memory management unit) (e.g. because you have < 3 GB memory). Kernel boot message: "PCI-DMA: Disabling IOMMU" - 2. : AMD GART based hardware IOMMU. + 2. : AMD GART based hardware IOMMU. Kernel boot message: "PCI-DMA: using GART IOMMU" 3. : Software IOMMU implementation. Used diff --git a/Documentation/zh_CN/email-clients.txt b/Documentation/zh_CN/email-clients.txt new file mode 100644 index 0000000..5d65e32 --- /dev/null +++ b/Documentation/zh_CN/email-clients.txt @@ -0,0 +1,210 @@ +锘?Chinese translated version of Documentation/email-clients.txt + +If you have any comment or update to the content, please contact the +original document maintainer directly. However, if you have a problem +communicating in English you can also ask the Chinese maintainer for +help. Contact the Chinese maintainer if this translation is outdated +or if there is a problem with the translation. + +Chinese maintainer: Harry Wei +--------------------------------------------------------------------- +Documentation/email-clients.txt ???涓????缈昏?? + +濡??????宠??璁烘????存?版???????????瀹癸??璇风?存?ヨ??绯诲?????妗g??缁存?よ?????濡????浣?浣跨?ㄨ?辨?? +浜ゆ???????伴?剧??璇?锛?涔????浠ュ??涓???????缁存?よ??姹???┿??濡???????缈昏????存?颁???????舵?????缈? +璇?瀛???ㄩ??棰?锛?璇疯??绯讳腑??????缁存?よ????? + +涓???????缁存?よ??锛? 璐惧??濞? Harry Wei +涓???????缈昏?????锛? 璐惧??濞? Harry Wei +涓?????????¤?????锛? Yinglin Luan + Xiaochen Wang + yaxinsn + +浠ヤ??涓烘?f?? +--------------------------------------------------------------------- + +Linux???浠跺?㈡?风?????缃?淇℃?? +====================================================================== + +?????????缃? +---------------------------------------------------------------------- +Linux?????歌ˉ涓???????杩????浠惰?????浜ょ??锛????濂芥??琛ヤ??浣?涓洪??浠朵????????宓?????????????浜?缁存?よ?? +??ユ?堕??浠讹??浣???????浠剁?????瀹规?煎??搴?璇ユ??"text/plain"?????惰??锛????浠朵????????涓?璧???????锛? +???涓鸿??浼?浣胯ˉ涓????寮???ㄩ?ㄥ????ㄨ??璁鸿??绋?涓???????寰???伴?俱?? + +??ㄦ?ュ?????Linux?????歌ˉ涓???????浠跺?㈡?风????ㄥ?????琛ヤ????跺??璇ュ??浜?????????????濮???舵?????渚?濡?锛? +浠?浠?涓???芥?瑰?????????????ゅ?惰〃绗???????绌烘?硷???????虫????ㄦ??涓?琛????寮?澶存?????缁?灏俱?? + +涓?瑕????杩?"format=flowed"妯″????????琛ヤ?????杩???蜂??寮?璧蜂?????棰????浠ュ?????瀹崇?????琛???? + +涓?瑕?璁╀????????浠跺?㈡?风??杩?琛??????ㄦ?㈣?????杩???蜂??浼???村??浣????琛ヤ????? + +???浠跺?㈡?风??涓???芥?瑰???????????瀛?绗????缂??????瑰?????瑕??????????琛ヤ???????芥??ASCII??????UTF-8缂??????瑰??锛? +濡????浣?浣跨??UTF-8缂??????瑰???????????浠讹????d??浣?灏?浼???垮??涓?浜??????藉????????瀛?绗???????棰???? + +???浠跺?㈡?风??搴?璇ュ舰???骞朵??淇???? References: ?????? In-Reply-To: ???棰?锛???d?? +???浠惰??棰?灏变??浼?涓??????? + +澶???剁??甯?(?????????璐寸??甯?)???甯镐????界?ㄤ??琛ヤ??锛????涓哄?惰〃绗?浼?杞????涓虹┖??笺??浣跨??xclipboard, xclip +??????xcutsel涔?璁稿??浠ワ??浣???????濂芥??璇?涓?涓?????????垮??浣跨?ㄥ????剁??甯???? + +涓?瑕???ㄤ娇???PGP/GPG缃插????????浠朵腑??????琛ヤ?????杩???蜂??浣垮??寰?澶???????涓???借?诲??????????ㄤ??浣????琛ヤ????? +锛?杩?涓????棰?搴?璇ユ?????浠ヤ慨澶????锛? + +??ㄧ???????搁??浠跺??琛ㄥ?????琛ヤ??涔????锛?缁????宸卞?????涓?涓?琛ヤ?????涓?涓???????涓绘??锛?淇?瀛???ユ?跺?扮?? +???浠讹??灏?琛ヤ?????'patch'??戒护???涓?锛?濡??????????浜?锛????缁??????搁??浠跺??琛ㄥ???????? + + +涓?浜????浠跺?㈡?风?????绀? +---------------------------------------------------------------------- +杩????缁???轰??浜?璇?缁????MUA???缃????绀猴?????浠ョ?ㄤ??缁?Linux?????稿?????琛ヤ?????杩?浜?骞朵???????虫?? +?????????杞?浠跺?????缃???荤????? + +璇存??锛? +TUI = 浠ユ?????涓哄?虹???????ㄦ?锋?ュ?? +GUI = ??惧舰?????㈢?ㄦ?锋?ュ?? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Alpine (TUI) + +???缃????椤癸?? +???"Sending Preferences"??ㄥ??锛? + +- "Do Not Send Flowed Text"蹇?椤诲????? +- "Strip Whitespace Before Sending"蹇?椤诲?抽?? + +褰???????浠舵?讹????????搴?璇ユ?惧?ㄨˉ涓?浼???虹?扮????版?癸????跺?????涓?CTRL-R缁???????锛?浣挎??瀹???? +琛ヤ?????浠跺????ュ?伴??浠朵腑??? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Evolution (GUI) + +涓?浜?寮????????????????浣跨?ㄥ????????琛ヤ?? + +褰??????╅??浠堕??椤癸??Preformat + 浠?Format->Heading->Preformatted (Ctrl-7)??????宸ュ?锋?? + +??跺??浣跨??锛? + Insert->Text File... (Alt-n x)?????ヨˉ涓????浠躲?? + +浣?杩????浠?"diff -Nru old.c new.c | xclip"锛???????Preformat锛???跺??浣跨?ㄤ腑??撮??杩?琛?绮?甯???? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Kmail (GUI) + +涓?浜?寮????????????????浣跨?ㄥ????????琛ヤ????? + +榛?璁よ?剧疆涓?涓?HTML??煎??????????????锛?涓?瑕??????ㄥ????? + +褰?涔????涓?灏????浠剁????跺??锛???ㄩ??椤逛?????涓?瑕??????╄????ㄦ?㈣????????涓????缂虹?瑰氨???浣???ㄩ??浠朵腑杈???ョ??浠讳???????? +??戒??浼?琚??????ㄦ?㈣??锛????姝や??蹇?椤诲?ㄥ?????琛ヤ??涔?????????ㄦ?㈣????????绠?????????规??灏辨???????ㄨ????ㄦ?㈣????ヤ功??????浠讹?? +??跺?????瀹?淇?瀛?涓鸿??绋裤??涓????浣???ㄨ??绋夸腑???娆℃??寮?瀹?锛?瀹?宸茬????ㄩ?ㄨ????ㄦ?㈣??浜?锛???d??浣???????浠惰?界?舵病??? +?????╄????ㄦ?㈣??锛?浣????杩?涓?浼?澶卞?诲凡???????????ㄦ?㈣????? + +??ㄩ??浠剁??搴????锛??????ヨˉ涓?涔????锛???句??甯哥?ㄧ??琛ヤ??瀹????绗?锛?涓?涓?杩?瀛????(---)??? + +??跺?????"Message"????????$??锛??????╂????ユ??浠讹????ョ????????浣????琛ヤ?????浠躲??杩????涓?涓?棰?澶???????椤癸??浣????浠? +???杩?瀹????缃?浣???????浠跺缓绔?宸ュ?锋????????锛?杩????浠ュ甫涓?"insert file"??炬????? + +浣????浠ュ????ㄥ?伴??杩?GPG???璁伴??浠讹??浣???????宓?琛ヤ?????濂戒??瑕?浣跨??GPG???璁板??浠????浣?涓哄??宓??????????绛惧??琛ヤ??锛? +褰?浠?GPG涓???????7浣?缂??????朵??浣夸??浠?????????村??澶??????? + +濡????浣????瑕?浠ラ??浠剁??褰㈠????????琛ヤ??锛???d??灏卞?抽????瑰?婚??浠讹????跺?????涓?灞???э??绐????"Suggest automatic +display"锛?杩???峰??宓????浠舵?村?规??璁╄?昏???????般?? + +褰?浣?瑕?淇?瀛?灏?瑕?????????????宓???????琛ヤ??锛?浣????浠ヤ??娑???????琛ㄧ????奸????╁?????琛ヤ????????浠讹????跺????冲?婚????? +"save as"???浣????浠ヤ娇??ㄤ??涓?娌℃????存?圭????????琛ヤ????????浠讹??濡????瀹????浠ユ?g‘???褰㈠??缁???????褰?浣?姝g????ㄥ?? +???宸辩??绐???d??涓?瀵????锛???f?舵病??????椤瑰??浠ヤ??瀛????浠?--宸茬?????涓?涓?杩???风??bug琚?姹???ュ?颁??kmail???bugzilla +骞朵??甯????杩?灏?浼?琚?澶??????????浠舵??浠ュ?????瀵规??涓???ㄦ?峰??璇诲???????????琚?淇?瀛????锛????浠ュ?????浣???虫?????浠跺????跺?板?朵????版?癸?? +浣?涓?寰?涓????浠?浠????????????逛负缁?????????翠?????璇汇?? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Lotus Notes (GUI) + +涓?瑕?浣跨?ㄥ????? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Mutt (TUI) + +寰?澶?Linux寮????浜哄??浣跨??mutt瀹㈡?风??锛????浠ヨ?????瀹????瀹?宸ヤ????????甯告??浜???? + +Mutt涓????甯?缂?杈????锛????浠ヤ??绠′??浣跨?ㄤ??涔?缂?杈???ㄩ?戒??搴?璇ュ甫????????ㄦ??琛????澶у????扮??杈???ㄩ?藉甫??? +涓?涓?"insert file"???椤癸??瀹????浠ラ??杩?涓???瑰?????浠跺??瀹圭????瑰???????ユ??浠躲?? + +'vim'浣?涓?mutt???缂?杈????锛? + set editor="vi" + + 濡????浣跨??xclip锛???插?ヤ互涓???戒护 + :set paste + ???涓????涔??????????shift-insert??????浣跨?? + :r filename + +濡??????宠?????琛ヤ??浣?涓哄??宓?????????? +(a)ttach宸ヤ?????寰?濂斤??涓?甯????"set paste"??? + +???缃????椤癸?? +瀹?搴?璇ヤ互榛?璁よ?剧疆???褰㈠??宸ヤ????? +??惰??锛????"send_charset"璁剧疆涓?"us-ascii::utf-8"涔????涓?涓?涓???????涓绘????? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Pine (TUI) + +Pine杩???绘??涓?浜?绌烘?煎????????棰?锛?浣????杩?浜???板?ㄥ??璇ラ?借??淇?澶?浜???? + +濡???????浠ワ??璇蜂娇???alpine(pine???缁ф?胯??) + +???缃????椤癸?? +- ???杩?????????????瑕?娑???ゆ??绋??????? +- "no-strip-whitespace-before-send"???椤逛????????瑕??????? + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Sylpheed (GUI) + +- ???宓??????????浠ュ??濂界??宸ヤ??锛???????浣跨?ㄩ??浠讹????? +- ???璁镐娇??ㄥ????ㄧ??缂?杈???ㄣ?? +- 瀵逛?????褰?杈?澶???堕??甯告????? +- 濡???????杩?non-SSL杩???ワ?????娉?浣跨??TLS SMTP????????? +- ??ㄧ?????绐???d腑???涓?涓?寰??????ㄧ??ruler bar??? +- 缁???板?????涓?娣诲????板??灏变??浼?姝g‘???浜?瑙f?剧ず?????? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Thunderbird (GUI) + +榛?璁ゆ????典??锛?thunderbird寰?瀹规??????????????锛?浣????杩????涓?浜???规?????浠ュ己??跺?????寰???村ソ??? + +- ??ㄧ?ㄦ?峰????疯?剧疆???锛?缁???????瀵诲??锛?涓?瑕???????"Compose messages in HTML format"??? + +- 缂?杈?浣????Thunderbird???缃?璁剧疆??ヤ娇瀹?涓?瑕????琛?浣跨??锛?user_pref("mailnews.wraplength", 0); + +- 缂?杈?浣????Thunderbird???缃?璁剧疆锛?浣垮??涓?瑕?浣跨??"format=flowed"??煎??锛?user_pref("mailnews. + send_plaintext_flowed", false); + +- 浣????瑕?浣?Thunderbird???涓洪???????煎????瑰??锛? + 濡????榛?璁ゆ????典??浣?涔??????????HTML??煎??锛???d?????寰???俱??浠?浠?浠????棰???????涓????妗?涓???????"Preformat"??煎????? + 濡????榛?璁ゆ????典??浣?涔??????????????????煎??锛?浣?涓?寰????瀹???逛负HTML??煎??锛?浠?浠?浣?涓轰??娆℃?х??锛???ヤ功?????扮??娑????锛? + ??跺??寮哄?朵娇瀹??????版???????煎??锛???????瀹?灏变?????琛????瑕?瀹???板??锛???ㄥ??淇$????炬??涓?浣跨??shift?????ヤ娇瀹????涓?HTML + ??煎??锛???跺?????棰???????涓????妗?涓???????"Preformat"??煎????? + +- ???璁镐娇??ㄥ????ㄧ??缂?杈????锛? + ???瀵?Thunderbird???琛ヤ?????绠?????????规??灏辨??浣跨?ㄤ??涓?"external editor"??╁??锛???跺??浣跨?ㄤ????????娆㈢?? + $EDITOR??ヨ?诲???????????骞惰ˉ涓???版?????涓????瑕?瀹???板??锛????浠ヤ??杞藉苟涓?瀹?瑁?杩?涓???╁??锛???跺??娣诲??涓?涓?浣跨?ㄥ????? + ??????View->Toolbars->Customize...??????褰?浣?涔????淇℃???????跺??浠?浠???瑰?诲??灏卞??浠ヤ????? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +TkRat (GUI) + +???浠ヤ娇??ㄥ?????浣跨??"Insert file..."??????澶???ㄧ??缂?杈???ㄣ?? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Gmail (Web GUI) + +涓?瑕?浣跨?ㄥ????????琛ヤ????? + +Gmail缃?椤靛?㈡?风???????ㄥ?版????惰〃绗?杞????涓虹┖??笺?? + +??界?跺?惰〃绗?杞????涓虹┖??奸??棰????浠ヨ??澶???ㄧ??杈???ㄨВ??筹???????跺??杩?浼?浣跨?ㄥ??杞???㈣?????姣?琛???????涓?78涓?瀛?绗???? + +???涓?涓????棰????Gmail杩?浼????浠讳??涓????ASCII???瀛?绗????淇℃????逛负base64缂???????瀹????涓?瑗垮????????娆ф床浜虹?????瀛???? + + ### diff --git a/MAINTAINERS b/MAINTAINERS index 69f19f1..d54d551 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -287,35 +287,35 @@ F: sound/pci/ad1889.* AD525X ANALOG DEVICES DIGITAL POTENTIOMETERS DRIVER M: Michael Hennerich -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/AD5254 S: Supported F: drivers/misc/ad525x_dpot.c AD5398 CURRENT REGULATOR DRIVER (AD5398/AD5821) M: Michael Hennerich -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/AD5398 S: Supported F: drivers/regulator/ad5398.c AD714X CAPACITANCE TOUCH SENSOR DRIVER (AD7142/3/7/8/7A) M: Michael Hennerich -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/AD7142 S: Supported F: drivers/input/misc/ad714x.c AD7877 TOUCHSCREEN DRIVER M: Michael Hennerich -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/AD7877 S: Supported F: drivers/input/touchscreen/ad7877.c AD7879 TOUCHSCREEN DRIVER (AD7879/AD7889) M: Michael Hennerich -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/AD7879 S: Supported F: drivers/input/touchscreen/ad7879.c @@ -341,7 +341,7 @@ F: drivers/net/wireless/adm8211.* ADP5520 BACKLIGHT DRIVER WITH IO EXPANDER (ADP5520/ADP5501) M: Michael Hennerich -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/ADP5520 S: Supported F: drivers/mfd/adp5520.c @@ -352,7 +352,7 @@ F: drivers/input/keyboard/adp5520-keys.c ADP5588 QWERTY KEYPAD AND IO EXPANDER DRIVER (ADP5588/ADP5587) M: Michael Hennerich -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/ADP5588 S: Supported F: drivers/input/keyboard/adp5588-keys.c @@ -360,7 +360,7 @@ F: drivers/gpio/adp5588-gpio.c ADP8860 BACKLIGHT DRIVER (ADP8860/ADP8861/ADP8863) M: Michael Hennerich -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/ADP8860 S: Supported F: drivers/video/backlight/adp8860_bl.c @@ -387,7 +387,7 @@ F: drivers/hwmon/adt7475.c ADXL34X THREE-AXIS DIGITAL ACCELEROMETER DRIVER (ADXL345/ADXL346) M: Michael Hennerich -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/ADXL345 S: Supported F: drivers/input/misc/adxl34x.c @@ -405,8 +405,8 @@ S: Maintained F: sound/oss/aedsp16.c AFFS FILE SYSTEM -M: Roman Zippel -S: Maintained +L: linux-fsdevel@vger.kernel.org +S: Orphan F: Documentation/filesystems/affs.txt F: fs/affs/ @@ -483,6 +483,13 @@ F: drivers/tty/serial/altera_jtaguart.c F: include/linux/altera_uart.h F: include/linux/altera_jtaguart.h +AMD FAM15H PROCESSOR POWER MONITORING DRIVER +M: Andreas Herrmann +L: lm-sensors@lm-sensors.org +S: Maintained +F: Documentation/hwmon/fam15h_power +F: drivers/hwmon/fam15h_power.c + AMD GEODE CS5536 USB DEVICE CONTROLLER DRIVER M: Thomas Dahlmann L: linux-geode@lists.infradead.org (moderated for non-subscribers) @@ -526,7 +533,7 @@ S: Maintained F: drivers/infiniband/hw/amso1100/ ANALOG DEVICES INC ASOC CODEC DRIVERS -L: device-driver-devel@blackfin.uclinux.org +L: device-drivers-devel@blackfin.uclinux.org L: alsa-devel@alsa-project.org (moderated for non-subscribers) W: http://wiki.analog.com/ S: Supported @@ -548,10 +555,11 @@ S: Maintained F: sound/aoa/ APM DRIVER -L: linux-laptop@vger.kernel.org -S: Orphan +M: Jiri Kosina +S: Odd fixes F: arch/x86/kernel/apm_32.c F: include/linux/apm_bios.h +F: drivers/char/apm-emulation.c APPLE BCM5974 MULTITOUCH DRIVER M: Henrik Rydberg @@ -729,7 +737,7 @@ ARM/EZX SMARTPHONES (A780, A910, A1200, E680, ROKR E2 and ROKR E6) M: Daniel Ribeiro M: Stefan Schmidt M: Harald Welte -L: openezx-devel@lists.openezx.org (subscribers-only) +L: openezx-devel@lists.openezx.org (moderated for non-subscribers) W: http://www.openezx.org/ S: Maintained T: topgit git://git.openezx.org/openezx.git @@ -1232,13 +1240,6 @@ W: http://wireless.kernel.org/en/users/Drivers/ath9k S: Supported F: drivers/net/wireless/ath/ath9k/ -ATHEROS AR9170 WIRELESS DRIVER -M: Christian Lamparter -L: linux-wireless@vger.kernel.org -W: http://wireless.kernel.org/en/users/Drivers/ar9170 -S: Obsolete -F: drivers/net/wireless/ath/ar9170/ - CARL9170 LINUX COMMUNITY WIRELESS DRIVER M: Christian Lamparter L: linux-wireless@vger.kernel.org @@ -2040,9 +2041,8 @@ F: net/ax25/ax25_timer.c F: net/ax25/sysctl_net_ax25.c DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER -M: Tobias Ringstrom L: netdev@vger.kernel.org -S: Maintained +S: Orphan F: Documentation/networking/dmfe.txt F: drivers/net/tulip/dmfe.c @@ -2251,10 +2251,10 @@ F: drivers/gpu/drm/ F: include/drm/ INTEL DRM DRIVERS (excluding Poulsbo, Moorestown and derivative chipsets) -M: Chris Wilson +M: Keith Packard L: intel-gfx@lists.freedesktop.org (subscribers-only) L: dri-devel@lists.freedesktop.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/ickle/drm-intel.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/keithp/linux-2.6.git S: Supported F: drivers/gpu/drm/i915 F: include/drm/i915* @@ -2946,8 +2946,8 @@ F: drivers/block/cciss* F: include/linux/cciss_ioctl.h HFS FILESYSTEM -M: Roman Zippel -S: Maintained +L: linux-fsdevel@vger.kernel.org +S: Orphan F: Documentation/filesystems/hfs.txt F: fs/hfs/ @@ -3363,6 +3363,12 @@ F: Documentation/wimax/README.i2400m F: drivers/net/wimax/i2400m/ F: include/linux/wimax/i2400m.h +INTEL WIRELESS 3945ABG/BG, 4965AGN (iwlegacy) +M: Stanislaw Gruszka +L: linux-wireless@vger.kernel.org +S: Supported +F: drivers/net/wireless/iwlegacy/ + INTEL WIRELESS WIFI LINK (iwlwifi) M: Wey-Yi Guy M: Intel Linux Wireless @@ -3566,9 +3572,16 @@ M: Andrew Morton M: Jan Kara L: linux-ext4@vger.kernel.org S: Maintained -F: fs/jbd*/ -F: include/linux/ext*jbd*.h -F: include/linux/jbd*.h +F: fs/jbd/ +F: include/linux/ext3_jbd.h +F: include/linux/jbd.h + +JOURNALLING LAYER FOR BLOCK DEVICES (JBD2) +M: "Theodore Ts'o" +L: linux-ext4@vger.kernel.org +S: Maintained +F: fs/jbd2/ +F: include/linux/jbd2.h JSM Neo PCI based serial card M: Breno Leitao @@ -3591,10 +3604,9 @@ F: Documentation/hwmon/k8temp F: drivers/hwmon/k8temp.c KCONFIG -M: Roman Zippel +M: Michal Marek L: linux-kbuild@vger.kernel.org -Q: http://patchwork.kernel.org/project/linux-kbuild/list/ -S: Maintained +S: Odd Fixes F: Documentation/kbuild/kconfig-language.txt F: scripts/kconfig/ @@ -3814,7 +3826,7 @@ M: Rusty Russell L: lguest@lists.ozlabs.org W: http://lguest.ozlabs.org/ S: Odd Fixes -F: Documentation/lguest/ +F: Documentation/virtual/lguest/ F: arch/x86/lguest/ F: drivers/lguest/ F: include/linux/lguest*.h @@ -3898,7 +3910,6 @@ F: drivers/*/*/*pasemi* LINUX SECURITY MODULE (LSM) FRAMEWORK M: Chris Wright L: linux-security-module@vger.kernel.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/chrisw/lsm-2.6.git S: Supported LIS3LV02D ACCELEROMETER DRIVER @@ -4001,7 +4012,6 @@ F: arch/m32r/ M68K ARCHITECTURE M: Geert Uytterhoeven -M: Roman Zippel L: linux-m68k@lists.linux-m68k.org W: http://www.linux-m68k.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git @@ -4254,7 +4264,7 @@ F: include/linux/isicom.h MUSB MULTIPOINT HIGH SPEED DUAL-ROLE CONTROLLER M: Felipe Balbi L: linux-usb@vger.kernel.org -T: git git://gitorious.org/usb/usb.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git S: Maintained F: drivers/usb/musb/ @@ -4271,6 +4281,13 @@ M: Tim Hockin S: Maintained F: drivers/net/natsemi.c +NATIVE INSTRUMENTS USB SOUND INTERFACE DRIVER +M: Daniel Mack +S: Maintained +L: alsa-devel@alsa-project.org +W: http://www.native-instruments.com +F: sound/usb/caiaq/ + NCP FILESYSTEM M: Petr Vandrovec S: Odd Fixes @@ -4386,6 +4403,7 @@ S: Maintained F: net/ipv4/ F: net/ipv6/ F: include/net/ip* +F: arch/x86/net/* NETWORKING [LABELED] (NetLabel, CIPSO, Labeled IPsec, SECMARK) M: Paul Moore @@ -4581,6 +4599,7 @@ M: Felipe Balbi M: David Brownell L: linux-usb@vger.kernel.org L: linux-omap@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git S: Maintained F: drivers/usb/*/*omap* F: arch/arm/*omap*/usb* @@ -5432,6 +5451,7 @@ F: include/linux/timex.h F: kernel/time/clocksource.c F: kernel/time/time*.c F: kernel/time/ntp.c +F: drivers/clocksource TLG2300 VIDEO4LINUX-2 DRIVER M: Huang Shijie @@ -5583,10 +5603,11 @@ M: James Morris M: Eric Paris L: selinux@tycho.nsa.gov (subscribers-only, general discussion) W: http://selinuxproject.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git +T: git git://git.infradead.org/users/eparis/selinux.git S: Supported F: include/linux/selinux* F: security/selinux/ +F: scripts/selinux/ APPARMOR SECURITY MODULE M: John Johansen @@ -5612,9 +5633,9 @@ F: include/linux/ata.h F: include/linux/libata.h SERVER ENGINES 10Gbps iSCSI - BladeEngine 2 DRIVER -M: Jayamohan Kallickal +M: Jayamohan Kallickal L: linux-scsi@vger.kernel.org -W: http://www.serverengines.com +W: http://www.emulex.com S: Supported F: drivers/scsi/be2iscsi/ @@ -5832,6 +5853,13 @@ S: Maintained F: drivers/ssb/ F: include/linux/ssb/ +BROADCOM SPECIFIC AMBA DRIVER (BCMA) +M: Rafał Miłecki +L: linux-wireless@vger.kernel.org +S: Maintained +F: drivers/bcma/ +F: include/linux/bcma/ + SONY VAIO CONTROL DEVICE DRIVER M: Mattia Dongili L: platform-driver-x86@vger.kernel.org @@ -5861,7 +5889,7 @@ F: include/sound/ F: sound/ SOUND - SOC LAYER / DYNAMIC AUDIO POWER MANAGEMENT (ASoC) -M: Liam Girdwood +M: Liam Girdwood M: Mark Brown T: git git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound-2.6.git L: alsa-devel@alsa-project.org (moderated for non-subscribers) @@ -6112,7 +6140,7 @@ F: drivers/mmc/host/tifm_sd.c F: include/linux/tifm.h TI TWL4030 SERIES SOC CODEC DRIVER -M: Peter Ujfalusi +M: Peter Ujfalusi L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Maintained F: sound/soc/codecs/twl4030* @@ -6215,6 +6243,7 @@ TRIVIAL PATCHES M: Jiri Kosina T: git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial.git S: Maintained +K: ^Subject:.*(?i)trivial TTY LAYER M: Greg Kroah-Hartman @@ -6632,7 +6661,7 @@ L: user-mode-linux-devel@lists.sourceforge.net L: user-mode-linux-user@lists.sourceforge.net W: http://user-mode-linux.sourceforge.net S: Maintained -F: Documentation/uml/ +F: Documentation/virtual/uml/ F: arch/um/ F: fs/hostfs/ F: fs/hppfs/ @@ -6756,7 +6785,7 @@ F: drivers/scsi/vmw_pvscsi.c F: drivers/scsi/vmw_pvscsi.h VOLTAGE AND CURRENT REGULATOR FRAMEWORK -M: Liam Girdwood +M: Liam Girdwood M: Mark Brown W: http://opensource.wolfsonmicro.com/node/15 W: http://www.slimlogic.co.uk/?p=48 @@ -6778,6 +6807,13 @@ L: lm-sensors@lm-sensors.org S: Maintained F: drivers/hwmon/vt8231.c +VUB300 USB to SDIO/SD/MMC bridge chip +M: Tony Olech +L: linux-mmc@vger.kernel.org +L: linux-usb@vger.kernel.org +S: Supported +F: drivers/mmc/host/vub300.c + W1 DALLAS'S 1-WIRE BUS M: Evgeniy Polyakov S: Maintained diff --git a/Makefile b/Makefile index 123d858..529d93f 100644 --- a/Makefile +++ b/Makefile @@ -103,7 +103,7 @@ ifeq ("$(origin O)", "command line") endif ifeq ("$(origin W)", "command line") - export KBUILD_ENABLE_EXTRA_GCC_CHECKS := 1 + export KBUILD_ENABLE_EXTRA_GCC_CHECKS := $(W) endif # That's our default target when none is given on the command line @@ -220,6 +220,14 @@ ifeq ($(ARCH),sh64) SRCARCH := sh endif +# Additional ARCH settings for tile +ifeq ($(ARCH),tilepro) + SRCARCH := tile +endif +ifeq ($(ARCH),tilegx) + SRCARCH := tile +endif + # Where to locate arch specific headers hdr-arch := $(SRCARCH) @@ -349,7 +357,8 @@ CFLAGS_GCOV = -fprofile-arcs -ftest-coverage # Use LINUXINCLUDE when you must reference the include/ directory. # Needed to be compatible with the O= option -LINUXINCLUDE := -I$(srctree)/arch/$(hdr-arch)/include -Iinclude \ +LINUXINCLUDE := -I$(srctree)/arch/$(hdr-arch)/include \ + -Iarch/$(hdr-arch)/include/generated -Iinclude \ $(if $(KBUILD_SRC), -I$(srctree)/include) \ -include include/generated/autoconf.h @@ -382,6 +391,7 @@ export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL +export KBUILD_ARFLAGS # When compiling out-of-tree modules, put MODVERDIR in the module # tree rather than in the kernel tree. The kernel tree might @@ -416,6 +426,12 @@ ifneq ($(KBUILD_SRC),) $(srctree) $(objtree) $(VERSION) $(PATCHLEVEL) endif +# Support for using generic headers in asm-generic +PHONY += asm-generic +asm-generic: + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.asm-generic \ + obj=arch/$(SRCARCH)/include/generated/asm + # To make sure we do not include .config for any of the *config targets # catch them early, and hand them over to scripts/kconfig/Makefile # It is allowed to specify more targets when calling make, including @@ -559,6 +575,10 @@ ifndef CONFIG_CC_STACKPROTECTOR KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector) endif +# This warning generated too much noise in a regular build. +# Use make W=1 to enable this warning (see scripts/Makefile.build) +KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) + ifdef CONFIG_FRAME_POINTER KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls else @@ -604,7 +624,7 @@ CHECKFLAGS += $(NOSTDINC_FLAGS) KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,) # disable pointer signed / unsigned warnings in gcc 4.0 -KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,) +KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign) # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow) @@ -612,6 +632,9 @@ KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow) # conserve stack if available KBUILD_CFLAGS += $(call cc-option,-fconserve-stack) +# use the deterministic mode of AR if available +KBUILD_ARFLAGS := $(call ar-option,D) + # check for 'asm goto' ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y) KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO @@ -797,15 +820,17 @@ ifdef CONFIG_KALLSYMS # o The correct .tmp_kallsyms2.o is linked into the final vmlinux. # o Verify that the System.map from vmlinux matches the map from # .tmp_vmlinux2, just in case we did not generate kallsyms correctly. -# o If CONFIG_KALLSYMS_EXTRA_PASS is set, do an extra pass using +# o If 'make KALLSYMS_EXTRA_PASS=1" was used, do an extra pass using # .tmp_vmlinux3 and .tmp_kallsyms3.o. This is only meant as a # temporary bypass to allow the kernel to be built while the # maintainers work out what went wrong with kallsyms. -ifdef CONFIG_KALLSYMS_EXTRA_PASS -last_kallsyms := 3 -else last_kallsyms := 2 + +ifdef KALLSYMS_EXTRA_PASS +ifneq ($(KALLSYMS_EXTRA_PASS),0) +last_kallsyms := 3 +endif endif kallsyms.o := .tmp_kallsyms$(last_kallsyms).o @@ -816,7 +841,8 @@ define verify_kallsyms $(cmd_sysmap) .tmp_vmlinux$(last_kallsyms) .tmp_System.map $(Q)cmp -s System.map .tmp_System.map || \ (echo Inconsistent kallsyms data; \ - echo Try setting CONFIG_KALLSYMS_EXTRA_PASS; \ + echo This is a bug - please report about it; \ + echo Try "make KALLSYMS_EXTRA_PASS=1" as a workaround; \ rm .tmp_kallsyms* ; /bin/false ) endef @@ -947,7 +973,7 @@ ifneq ($(KBUILD_SRC),) endif # prepare2 creates a makefile if using a separate output directory -prepare2: prepare3 outputmakefile +prepare2: prepare3 outputmakefile asm-generic prepare1: prepare2 include/linux/version.h include/generated/utsrelease.h \ include/config/auto.conf @@ -991,7 +1017,8 @@ include/generated/utsrelease.h: include/config/kernel.release FORCE PHONY += headerdep headerdep: - $(Q)find include/ -name '*.h' | xargs --max-args 1 scripts/headerdep.pl + $(Q)find $(srctree)/include/ -name '*.h' | xargs --max-args 1 \ + $(srctree)/scripts/headerdep.pl -I$(srctree)/include # --------------------------------------------------------------------------- @@ -1021,7 +1048,7 @@ hdr-inst := -rR -f $(srctree)/scripts/Makefile.headersinst obj hdr-dst = $(if $(KBUILD_HEADERS), dst=include/asm-$(hdr-arch), dst=include/asm) PHONY += __headers -__headers: include/linux/version.h scripts_basic FORCE +__headers: include/linux/version.h scripts_basic asm-generic FORCE $(Q)$(MAKE) $(build)=scripts build_unifdef PHONY += headers_install_all @@ -1136,7 +1163,8 @@ CLEAN_FILES += vmlinux System.map \ .tmp_kallsyms* .tmp_version .tmp_vmlinux* .tmp_System.map # Directories & files removed with 'make mrproper' -MRPROPER_DIRS += include/config usr/include include/generated +MRPROPER_DIRS += include/config usr/include include/generated \ + arch/*/include/generated MRPROPER_FILES += .config .config.old .version .old_version \ include/linux/version.h \ Module.symvers tags TAGS cscope* GPATH GTAGS GRTAGS GSYMS @@ -1267,7 +1295,12 @@ help: @echo ' make O=dir [targets] Locate all output files in "dir", including .config' @echo ' make C=1 [targets] Check all c source with $$CHECK (sparse by default)' @echo ' make C=2 [targets] Force check of all c source with $$CHECK' - @echo ' make W=1 [targets] Enable extra gcc checks' + @echo ' make W=n [targets] Enable extra gcc checks, n=1,2,3 where' + @echo ' 1: warnings which may be relevant and do not occur too often' + @echo ' 2: warnings which occur quite often but may still be relevant' + @echo ' 3: more obscure warnings, can most likely be ignored' + @echo ' Multiple levels can be combined with W=12 or W=123' + @echo ' make RECORDMCOUNT_WARN=1 [targets] Warn about ignored mcount sections' @echo '' @echo 'Execute "make" or "make all" to build all targets marked with [*] ' @echo 'For further info see the ./README file' @@ -1290,6 +1323,7 @@ $(help-board-dirs): help-%: # Documentation targets # --------------------------------------------------------------------------- %docs: scripts_basic FORCE + $(Q)$(MAKE) $(build)=scripts build_docproc $(Q)$(MAKE) $(build)=Documentation/DocBook $@ else # KBUILD_EXTMOD @@ -1374,7 +1408,7 @@ endif # KBUILD_EXTMOD clean: $(clean-dirs) $(call cmd,rmdirs) $(call cmd,rmfiles) - @find $(or $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \ + @find $(if $(KBUILD_EXTMOD), $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \ \( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \ -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \ -o -name '*.symtypes' -o -name 'modules.order' \ @@ -1392,13 +1426,15 @@ tags TAGS cscope gtags: FORCE # Scripts to check various things for consistency # --------------------------------------------------------------------------- +PHONY += includecheck versioncheck coccicheck namespacecheck export_report + includecheck: - find * $(RCS_FIND_IGNORE) \ + find $(srctree)/* $(RCS_FIND_IGNORE) \ -name '*.[hcS]' -type f -print | sort \ | xargs $(PERL) -w $(srctree)/scripts/checkincludes.pl versioncheck: - find * $(RCS_FIND_IGNORE) \ + find $(srctree)/* $(RCS_FIND_IGNORE) \ -name '*.[hcS]' -type f -print | sort \ | xargs $(PERL) -w $(srctree)/scripts/checkversion.pl diff --git a/arch/Kconfig b/arch/Kconfig index f78c2be..26b0e23 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -144,9 +144,6 @@ config HAVE_CLK config HAVE_DMA_API_DEBUG bool -config HAVE_DEFAULT_NO_SPIN_MUTEXES - bool - config HAVE_HW_BREAKPOINT bool depends on PERF_EVENTS @@ -178,4 +175,7 @@ config HAVE_ARCH_JUMP_LABEL config HAVE_ARCH_MUTEX_CPU_RELAX bool +config HAVE_RCU_TABLE_FREE + bool + source "kernel/gcov/Kconfig" diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9808998..e3a8277 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -12,6 +12,7 @@ config ALPHA select GENERIC_IRQ_PROBE select AUTO_IRQ_AFFINITY if SMP select GENERIC_IRQ_SHOW + select ARCH_WANT_OPTIONAL_GPIOLIB help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, @@ -51,6 +52,9 @@ config GENERIC_CALIBRATE_DELAY config GENERIC_CMOS_UPDATE def_bool y +config GENERIC_GPIO + def_bool y + config ZONE_DMA bool default y diff --git a/arch/alpha/include/asm/gpio.h b/arch/alpha/include/asm/gpio.h new file mode 100644 index 0000000..7dc6a63 --- /dev/null +++ b/arch/alpha/include/asm/gpio.h @@ -0,0 +1,55 @@ +/* + * Generic GPIO API implementation for Alpha. + * + * A stright copy of that for PowerPC which was: + * + * Copyright (c) 2007-2008 MontaVista Software, Inc. + * + * Author: Anton Vorontsov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _ASM_ALPHA_GPIO_H +#define _ASM_ALPHA_GPIO_H + +#include +#include + +#ifdef CONFIG_GPIOLIB + +/* + * We don't (yet) implement inlined/rapid versions for on-chip gpios. + * Just call gpiolib. + */ +static inline int gpio_get_value(unsigned int gpio) +{ + return __gpio_get_value(gpio); +} + +static inline void gpio_set_value(unsigned int gpio, int value) +{ + __gpio_set_value(gpio, value); +} + +static inline int gpio_cansleep(unsigned int gpio) +{ + return __gpio_cansleep(gpio); +} + +static inline int gpio_to_irq(unsigned int gpio) +{ + return __gpio_to_irq(gpio); +} + +static inline int irq_to_gpio(unsigned int irq) +{ + return -EINVAL; +} + +#endif /* CONFIG_GPIOLIB */ + +#endif /* _ASM_ALPHA_GPIO_H */ diff --git a/arch/alpha/include/asm/smp.h b/arch/alpha/include/asm/smp.h index 3f390e8..c46e714 100644 --- a/arch/alpha/include/asm/smp.h +++ b/arch/alpha/include/asm/smp.h @@ -39,8 +39,6 @@ struct cpuinfo_alpha { extern struct cpuinfo_alpha cpu_data[NR_CPUS]; -#define PROC_CHANGE_PENALTY 20 - #define hard_smp_processor_id() __hard_smp_processor_id() #define raw_smp_processor_id() (current_thread_info()->cpu) diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 3ec3506..838eac1 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -121,7 +121,7 @@ common_shutdown_1(void *generic_ptr) /* Wait for the secondaries to halt. */ set_cpu_present(boot_cpuid, false); set_cpu_possible(boot_cpuid, false); - while (cpus_weight(cpu_present_map)) + while (cpumask_weight(cpu_present_mask)) barrier(); #endif diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index edbddcb..cc0fd86 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -1257,7 +1257,7 @@ show_cpuinfo(struct seq_file *f, void *slot) #ifdef CONFIG_SMP seq_printf(f, "cpus active\t\t: %u\n" "cpu active mask\t\t: %016lx\n", - num_online_cpus(), cpus_addr(cpu_possible_map)[0]); + num_online_cpus(), cpumask_bits(cpu_possible_mask)[0]); #endif show_cache_size (f, "L1 Icache", alpha_l1i_cacheshape); diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 42aa078..d739703 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -451,7 +451,7 @@ setup_smp(void) } printk(KERN_INFO "SMP: %d CPUs probed -- cpu_present_map = %lx\n", - smp_num_probed, cpu_present_map.bits[0]); + smp_num_probed, cpumask_bits(cpu_present_mask)[0]); } /* @@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs) switch (which) { case IPI_RESCHEDULE: - /* Reschedule callback. Everything to be done - is done by the interrupt return path. */ + scheduler_ipi(); break; case IPI_CALL_FUNC: @@ -630,8 +629,9 @@ smp_send_reschedule(int cpu) void smp_send_stop(void) { - cpumask_t to_whom = cpu_possible_map; - cpu_clear(smp_processor_id(), to_whom); + cpumask_t to_whom; + cpumask_copy(&to_whom, cpu_possible_mask); + cpumask_clear_cpu(smp_processor_id(), &to_whom); #ifdef DEBUG_IPI_MSG if (hard_smp_processor_id() != boot_cpu_id) printk(KERN_WARNING "smp_send_stop: Not on boot cpu.\n"); diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index 5ac00fd..f885682 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -140,7 +140,7 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity) for (cpu = 0; cpu < 4; cpu++) { unsigned long aff = cpu_irq_affinity[cpu]; - if (cpu_isset(cpu, affinity)) + if (cpumask_test_cpu(cpu, &affinity)) aff |= 1UL << irq; else aff &= ~(1UL << irq); diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index fea0e46..6994407 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -65,10 +65,11 @@ titan_update_irq_hw(unsigned long mask) register int bcpu = boot_cpuid; #ifdef CONFIG_SMP - cpumask_t cpm = cpu_present_map; + cpumask_t cpm; volatile unsigned long *dim0, *dim1, *dim2, *dim3; unsigned long mask0, mask1, mask2, mask3, dummy; + cpumask_copy(&cpm, cpu_present_mask); mask &= ~isa_enable; mask0 = mask & titan_cpu_irq_affinity[0]; mask1 = mask & titan_cpu_irq_affinity[1]; @@ -84,10 +85,10 @@ titan_update_irq_hw(unsigned long mask) dim1 = &cchip->dim1.csr; dim2 = &cchip->dim2.csr; dim3 = &cchip->dim3.csr; - if (!cpu_isset(0, cpm)) dim0 = &dummy; - if (!cpu_isset(1, cpm)) dim1 = &dummy; - if (!cpu_isset(2, cpm)) dim2 = &dummy; - if (!cpu_isset(3, cpm)) dim3 = &dummy; + if (!cpumask_test_cpu(0, &cpm)) dim0 = &dummy; + if (!cpumask_test_cpu(1, &cpm)) dim1 = &dummy; + if (!cpumask_test_cpu(2, &cpm)) dim2 = &dummy; + if (!cpumask_test_cpu(3, &cpm)) dim3 = &dummy; *dim0 = mask0; *dim1 = mask1; @@ -137,7 +138,7 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity) int cpu; for (cpu = 0; cpu < 4; cpu++) { - if (cpu_isset(cpu, affinity)) + if (cpumask_test_cpu(cpu, &affinity)) titan_cpu_irq_affinity[cpu] |= 1UL << irq; else titan_cpu_irq_affinity[cpu] &= ~(1UL << irq); diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 433be2a..f937ad1 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -39,13 +39,14 @@ SECTIONS __init_begin = ALIGN(PAGE_SIZE); INIT_TEXT_SECTION(PAGE_SIZE) INIT_DATA_SECTION(16) - PERCPU(L1_CACHE_BYTES, PAGE_SIZE) + PERCPU_SECTION(L1_CACHE_BYTES) /* Align to THREAD_SIZE rather than PAGE_SIZE here so any padding page needed for the THREAD_SIZE aligned init_task gets freed after init */ . = ALIGN(THREAD_SIZE); __init_end = .; /* Freed after init ends here */ + _sdata = .; /* Start of rw data section */ _data = .; RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 86425ab..69d0c57 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -32,8 +32,6 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - extern void die_if_kernel(char *,struct pt_regs *,long); static struct pcb_struct original_pcb; diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index 7b2c56d..3973ae3 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -313,6 +313,7 @@ void __init paging_init(void) zones_size[ZONE_DMA] = dma_local_pfn; zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn; } + node_set_state(nid, N_NORMAL_MEMORY); free_area_init_node(nid, zones_size, start_pfn, NULL); } diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 377a7a5..7275009 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -197,15 +197,21 @@ config ARM_PATCH_PHYS_VIRT depends on !XIP_KERNEL && MMU depends on !ARCH_REALVIEW || !SPARSEMEM help - Patch phys-to-virt translation functions at runtime according to - the position of the kernel in system memory. + Patch phys-to-virt and virt-to-phys translation functions at + boot and module load time according to the position of the + kernel in system memory. - This can only be used with non-XIP with MMU kernels where - the base of physical memory is at a 16MB boundary. + This can only be used with non-XIP MMU kernels where the base + of physical memory is at a 16MB boundary, or theoretically 64K + for the MSM machine class. config ARM_PATCH_PHYS_VIRT_16BIT def_bool y depends on ARM_PATCH_PHYS_VIRT && ARCH_MSM + help + This option extends the physical to virtual translation patching + to allow physical memory down to a theoretical minimum of 64K + boundaries. source "init/Kconfig" @@ -297,6 +303,7 @@ config ARCH_BCMRING depends on MMU select CPU_V6 select ARM_AMBA + select ARM_TIMER_SP804 select CLKDEV_LOOKUP select GENERIC_CLOCKEVENTS select ARCH_WANT_OPTIONAL_GPIOLIB @@ -366,6 +373,7 @@ config ARCH_MXC select GENERIC_CLOCKEVENTS select ARCH_REQUIRE_GPIOLIB select CLKDEV_LOOKUP + select CLKSRC_MMIO select HAVE_SCHED_CLOCK help Support for Freescale MXC/iMX-based family of processors @@ -375,21 +383,13 @@ config ARCH_MXS select GENERIC_CLOCKEVENTS select ARCH_REQUIRE_GPIOLIB select CLKDEV_LOOKUP + select CLKSRC_MMIO help Support for Freescale MXS-based family of processors -config ARCH_STMP3XXX - bool "Freescale STMP3xxx" - select CPU_ARM926T - select CLKDEV_LOOKUP - select ARCH_REQUIRE_GPIOLIB - select GENERIC_CLOCKEVENTS - select USB_ARCH_HAS_EHCI - help - Support for systems based on the Freescale 3xxx CPUs. - config ARCH_NETX bool "Hilscher NetX based" + select CLKSRC_MMIO select CPU_ARM926T select ARM_VIC select GENERIC_CLOCKEVENTS @@ -457,6 +457,7 @@ config ARCH_IXP2000 config ARCH_IXP4XX bool "IXP4xx-based" depends on MMU + select CLKSRC_MMIO select CPU_XSCALE select GENERIC_GPIO select GENERIC_CLOCKEVENTS @@ -468,7 +469,7 @@ config ARCH_IXP4XX config ARCH_DOVE bool "Marvell Dove" - select CPU_V6K + select CPU_V7 select PCI select ARCH_REQUIRE_GPIOLIB select GENERIC_CLOCKEVENTS @@ -497,6 +498,7 @@ config ARCH_LOKI config ARCH_LPC32XX bool "NXP LPC32XX" + select CLKSRC_MMIO select CPU_ARM926T select ARCH_REQUIRE_GPIOLIB select HAVE_IDE @@ -554,23 +556,12 @@ config ARCH_KS8695 Support for Micrel/Kendin KS8695 "Centaur" (ARM922T) based System-on-Chip devices. -config ARCH_NS9XXX - bool "NetSilicon NS9xxx" - select CPU_ARM926T - select GENERIC_GPIO - select GENERIC_CLOCKEVENTS - select HAVE_CLK - help - Say Y here if you intend to run this kernel on a NetSilicon NS9xxx - System. - - - config ARCH_W90X900 bool "Nuvoton W90X900 CPU" select CPU_ARM926T select ARCH_REQUIRE_GPIOLIB select CLKDEV_LOOKUP + select CLKSRC_MMIO select GENERIC_CLOCKEVENTS help Support for Nuvoton (Winbond logic dept.) ARM9 processor, @@ -592,6 +583,7 @@ config ARCH_NUC93X config ARCH_TEGRA bool "NVIDIA Tegra" select CLKDEV_LOOKUP + select CLKSRC_MMIO select GENERIC_TIME select GENERIC_CLOCKEVENTS select GENERIC_GPIO @@ -617,6 +609,7 @@ config ARCH_PXA select ARCH_MTD_XIP select ARCH_HAS_CPUFREQ select CLKDEV_LOOKUP + select CLKSRC_MMIO select ARCH_REQUIRE_GPIOLIB select GENERIC_CLOCKEVENTS select HAVE_SCHED_CLOCK @@ -667,6 +660,7 @@ config ARCH_RPC config ARCH_SA1100 bool "SA1100-based" + select CLKSRC_MMIO select CPU_SA1100 select ISA select ARCH_SPARSEMEM_ENABLE @@ -803,6 +797,7 @@ config ARCH_SHARK config ARCH_TCC_926 bool "Telechips TCC ARM926-based systems" + select CLKSRC_MMIO select CPU_ARM926T select HAVE_CLK select CLKDEV_LOOKUP @@ -813,6 +808,7 @@ config ARCH_TCC_926 config ARCH_U300 bool "ST-Ericsson U300 Series" depends on MMU + select CLKSRC_MMIO select CPU_ARM926T select HAVE_SCHED_CLOCK select HAVE_TCM @@ -854,6 +850,7 @@ config ARCH_DAVINCI select HAVE_IDE select CLKDEV_LOOKUP select GENERIC_ALLOCATOR + select GENERIC_IRQ_CHIP select ARCH_HAS_HOLES_MEMORYMODEL help Support for TI's DaVinci platform. @@ -874,6 +871,7 @@ config PLAT_SPEAR select ARM_AMBA select ARCH_REQUIRE_GPIOLIB select CLKDEV_LOOKUP + select CLKSRC_MMIO select GENERIC_CLOCKEVENTS select HAVE_CLK help @@ -951,8 +949,6 @@ source "arch/arm/mach-netx/Kconfig" source "arch/arm/mach-nomadik/Kconfig" source "arch/arm/plat-nomadik/Kconfig" -source "arch/arm/mach-ns9xxx/Kconfig" - source "arch/arm/mach-nuc93x/Kconfig" source "arch/arm/plat-omap/Kconfig" @@ -1005,8 +1001,6 @@ source "arch/arm/mach-exynos4/Kconfig" source "arch/arm/mach-shmobile/Kconfig" -source "arch/arm/plat-stmp3xxx/Kconfig" - source "arch/arm/mach-tegra/Kconfig" source "arch/arm/mach-u300/Kconfig" @@ -1033,6 +1027,8 @@ config PLAT_IOP config PLAT_ORION bool + select CLKSRC_MMIO + select GENERIC_IRQ_CHIP select HAVE_SCHED_CLOCK config PLAT_PXA @@ -1043,6 +1039,7 @@ config PLAT_VERSATILE config ARM_TIMER_SP804 bool + select CLKSRC_MMIO source arch/arm/mm/Kconfig @@ -1318,8 +1315,7 @@ menu "Kernel Features" source "kernel/time/Kconfig" config SMP - bool "Symmetric Multi-Processing (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Symmetric Multi-Processing" depends on CPU_V6K || CPU_V7 depends on GENERIC_CLOCKEVENTS depends on REALVIEW_EB_ARM11MP || REALVIEW_EB_A9MP || \ @@ -1521,8 +1517,8 @@ config ARCH_SELECT_MEMORY_MODEL def_bool ARCH_SPARSEMEM_ENABLE config HIGHMEM - bool "High Memory Support (EXPERIMENTAL)" - depends on MMU && EXPERIMENTAL + bool "High Memory Support" + depends on MMU help The address space of ARM processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address @@ -1742,16 +1738,31 @@ config CMDLINE time by entering them here. As a minimum, you should specify the memory size and the root device (e.g., mem=64M root=/dev/nfs). +choice + prompt "Kernel command line type" if CMDLINE != "" + default CMDLINE_FROM_BOOTLOADER + +config CMDLINE_FROM_BOOTLOADER + bool "Use bootloader kernel arguments if available" + help + Uses the command-line options passed by the boot loader. If + the boot loader doesn't provide any, the default kernel command + string provided in CMDLINE will be used. + +config CMDLINE_EXTEND + bool "Extend bootloader kernel arguments" + help + The command-line arguments provided by the boot loader will be + appended to the default kernel command string. + config CMDLINE_FORCE bool "Always use the default kernel command string" - depends on CMDLINE != "" help Always use the default kernel command string, even if the boot loader passes other arguments to the kernel. This is useful if you cannot or don't want to change the command-line options your boot loader passes to the kernel. - - If unsure, say N. +endchoice config XIP_KERNEL bool "Kernel Execute-In-Place from ROM" @@ -2010,7 +2021,7 @@ menu "Power management options" source "kernel/power/Kconfig" config ARCH_SUSPEND_POSSIBLE - depends on !ARCH_S5P64X0 && !ARCH_S5P6442 + depends on !ARCH_S5P64X0 && !ARCH_S5P6442 && !ARCH_S5PC100 depends on CPU_ARM920T || CPU_ARM926T || CPU_SA1100 || \ CPU_V6 || CPU_V6K || CPU_V7 || CPU_XSC3 || CPU_XSCALE def_bool y diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 03d01d7..81cbe40 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -63,13 +63,6 @@ config DEBUG_USER 8 - SIGSEGV faults 16 - SIGBUS faults -config DEBUG_STACK_USAGE - bool "Enable stack utilization instrumentation" - depends on DEBUG_KERNEL - help - Enables the display of the minimum amount of free stack which each - task has ever had available in the sysrq-T output. - # These options are only for real kernel hackers who want to get their hands dirty. config DEBUG_LL bool "Kernel low-level debugging functions" diff --git a/arch/arm/Makefile b/arch/arm/Makefile index c7d321a..25750bc 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -158,13 +158,11 @@ machine-$(CONFIG_ARCH_MV78XX0) := mv78xx0 machine-$(CONFIG_ARCH_MX1) := imx machine-$(CONFIG_ARCH_MX2) := imx machine-$(CONFIG_ARCH_MX25) := imx -machine-$(CONFIG_ARCH_MX3) := mx3 +machine-$(CONFIG_ARCH_MX3) := imx machine-$(CONFIG_ARCH_MX5) := mx5 -machine-$(CONFIG_ARCH_MXC91231) := mxc91231 machine-$(CONFIG_ARCH_MXS) := mxs machine-$(CONFIG_ARCH_NETX) := netx machine-$(CONFIG_ARCH_NOMADIK) := nomadik -machine-$(CONFIG_ARCH_NS9XXX) := ns9xxx machine-$(CONFIG_ARCH_OMAP1) := omap1 machine-$(CONFIG_ARCH_OMAP2) := omap2 machine-$(CONFIG_ARCH_OMAP3) := omap2 @@ -185,8 +183,6 @@ machine-$(CONFIG_ARCH_EXYNOS4) := exynos4 machine-$(CONFIG_ARCH_SA1100) := sa1100 machine-$(CONFIG_ARCH_SHARK) := shark machine-$(CONFIG_ARCH_SHMOBILE) := shmobile -machine-$(CONFIG_ARCH_STMP378X) := stmp378x -machine-$(CONFIG_ARCH_STMP37XX) := stmp37xx machine-$(CONFIG_ARCH_TCC8K) := tcc8k machine-$(CONFIG_ARCH_TEGRA) := tegra machine-$(CONFIG_ARCH_U300) := u300 @@ -207,7 +203,6 @@ machine-$(CONFIG_MACH_SPEAR600) := spear6xx plat-$(CONFIG_ARCH_MXC) := mxc plat-$(CONFIG_ARCH_OMAP) := omap plat-$(CONFIG_ARCH_S3C64XX) := samsung -plat-$(CONFIG_ARCH_STMP3XXX) := stmp3xxx plat-$(CONFIG_ARCH_TCC_926) := tcc plat-$(CONFIG_PLAT_IOP) := iop plat-$(CONFIG_PLAT_NOMADIK) := nomadik diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 0c6852d..23aad07 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -98,8 +98,6 @@ endif ccflags-y := -fpic -fno-builtin asflags-y := -Wa,-march=all -# Provide size of uncompressed kernel to the decompressor via a linker symbol. -LDFLAGS_vmlinux = --defsym _image_size=$(shell stat -c "%s" $(obj)/../Image) # Supply ZRELADDR to the decompressor via a linker symbol. ifneq ($(CONFIG_AUTO_ZRELADDR),y) LDFLAGS_vmlinux += --defsym zreladdr=$(ZRELADDR) @@ -122,10 +120,23 @@ lib1funcs = $(obj)/lib1funcs.o $(obj)/lib1funcs.S: $(srctree)/arch/$(SRCARCH)/lib/lib1funcs.S FORCE $(call cmd,shipped) +# We need to prevent any GOTOFF relocs being used with references +# to symbols in the .bss section since we cannot relocate them +# independently from the rest at run time. This can be achieved by +# ensuring that no private .bss symbols exist, as global symbols +# always have a GOT entry which is what we need. +# The .data section is already discarded by the linker script so no need +# to bother about it here. +check_for_bad_syms = \ +bad_syms=$$($(CROSS_COMPILE)nm $@ | sed -n 's/^.\{8\} [bc] \(.*\)/\1/p') && \ +[ -z "$$bad_syms" ] || \ + ( echo "following symbols must have non local/private scope:" >&2; \ + echo "$$bad_syms" >&2; rm -f $@; false ) + $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/$(HEAD) $(obj)/piggy.$(suffix_y).o \ $(addprefix $(obj)/, $(OBJS)) $(lib1funcs) FORCE $(call if_changed,ld) - @: + @$(check_for_bad_syms) $(obj)/piggy.$(suffix_y): $(obj)/../Image FORCE $(call if_changed,$(suffix_y)) diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c index 4c72a97..07be5a2 100644 --- a/arch/arm/boot/compressed/decompress.c +++ b/arch/arm/boot/compressed/decompress.c @@ -44,7 +44,7 @@ extern void error(char *); #include "../../../../lib/decompress_unlzma.c" #endif -void do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x)) +int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x)) { - decompress(input, len, NULL, NULL, output, NULL, error); + return decompress(input, len, NULL, NULL, output, NULL, error); } diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 49f5b2e..f9da419 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -179,7 +179,7 @@ not_angel: bl cache_on restart: adr r0, LC0 - ldmia r0, {r1, r2, r3, r6, r9, r11, r12} + ldmia r0, {r1, r2, r3, r6, r10, r11, r12} ldr sp, [r0, #28] /* @@ -188,6 +188,20 @@ restart: adr r0, LC0 */ sub r0, r0, r1 @ calculate the delta offset add r6, r6, r0 @ _edata + add r10, r10, r0 @ inflated kernel size location + + /* + * The kernel build system appends the size of the + * decompressed kernel at the end of the compressed data + * in little-endian form. + */ + ldrb r9, [r10, #0] + ldrb lr, [r10, #1] + orr r9, r9, lr, lsl #8 + ldrb lr, [r10, #2] + ldrb r10, [r10, #3] + orr r9, r9, lr, lsl #16 + orr r9, r9, r10, lsl #24 #ifndef CONFIG_ZBOOT_ROM /* malloc space is above the relocated stack (64k max) */ @@ -347,10 +361,10 @@ LC0: .word LC0 @ r1 .word __bss_start @ r2 .word _end @ r3 .word _edata @ r6 - .word _image_size @ r9 + .word input_data_end - 4 @ r10 (inflated size location) .word _got_start @ r11 .word _got_end @ ip - .word user_stack_end @ sp + .word .L_user_stack_end @ sp .size LC0, . - LC0 #ifdef CONFIG_ARCH_RPC @@ -459,7 +473,11 @@ __setup_mmu: sub r3, r4, #16384 @ Page directory size orr r1, r1, #3 << 10 add r2, r3, #16384 1: cmp r1, r9 @ if virt > start of RAM +#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH + orrhs r1, r1, #0x08 @ set cacheable +#else orrhs r1, r1, #0x0c @ set cacheable, bufferable +#endif cmp r1, r10 @ if virt > end of RAM bichs r1, r1, #0x0c @ clear cacheable, bufferable str r1, [r0], #4 @ 1:1 mapping @@ -484,6 +502,12 @@ __setup_mmu: sub r3, r4, #16384 @ Page directory size mov pc, lr ENDPROC(__setup_mmu) +__arm926ejs_mmu_cache_on: +#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH + mov r0, #4 @ put dcache in WT mode + mcr p15, 7, r0, c15, c0, 0 +#endif + __armv4_mmu_cache_on: mov r12, lr #ifdef CONFIG_MMU @@ -665,6 +689,12 @@ proc_types: W(b) __armv4_mpu_cache_off W(b) __armv4_mpu_cache_flush + .word 0x41069260 @ ARM926EJ-S (v5TEJ) + .word 0xff0ffff0 + b __arm926ejs_mmu_cache_on + b __armv4_mmu_cache_off + b __armv5tej_mmu_cache_flush + .word 0x00007000 @ ARM7 IDs .word 0x0000f000 mov pc, lr @@ -747,12 +777,6 @@ proc_types: W(b) __armv4_mmu_cache_off W(b) __armv6_mmu_cache_flush - .word 0x560f5810 @ Marvell PJ4 ARMv6 - .word 0xff0ffff0 - W(b) __armv4_mmu_cache_on - W(b) __armv4_mmu_cache_off - W(b) __armv6_mmu_cache_flush - .word 0x000f0000 @ new CPU Id .word 0x000f0000 W(b) __armv7_mmu_cache_on @@ -1078,5 +1102,5 @@ reloc_code_end: .align .section ".stack", "aw", %nobits -user_stack: .space 4096 -user_stack_end: +.L_user_stack: .space 4096 +.L_user_stack_end: diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c index 2df3826..832d372 100644 --- a/arch/arm/boot/compressed/misc.c +++ b/arch/arm/boot/compressed/misc.c @@ -26,8 +26,6 @@ unsigned int __machine_arch_type; #include #include -#include - static void putstr(const char *ptr); extern void error(char *x); @@ -139,13 +137,12 @@ void *memcpy(void *__dest, __const void *__src, size_t __n) } /* - * gzip delarations + * gzip declarations */ extern char input_data[]; extern char input_data_end[]; unsigned char *output_data; -unsigned long output_ptr; unsigned long free_mem_ptr; unsigned long free_mem_end_ptr; @@ -170,15 +167,15 @@ asmlinkage void __div0(void) error("Attempting division by 0!"); } -extern void do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x)); +extern int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x)); -unsigned long +void decompress_kernel(unsigned long output_start, unsigned long free_mem_ptr_p, unsigned long free_mem_ptr_end_p, int arch_id) { - unsigned char *tmp; + int ret; output_data = (unsigned char *)output_start; free_mem_ptr = free_mem_ptr_p; @@ -187,12 +184,11 @@ decompress_kernel(unsigned long output_start, unsigned long free_mem_ptr_p, arch_decomp_setup(); - tmp = (unsigned char *) (((unsigned long)input_data_end) - 4); - output_ptr = get_unaligned_le32(tmp); - putstr("Uncompressing Linux..."); - do_decompress(input_data, input_data_end - input_data, - output_data, error); - putstr(" done, booting the kernel.\n"); - return output_ptr; + ret = do_decompress(input_data, input_data_end - input_data, + output_data, error); + if (ret) + error("decompressor returned an error"); + else + putstr(" done, booting the kernel.\n"); } diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c index f70ec7d..4ddd0a6 100644 --- a/arch/arm/common/gic.c +++ b/arch/arm/common/gic.c @@ -49,7 +49,7 @@ struct gic_chip_data { * Default make them NULL. */ struct irq_chip gic_arch_extn = { - .irq_ack = NULL, + .irq_eoi = NULL, .irq_mask = NULL, .irq_unmask = NULL, .irq_retrigger = NULL, @@ -84,21 +84,12 @@ static inline unsigned int gic_irq(struct irq_data *d) /* * Routines to acknowledge, disable and enable interrupts */ -static void gic_ack_irq(struct irq_data *d) -{ - spin_lock(&irq_controller_lock); - if (gic_arch_extn.irq_ack) - gic_arch_extn.irq_ack(d); - writel(gic_irq(d), gic_cpu_base(d) + GIC_CPU_EOI); - spin_unlock(&irq_controller_lock); -} - static void gic_mask_irq(struct irq_data *d) { u32 mask = 1 << (d->irq % 32); spin_lock(&irq_controller_lock); - writel(mask, gic_dist_base(d) + GIC_DIST_ENABLE_CLEAR + (gic_irq(d) / 32) * 4); + writel_relaxed(mask, gic_dist_base(d) + GIC_DIST_ENABLE_CLEAR + (gic_irq(d) / 32) * 4); if (gic_arch_extn.irq_mask) gic_arch_extn.irq_mask(d); spin_unlock(&irq_controller_lock); @@ -111,10 +102,21 @@ static void gic_unmask_irq(struct irq_data *d) spin_lock(&irq_controller_lock); if (gic_arch_extn.irq_unmask) gic_arch_extn.irq_unmask(d); - writel(mask, gic_dist_base(d) + GIC_DIST_ENABLE_SET + (gic_irq(d) / 32) * 4); + writel_relaxed(mask, gic_dist_base(d) + GIC_DIST_ENABLE_SET + (gic_irq(d) / 32) * 4); spin_unlock(&irq_controller_lock); } +static void gic_eoi_irq(struct irq_data *d) +{ + if (gic_arch_extn.irq_eoi) { + spin_lock(&irq_controller_lock); + gic_arch_extn.irq_eoi(d); + spin_unlock(&irq_controller_lock); + } + + writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_EOI); +} + static int gic_set_type(struct irq_data *d, unsigned int type) { void __iomem *base = gic_dist_base(d); @@ -138,7 +140,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type) if (gic_arch_extn.irq_set_type) gic_arch_extn.irq_set_type(d, type); - val = readl(base + GIC_DIST_CONFIG + confoff); + val = readl_relaxed(base + GIC_DIST_CONFIG + confoff); if (type == IRQ_TYPE_LEVEL_HIGH) val &= ~confmask; else if (type == IRQ_TYPE_EDGE_RISING) @@ -148,15 +150,15 @@ static int gic_set_type(struct irq_data *d, unsigned int type) * As recommended by the spec, disable the interrupt before changing * the configuration */ - if (readl(base + GIC_DIST_ENABLE_SET + enableoff) & enablemask) { - writel(enablemask, base + GIC_DIST_ENABLE_CLEAR + enableoff); + if (readl_relaxed(base + GIC_DIST_ENABLE_SET + enableoff) & enablemask) { + writel_relaxed(enablemask, base + GIC_DIST_ENABLE_CLEAR + enableoff); enabled = true; } - writel(val, base + GIC_DIST_CONFIG + confoff); + writel_relaxed(val, base + GIC_DIST_CONFIG + confoff); if (enabled) - writel(enablemask, base + GIC_DIST_ENABLE_SET + enableoff); + writel_relaxed(enablemask, base + GIC_DIST_ENABLE_SET + enableoff); spin_unlock(&irq_controller_lock); @@ -188,8 +190,8 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, spin_lock(&irq_controller_lock); d->node = cpu; - val = readl(reg) & ~mask; - writel(val | bit, reg); + val = readl_relaxed(reg) & ~mask; + writel_relaxed(val | bit, reg); spin_unlock(&irq_controller_lock); return 0; @@ -218,11 +220,10 @@ static void gic_handle_cascade_irq(unsigned int irq, struct irq_desc *desc) unsigned int cascade_irq, gic_irq; unsigned long status; - /* primary controller ack'ing */ - chip->irq_ack(&desc->irq_data); + chained_irq_enter(chip, desc); spin_lock(&irq_controller_lock); - status = readl(chip_data->cpu_base + GIC_CPU_INTACK); + status = readl_relaxed(chip_data->cpu_base + GIC_CPU_INTACK); spin_unlock(&irq_controller_lock); gic_irq = (status & 0x3ff); @@ -236,15 +237,14 @@ static void gic_handle_cascade_irq(unsigned int irq, struct irq_desc *desc) generic_handle_irq(cascade_irq); out: - /* primary controller unmasking */ - chip->irq_unmask(&desc->irq_data); + chained_irq_exit(chip, desc); } static struct irq_chip gic_chip = { .name = "GIC", - .irq_ack = gic_ack_irq, .irq_mask = gic_mask_irq, .irq_unmask = gic_unmask_irq, + .irq_eoi = gic_eoi_irq, .irq_set_type = gic_set_type, .irq_retrigger = gic_retrigger, #ifdef CONFIG_SMP @@ -272,13 +272,13 @@ static void __init gic_dist_init(struct gic_chip_data *gic, cpumask |= cpumask << 8; cpumask |= cpumask << 16; - writel(0, base + GIC_DIST_CTRL); + writel_relaxed(0, base + GIC_DIST_CTRL); /* * Find out how many interrupts are supported. * The GIC only supports up to 1020 interrupt sources. */ - gic_irqs = readl(base + GIC_DIST_CTR) & 0x1f; + gic_irqs = readl_relaxed(base + GIC_DIST_CTR) & 0x1f; gic_irqs = (gic_irqs + 1) * 32; if (gic_irqs > 1020) gic_irqs = 1020; @@ -287,26 +287,26 @@ static void __init gic_dist_init(struct gic_chip_data *gic, * Set all global interrupts to be level triggered, active low. */ for (i = 32; i < gic_irqs; i += 16) - writel(0, base + GIC_DIST_CONFIG + i * 4 / 16); + writel_relaxed(0, base + GIC_DIST_CONFIG + i * 4 / 16); /* * Set all global interrupts to this CPU only. */ for (i = 32; i < gic_irqs; i += 4) - writel(cpumask, base + GIC_DIST_TARGET + i * 4 / 4); + writel_relaxed(cpumask, base + GIC_DIST_TARGET + i * 4 / 4); /* * Set priority on all global interrupts. */ for (i = 32; i < gic_irqs; i += 4) - writel(0xa0a0a0a0, base + GIC_DIST_PRI + i * 4 / 4); + writel_relaxed(0xa0a0a0a0, base + GIC_DIST_PRI + i * 4 / 4); /* * Disable all interrupts. Leave the PPI and SGIs alone * as these enables are banked registers. */ for (i = 32; i < gic_irqs; i += 32) - writel(0xffffffff, base + GIC_DIST_ENABLE_CLEAR + i * 4 / 32); + writel_relaxed(0xffffffff, base + GIC_DIST_ENABLE_CLEAR + i * 4 / 32); /* * Limit number of interrupts registered to the platform maximum @@ -319,12 +319,12 @@ static void __init gic_dist_init(struct gic_chip_data *gic, * Setup the Linux IRQ subsystem. */ for (i = irq_start; i < irq_limit; i++) { - irq_set_chip_and_handler(i, &gic_chip, handle_level_irq); + irq_set_chip_and_handler(i, &gic_chip, handle_fasteoi_irq); irq_set_chip_data(i, gic); set_irq_flags(i, IRQF_VALID | IRQF_PROBE); } - writel(1, base + GIC_DIST_CTRL); + writel_relaxed(1, base + GIC_DIST_CTRL); } static void __cpuinit gic_cpu_init(struct gic_chip_data *gic) @@ -337,17 +337,17 @@ static void __cpuinit gic_cpu_init(struct gic_chip_data *gic) * Deal with the banked PPI and SGI interrupts - disable all * PPI interrupts, ensure all SGI interrupts are enabled. */ - writel(0xffff0000, dist_base + GIC_DIST_ENABLE_CLEAR); - writel(0x0000ffff, dist_base + GIC_DIST_ENABLE_SET); + writel_relaxed(0xffff0000, dist_base + GIC_DIST_ENABLE_CLEAR); + writel_relaxed(0x0000ffff, dist_base + GIC_DIST_ENABLE_SET); /* * Set priority on PPI and SGI interrupts */ for (i = 0; i < 32; i += 4) - writel(0xa0a0a0a0, dist_base + GIC_DIST_PRI + i * 4 / 4); + writel_relaxed(0xa0a0a0a0, dist_base + GIC_DIST_PRI + i * 4 / 4); - writel(0xf0, base + GIC_CPU_PRIMASK); - writel(1, base + GIC_CPU_CTRL); + writel_relaxed(0xf0, base + GIC_CPU_PRIMASK); + writel_relaxed(1, base + GIC_CPU_CTRL); } void __init gic_init(unsigned int gic_nr, unsigned int irq_start, @@ -391,7 +391,13 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) { unsigned long map = *cpus_addr(*mask); + /* + * Ensure that stores to Normal memory are visible to the + * other CPUs before issuing the IPI. + */ + dsb(); + /* this always happens on GIC0 */ - writel(map << 16 | irq, gic_data[0].dist_base + GIC_DIST_SOFTINT); + writel_relaxed(map << 16 | irq, gic_data[0].dist_base + GIC_DIST_SOFTINT); } #endif diff --git a/arch/arm/common/sa1111.c b/arch/arm/common/sa1111.c index a12b33c..9c49a46 100644 --- a/arch/arm/common/sa1111.c +++ b/arch/arm/common/sa1111.c @@ -185,14 +185,6 @@ static struct sa1111_dev_info sa1111_devices[] = { }, }; -void __init sa1111_adjust_zones(unsigned long *size, unsigned long *holes) -{ - unsigned int sz = SZ_1M >> PAGE_SHIFT; - - size[1] = size[0] - sz; - size[0] = sz; -} - /* * SA1111 interrupt support. Since clearing an IRQ while there are * active IRQs causes the interrupt output to pulse, the upper levels diff --git a/arch/arm/common/timer-sp.c b/arch/arm/common/timer-sp.c index 6ef3342..41df478 100644 --- a/arch/arm/common/timer-sp.c +++ b/arch/arm/common/timer-sp.c @@ -18,53 +18,67 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include +#include #include #include #include #include -/* - * These timers are currently always setup to be clocked at 1MHz. - */ -#define TIMER_FREQ_KHZ (1000) -#define TIMER_RELOAD (TIMER_FREQ_KHZ * 1000 / HZ) +static long __init sp804_get_clock_rate(const char *name) +{ + struct clk *clk; + long rate; + int err; + + clk = clk_get_sys("sp804", name); + if (IS_ERR(clk)) { + pr_err("sp804: %s clock not found: %d\n", name, + (int)PTR_ERR(clk)); + return PTR_ERR(clk); + } -static void __iomem *clksrc_base; + err = clk_enable(clk); + if (err) { + pr_err("sp804: %s clock failed to enable: %d\n", name, err); + clk_put(clk); + return err; + } -static cycle_t sp804_read(struct clocksource *cs) -{ - return ~readl(clksrc_base + TIMER_VALUE); -} + rate = clk_get_rate(clk); + if (rate < 0) { + pr_err("sp804: %s clock failed to get rate: %ld\n", name, rate); + clk_disable(clk); + clk_put(clk); + } -static struct clocksource clocksource_sp804 = { - .name = "timer3", - .rating = 200, - .read = sp804_read, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; + return rate; +} -void __init sp804_clocksource_init(void __iomem *base) +void __init sp804_clocksource_init(void __iomem *base, const char *name) { - struct clocksource *cs = &clocksource_sp804; + long rate = sp804_get_clock_rate(name); - clksrc_base = base; + if (rate < 0) + return; /* setup timer 0 as free-running clocksource */ - writel(0, clksrc_base + TIMER_CTRL); - writel(0xffffffff, clksrc_base + TIMER_LOAD); - writel(0xffffffff, clksrc_base + TIMER_VALUE); + writel(0, base + TIMER_CTRL); + writel(0xffffffff, base + TIMER_LOAD); + writel(0xffffffff, base + TIMER_VALUE); writel(TIMER_CTRL_32BIT | TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC, - clksrc_base + TIMER_CTRL); + base + TIMER_CTRL); - clocksource_register_khz(cs, TIMER_FREQ_KHZ); + clocksource_mmio_init(base + TIMER_VALUE, name, + rate, 200, 32, clocksource_mmio_readl_down); } static void __iomem *clkevt_base; +static unsigned long clkevt_reload; /* * IRQ handler for the timer @@ -90,7 +104,7 @@ static void sp804_set_mode(enum clock_event_mode mode, switch (mode) { case CLOCK_EVT_MODE_PERIODIC: - writel(TIMER_RELOAD, clkevt_base + TIMER_LOAD); + writel(clkevt_reload, clkevt_base + TIMER_LOAD); ctrl |= TIMER_CTRL_PERIODIC | TIMER_CTRL_ENABLE; break; @@ -120,7 +134,6 @@ static int sp804_set_next_event(unsigned long next, } static struct clock_event_device sp804_clockevent = { - .name = "timer0", .shift = 32, .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = sp804_set_mode, @@ -136,17 +149,24 @@ static struct irqaction sp804_timer_irq = { .dev_id = &sp804_clockevent, }; -void __init sp804_clockevents_init(void __iomem *base, unsigned int timer_irq) +void __init sp804_clockevents_init(void __iomem *base, unsigned int irq, + const char *name) { struct clock_event_device *evt = &sp804_clockevent; + long rate = sp804_get_clock_rate(name); + + if (rate < 0) + return; clkevt_base = base; + clkevt_reload = DIV_ROUND_CLOSEST(rate, HZ); - evt->irq = timer_irq; - evt->mult = div_sc(TIMER_FREQ_KHZ, NSEC_PER_MSEC, evt->shift); + evt->name = name; + evt->irq = irq; + evt->mult = div_sc(rate, NSEC_PER_SEC, evt->shift); evt->max_delta_ns = clockevent_delta2ns(0xffffffff, evt); evt->min_delta_ns = clockevent_delta2ns(0xf, evt); - setup_irq(timer_irq, &sp804_timer_irq); + setup_irq(irq, &sp804_timer_irq); clockevents_register_device(evt); } diff --git a/arch/arm/common/vic.c b/arch/arm/common/vic.c index 113085a..7aa4262 100644 --- a/arch/arm/common/vic.c +++ b/arch/arm/common/vic.c @@ -22,17 +22,16 @@ #include #include #include -#include +#include #include #include #include #include -#if defined(CONFIG_PM) +#ifdef CONFIG_PM /** * struct vic_device - VIC PM device - * @sysdev: The system device which is registered. * @irq: The IRQ number for the base of the VIC. * @base: The register base for the VIC. * @resume_sources: A bitmask of interrupts for resume. @@ -43,8 +42,6 @@ * @protect: Save for VIC_PROTECT. */ struct vic_device { - struct sys_device sysdev; - void __iomem *base; int irq; u32 resume_sources; @@ -59,11 +56,6 @@ struct vic_device { static struct vic_device vic_devices[CONFIG_ARM_VIC_NR]; static int vic_id; - -static inline struct vic_device *to_vic(struct sys_device *sys) -{ - return container_of(sys, struct vic_device, sysdev); -} #endif /* CONFIG_PM */ /** @@ -85,10 +77,9 @@ static void vic_init2(void __iomem *base) writel(32, base + VIC_PL190_DEF_VECT_ADDR); } -#if defined(CONFIG_PM) -static int vic_class_resume(struct sys_device *dev) +#ifdef CONFIG_PM +static void resume_one_vic(struct vic_device *vic) { - struct vic_device *vic = to_vic(dev); void __iomem *base = vic->base; printk(KERN_DEBUG "%s: resuming vic at %p\n", __func__, base); @@ -107,13 +98,18 @@ static int vic_class_resume(struct sys_device *dev) writel(vic->soft_int, base + VIC_INT_SOFT); writel(~vic->soft_int, base + VIC_INT_SOFT_CLEAR); +} - return 0; +static void vic_resume(void) +{ + int id; + + for (id = vic_id - 1; id >= 0; id--) + resume_one_vic(vic_devices + id); } -static int vic_class_suspend(struct sys_device *dev, pm_message_t state) +static void suspend_one_vic(struct vic_device *vic) { - struct vic_device *vic = to_vic(dev); void __iomem *base = vic->base; printk(KERN_DEBUG "%s: suspending vic at %p\n", __func__, base); @@ -128,14 +124,21 @@ static int vic_class_suspend(struct sys_device *dev, pm_message_t state) writel(vic->resume_irqs, base + VIC_INT_ENABLE); writel(~vic->resume_irqs, base + VIC_INT_ENABLE_CLEAR); +} + +static int vic_suspend(void) +{ + int id; + + for (id = 0; id < vic_id; id++) + suspend_one_vic(vic_devices + id); return 0; } -struct sysdev_class vic_class = { - .name = "vic", - .suspend = vic_class_suspend, - .resume = vic_class_resume, +struct syscore_ops vic_syscore_ops = { + .suspend = vic_suspend, + .resume = vic_resume, }; /** @@ -147,30 +150,8 @@ struct sysdev_class vic_class = { */ static int __init vic_pm_init(void) { - struct vic_device *dev = vic_devices; - int err; - int id; - - if (vic_id == 0) - return 0; - - err = sysdev_class_register(&vic_class); - if (err) { - printk(KERN_ERR "%s: cannot register class\n", __func__); - return err; - } - - for (id = 0; id < vic_id; id++, dev++) { - dev->sysdev.id = id; - dev->sysdev.cls = &vic_class; - - err = sysdev_register(&dev->sysdev); - if (err) { - printk(KERN_ERR "%s: failed to register device\n", - __func__); - return err; - } - } + if (vic_id > 0) + register_syscore_ops(&vic_syscore_ops); return 0; } diff --git a/arch/arm/configs/dove_defconfig b/arch/arm/configs/dove_defconfig index 54bf5ee..40db34c 100644 --- a/arch/arm/configs/dove_defconfig +++ b/arch/arm/configs/dove_defconfig @@ -8,8 +8,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_ARCH_DOVE=y CONFIG_MACH_DOVE_DB=y -CONFIG_CPU_V6=y -CONFIG_CPU_32v6K=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_AEABI=y @@ -44,7 +42,6 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=1 -# CONFIG_MISC_DEVICES is not set # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_LOWLEVEL is not set @@ -59,12 +56,12 @@ CONFIG_INPUT_EVDEV=y # CONFIG_KEYBOARD_ATKBD is not set # CONFIG_MOUSE_PS2 is not set # CONFIG_SERIO is not set +CONFIG_LEGACY_PTY_COUNT=16 # CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_SERIAL_8250_PCI is not set CONFIG_SERIAL_8250_RUNTIME_UARTS=2 -CONFIG_LEGACY_PTY_COUNT=16 # CONFIG_HW_RANDOM is not set CONFIG_I2C=y CONFIG_I2C_CHARDEV=y @@ -72,12 +69,10 @@ CONFIG_I2C_MV64XXX=y CONFIG_SPI=y CONFIG_SPI_ORION=y # CONFIG_HWMON is not set -# CONFIG_VGA_CONSOLE is not set CONFIG_USB=y CONFIG_USB_DEVICEFS=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_STORAGE=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_MV=y @@ -86,7 +81,6 @@ CONFIG_MV_XOR=y CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_FS_XATTR is not set -CONFIG_INOTIFY=y CONFIG_ISO9660_FS=y CONFIG_JOLIET=y CONFIG_UDF_FS=m @@ -110,23 +104,19 @@ CONFIG_DEBUG_KERNEL=y CONFIG_TIMER_STATS=y # CONFIG_DEBUG_BUGVERBOSE is not set CONFIG_DEBUG_INFO=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_DEBUG_USER=y CONFIG_DEBUG_ERRORS=y CONFIG_CRYPTO_NULL=y -CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_MD4=y -CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_SHA1=y CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=y CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_BLOWFISH=y -CONFIG_CRYPTO_DES=y CONFIG_CRYPTO_TEA=y CONFIG_CRYPTO_TWOFISH=y CONFIG_CRYPTO_DEFLATE=y diff --git a/arch/arm/configs/mx1_defconfig b/arch/arm/configs/mx1_defconfig index b39b5ce..c9436d0 100644 --- a/arch/arm/configs/mx1_defconfig +++ b/arch/arm/configs/mx1_defconfig @@ -15,6 +15,7 @@ CONFIG_ARCH_MXC=y CONFIG_ARCH_MX1=y CONFIG_ARCH_MX1ADS=y CONFIG_MACH_SCB9328=y +CONFIG_MACH_APF9328=y CONFIG_MXC_IRQ_PRIOR=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y diff --git a/arch/arm/configs/mx51_defconfig b/arch/arm/configs/mx51_defconfig index e3c9032..0ace16c 100644 --- a/arch/arm/configs/mx51_defconfig +++ b/arch/arm/configs/mx51_defconfig @@ -13,7 +13,7 @@ CONFIG_MODULE_SRCVERSION_ALL=y # CONFIG_LBDAF is not set # CONFIG_BLK_DEV_BSG is not set CONFIG_ARCH_MXC=y -CONFIG_ARCH_MX5=y +CONFIG_ARCH_MX51=y CONFIG_MACH_MX51_BABBAGE=y CONFIG_MACH_MX51_3DS=y CONFIG_MACH_EUKREA_CPUIMX51=y diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig new file mode 100644 index 0000000..2bf2243 --- /dev/null +++ b/arch/arm/configs/mxs_defconfig @@ -0,0 +1,129 @@ +CONFIG_EXPERIMENTAL=y +CONFIG_SYSVIPC=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_UTS_NS is not set +# CONFIG_IPC_NS is not set +# CONFIG_USER_NS is not set +# CONFIG_PID_NS is not set +# CONFIG_NET_NS is not set +CONFIG_PERF_EVENTS=y +# CONFIG_COMPAT_BRK is not set +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_BLK_DEV_INTEGRITY=y +# CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_IOSCHED_CFQ is not set +CONFIG_ARCH_MXS=y +CONFIG_MACH_STMP378X_DEVB=y +CONFIG_MACH_TX28=y +# CONFIG_ARM_THUMB is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_PREEMPT_VOLUNTARY=y +CONFIG_AEABI=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_AUTO_ZRELADDR=y +CONFIG_FPE_NWFPE=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_SYN_COOKIES=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set +# CONFIG_INET_DIAG is not set +# CONFIG_IPV6 is not set +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_DEV=m +CONFIG_CAN_FLEXCAN=m +# CONFIG_WIRELESS is not set +CONFIG_DEVTMPFS=y +# CONFIG_FIRMWARE_IN_KERNEL is not set +# CONFIG_BLK_DEV is not set +CONFIG_NETDEVICES=y +CONFIG_NET_ETHERNET=y +CONFIG_ENC28J60=y +# CONFIG_NETDEV_1000 is not set +# CONFIG_NETDEV_10000 is not set +# CONFIG_WLAN is not set +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_TSC2007=m +# CONFIG_SERIO is not set +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_DEVPTS_MULTIPLE_INSTANCES=y +# CONFIG_LEGACY_PTYS is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +# CONFIG_HW_RANDOM is not set +CONFIG_I2C=m +# CONFIG_I2C_COMPAT is not set +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MXS=m +CONFIG_SPI=y +CONFIG_SPI_GPIO=m +CONFIG_DEBUG_GPIO=y +CONFIG_GPIO_SYSFS=y +# CONFIG_HWMON is not set +# CONFIG_MFD_SUPPORT is not set +CONFIG_DISPLAY_SUPPORT=m +# CONFIG_HID_SUPPORT is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_MMC=y +CONFIG_MMC_MXS=y +CONFIG_RTC_CLASS=m +CONFIG_RTC_DRV_DS1307=m +CONFIG_DMADEVICES=y +CONFIG_MXS_DMA=y +CONFIG_EXT3_FS=y +# CONFIG_DNOTIFY is not set +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_CACHEFILES=m +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +# CONFIG_MISC_FILESYSTEMS is not set +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_ROOT_NFS=y +CONFIG_PRINTK_TIME=y +CONFIG_FRAME_WARN=2048 +CONFIG_MAGIC_SYSRQ=y +CONFIG_UNUSED_SYMBOLS=y +CONFIG_DEBUG_KERNEL=y +CONFIG_LOCKUP_DETECTOR=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_TIMER_STATS=y +CONFIG_PROVE_LOCKING=y +CONFIG_DEBUG_SPINLOCK_SLEEP=y +CONFIG_DEBUG_INFO=y +CONFIG_SYSCTL_SYSCALL_CHECK=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_STRICT_DEVMEM=y +CONFIG_DEBUG_USER=y +CONFIG_CRYPTO=y +CONFIG_CRYPTO_CRC32C=m +# CONFIG_CRYPTO_ANSI_CPRNG is not set +# CONFIG_CRYPTO_HW is not set +CONFIG_CRC_ITU_T=m +CONFIG_CRC7=m diff --git a/arch/arm/configs/ns9xxx_defconfig b/arch/arm/configs/ns9xxx_defconfig deleted file mode 100644 index 1f528a0..0000000 --- a/arch/arm/configs/ns9xxx_defconfig +++ /dev/null @@ -1,56 +0,0 @@ -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set -CONFIG_ARCH_NS9XXX=y -CONFIG_MACH_CC9P9360DEV=y -CONFIG_MACH_CC9P9360JS=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_FPE_NWFPE=y -CONFIG_NET=y -CONFIG_PACKET=m -CONFIG_INET=y -CONFIG_IP_PNP=y -CONFIG_SYN_COOKIES=y -CONFIG_MTD=m -CONFIG_MTD_CONCAT=m -CONFIG_MTD_CHAR=m -CONFIG_MTD_BLOCK=m -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_CFI_AMDSTD=m -CONFIG_MTD_PHYSMAP=m -CONFIG_BLK_DEV_LOOP=m -CONFIG_NETDEVICES=y -CONFIG_NET_ETHERNET=y -# CONFIG_SERIO_SERPORT is not set -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -# CONFIG_LEGACY_PTYS is not set -# CONFIG_HW_RANDOM is not set -CONFIG_I2C=m -CONFIG_I2C_GPIO=m -# CONFIG_HWMON is not set -# CONFIG_VGA_CONSOLE is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m -CONFIG_LEDS_GPIO=m -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=m -CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_RTC_CLASS=m -CONFIG_EXT2_FS=m -CONFIG_TMPFS=y -CONFIG_JFFS2_FS=m -CONFIG_NFS_FS=y -CONFIG_ROOT_NFS=y -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_USER=y -CONFIG_DEBUG_ERRORS=y diff --git a/arch/arm/configs/realview-smp_defconfig b/arch/arm/configs/realview-smp_defconfig index 5ca7a61..abe61bf 100644 --- a/arch/arm/configs/realview-smp_defconfig +++ b/arch/arm/configs/realview-smp_defconfig @@ -38,7 +38,7 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y CONFIG_MTD_CFI_AMDSTD=y -CONFIG_MTD_ARM_INTEGRATOR=y +CONFIG_MTD_PHYSMAP=y CONFIG_ARM_CHARLCD=y CONFIG_NETDEVICES=y CONFIG_SMSC_PHY=y diff --git a/arch/arm/configs/realview_defconfig b/arch/arm/configs/realview_defconfig index fcaa603..7079cbe 100644 --- a/arch/arm/configs/realview_defconfig +++ b/arch/arm/configs/realview_defconfig @@ -37,7 +37,7 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y CONFIG_MTD_CFI_AMDSTD=y -CONFIG_MTD_ARM_INTEGRATOR=y +CONFIG_MTD_PHYSMAP=y CONFIG_ARM_CHARLCD=y CONFIG_NETDEVICES=y CONFIG_SMSC_PHY=y diff --git a/arch/arm/configs/spear300_defconfig b/arch/arm/configs/spear300_defconfig deleted file mode 100644 index cf29f3e..0000000 --- a/arch/arm/configs/spear300_defconfig +++ /dev/null @@ -1,51 +0,0 @@ -CONFIG_EXPERIMENTAL=y -CONFIG_SYSVIPC=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_KALLSYMS_EXTRA_PASS=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_PLAT_SPEAR=y -CONFIG_BINFMT_MISC=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_INPUT_FF_MEMLESS=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -CONFIG_SERIAL_AMBA_PL011=y -CONFIG_SERIAL_AMBA_PL011_CONSOLE=y -# CONFIG_LEGACY_PTYS is not set -# CONFIG_HW_RANDOM is not set -CONFIG_RAW_DRIVER=y -CONFIG_MAX_RAW_DEVS=8192 -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_PL061=y -# CONFIG_HWMON is not set -# CONFIG_VGA_CONSOLE is not set -# CONFIG_HID_SUPPORT is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_EXT2_FS=y -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=y -CONFIG_EXT3_FS_SECURITY=y -CONFIG_AUTOFS4_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_IOCHARSET="ascii" -CONFIG_TMPFS=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=y -CONFIG_NLS_ASCII=m -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_FS=y -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y -# CONFIG_CRC32 is not set diff --git a/arch/arm/configs/spear310_defconfig b/arch/arm/configs/spear310_defconfig deleted file mode 100644 index 824e444..0000000 --- a/arch/arm/configs/spear310_defconfig +++ /dev/null @@ -1,52 +0,0 @@ -CONFIG_EXPERIMENTAL=y -CONFIG_SYSVIPC=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_KALLSYMS_EXTRA_PASS=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_PLAT_SPEAR=y -CONFIG_MACH_SPEAR310=y -CONFIG_BINFMT_MISC=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_INPUT_FF_MEMLESS=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -CONFIG_SERIAL_AMBA_PL011=y -CONFIG_SERIAL_AMBA_PL011_CONSOLE=y -# CONFIG_LEGACY_PTYS is not set -# CONFIG_HW_RANDOM is not set -CONFIG_RAW_DRIVER=y -CONFIG_MAX_RAW_DEVS=8192 -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_PL061=y -# CONFIG_HWMON is not set -# CONFIG_VGA_CONSOLE is not set -# CONFIG_HID_SUPPORT is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_EXT2_FS=y -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=y -CONFIG_EXT3_FS_SECURITY=y -CONFIG_AUTOFS4_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_IOCHARSET="ascii" -CONFIG_TMPFS=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=y -CONFIG_NLS_ASCII=m -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_FS=y -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y -# CONFIG_CRC32 is not set diff --git a/arch/arm/configs/spear320_defconfig b/arch/arm/configs/spear320_defconfig deleted file mode 100644 index 842f7f3..0000000 --- a/arch/arm/configs/spear320_defconfig +++ /dev/null @@ -1,52 +0,0 @@ -CONFIG_EXPERIMENTAL=y -CONFIG_SYSVIPC=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_KALLSYMS_EXTRA_PASS=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_PLAT_SPEAR=y -CONFIG_MACH_SPEAR320=y -CONFIG_BINFMT_MISC=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_INPUT_FF_MEMLESS=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -CONFIG_SERIAL_AMBA_PL011=y -CONFIG_SERIAL_AMBA_PL011_CONSOLE=y -# CONFIG_LEGACY_PTYS is not set -# CONFIG_HW_RANDOM is not set -CONFIG_RAW_DRIVER=y -CONFIG_MAX_RAW_DEVS=8192 -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_PL061=y -# CONFIG_HWMON is not set -# CONFIG_VGA_CONSOLE is not set -# CONFIG_HID_SUPPORT is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_EXT2_FS=y -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=y -CONFIG_EXT3_FS_SECURITY=y -CONFIG_AUTOFS4_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_IOCHARSET="ascii" -CONFIG_TMPFS=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=y -CONFIG_NLS_ASCII=m -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_FS=y -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y -# CONFIG_CRC32 is not set diff --git a/arch/arm/configs/spear3xx_defconfig b/arch/arm/configs/spear3xx_defconfig new file mode 100644 index 0000000..fea7e1f --- /dev/null +++ b/arch/arm/configs/spear3xx_defconfig @@ -0,0 +1,53 @@ +CONFIG_EXPERIMENTAL=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_KALLSYMS_EXTRA_PASS=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_PLAT_SPEAR=y +CONFIG_BOARD_SPEAR300_EVB=y +CONFIG_BOARD_SPEAR310_EVB=y +CONFIG_BOARD_SPEAR320_EVB=y +CONFIG_BINFMT_MISC=y +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_INPUT_FF_MEMLESS=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +# CONFIG_LEGACY_PTYS is not set +# CONFIG_HW_RANDOM is not set +CONFIG_RAW_DRIVER=y +CONFIG_MAX_RAW_DEVS=8192 +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_PL061=y +# CONFIG_HWMON is not set +# CONFIG_HID_SUPPORT is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_AUTOFS4_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_IOCHARSET="ascii" +CONFIG_TMPFS=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=m +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_SPINLOCK=y +CONFIG_DEBUG_SPINLOCK_SLEEP=y +CONFIG_DEBUG_INFO=y +# CONFIG_CRC32 is not set diff --git a/arch/arm/configs/spear600_defconfig b/arch/arm/configs/spear600_defconfig deleted file mode 100644 index 6777c11..0000000 --- a/arch/arm/configs/spear600_defconfig +++ /dev/null @@ -1,49 +0,0 @@ -CONFIG_EXPERIMENTAL=y -CONFIG_SYSVIPC=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_KALLSYMS_EXTRA_PASS=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_PLAT_SPEAR=y -CONFIG_ARCH_SPEAR6XX=y -CONFIG_BINFMT_MISC=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_INPUT_FF_MEMLESS=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set -CONFIG_SERIAL_AMBA_PL011=y -CONFIG_SERIAL_AMBA_PL011_CONSOLE=y -# CONFIG_LEGACY_PTYS is not set -CONFIG_RAW_DRIVER=y -CONFIG_MAX_RAW_DEVS=8192 -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_PL061=y -# CONFIG_HWMON is not set -# CONFIG_VGA_CONSOLE is not set -# CONFIG_HID_SUPPORT is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_EXT2_FS=y -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=y -CONFIG_EXT3_FS_SECURITY=y -CONFIG_AUTOFS4_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_IOCHARSET="ascii" -CONFIG_TMPFS=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=y -CONFIG_NLS_ASCII=m -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_FS=y -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_INFO=y -# CONFIG_CRC32 is not set diff --git a/arch/arm/configs/spear6xx_defconfig b/arch/arm/configs/spear6xx_defconfig new file mode 100644 index 0000000..cef2e83 --- /dev/null +++ b/arch/arm/configs/spear6xx_defconfig @@ -0,0 +1,49 @@ +CONFIG_EXPERIMENTAL=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_KALLSYMS_EXTRA_PASS=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_PLAT_SPEAR=y +CONFIG_ARCH_SPEAR6XX=y +CONFIG_BOARD_SPEAR600_EVB=y +CONFIG_BINFMT_MISC=y +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_INPUT_FF_MEMLESS=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +# CONFIG_LEGACY_PTYS is not set +CONFIG_RAW_DRIVER=y +CONFIG_MAX_RAW_DEVS=8192 +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_PL061=y +# CONFIG_HWMON is not set +# CONFIG_HID_SUPPORT is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_AUTOFS4_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_IOCHARSET="ascii" +CONFIG_TMPFS=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=m +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_SPINLOCK=y +CONFIG_DEBUG_SPINLOCK_SLEEP=y +CONFIG_DEBUG_INFO=y +# CONFIG_CRC32 is not set diff --git a/arch/arm/configs/stmp378x_defconfig b/arch/arm/configs/stmp378x_defconfig deleted file mode 100644 index 1079c2b..0000000 --- a/arch/arm/configs/stmp378x_defconfig +++ /dev/null @@ -1,128 +0,0 @@ -CONFIG_EXPERIMENTAL=y -CONFIG_LOCALVERSION="-default" -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_SYSFS_DEPRECATED_V2=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -CONFIG_SLAB=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_MODULE_SRCVERSION_ALL=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_ARCH_STMP3XXX=y -CONFIG_ARCH_STMP378X=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_PREEMPT=y -CONFIG_AEABI=y -CONFIG_HIGHMEM=y -CONFIG_ZBOOT_ROM_TEXT=0x0 -CONFIG_ZBOOT_ROM_BSS=0x0 -CONFIG_CMDLINE="console=ttySDBG0,115200 mem=32M" -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_IP_PNP_BOOTP=y -CONFIG_IP_MROUTE=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -# CONFIG_INET_LRO is not set -# CONFIG_IPV6 is not set -CONFIG_NET_SCHED=y -# CONFIG_WIRELESS is not set -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -# CONFIG_STANDALONE is not set -CONFIG_MTD=y -CONFIG_MTD_CHAR=y -CONFIG_MTD_NAND=y -CONFIG_MTD_UBI=y -CONFIG_MTD_UBI_GLUEBI=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_CRYPTOLOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_COUNT=4 -CONFIG_BLK_DEV_RAM_SIZE=6144 -# CONFIG_MISC_DEVICES is not set -CONFIG_SCSI=y -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_SG=y -# CONFIG_SCSI_LOWLEVEL is not set -CONFIG_INPUT_POLLDEV=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=320 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=240 -CONFIG_INPUT_EVDEV=y -# CONFIG_KEYBOARD_ATKBD is not set -# CONFIG_INPUT_MOUSE is not set -CONFIG_INPUT_TOUCHSCREEN=y -CONFIG_INPUT_MISC=y -# CONFIG_SERIO_SERPORT is not set -CONFIG_VT_HW_CONSOLE_BINDING=y -# CONFIG_LEGACY_PTYS is not set -CONFIG_HW_RANDOM=y -CONFIG_DEBUG_GPIO=y -CONFIG_GPIO_SYSFS=y -# CONFIG_HWMON is not set -CONFIG_FB=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y -CONFIG_LCD_CLASS_DEVICE=y -CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_VGA_CONSOLE is not set -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_LOGO=y -# CONFIG_HID_SUPPORT is not set -# CONFIG_USB_SUPPORT is not set -# CONFIG_DNOTIFY is not set -CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m -# CONFIG_MISC_FILESYSTEMS is not set -# CONFIG_NETWORK_FILESYSTEMS is not set -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_STRIP_ASM_SYMS=y -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_SHIRQ=y -# CONFIG_SCHED_DEBUG is not set -CONFIG_DEBUG_OBJECTS=y -CONFIG_DEBUG_OBJECTS_SELFTEST=y -CONFIG_DEBUG_OBJECTS_FREE=y -CONFIG_DEBUG_OBJECTS_TIMERS=y -CONFIG_DEBUG_SLAB=y -CONFIG_DEBUG_SLAB_LEAK=y -CONFIG_DEBUG_RT_MUTEXES=y -CONFIG_PROVE_LOCKING=y -CONFIG_DEBUG_SPINLOCK_SLEEP=y -CONFIG_DEBUG_KOBJECT=y -# CONFIG_DEBUG_BUGVERBOSE is not set -CONFIG_DEBUG_INFO=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_SYSCTL_SYSCALL_CHECK=y -CONFIG_BOOT_TRACER=y -CONFIG_STACK_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_KEYS=y -CONFIG_KEYS_DEBUG_PROC_KEYS=y -CONFIG_SECURITY=y -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_ECB=y -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_SHA1=m -CONFIG_CRYPTO_AES=m -CONFIG_CRYPTO_DES=y -CONFIG_CRYPTO_DEFLATE=y -CONFIG_CRYPTO_LZO=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_CRC_CCITT=m -CONFIG_CRC16=y diff --git a/arch/arm/configs/stmp37xx_defconfig b/arch/arm/configs/stmp37xx_defconfig deleted file mode 100644 index 564a5cc..0000000 --- a/arch/arm/configs/stmp37xx_defconfig +++ /dev/null @@ -1,108 +0,0 @@ -CONFIG_EXPERIMENTAL=y -CONFIG_LOCALVERSION="-default" -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_SYSFS_DEPRECATED_V2=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -CONFIG_SLAB=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_MODULE_SRCVERSION_ALL=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_ARCH_STMP3XXX=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_PREEMPT=y -CONFIG_AEABI=y -CONFIG_ZBOOT_ROM_TEXT=0x0 -CONFIG_ZBOOT_ROM_BSS=0x0 -CONFIG_CMDLINE="console=ttySDBG0,115200 mem=32M lcd_panel=lms350 rdinit=/bin/sh ignore_loglevel" -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_IP_PNP_BOOTP=y -CONFIG_IP_MROUTE=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -# CONFIG_INET_LRO is not set -# CONFIG_IPV6 is not set -CONFIG_NET_SCHED=y -# CONFIG_WIRELESS is not set -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -# CONFIG_STANDALONE is not set -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_CRYPTOLOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_COUNT=4 -CONFIG_BLK_DEV_RAM_SIZE=6144 -# CONFIG_MISC_DEVICES is not set -CONFIG_SCSI=y -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_SG=y -# CONFIG_SCSI_LOWLEVEL is not set -CONFIG_INPUT_POLLDEV=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=320 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=240 -CONFIG_INPUT_EVDEV=y -# CONFIG_KEYBOARD_ATKBD is not set -# CONFIG_INPUT_MOUSE is not set -CONFIG_INPUT_TOUCHSCREEN=y -CONFIG_INPUT_MISC=y -# CONFIG_SERIO_SERPORT is not set -CONFIG_VT_HW_CONSOLE_BINDING=y -# CONFIG_LEGACY_PTYS is not set -CONFIG_HW_RANDOM=y -CONFIG_DEBUG_GPIO=y -CONFIG_GPIO_SYSFS=y -# CONFIG_HWMON is not set -CONFIG_FB=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y -CONFIG_LCD_CLASS_DEVICE=y -CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_VGA_CONSOLE is not set -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_LOGO=y -# CONFIG_HID_SUPPORT is not set -# CONFIG_USB_SUPPORT is not set -# CONFIG_DNOTIFY is not set -CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m -# CONFIG_MISC_FILESYSTEMS is not set -# CONFIG_NETWORK_FILESYSTEMS is not set -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_DEBUG_KERNEL=y -# CONFIG_DEBUG_BUGVERBOSE is not set -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_SYSCTL_SYSCALL_CHECK=y -CONFIG_BOOT_TRACER=y -CONFIG_STACK_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_DEBUG_LL=y -CONFIG_KEYS=y -CONFIG_KEYS_DEBUG_PROC_KEYS=y -CONFIG_SECURITY=y -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_ECB=y -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_SHA1=m -CONFIG_CRYPTO_AES=m -CONFIG_CRYPTO_DES=y -CONFIG_CRYPTO_DEFLATE=y -CONFIG_CRYPTO_LZO=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_CRC_CCITT=m -CONFIG_CRC16=y diff --git a/arch/arm/configs/versatile_defconfig b/arch/arm/configs/versatile_defconfig index 0ce710f..cdd4d2b 100644 --- a/arch/arm/configs/versatile_defconfig +++ b/arch/arm/configs/versatile_defconfig @@ -32,7 +32,7 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_ADV_OPTIONS=y CONFIG_MTD_CFI_INTELEXT=y -CONFIG_MTD_ARM_INTEGRATOR=y +CONFIG_MTD_PHYSMAP=y CONFIG_BLK_DEV_RAM=y CONFIG_EEPROM_LEGACY=m CONFIG_NETDEVICES=y diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h index ca51143..4200554 100644 --- a/arch/arm/include/asm/dma.h +++ b/arch/arm/include/asm/dma.h @@ -6,8 +6,10 @@ /* * This is the maximum virtual address which can be DMA'd from. */ -#ifndef MAX_DMA_ADDRESS +#ifndef ARM_DMA_ZONE_SIZE #define MAX_DMA_ADDRESS 0xffffffff +#else +#define MAX_DMA_ADDRESS (PAGE_OFFSET + ARM_DMA_ZONE_SIZE) #endif #ifdef CONFIG_ISA_DMA_API diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index c3cd875..0e9ce8d 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -108,6 +108,7 @@ struct task_struct; int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs); #define ELF_CORE_COPY_TASK_REGS dump_task_regs +#define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 /* This is the location that an ET_DYN program is loaded if exec'ed. Typical diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 199a6b6..8c73900 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -3,16 +3,74 @@ #ifdef __KERNEL__ +#if defined(CONFIG_CPU_USE_DOMAINS) && defined(CONFIG_SMP) +/* ARM doesn't provide unprivileged exclusive memory accessors */ +#include +#else + +#include +#include +#include + +#define __futex_atomic_ex_table(err_reg) \ + "3:\n" \ + " .pushsection __ex_table,\"a\"\n" \ + " .align 3\n" \ + " .long 1b, 4f, 2b, 4f\n" \ + " .popsection\n" \ + " .pushsection .fixup,\"ax\"\n" \ + "4: mov %0, " err_reg "\n" \ + " b 3b\n" \ + " .popsection" + #ifdef CONFIG_SMP -#include +#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \ + smp_mb(); \ + __asm__ __volatile__( \ + "1: ldrex %1, [%2]\n" \ + " " insn "\n" \ + "2: strex %1, %0, [%2]\n" \ + " teq %1, #0\n" \ + " bne 1b\n" \ + " mov %0, #0\n" \ + __futex_atomic_ex_table("%4") \ + : "=&r" (ret), "=&r" (oldval) \ + : "r" (uaddr), "r" (oparg), "Ir" (-EFAULT) \ + : "cc", "memory") + +static inline int +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + int ret; + u32 val; + + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + return -EFAULT; + + smp_mb(); + __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" + "1: ldrex %1, [%4]\n" + " teq %1, %2\n" + " ite eq @ explicit IT needed for the 2b label\n" + "2: strexeq %0, %3, [%4]\n" + " movne %0, #0\n" + " teq %0, #0\n" + " bne 1b\n" + __futex_atomic_ex_table("%5") + : "=&r" (ret), "=&r" (val) + : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) + : "cc", "memory"); + smp_mb(); + + *uval = val; + return ret; +} #else /* !SMP, we can work around lack of atomic ops by disabling preemption */ -#include #include -#include -#include #include #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \ @@ -21,20 +79,38 @@ " " insn "\n" \ "2: " T(str) " %0, [%2]\n" \ " mov %0, #0\n" \ - "3:\n" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .long 1b, 4f, 2b, 4f\n" \ - " .popsection\n" \ - " .pushsection .fixup,\"ax\"\n" \ - "4: mov %0, %4\n" \ - " b 3b\n" \ - " .popsection" \ + __futex_atomic_ex_table("%4") \ : "=&r" (ret), "=&r" (oldval) \ : "r" (uaddr), "r" (oparg), "Ir" (-EFAULT) \ : "cc", "memory") static inline int +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + int ret = 0; + u32 val; + + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + return -EFAULT; + + __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" + "1: " T(ldr) " %1, [%4]\n" + " teq %1, %2\n" + " it eq @ explicit IT needed for the 2b label\n" + "2: " T(streq) " %3, [%4]\n" + __futex_atomic_ex_table("%5") + : "+r" (ret), "=&r" (val) + : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) + : "cc", "memory"); + + *uval = val; + return ret; +} + +#endif /* !SMP */ + +static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; @@ -87,39 +163,6 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) return ret; } -static inline int -futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, - u32 oldval, u32 newval) -{ - int ret = 0; - u32 val; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - - __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" - "1: " T(ldr) " %1, [%4]\n" - " teq %1, %2\n" - " it eq @ explicit IT needed for the 2b label\n" - "2: " T(streq) " %3, [%4]\n" - "3:\n" - " .pushsection __ex_table,\"a\"\n" - " .align 3\n" - " .long 1b, 4f, 2b, 4f\n" - " .popsection\n" - " .pushsection .fixup,\"ax\"\n" - "4: mov %0, %5\n" - " b 3b\n" - " .popsection" - : "+r" (ret), "=&r" (val) - : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) - : "cc", "memory"); - - *uval = val; - return ret; -} - -#endif /* !SMP */ - +#endif /* !(CPU_USE_DOMAINS && SMP) */ #endif /* __KERNEL__ */ #endif /* _ASM_ARM_FUTEX_H */ diff --git a/arch/arm/include/asm/hardware/timer-sp.h b/arch/arm/include/asm/hardware/timer-sp.h index 21e75e3..4384d81 100644 --- a/arch/arm/include/asm/hardware/timer-sp.h +++ b/arch/arm/include/asm/hardware/timer-sp.h @@ -1,2 +1,2 @@ -void sp804_clocksource_init(void __iomem *); -void sp804_clockevents_init(void __iomem *, unsigned int); +void sp804_clocksource_init(void __iomem *, const char *); +void sp804_clockevents_init(void __iomem *, unsigned int, const char *); diff --git a/arch/arm/include/asm/i8253.h b/arch/arm/include/asm/i8253.h new file mode 100644 index 0000000..70656b6 --- /dev/null +++ b/arch/arm/include/asm/i8253.h @@ -0,0 +1,15 @@ +#ifndef __ASMARM_I8253_H +#define __ASMARM_I8253_H + +/* i8253A PIT registers */ +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 + +#define PIT_LATCH ((PIT_TICK_RATE + HZ / 2) / HZ) + +extern raw_spinlock_t i8253_lock; + +#define outb_pit outb_p +#define inb_pit inb_p + +#endif diff --git a/arch/arm/include/asm/mach/time.h b/arch/arm/include/asm/mach/time.h index 883f6be..d5adaae 100644 --- a/arch/arm/include/asm/mach/time.h +++ b/arch/arm/include/asm/mach/time.h @@ -34,7 +34,6 @@ * timer interrupt which may be pending. */ struct sys_timer { - struct sys_device dev; void (*init)(void); void (*suspend)(void); void (*resume)(void); diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 431077c..af44a8f 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -209,14 +209,10 @@ static inline unsigned long __phys_to_virt(unsigned long x) * allocations. This must be the smallest DMA mask in the system, * so a successful GFP_DMA allocation will always satisfy this. */ -#ifndef ISA_DMA_THRESHOLD +#ifndef ARM_DMA_ZONE_SIZE #define ISA_DMA_THRESHOLD (0xffffffffULL) -#endif - -#ifndef arch_adjust_zones -#define arch_adjust_zones(size,holes) do { } while (0) -#elif !defined(CONFIG_ZONE_DMA) -#error "custom arch_adjust_zones() requires CONFIG_ZONE_DMA" +#else +#define ISA_DMA_THRESHOLD (PHYS_OFFSET + ARM_DMA_ZONE_SIZE - 1) #endif /* diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h index a8ff22b..312d108 100644 --- a/arch/arm/include/asm/ptrace.h +++ b/arch/arm/include/asm/ptrace.h @@ -128,6 +128,12 @@ struct pt_regs { #define ARM_r0 uregs[0] #define ARM_ORIG_r0 uregs[17] +/* + * The size of the user-visible VFP state as seen by PTRACE_GET/SETVFPREGS + * and core dumps. + */ +#define ARM_VFPREGS_SIZE ( 32 * 8 /*fpregs*/ + 4 /*fpscr*/ ) + #ifdef __KERNEL__ #define user_mode(regs) \ diff --git a/arch/arm/include/asm/sizes.h b/arch/arm/include/asm/sizes.h index 316bb2b..154b89b 100644 --- a/arch/arm/include/asm/sizes.h +++ b/arch/arm/include/asm/sizes.h @@ -16,44 +16,6 @@ /* Size definitions * Copyright (C) ARM Limited 1998. All rights reserved. */ +#include -#ifndef __sizes_h -#define __sizes_h 1 - -/* handy sizes */ -#define SZ_16 0x00000010 -#define SZ_32 0x00000020 -#define SZ_64 0x00000040 -#define SZ_128 0x00000080 -#define SZ_256 0x00000100 -#define SZ_512 0x00000200 - -#define SZ_1K 0x00000400 -#define SZ_2K 0x00000800 -#define SZ_4K 0x00001000 -#define SZ_8K 0x00002000 -#define SZ_16K 0x00004000 -#define SZ_32K 0x00008000 -#define SZ_64K 0x00010000 -#define SZ_128K 0x00020000 -#define SZ_256K 0x00040000 -#define SZ_512K 0x00080000 - -#define SZ_1M 0x00100000 -#define SZ_2M 0x00200000 -#define SZ_4M 0x00400000 -#define SZ_8M 0x00800000 -#define SZ_16M 0x01000000 -#define SZ_32M 0x02000000 -#define SZ_48M 0x03000000 -#define SZ_64M 0x04000000 -#define SZ_128M 0x08000000 -#define SZ_256M 0x10000000 -#define SZ_512M 0x20000000 - -#define SZ_1G 0x40000000 -#define SZ_2G 0x80000000 - -#endif - -/* END */ +#define SZ_48M (SZ_32M + SZ_16M) diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h index 96ed521..d2b514f 100644 --- a/arch/arm/include/asm/smp.h +++ b/arch/arm/include/asm/smp.h @@ -14,20 +14,12 @@ #include #include -#include - #ifndef CONFIG_SMP # error " included in non-SMP build" #endif #define raw_smp_processor_id() (current_thread_info()->cpu) -/* - * at the moment, there's not a big penalty for changing CPUs - * (the >big< penalty is running SMP in the first place) - */ -#define PROC_CHANGE_PENALTY 15 - struct seq_file; /* @@ -47,9 +39,9 @@ extern void smp_init_cpus(void); /* - * Raise an IPI cross call on CPUs in callmap. + * Provide a function to raise an IPI cross call on CPUs in callmap. */ -extern void smp_cross_call(const struct cpumask *mask, int ipi); +extern void set_smp_cross_call(void (*)(const struct cpumask *, unsigned int)); /* * Boot a secondary CPU, and assign it the specified idle task. diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index fdd3820..65fa3c8 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -5,6 +5,8 @@ #error SMP not supported on pre-ARMv6 CPUs #endif +#include + /* * sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K * extensions, so when running on UP, we have to patch these instructions away. diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 82dfe5d..265f908 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -41,12 +41,12 @@ */ #if defined(CONFIG_SMP) || defined(CONFIG_CPU_32v7) #define tlb_fast_mode(tlb) 0 -#define FREE_PTE_NR 500 #else #define tlb_fast_mode(tlb) 1 -#define FREE_PTE_NR 0 #endif +#define MMU_GATHER_BUNDLE 8 + /* * TLB handling. This allows us to remove pages from the page * tables, and efficiently handle the TLB issues. @@ -58,7 +58,9 @@ struct mmu_gather { unsigned long range_start; unsigned long range_end; unsigned int nr; - struct page *pages[FREE_PTE_NR]; + unsigned int max; + struct page **pages; + struct page *local[MMU_GATHER_BUNDLE]; }; DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -97,26 +99,37 @@ static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr) } } +static inline void __tlb_alloc_page(struct mmu_gather *tlb) +{ + unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); + + if (addr) { + tlb->pages = (void *)addr; + tlb->max = PAGE_SIZE / sizeof(struct page *); + } +} + static inline void tlb_flush_mmu(struct mmu_gather *tlb) { tlb_flush(tlb); if (!tlb_fast_mode(tlb)) { free_pages_and_swap_cache(tlb->pages, tlb->nr); tlb->nr = 0; + if (tlb->pages == tlb->local) + __tlb_alloc_page(tlb); } } -static inline struct mmu_gather * -tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) +static inline void +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int fullmm) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); - tlb->mm = mm; - tlb->fullmm = full_mm_flush; + tlb->fullmm = fullmm; tlb->vma = NULL; + tlb->max = ARRAY_SIZE(tlb->local); + tlb->pages = tlb->local; tlb->nr = 0; - - return tlb; + __tlb_alloc_page(tlb); } static inline void @@ -127,7 +140,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); - put_cpu_var(mmu_gathers); + if (tlb->pages != tlb->local) + free_pages((unsigned long)tlb->pages, 0); } /* @@ -162,15 +176,22 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) tlb_flush(tlb); } -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { if (tlb_fast_mode(tlb)) { free_page_and_swap_cache(page); - } else { - tlb->pages[tlb->nr++] = page; - if (tlb->nr >= FREE_PTE_NR) - tlb_flush_mmu(tlb); + return 1; /* avoid calling tlb_flush_mmu */ } + + tlb->pages[tlb->nr++] = page; + VM_BUG_ON(tlb->nr > tlb->max); + return tlb->max - tlb->nr; +} + +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + if (!__tlb_remove_page(tlb, page)) + tlb_flush_mmu(tlb); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, diff --git a/arch/arm/kernel/leds.c b/arch/arm/kernel/leds.c index 31a316c..0f107dc 100644 --- a/arch/arm/kernel/leds.c +++ b/arch/arm/kernel/leds.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -69,36 +70,37 @@ static ssize_t leds_store(struct sys_device *dev, static SYSDEV_ATTR(event, 0200, NULL, leds_store); -static int leds_suspend(struct sys_device *dev, pm_message_t state) +static struct sysdev_class leds_sysclass = { + .name = "leds", +}; + +static struct sys_device leds_device = { + .id = 0, + .cls = &leds_sysclass, +}; + +static int leds_suspend(void) { leds_event(led_stop); return 0; } -static int leds_resume(struct sys_device *dev) +static void leds_resume(void) { leds_event(led_start); - return 0; } -static int leds_shutdown(struct sys_device *dev) +static void leds_shutdown(void) { leds_event(led_halted); - return 0; } -static struct sysdev_class leds_sysclass = { - .name = "leds", +static struct syscore_ops leds_syscore_ops = { .shutdown = leds_shutdown, .suspend = leds_suspend, .resume = leds_resume, }; -static struct sys_device leds_device = { - .id = 0, - .cls = &leds_sysclass, -}; - static int __init leds_init(void) { int ret; @@ -107,6 +109,8 @@ static int __init leds_init(void) ret = sysdev_register(&leds_device); if (ret == 0) ret = sysdev_create_file(&leds_device, &attr_event); + if (ret == 0) + register_syscore_ops(&leds_syscore_ops); return ret; } diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 139e3c8..d53c0ab 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -560,11 +560,6 @@ static int armpmu_event_init(struct perf_event *event) event->destroy = hw_perf_event_destroy; if (!atomic_inc_not_zero(&active_events)) { - if (atomic_read(&active_events) > armpmu->num_events) { - atomic_dec(&active_events); - return -ENOSPC; - } - mutex_lock(&pmu_reserve_mutex); if (atomic_read(&active_events) == 0) { err = armpmu_reserve_hardware(); diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 8182f45..9726006 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -308,58 +309,6 @@ static int ptrace_write_user(struct task_struct *tsk, unsigned long off, return put_user_reg(tsk, off >> 2, val); } -/* - * Get all user integer registers. - */ -static int ptrace_getregs(struct task_struct *tsk, void __user *uregs) -{ - struct pt_regs *regs = task_pt_regs(tsk); - - return copy_to_user(uregs, regs, sizeof(struct pt_regs)) ? -EFAULT : 0; -} - -/* - * Set all user integer registers. - */ -static int ptrace_setregs(struct task_struct *tsk, void __user *uregs) -{ - struct pt_regs newregs; - int ret; - - ret = -EFAULT; - if (copy_from_user(&newregs, uregs, sizeof(struct pt_regs)) == 0) { - struct pt_regs *regs = task_pt_regs(tsk); - - ret = -EINVAL; - if (valid_user_regs(&newregs)) { - *regs = newregs; - ret = 0; - } - } - - return ret; -} - -/* - * Get the child FPU state. - */ -static int ptrace_getfpregs(struct task_struct *tsk, void __user *ufp) -{ - return copy_to_user(ufp, &task_thread_info(tsk)->fpstate, - sizeof(struct user_fp)) ? -EFAULT : 0; -} - -/* - * Set the child FPU state. - */ -static int ptrace_setfpregs(struct task_struct *tsk, void __user *ufp) -{ - struct thread_info *thread = task_thread_info(tsk); - thread->used_cp[1] = thread->used_cp[2] = 1; - return copy_from_user(&thread->fpstate, ufp, - sizeof(struct user_fp)) ? -EFAULT : 0; -} - #ifdef CONFIG_IWMMXT /* @@ -418,56 +367,6 @@ static int ptrace_setcrunchregs(struct task_struct *tsk, void __user *ufp) } #endif -#ifdef CONFIG_VFP -/* - * Get the child VFP state. - */ -static int ptrace_getvfpregs(struct task_struct *tsk, void __user *data) -{ - struct thread_info *thread = task_thread_info(tsk); - union vfp_state *vfp = &thread->vfpstate; - struct user_vfp __user *ufp = data; - - vfp_sync_hwstate(thread); - - /* copy the floating point registers */ - if (copy_to_user(&ufp->fpregs, &vfp->hard.fpregs, - sizeof(vfp->hard.fpregs))) - return -EFAULT; - - /* copy the status and control register */ - if (put_user(vfp->hard.fpscr, &ufp->fpscr)) - return -EFAULT; - - return 0; -} - -/* - * Set the child VFP state. - */ -static int ptrace_setvfpregs(struct task_struct *tsk, void __user *data) -{ - struct thread_info *thread = task_thread_info(tsk); - union vfp_state *vfp = &thread->vfpstate; - struct user_vfp __user *ufp = data; - - vfp_sync_hwstate(thread); - - /* copy the floating point registers */ - if (copy_from_user(&vfp->hard.fpregs, &ufp->fpregs, - sizeof(vfp->hard.fpregs))) - return -EFAULT; - - /* copy the status and control register */ - if (get_user(vfp->hard.fpscr, &ufp->fpscr)) - return -EFAULT; - - vfp_flush_hwstate(thread); - - return 0; -} -#endif - #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * Convert a virtual register number into an index for a thread_info @@ -694,6 +593,219 @@ out: } #endif +/* regset get/set implementations */ + +static int gpr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct pt_regs *regs = task_pt_regs(target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + regs, + 0, sizeof(*regs)); +} + +static int gpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct pt_regs newregs; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &newregs, + 0, sizeof(newregs)); + if (ret) + return ret; + + if (!valid_user_regs(&newregs)) + return -EINVAL; + + *task_pt_regs(target) = newregs; + return 0; +} + +static int fpa_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &task_thread_info(target)->fpstate, + 0, sizeof(struct user_fp)); +} + +static int fpa_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct thread_info *thread = task_thread_info(target); + + thread->used_cp[1] = thread->used_cp[2] = 1; + + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &thread->fpstate, + 0, sizeof(struct user_fp)); +} + +#ifdef CONFIG_VFP +/* + * VFP register get/set implementations. + * + * With respect to the kernel, struct user_fp is divided into three chunks: + * 16 or 32 real VFP registers (d0-d15 or d0-31) + * These are transferred to/from the real registers in the task's + * vfp_hard_struct. The number of registers depends on the kernel + * configuration. + * + * 16 or 0 fake VFP registers (d16-d31 or empty) + * i.e., the user_vfp structure has space for 32 registers even if + * the kernel doesn't have them all. + * + * vfp_get() reads this chunk as zero where applicable + * vfp_set() ignores this chunk + * + * 1 word for the FPSCR + * + * The bounds-checking logic built into user_regset_copyout and friends + * means that we can make a simple sequence of calls to map the relevant data + * to/from the specified slice of the user regset structure. + */ +static int vfp_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + struct thread_info *thread = task_thread_info(target); + struct vfp_hard_struct const *vfp = &thread->vfpstate.hard; + const size_t user_fpregs_offset = offsetof(struct user_vfp, fpregs); + const size_t user_fpscr_offset = offsetof(struct user_vfp, fpscr); + + vfp_sync_hwstate(thread); + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &vfp->fpregs, + user_fpregs_offset, + user_fpregs_offset + sizeof(vfp->fpregs)); + if (ret) + return ret; + + ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, + user_fpregs_offset + sizeof(vfp->fpregs), + user_fpscr_offset); + if (ret) + return ret; + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &vfp->fpscr, + user_fpscr_offset, + user_fpscr_offset + sizeof(vfp->fpscr)); +} + +/* + * For vfp_set() a read-modify-write is done on the VFP registers, + * in order to avoid writing back a half-modified set of registers on + * failure. + */ +static int vfp_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct thread_info *thread = task_thread_info(target); + struct vfp_hard_struct new_vfp = thread->vfpstate.hard; + const size_t user_fpregs_offset = offsetof(struct user_vfp, fpregs); + const size_t user_fpscr_offset = offsetof(struct user_vfp, fpscr); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &new_vfp.fpregs, + user_fpregs_offset, + user_fpregs_offset + sizeof(new_vfp.fpregs)); + if (ret) + return ret; + + ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + user_fpregs_offset + sizeof(new_vfp.fpregs), + user_fpscr_offset); + if (ret) + return ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &new_vfp.fpscr, + user_fpscr_offset, + user_fpscr_offset + sizeof(new_vfp.fpscr)); + if (ret) + return ret; + + vfp_sync_hwstate(thread); + thread->vfpstate.hard = new_vfp; + vfp_flush_hwstate(thread); + + return 0; +} +#endif /* CONFIG_VFP */ + +enum arm_regset { + REGSET_GPR, + REGSET_FPR, +#ifdef CONFIG_VFP + REGSET_VFP, +#endif +}; + +static const struct user_regset arm_regsets[] = { + [REGSET_GPR] = { + .core_note_type = NT_PRSTATUS, + .n = ELF_NGREG, + .size = sizeof(u32), + .align = sizeof(u32), + .get = gpr_get, + .set = gpr_set + }, + [REGSET_FPR] = { + /* + * For the FPA regs in fpstate, the real fields are a mixture + * of sizes, so pretend that the registers are word-sized: + */ + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_fp) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .get = fpa_get, + .set = fpa_set + }, +#ifdef CONFIG_VFP + [REGSET_VFP] = { + /* + * Pretend that the VFP regs are word-sized, since the FPSCR is + * a single word dangling at the end of struct user_vfp: + */ + .core_note_type = NT_ARM_VFP, + .n = ARM_VFPREGS_SIZE / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .get = vfp_get, + .set = vfp_set + }, +#endif /* CONFIG_VFP */ +}; + +static const struct user_regset_view user_arm_view = { + .name = "arm", .e_machine = ELF_ARCH, .ei_osabi = ELF_OSABI, + .regsets = arm_regsets, .n = ARRAY_SIZE(arm_regsets) +}; + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ + return &user_arm_view; +} + long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { @@ -710,19 +822,31 @@ long arch_ptrace(struct task_struct *child, long request, break; case PTRACE_GETREGS: - ret = ptrace_getregs(child, datap); + ret = copy_regset_to_user(child, + &user_arm_view, REGSET_GPR, + 0, sizeof(struct pt_regs), + datap); break; case PTRACE_SETREGS: - ret = ptrace_setregs(child, datap); + ret = copy_regset_from_user(child, + &user_arm_view, REGSET_GPR, + 0, sizeof(struct pt_regs), + datap); break; case PTRACE_GETFPREGS: - ret = ptrace_getfpregs(child, datap); + ret = copy_regset_to_user(child, + &user_arm_view, REGSET_FPR, + 0, sizeof(union fp_state), + datap); break; - + case PTRACE_SETFPREGS: - ret = ptrace_setfpregs(child, datap); + ret = copy_regset_from_user(child, + &user_arm_view, REGSET_FPR, + 0, sizeof(union fp_state), + datap); break; #ifdef CONFIG_IWMMXT @@ -757,11 +881,17 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef CONFIG_VFP case PTRACE_GETVFPREGS: - ret = ptrace_getvfpregs(child, datap); + ret = copy_regset_to_user(child, + &user_arm_view, REGSET_VFP, + 0, ARM_VFPREGS_SIZE, + datap); break; case PTRACE_SETVFPREGS: - ret = ptrace_setvfpregs(child, datap); + ret = copy_regset_from_user(child, + &user_arm_view, REGSET_VFP, + 0, ARM_VFPREGS_SIZE, + datap); break; #endif diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 006c1e8..6dce209 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -672,11 +672,16 @@ __tagtable(ATAG_REVISION, parse_tag_revision); static int __init parse_tag_cmdline(const struct tag *tag) { -#ifndef CONFIG_CMDLINE_FORCE - strlcpy(default_command_line, tag->u.cmdline.cmdline, COMMAND_LINE_SIZE); -#else +#if defined(CONFIG_CMDLINE_EXTEND) + strlcat(default_command_line, " ", COMMAND_LINE_SIZE); + strlcat(default_command_line, tag->u.cmdline.cmdline, + COMMAND_LINE_SIZE); +#elif defined(CONFIG_CMDLINE_FORCE) pr_warning("Ignoring tag cmdline (using the default kernel command line)\n"); -#endif /* CONFIG_CMDLINE_FORCE */ +#else + strlcpy(default_command_line, tag->u.cmdline.cmdline, + COMMAND_LINE_SIZE); +#endif return 0; } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index f29b8a2..d439a8f 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -376,6 +376,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } } +static void (*smp_cross_call)(const struct cpumask *, unsigned int); + +void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int)) +{ + smp_cross_call = fn; +} + void arch_send_call_function_ipi_mask(const struct cpumask *mask) { smp_cross_call(mask, IPI_CALL_FUNC); @@ -560,10 +567,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs) break; case IPI_RESCHEDULE: - /* - * nothing more to do - eveything is - * done on the interrupt return path - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 1ff46ca..cb634c3 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -115,48 +115,37 @@ void timer_tick(void) #endif #if defined(CONFIG_PM) && !defined(CONFIG_GENERIC_CLOCKEVENTS) -static int timer_suspend(struct sys_device *dev, pm_message_t state) +static int timer_suspend(void) { - struct sys_timer *timer = container_of(dev, struct sys_timer, dev); - - if (timer->suspend != NULL) - timer->suspend(); + if (system_timer->suspend) + system_timer->suspend(); return 0; } -static int timer_resume(struct sys_device *dev) +static void timer_resume(void) { - struct sys_timer *timer = container_of(dev, struct sys_timer, dev); - - if (timer->resume != NULL) - timer->resume(); - - return 0; + if (system_timer->resume) + system_timer->resume(); } #else #define timer_suspend NULL #define timer_resume NULL #endif -static struct sysdev_class timer_sysclass = { - .name = "timer", +static struct syscore_ops timer_syscore_ops = { .suspend = timer_suspend, .resume = timer_resume, }; -static int __init timer_init_sysfs(void) +static int __init timer_init_syscore_ops(void) { - int ret = sysdev_class_register(&timer_sysclass); - if (ret == 0) { - system_timer->dev.cls = &timer_sysclass; - ret = sysdev_register(&system_timer->dev); - } + register_syscore_ops(&timer_syscore_ops); - return ret; + return 0; } -device_initcall(timer_init_sysfs); +device_initcall(timer_init_syscore_ops); void __init time_init(void) { diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 3b54ad1..d52eec2 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -234,7 +234,6 @@ static int __die(const char *str, int err, struct thread_info *thread, struct pt printk(KERN_EMERG "Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n", str, err, ++die_counter); - sysfs_printk_last_file(); /* trap and error numbers are mostly meaningless on ARM */ ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, SIGSEGV); diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index b4348e6..e5287f2 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -82,7 +82,7 @@ SECTIONS #endif } - PERCPU(32, PAGE_SIZE) + PERCPU_SECTION(32) #ifndef CONFIG_XIP_KERNEL . = ALIGN(PAGE_SIZE); diff --git a/arch/arm/mach-at91/at91cap9_devices.c b/arch/arm/mach-at91/at91cap9_devices.c index 9ffbf3a..21020ce 100644 --- a/arch/arm/mach-at91/at91cap9_devices.c +++ b/arch/arm/mach-at91/at91cap9_devices.c @@ -171,7 +171,7 @@ void __init at91_add_device_usba(struct usba_platform_data *data) */ usba_udc_data.pdata.vbus_pin = -EINVAL; usba_udc_data.pdata.num_ep = ARRAY_SIZE(usba_udc_ep); - memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep));; + memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep)); if (data && data->vbus_pin > 0) { at91_set_gpio_input(data->vbus_pin, 0); diff --git a/arch/arm/mach-at91/at91sam9g45_devices.c b/arch/arm/mach-at91/at91sam9g45_devices.c index 1e8f275..5e9f8a4 100644 --- a/arch/arm/mach-at91/at91sam9g45_devices.c +++ b/arch/arm/mach-at91/at91sam9g45_devices.c @@ -256,7 +256,7 @@ void __init at91_add_device_usba(struct usba_platform_data *data) { usba_udc_data.pdata.vbus_pin = -EINVAL; usba_udc_data.pdata.num_ep = ARRAY_SIZE(usba_udc_ep); - memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep));; + memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep)); if (data && data->vbus_pin > 0) { at91_set_gpio_input(data->vbus_pin, 0); diff --git a/arch/arm/mach-at91/at91sam9rl_devices.c b/arch/arm/mach-at91/at91sam9rl_devices.c index 53aaa94..c49262b 100644 --- a/arch/arm/mach-at91/at91sam9rl_devices.c +++ b/arch/arm/mach-at91/at91sam9rl_devices.c @@ -145,7 +145,7 @@ void __init at91_add_device_usba(struct usba_platform_data *data) */ usba_udc_data.pdata.vbus_pin = -EINVAL; usba_udc_data.pdata.num_ep = ARRAY_SIZE(usba_udc_ep); - memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep));; + memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep)); if (data && data->vbus_pin > 0) { at91_set_gpio_input(data->vbus_pin, 0); diff --git a/arch/arm/mach-bcmring/arch.c b/arch/arm/mach-bcmring/arch.c index 73eb066..a604b9e 100644 --- a/arch/arm/mach-bcmring/arch.c +++ b/arch/arm/mach-bcmring/arch.c @@ -169,6 +169,7 @@ MACHINE_START(BCMRING, "BCMRING") /* Maintainer: Broadcom Corporation */ .fixup = bcmring_fixup, .map_io = bcmring_map_io, + .init_early = bcmring_init_early, .init_irq = bcmring_init_irq, .timer = &bcmring_timer, .init_machine = bcmring_init_machine diff --git a/arch/arm/mach-bcmring/core.c b/arch/arm/mach-bcmring/core.c index 8fc2035..43eadbc 100644 --- a/arch/arm/mach-bcmring/core.c +++ b/arch/arm/mach-bcmring/core.c @@ -28,8 +28,6 @@ #include #include #include -#include -#include #include #include @@ -37,6 +35,7 @@ #include #include #include +#include #include #include @@ -97,6 +96,35 @@ static struct clk dummy_apb_pclk = { .mode = CLK_MODE_XTAL, }; +/* Timer 0 - 25 MHz, Timer3 at bus clock rate, typically 150-166 MHz */ +#if defined(CONFIG_ARCH_FPGA11107) +/* fpga cpu/bus are currently 30 times slower so scale frequency as well to */ +/* slow down Linux's sense of time */ +#define TIMER0_FREQUENCY_MHZ (tmrHw_LOW_FREQUENCY_MHZ * 30) +#define TIMER1_FREQUENCY_MHZ (tmrHw_LOW_FREQUENCY_MHZ * 30) +#define TIMER3_FREQUENCY_MHZ (tmrHw_HIGH_FREQUENCY_MHZ * 30) +#define TIMER3_FREQUENCY_KHZ (tmrHw_HIGH_FREQUENCY_HZ / 1000 * 30) +#else +#define TIMER0_FREQUENCY_MHZ tmrHw_LOW_FREQUENCY_MHZ +#define TIMER1_FREQUENCY_MHZ tmrHw_LOW_FREQUENCY_MHZ +#define TIMER3_FREQUENCY_MHZ tmrHw_HIGH_FREQUENCY_MHZ +#define TIMER3_FREQUENCY_KHZ (tmrHw_HIGH_FREQUENCY_HZ / 1000) +#endif + +static struct clk sp804_timer012_clk = { + .name = "sp804-timer-0,1,2", + .type = CLK_TYPE_PRIMARY, + .mode = CLK_MODE_XTAL, + .rate_hz = TIMER1_FREQUENCY_MHZ * 1000000, +}; + +static struct clk sp804_timer3_clk = { + .name = "sp804-timer-3", + .type = CLK_TYPE_PRIMARY, + .mode = CLK_MODE_XTAL, + .rate_hz = TIMER3_FREQUENCY_KHZ * 1000, +}; + static struct clk_lookup lookups[] = { { /* Bus clock */ .con_id = "apb_pclk", @@ -107,6 +135,18 @@ static struct clk_lookup lookups[] = { }, { /* UART1 */ .dev_id = "uartb", .clk = &uart_clk, + }, { /* SP804 timer 0 */ + .dev_id = "sp804", + .con_id = "timer0", + .clk = &sp804_timer012_clk, + }, { /* SP804 timer 1 */ + .dev_id = "sp804", + .con_id = "timer1", + .clk = &sp804_timer012_clk, + }, { /* SP804 timer 3 */ + .dev_id = "sp804", + .con_id = "timer3", + .clk = &sp804_timer3_clk, } }; @@ -151,8 +191,6 @@ void __init bcmring_amba_init(void) chipcHw_busInterfaceClockEnable(bus_clock); - clkdev_add_table(lookups, ARRAY_SIZE(lookups)); - for (i = 0; i < ARRAY_SIZE(amba_devs); i++) { struct amba_device *d = amba_devs[i]; amba_device_register(d, &iomem_resource); @@ -162,170 +200,18 @@ void __init bcmring_amba_init(void) /* * Where is the timer (VA)? */ -#define TIMER0_VA_BASE MM_IO_BASE_TMR -#define TIMER1_VA_BASE (MM_IO_BASE_TMR + 0x20) -#define TIMER2_VA_BASE (MM_IO_BASE_TMR + 0x40) -#define TIMER3_VA_BASE (MM_IO_BASE_TMR + 0x60) - -/* Timer 0 - 25 MHz, Timer3 at bus clock rate, typically 150-166 MHz */ -#if defined(CONFIG_ARCH_FPGA11107) -/* fpga cpu/bus are currently 30 times slower so scale frequency as well to */ -/* slow down Linux's sense of time */ -#define TIMER0_FREQUENCY_MHZ (tmrHw_LOW_FREQUENCY_MHZ * 30) -#define TIMER1_FREQUENCY_MHZ (tmrHw_LOW_FREQUENCY_MHZ * 30) -#define TIMER3_FREQUENCY_MHZ (tmrHw_HIGH_FREQUENCY_MHZ * 30) -#define TIMER3_FREQUENCY_KHZ (tmrHw_HIGH_FREQUENCY_HZ / 1000 * 30) -#else -#define TIMER0_FREQUENCY_MHZ tmrHw_LOW_FREQUENCY_MHZ -#define TIMER1_FREQUENCY_MHZ tmrHw_LOW_FREQUENCY_MHZ -#define TIMER3_FREQUENCY_MHZ tmrHw_HIGH_FREQUENCY_MHZ -#define TIMER3_FREQUENCY_KHZ (tmrHw_HIGH_FREQUENCY_HZ / 1000) -#endif - -#define TICKS_PER_uSEC TIMER0_FREQUENCY_MHZ - -/* - * These are useconds NOT ticks. - * - */ -#define mSEC_1 1000 -#define mSEC_5 (mSEC_1 * 5) -#define mSEC_10 (mSEC_1 * 10) -#define mSEC_25 (mSEC_1 * 25) -#define SEC_1 (mSEC_1 * 1000) - -/* - * How long is the timer interval? - */ -#define TIMER_INTERVAL (TICKS_PER_uSEC * mSEC_10) -#if TIMER_INTERVAL >= 0x100000 -#define TIMER_RELOAD (TIMER_INTERVAL >> 8) -#define TIMER_DIVISOR (TIMER_CTRL_DIV256) -#define TICKS2USECS(x) (256 * (x) / TICKS_PER_uSEC) -#elif TIMER_INTERVAL >= 0x10000 -#define TIMER_RELOAD (TIMER_INTERVAL >> 4) /* Divide by 16 */ -#define TIMER_DIVISOR (TIMER_CTRL_DIV16) -#define TICKS2USECS(x) (16 * (x) / TICKS_PER_uSEC) -#else -#define TIMER_RELOAD (TIMER_INTERVAL) -#define TIMER_DIVISOR (TIMER_CTRL_DIV1) -#define TICKS2USECS(x) ((x) / TICKS_PER_uSEC) -#endif - -static void timer_set_mode(enum clock_event_mode mode, - struct clock_event_device *clk) -{ - unsigned long ctrl; - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - writel(TIMER_RELOAD, TIMER0_VA_BASE + TIMER_LOAD); - - ctrl = TIMER_CTRL_PERIODIC; - ctrl |= - TIMER_DIVISOR | TIMER_CTRL_32BIT | TIMER_CTRL_IE | - TIMER_CTRL_ENABLE; - break; - case CLOCK_EVT_MODE_ONESHOT: - /* period set, and timer enabled in 'next_event' hook */ - ctrl = TIMER_CTRL_ONESHOT; - ctrl |= TIMER_DIVISOR | TIMER_CTRL_32BIT | TIMER_CTRL_IE; - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - default: - ctrl = 0; - } - - writel(ctrl, TIMER0_VA_BASE + TIMER_CTRL); -} - -static int timer_set_next_event(unsigned long evt, - struct clock_event_device *unused) -{ - unsigned long ctrl = readl(TIMER0_VA_BASE + TIMER_CTRL); - - writel(evt, TIMER0_VA_BASE + TIMER_LOAD); - writel(ctrl | TIMER_CTRL_ENABLE, TIMER0_VA_BASE + TIMER_CTRL); - - return 0; -} - -static struct clock_event_device timer0_clockevent = { - .name = "timer0", - .shift = 32, - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = timer_set_mode, - .set_next_event = timer_set_next_event, -}; - -/* - * IRQ handler for the timer - */ -static irqreturn_t bcmring_timer_interrupt(int irq, void *dev_id) -{ - struct clock_event_device *evt = &timer0_clockevent; - - writel(1, TIMER0_VA_BASE + TIMER_INTCLR); - - evt->event_handler(evt); - - return IRQ_HANDLED; -} - -static struct irqaction bcmring_timer_irq = { - .name = "bcmring Timer Tick", - .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL, - .handler = bcmring_timer_interrupt, -}; - -static cycle_t bcmring_get_cycles_timer1(struct clocksource *cs) -{ - return ~readl(TIMER1_VA_BASE + TIMER_VALUE); -} - -static cycle_t bcmring_get_cycles_timer3(struct clocksource *cs) -{ - return ~readl(TIMER3_VA_BASE + TIMER_VALUE); -} - -static struct clocksource clocksource_bcmring_timer1 = { - .name = "timer1", - .rating = 200, - .read = bcmring_get_cycles_timer1, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -static struct clocksource clocksource_bcmring_timer3 = { - .name = "timer3", - .rating = 100, - .read = bcmring_get_cycles_timer3, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; +#define TIMER0_VA_BASE ((void __iomem *)MM_IO_BASE_TMR) +#define TIMER1_VA_BASE ((void __iomem *)(MM_IO_BASE_TMR + 0x20)) +#define TIMER2_VA_BASE ((void __iomem *)(MM_IO_BASE_TMR + 0x40)) +#define TIMER3_VA_BASE ((void __iomem *)(MM_IO_BASE_TMR + 0x60)) static int __init bcmring_clocksource_init(void) { /* setup timer1 as free-running clocksource */ - writel(0, TIMER1_VA_BASE + TIMER_CTRL); - writel(0xffffffff, TIMER1_VA_BASE + TIMER_LOAD); - writel(0xffffffff, TIMER1_VA_BASE + TIMER_VALUE); - writel(TIMER_CTRL_32BIT | TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC, - TIMER1_VA_BASE + TIMER_CTRL); - - clocksource_register_khz(&clocksource_bcmring_timer1, - TIMER1_FREQUENCY_MHZ * 1000); + sp804_clocksource_init(TIMER1_VA_BASE, "timer1"); /* setup timer3 as free-running clocksource */ - writel(0, TIMER3_VA_BASE + TIMER_CTRL); - writel(0xffffffff, TIMER3_VA_BASE + TIMER_LOAD); - writel(0xffffffff, TIMER3_VA_BASE + TIMER_VALUE); - writel(TIMER_CTRL_32BIT | TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC, - TIMER3_VA_BASE + TIMER_CTRL); - - clocksource_register_khz(&clocksource_bcmring_timer3, - TIMER3_FREQUENCY_KHZ); + sp804_clocksource_init(TIMER3_VA_BASE, "timer3"); return 0; } @@ -347,21 +233,16 @@ void __init bcmring_init_timer(void) /* * Make irqs happen for the system timer */ - setup_irq(IRQ_TIMER0, &bcmring_timer_irq); - bcmring_clocksource_init(); - timer0_clockevent.mult = - div_sc(1000000, NSEC_PER_SEC, timer0_clockevent.shift); - timer0_clockevent.max_delta_ns = - clockevent_delta2ns(0xffffffff, &timer0_clockevent); - timer0_clockevent.min_delta_ns = - clockevent_delta2ns(0xf, &timer0_clockevent); - - timer0_clockevent.cpumask = cpumask_of(0); - clockevents_register_device(&timer0_clockevent); + sp804_clockevents_register(TIMER0_VA_BASE, IRQ_TIMER0, "timer0"); } struct sys_timer bcmring_timer = { .init = bcmring_init_timer, }; + +void __init bcmring_init_early(void) +{ + clkdev_add_table(lookups, ARRAY_SIZE(lookups)); +} diff --git a/arch/arm/mach-bcmring/core.h b/arch/arm/mach-bcmring/core.h index b197ba4..e0e02c4 100644 --- a/arch/arm/mach-bcmring/core.h +++ b/arch/arm/mach-bcmring/core.h @@ -25,6 +25,7 @@ void __init bcmring_amba_init(void); void __init bcmring_map_io(void); void __init bcmring_init_irq(void); +void __init bcmring_init_early(void); extern struct sys_timer bcmring_timer; #endif diff --git a/arch/arm/mach-davinci/cpufreq.c b/arch/arm/mach-davinci/cpufreq.c index 0a95be1..41669ec 100644 --- a/arch/arm/mach-davinci/cpufreq.c +++ b/arch/arm/mach-davinci/cpufreq.c @@ -94,9 +94,7 @@ static int davinci_target(struct cpufreq_policy *policy, if (freqs.old == freqs.new) return ret; - cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, - dev_driver_string(cpufreq.dev), - "transition: %u --> %u\n", freqs.old, freqs.new); + dev_dbg(&cpufreq.dev, "transition: %u --> %u\n", freqs.old, freqs.new); ret = cpufreq_frequency_table_target(policy, pdata->freq_table, freqs.new, relation, &idx); diff --git a/arch/arm/mach-davinci/include/mach/memory.h b/arch/arm/mach-davinci/include/mach/memory.h index 7882272..491249e 100644 --- a/arch/arm/mach-davinci/include/mach/memory.h +++ b/arch/arm/mach-davinci/include/mach/memory.h @@ -41,27 +41,11 @@ */ #define CONSISTENT_DMA_SIZE (14<<20) -#ifndef __ASSEMBLY__ /* * Restrict DMA-able region to workaround silicon bug. The bug * restricts buffers available for DMA to video hardware to be * below 128M */ -static inline void -__arch_adjust_zones(unsigned long *size, unsigned long *holes) -{ - unsigned int sz = (128<<20) >> PAGE_SHIFT; - - size[1] = size[0] - sz; - size[0] = sz; -} - -#define arch_adjust_zones(zone_size, holes) \ - if ((meminfo.bank[0].size >> 20) > 128) __arch_adjust_zones(zone_size, holes) - -#define ISA_DMA_THRESHOLD (PHYS_OFFSET + (128<<20) - 1) -#define MAX_DMA_ADDRESS (PAGE_OFFSET + (128<<20)) - -#endif +#define ARM_DMA_ZONE_SIZE SZ_128M #endif /* __ASM_ARCH_MEMORY_H */ diff --git a/arch/arm/mach-davinci/include/mach/uncompress.h b/arch/arm/mach-davinci/include/mach/uncompress.h index 47723e8..78d8068 100644 --- a/arch/arm/mach-davinci/include/mach/uncompress.h +++ b/arch/arm/mach-davinci/include/mach/uncompress.h @@ -25,8 +25,7 @@ #include -static u32 *uart; -static u32 *uart_info = (u32 *)(DAVINCI_UART_INFO); +u32 *uart; /* PORT_16C550A, in polled non-fifo mode */ static void putc(char c) @@ -44,6 +43,8 @@ static inline void flush(void) static inline void set_uart_info(u32 phys, void * __iomem virt) { + u32 *uart_info = (u32 *)(DAVINCI_UART_INFO); + uart = (u32 *)phys; uart_info[0] = phys; uart_info[1] = (u32)virt; diff --git a/arch/arm/mach-davinci/irq.c b/arch/arm/mach-davinci/irq.c index e6269a6..bfe68ec 100644 --- a/arch/arm/mach-davinci/irq.c +++ b/arch/arm/mach-davinci/irq.c @@ -29,8 +29,6 @@ #include #include -#define IRQ_BIT(irq) ((irq) & 0x1f) - #define FIQ_REG0_OFFSET 0x0000 #define FIQ_REG1_OFFSET 0x0004 #define IRQ_REG0_OFFSET 0x0008 @@ -42,78 +40,33 @@ #define IRQ_INTPRI0_REG_OFFSET 0x0030 #define IRQ_INTPRI7_REG_OFFSET 0x004C -static inline unsigned int davinci_irq_readl(int offset) -{ - return __raw_readl(davinci_intc_base + offset); -} - static inline void davinci_irq_writel(unsigned long value, int offset) { __raw_writel(value, davinci_intc_base + offset); } -/* Disable interrupt */ -static void davinci_mask_irq(struct irq_data *d) +static __init void +davinci_alloc_gc(void __iomem *base, unsigned int irq_start, unsigned int num) { - unsigned int mask; - u32 l; - - mask = 1 << IRQ_BIT(d->irq); - - if (d->irq > 31) { - l = davinci_irq_readl(IRQ_ENT_REG1_OFFSET); - l &= ~mask; - davinci_irq_writel(l, IRQ_ENT_REG1_OFFSET); - } else { - l = davinci_irq_readl(IRQ_ENT_REG0_OFFSET); - l &= ~mask; - davinci_irq_writel(l, IRQ_ENT_REG0_OFFSET); - } -} - -/* Enable interrupt */ -static void davinci_unmask_irq(struct irq_data *d) -{ - unsigned int mask; - u32 l; - - mask = 1 << IRQ_BIT(d->irq); - - if (d->irq > 31) { - l = davinci_irq_readl(IRQ_ENT_REG1_OFFSET); - l |= mask; - davinci_irq_writel(l, IRQ_ENT_REG1_OFFSET); - } else { - l = davinci_irq_readl(IRQ_ENT_REG0_OFFSET); - l |= mask; - davinci_irq_writel(l, IRQ_ENT_REG0_OFFSET); - } + struct irq_chip_generic *gc; + struct irq_chip_type *ct; + + gc = irq_alloc_generic_chip("AINTC", 1, irq_start, base, handle_edge_irq); + ct = gc->chip_types; + ct->chip.irq_ack = irq_gc_ack; + ct->chip.irq_mask = irq_gc_mask_clr_bit; + ct->chip.irq_unmask = irq_gc_mask_set_bit; + + ct->regs.ack = IRQ_REG0_OFFSET; + ct->regs.mask = IRQ_ENT_REG0_OFFSET; + irq_setup_generic_chip(gc, IRQ_MSK(num), IRQ_GC_INIT_MASK_CACHE, + IRQ_NOREQUEST | IRQ_NOPROBE, 0); } -/* EOI interrupt */ -static void davinci_ack_irq(struct irq_data *d) -{ - unsigned int mask; - - mask = 1 << IRQ_BIT(d->irq); - - if (d->irq > 31) - davinci_irq_writel(mask, IRQ_REG1_OFFSET); - else - davinci_irq_writel(mask, IRQ_REG0_OFFSET); -} - -static struct irq_chip davinci_irq_chip_0 = { - .name = "AINTC", - .irq_ack = davinci_ack_irq, - .irq_mask = davinci_mask_irq, - .irq_unmask = davinci_unmask_irq, -}; - /* ARM Interrupt Controller Initialization */ void __init davinci_irq_init(void) { - unsigned i; + unsigned i, j; const u8 *davinci_def_priorities = davinci_soc_info.intc_irq_prios; davinci_intc_type = DAVINCI_INTC_TYPE_AINTC; @@ -144,7 +97,6 @@ void __init davinci_irq_init(void) davinci_irq_writel(~0x0, IRQ_REG1_OFFSET); for (i = IRQ_INTPRI0_REG_OFFSET; i <= IRQ_INTPRI7_REG_OFFSET; i += 4) { - unsigned j; u32 pri; for (j = 0, pri = 0; j < 32; j += 4, davinci_def_priorities++) @@ -152,13 +104,8 @@ void __init davinci_irq_init(void) davinci_irq_writel(pri, i); } - /* set up genirq dispatch for ARM INTC */ - for (i = 0; i < davinci_soc_info.intc_irq_num; i++) { - irq_set_chip(i, &davinci_irq_chip_0); - set_irq_flags(i, IRQF_VALID | IRQF_PROBE); - if (i != IRQ_TINT1_TINT34) - irq_set_handler(i, handle_edge_irq); - else - irq_set_handler(i, handle_level_irq); - } + for (i = 0, j = 0; i < davinci_soc_info.intc_irq_num; i += 32, j += 0x04) + davinci_alloc_gc(davinci_intc_base + j, i, 32); + + irq_set_handler(IRQ_TINT1_TINT34, handle_level_irq); } diff --git a/arch/arm/mach-dove/common.c b/arch/arm/mach-dove/common.c index e06a88f..5ed51b8 100644 --- a/arch/arm/mach-dove/common.c +++ b/arch/arm/mach-dove/common.c @@ -16,10 +16,8 @@ #include #include #include -#include -#include #include -#include +#include #include #include #include @@ -32,11 +30,12 @@ #include #include #include -#include -#include #include +#include #include "common.h" +static int get_tclk(void); + /***************************************************************************** * I/O Address Mapping ****************************************************************************/ @@ -70,463 +69,106 @@ void __init dove_map_io(void) } /***************************************************************************** - * EHCI - ****************************************************************************/ -static struct orion_ehci_data dove_ehci_data = { - .dram = &dove_mbus_dram_info, - .phy_version = EHCI_PHY_NA, -}; - -static u64 ehci_dmamask = DMA_BIT_MASK(32); - -/***************************************************************************** * EHCI0 ****************************************************************************/ -static struct resource dove_ehci0_resources[] = { - { - .start = DOVE_USB0_PHYS_BASE, - .end = DOVE_USB0_PHYS_BASE + SZ_4K - 1, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_DOVE_USB0, - .end = IRQ_DOVE_USB0, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_ehci0 = { - .name = "orion-ehci", - .id = 0, - .dev = { - .dma_mask = &ehci_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(32), - .platform_data = &dove_ehci_data, - }, - .resource = dove_ehci0_resources, - .num_resources = ARRAY_SIZE(dove_ehci0_resources), -}; - void __init dove_ehci0_init(void) { - platform_device_register(&dove_ehci0); + orion_ehci_init(&dove_mbus_dram_info, + DOVE_USB0_PHYS_BASE, IRQ_DOVE_USB0); } /***************************************************************************** * EHCI1 ****************************************************************************/ -static struct resource dove_ehci1_resources[] = { - { - .start = DOVE_USB1_PHYS_BASE, - .end = DOVE_USB1_PHYS_BASE + SZ_4K - 1, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_DOVE_USB1, - .end = IRQ_DOVE_USB1, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_ehci1 = { - .name = "orion-ehci", - .id = 1, - .dev = { - .dma_mask = &ehci_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(32), - .platform_data = &dove_ehci_data, - }, - .resource = dove_ehci1_resources, - .num_resources = ARRAY_SIZE(dove_ehci1_resources), -}; - void __init dove_ehci1_init(void) { - platform_device_register(&dove_ehci1); + orion_ehci_1_init(&dove_mbus_dram_info, + DOVE_USB1_PHYS_BASE, IRQ_DOVE_USB1); } /***************************************************************************** * GE00 ****************************************************************************/ -struct mv643xx_eth_shared_platform_data dove_ge00_shared_data = { - .t_clk = 0, - .dram = &dove_mbus_dram_info, -}; - -static struct resource dove_ge00_shared_resources[] = { - { - .name = "ge00 base", - .start = DOVE_GE00_PHYS_BASE + 0x2000, - .end = DOVE_GE00_PHYS_BASE + SZ_16K - 1, - .flags = IORESOURCE_MEM, - }, -}; - -static struct platform_device dove_ge00_shared = { - .name = MV643XX_ETH_SHARED_NAME, - .id = 0, - .dev = { - .platform_data = &dove_ge00_shared_data, - }, - .num_resources = 1, - .resource = dove_ge00_shared_resources, -}; - -static struct resource dove_ge00_resources[] = { - { - .name = "ge00 irq", - .start = IRQ_DOVE_GE00_SUM, - .end = IRQ_DOVE_GE00_SUM, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_ge00 = { - .name = MV643XX_ETH_NAME, - .id = 0, - .num_resources = 1, - .resource = dove_ge00_resources, - .dev = { - .coherent_dma_mask = 0xffffffff, - }, -}; - void __init dove_ge00_init(struct mv643xx_eth_platform_data *eth_data) { - eth_data->shared = &dove_ge00_shared; - dove_ge00.dev.platform_data = eth_data; - - platform_device_register(&dove_ge00_shared); - platform_device_register(&dove_ge00); + orion_ge00_init(eth_data, &dove_mbus_dram_info, + DOVE_GE00_PHYS_BASE, IRQ_DOVE_GE00_SUM, + 0, get_tclk()); } /***************************************************************************** * SoC RTC ****************************************************************************/ -static struct resource dove_rtc_resource[] = { - { - .start = DOVE_RTC_PHYS_BASE, - .end = DOVE_RTC_PHYS_BASE + 32 - 1, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_DOVE_RTC, - .flags = IORESOURCE_IRQ, - } -}; - void __init dove_rtc_init(void) { - platform_device_register_simple("rtc-mv", -1, dove_rtc_resource, 2); + orion_rtc_init(DOVE_RTC_PHYS_BASE, IRQ_DOVE_RTC); } /***************************************************************************** * SATA ****************************************************************************/ -static struct resource dove_sata_resources[] = { - { - .name = "sata base", - .start = DOVE_SATA_PHYS_BASE, - .end = DOVE_SATA_PHYS_BASE + 0x5000 - 1, - .flags = IORESOURCE_MEM, - }, { - .name = "sata irq", - .start = IRQ_DOVE_SATA, - .end = IRQ_DOVE_SATA, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_sata = { - .name = "sata_mv", - .id = 0, - .dev = { - .coherent_dma_mask = DMA_BIT_MASK(32), - }, - .num_resources = ARRAY_SIZE(dove_sata_resources), - .resource = dove_sata_resources, -}; - void __init dove_sata_init(struct mv_sata_platform_data *sata_data) { - sata_data->dram = &dove_mbus_dram_info; - dove_sata.dev.platform_data = sata_data; - platform_device_register(&dove_sata); + orion_sata_init(sata_data, &dove_mbus_dram_info, + DOVE_SATA_PHYS_BASE, IRQ_DOVE_SATA); + } /***************************************************************************** * UART0 ****************************************************************************/ -static struct plat_serial8250_port dove_uart0_data[] = { - { - .mapbase = DOVE_UART0_PHYS_BASE, - .membase = (char *)DOVE_UART0_VIRT_BASE, - .irq = IRQ_DOVE_UART_0, - .flags = UPF_SKIP_TEST | UPF_BOOT_AUTOCONF, - .iotype = UPIO_MEM, - .regshift = 2, - .uartclk = 0, - }, { - }, -}; - -static struct resource dove_uart0_resources[] = { - { - .start = DOVE_UART0_PHYS_BASE, - .end = DOVE_UART0_PHYS_BASE + SZ_256 - 1, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_DOVE_UART_0, - .end = IRQ_DOVE_UART_0, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_uart0 = { - .name = "serial8250", - .id = 0, - .dev = { - .platform_data = dove_uart0_data, - }, - .resource = dove_uart0_resources, - .num_resources = ARRAY_SIZE(dove_uart0_resources), -}; - void __init dove_uart0_init(void) { - platform_device_register(&dove_uart0); + orion_uart0_init(DOVE_UART0_VIRT_BASE, DOVE_UART0_PHYS_BASE, + IRQ_DOVE_UART_0, get_tclk()); } /***************************************************************************** * UART1 ****************************************************************************/ -static struct plat_serial8250_port dove_uart1_data[] = { - { - .mapbase = DOVE_UART1_PHYS_BASE, - .membase = (char *)DOVE_UART1_VIRT_BASE, - .irq = IRQ_DOVE_UART_1, - .flags = UPF_SKIP_TEST | UPF_BOOT_AUTOCONF, - .iotype = UPIO_MEM, - .regshift = 2, - .uartclk = 0, - }, { - }, -}; - -static struct resource dove_uart1_resources[] = { - { - .start = DOVE_UART1_PHYS_BASE, - .end = DOVE_UART1_PHYS_BASE + SZ_256 - 1, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_DOVE_UART_1, - .end = IRQ_DOVE_UART_1, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_uart1 = { - .name = "serial8250", - .id = 1, - .dev = { - .platform_data = dove_uart1_data, - }, - .resource = dove_uart1_resources, - .num_resources = ARRAY_SIZE(dove_uart1_resources), -}; - void __init dove_uart1_init(void) { - platform_device_register(&dove_uart1); + orion_uart1_init(DOVE_UART1_VIRT_BASE, DOVE_UART1_PHYS_BASE, + IRQ_DOVE_UART_1, get_tclk()); } /***************************************************************************** * UART2 ****************************************************************************/ -static struct plat_serial8250_port dove_uart2_data[] = { - { - .mapbase = DOVE_UART2_PHYS_BASE, - .membase = (char *)DOVE_UART2_VIRT_BASE, - .irq = IRQ_DOVE_UART_2, - .flags = UPF_SKIP_TEST | UPF_BOOT_AUTOCONF, - .iotype = UPIO_MEM, - .regshift = 2, - .uartclk = 0, - }, { - }, -}; - -static struct resource dove_uart2_resources[] = { - { - .start = DOVE_UART2_PHYS_BASE, - .end = DOVE_UART2_PHYS_BASE + SZ_256 - 1, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_DOVE_UART_2, - .end = IRQ_DOVE_UART_2, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_uart2 = { - .name = "serial8250", - .id = 2, - .dev = { - .platform_data = dove_uart2_data, - }, - .resource = dove_uart2_resources, - .num_resources = ARRAY_SIZE(dove_uart2_resources), -}; - void __init dove_uart2_init(void) { - platform_device_register(&dove_uart2); + orion_uart2_init(DOVE_UART2_VIRT_BASE, DOVE_UART2_PHYS_BASE, + IRQ_DOVE_UART_2, get_tclk()); } /***************************************************************************** * UART3 ****************************************************************************/ -static struct plat_serial8250_port dove_uart3_data[] = { - { - .mapbase = DOVE_UART3_PHYS_BASE, - .membase = (char *)DOVE_UART3_VIRT_BASE, - .irq = IRQ_DOVE_UART_3, - .flags = UPF_SKIP_TEST | UPF_BOOT_AUTOCONF, - .iotype = UPIO_MEM, - .regshift = 2, - .uartclk = 0, - }, { - }, -}; - -static struct resource dove_uart3_resources[] = { - { - .start = DOVE_UART3_PHYS_BASE, - .end = DOVE_UART3_PHYS_BASE + SZ_256 - 1, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_DOVE_UART_3, - .end = IRQ_DOVE_UART_3, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_uart3 = { - .name = "serial8250", - .id = 3, - .dev = { - .platform_data = dove_uart3_data, - }, - .resource = dove_uart3_resources, - .num_resources = ARRAY_SIZE(dove_uart3_resources), -}; - void __init dove_uart3_init(void) { - platform_device_register(&dove_uart3); + orion_uart3_init(DOVE_UART3_VIRT_BASE, DOVE_UART3_PHYS_BASE, + IRQ_DOVE_UART_3, get_tclk()); } /***************************************************************************** - * SPI0 + * SPI ****************************************************************************/ -static struct orion_spi_info dove_spi0_data = { - .tclk = 0, -}; - -static struct resource dove_spi0_resources[] = { - { - .start = DOVE_SPI0_PHYS_BASE, - .end = DOVE_SPI0_PHYS_BASE + SZ_512 - 1, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_DOVE_SPI0, - .end = IRQ_DOVE_SPI0, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_spi0 = { - .name = "orion_spi", - .id = 0, - .resource = dove_spi0_resources, - .dev = { - .platform_data = &dove_spi0_data, - }, - .num_resources = ARRAY_SIZE(dove_spi0_resources), -}; - void __init dove_spi0_init(void) { - platform_device_register(&dove_spi0); + orion_spi_init(DOVE_SPI0_PHYS_BASE, get_tclk()); } -/***************************************************************************** - * SPI1 - ****************************************************************************/ -static struct orion_spi_info dove_spi1_data = { - .tclk = 0, -}; - -static struct resource dove_spi1_resources[] = { - { - .start = DOVE_SPI1_PHYS_BASE, - .end = DOVE_SPI1_PHYS_BASE + SZ_512 - 1, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_DOVE_SPI1, - .end = IRQ_DOVE_SPI1, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_spi1 = { - .name = "orion_spi", - .id = 1, - .resource = dove_spi1_resources, - .dev = { - .platform_data = &dove_spi1_data, - }, - .num_resources = ARRAY_SIZE(dove_spi1_resources), -}; - void __init dove_spi1_init(void) { - platform_device_register(&dove_spi1); + orion_spi_init(DOVE_SPI1_PHYS_BASE, get_tclk()); } /***************************************************************************** * I2C ****************************************************************************/ -static struct mv64xxx_i2c_pdata dove_i2c_data = { - .freq_m = 10, /* assumes 166 MHz TCLK gets 94.3kHz */ - .freq_n = 3, - .timeout = 1000, /* Default timeout of 1 second */ -}; - -static struct resource dove_i2c_resources[] = { - { - .name = "i2c base", - .start = DOVE_I2C_PHYS_BASE, - .end = DOVE_I2C_PHYS_BASE + 0x20 - 1, - .flags = IORESOURCE_MEM, - }, { - .name = "i2c irq", - .start = IRQ_DOVE_I2C, - .end = IRQ_DOVE_I2C, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device dove_i2c = { - .name = MV64XXX_I2C_CTLR_NAME, - .id = 0, - .num_resources = ARRAY_SIZE(dove_i2c_resources), - .resource = dove_i2c_resources, - .dev = { - .platform_data = &dove_i2c_data, - }, -}; - void __init dove_i2c_init(void) { - platform_device_register(&dove_i2c); + orion_i2c_init(DOVE_I2C_PHYS_BASE, IRQ_DOVE_I2C, 10); } /***************************************************************************** @@ -554,208 +196,22 @@ struct sys_timer dove_timer = { }; /***************************************************************************** - * XOR - ****************************************************************************/ -static struct mv_xor_platform_shared_data dove_xor_shared_data = { - .dram = &dove_mbus_dram_info, -}; - -/***************************************************************************** * XOR 0 ****************************************************************************/ -static u64 dove_xor0_dmamask = DMA_BIT_MASK(32); - -static struct resource dove_xor0_shared_resources[] = { - { - .name = "xor 0 low", - .start = DOVE_XOR0_PHYS_BASE, - .end = DOVE_XOR0_PHYS_BASE + 0xff, - .flags = IORESOURCE_MEM, - }, { - .name = "xor 0 high", - .start = DOVE_XOR0_HIGH_PHYS_BASE, - .end = DOVE_XOR0_HIGH_PHYS_BASE + 0xff, - .flags = IORESOURCE_MEM, - }, -}; - -static struct platform_device dove_xor0_shared = { - .name = MV_XOR_SHARED_NAME, - .id = 0, - .dev = { - .platform_data = &dove_xor_shared_data, - }, - .num_resources = ARRAY_SIZE(dove_xor0_shared_resources), - .resource = dove_xor0_shared_resources, -}; - -static struct resource dove_xor00_resources[] = { - [0] = { - .start = IRQ_DOVE_XOR_00, - .end = IRQ_DOVE_XOR_00, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct mv_xor_platform_data dove_xor00_data = { - .shared = &dove_xor0_shared, - .hw_id = 0, - .pool_size = PAGE_SIZE, -}; - -static struct platform_device dove_xor00_channel = { - .name = MV_XOR_NAME, - .id = 0, - .num_resources = ARRAY_SIZE(dove_xor00_resources), - .resource = dove_xor00_resources, - .dev = { - .dma_mask = &dove_xor0_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), - .platform_data = &dove_xor00_data, - }, -}; - -static struct resource dove_xor01_resources[] = { - [0] = { - .start = IRQ_DOVE_XOR_01, - .end = IRQ_DOVE_XOR_01, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct mv_xor_platform_data dove_xor01_data = { - .shared = &dove_xor0_shared, - .hw_id = 1, - .pool_size = PAGE_SIZE, -}; - -static struct platform_device dove_xor01_channel = { - .name = MV_XOR_NAME, - .id = 1, - .num_resources = ARRAY_SIZE(dove_xor01_resources), - .resource = dove_xor01_resources, - .dev = { - .dma_mask = &dove_xor0_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), - .platform_data = &dove_xor01_data, - }, -}; - void __init dove_xor0_init(void) { - platform_device_register(&dove_xor0_shared); - - /* - * two engines can't do memset simultaneously, this limitation - * satisfied by removing memset support from one of the engines. - */ - dma_cap_set(DMA_MEMCPY, dove_xor00_data.cap_mask); - dma_cap_set(DMA_XOR, dove_xor00_data.cap_mask); - platform_device_register(&dove_xor00_channel); - - dma_cap_set(DMA_MEMCPY, dove_xor01_data.cap_mask); - dma_cap_set(DMA_MEMSET, dove_xor01_data.cap_mask); - dma_cap_set(DMA_XOR, dove_xor01_data.cap_mask); - platform_device_register(&dove_xor01_channel); + orion_xor0_init(&dove_mbus_dram_info, + DOVE_XOR0_PHYS_BASE, DOVE_XOR0_HIGH_PHYS_BASE, + IRQ_DOVE_XOR_00, IRQ_DOVE_XOR_01); } /***************************************************************************** * XOR 1 ****************************************************************************/ -static u64 dove_xor1_dmamask = DMA_BIT_MASK(32); - -static struct resource dove_xor1_shared_resources[] = { - { - .name = "xor 0 low", - .start = DOVE_XOR1_PHYS_BASE, - .end = DOVE_XOR1_PHYS_BASE + 0xff, - .flags = IORESOURCE_MEM, - }, { - .name = "xor 0 high", - .start = DOVE_XOR1_HIGH_PHYS_BASE, - .end = DOVE_XOR1_HIGH_PHYS_BASE + 0xff, - .flags = IORESOURCE_MEM, - }, -}; - -static struct platform_device dove_xor1_shared = { - .name = MV_XOR_SHARED_NAME, - .id = 1, - .dev = { - .platform_data = &dove_xor_shared_data, - }, - .num_resources = ARRAY_SIZE(dove_xor1_shared_resources), - .resource = dove_xor1_shared_resources, -}; - -static struct resource dove_xor10_resources[] = { - [0] = { - .start = IRQ_DOVE_XOR_10, - .end = IRQ_DOVE_XOR_10, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct mv_xor_platform_data dove_xor10_data = { - .shared = &dove_xor1_shared, - .hw_id = 0, - .pool_size = PAGE_SIZE, -}; - -static struct platform_device dove_xor10_channel = { - .name = MV_XOR_NAME, - .id = 2, - .num_resources = ARRAY_SIZE(dove_xor10_resources), - .resource = dove_xor10_resources, - .dev = { - .dma_mask = &dove_xor1_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), - .platform_data = &dove_xor10_data, - }, -}; - -static struct resource dove_xor11_resources[] = { - [0] = { - .start = IRQ_DOVE_XOR_11, - .end = IRQ_DOVE_XOR_11, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct mv_xor_platform_data dove_xor11_data = { - .shared = &dove_xor1_shared, - .hw_id = 1, - .pool_size = PAGE_SIZE, -}; - -static struct platform_device dove_xor11_channel = { - .name = MV_XOR_NAME, - .id = 3, - .num_resources = ARRAY_SIZE(dove_xor11_resources), - .resource = dove_xor11_resources, - .dev = { - .dma_mask = &dove_xor1_dmamask, - .coherent_dma_mask = DMA_BIT_MASK(64), - .platform_data = &dove_xor11_data, - }, -}; - void __init dove_xor1_init(void) { - platform_device_register(&dove_xor1_shared); - - /* - * two engines can't do memset simultaneously, this limitation - * satisfied by removing memset support from one of the engines. - */ - dma_cap_set(DMA_MEMCPY, dove_xor10_data.cap_mask); - dma_cap_set(DMA_XOR, dove_xor10_data.cap_mask); - platform_device_register(&dove_xor10_channel); - - dma_cap_set(DMA_MEMCPY, dove_xor11_data.cap_mask); - dma_cap_set(DMA_MEMSET, dove_xor11_data.cap_mask); - dma_cap_set(DMA_XOR, dove_xor11_data.cap_mask); - platform_device_register(&dove_xor11_channel); + orion_xor1_init(DOVE_XOR1_PHYS_BASE, DOVE_XOR1_HIGH_PHYS_BASE, + IRQ_DOVE_XOR_10, IRQ_DOVE_XOR_11); } /***************************************************************************** @@ -833,14 +289,6 @@ void __init dove_init(void) #endif dove_setup_cpu_mbus(); - dove_ge00_shared_data.t_clk = tclk; - dove_uart0_data[0].uartclk = tclk; - dove_uart1_data[0].uartclk = tclk; - dove_uart2_data[0].uartclk = tclk; - dove_uart3_data[0].uartclk = tclk; - dove_spi0_data.tclk = tclk; - dove_spi1_data.tclk = tclk; - /* internal devices that every board has */ dove_rtc_init(); dove_xor0_init(); diff --git a/arch/arm/mach-dove/mpp.c b/arch/arm/mach-dove/mpp.c index c66c763..51e0e41 100644 --- a/arch/arm/mach-dove/mpp.c +++ b/arch/arm/mach-dove/mpp.c @@ -11,24 +11,17 @@ #include #include #include - +#include #include - #include "mpp.h" -#define MPP_NR_REGS 4 -#define MPP_CTRL(i) ((i) == 3 ? \ - DOVE_MPP_CTRL4_VIRT_BASE : \ - DOVE_MPP_VIRT_BASE + (i) * 4) -#define PMU_SIG_REGS 2 -#define PMU_SIG_CTRL(i) (DOVE_PMU_SIG_CTRL + (i) * 4) - struct dove_mpp_grp { int start; int end; }; -static struct dove_mpp_grp dove_mpp_grp[] = { +/* Map a group to a range of GPIO pins in that group */ +static const struct dove_mpp_grp dove_mpp_grp[] = { [MPP_24_39] = { .start = 24, .end = 39, @@ -38,8 +31,8 @@ static struct dove_mpp_grp dove_mpp_grp[] = { .end = 45, }, [MPP_46_51] = { - .start = 40, - .end = 45, + .start = 46, + .end = 51, }, [MPP_58_61] = { .start = 58, @@ -51,6 +44,8 @@ static struct dove_mpp_grp dove_mpp_grp[] = { }, }; +/* Enable gpio for a range of pins. mode should be a combination of + GPIO_OUTPUT_OK | GPIO_INPUT_OK */ static void dove_mpp_gpio_mode(int start, int end, int gpio_mode) { int i; @@ -59,24 +54,17 @@ static void dove_mpp_gpio_mode(int start, int end, int gpio_mode) orion_gpio_set_valid(i, gpio_mode); } +/* Dump all the extra MPP registers. The platform code will dump the + registers for pins 0-23. */ static void dove_mpp_dump_regs(void) { -#ifdef DEBUG - int i; + pr_debug("PMU_CTRL4_CTRL: %08x\n", + readl(DOVE_MPP_CTRL4_VIRT_BASE)); - pr_debug("MPP_CTRL regs:"); - for (i = 0; i < MPP_NR_REGS; i++) - printk(" %08x", readl(MPP_CTRL(i))); - printk("\n"); + pr_debug("PMU_MPP_GENERAL_CTRL: %08x\n", + readl(DOVE_PMU_MPP_GENERAL_CTRL)); - pr_debug("PMU_SIG_CTRL regs:"); - for (i = 0; i < PMU_SIG_REGS; i++) - printk(" %08x", readl(PMU_SIG_CTRL(i))); - printk("\n"); - - pr_debug("PMU_MPP_GENERAL_CTRL: %08x\n", readl(DOVE_PMU_MPP_GENERAL_CTRL)); pr_debug("MPP_GENERAL: %08x\n", readl(DOVE_MPP_GENERAL_VIRT_BASE)); -#endif } static void dove_mpp_cfg_nfc(int sel) @@ -92,7 +80,7 @@ static void dove_mpp_cfg_nfc(int sel) static void dove_mpp_cfg_au1(int sel) { - u32 mpp_ctrl4 = readl(DOVE_MPP_CTRL4_VIRT_BASE); + u32 mpp_ctrl4 = readl(DOVE_MPP_CTRL4_VIRT_BASE); u32 ssp_ctrl1 = readl(DOVE_SSP_CTRL_STATUS_1); u32 mpp_gen_ctrl = readl(DOVE_MPP_GENERAL_VIRT_BASE); u32 global_cfg_2 = readl(DOVE_GLOBAL_CONFIG_2); @@ -128,82 +116,46 @@ static void dove_mpp_cfg_au1(int sel) writel(global_cfg_2, DOVE_GLOBAL_CONFIG_2); } -static void dove_mpp_conf_grp(int num, int sel, u32 *mpp_ctrl) -{ - int start = dove_mpp_grp[num].start; - int end = dove_mpp_grp[num].end; - int gpio_mode = sel ? GPIO_OUTPUT_OK | GPIO_INPUT_OK : 0; - - *mpp_ctrl &= ~(0x1 << num); - *mpp_ctrl |= sel << num; - - dove_mpp_gpio_mode(start, end, gpio_mode); -} - -void __init dove_mpp_conf(unsigned int *mpp_list) +/* Configure the group registers, enabling GPIO if sel indicates the + pin is to be used for GPIO */ +static void dove_mpp_conf_grp(unsigned int *mpp_grp_list) { - u32 mpp_ctrl[MPP_NR_REGS]; - u32 pmu_mpp_ctrl = 0; - u32 pmu_sig_ctrl[PMU_SIG_REGS]; - int i; - - for (i = 0; i < MPP_NR_REGS; i++) - mpp_ctrl[i] = readl(MPP_CTRL(i)); - - for (i = 0; i < PMU_SIG_REGS; i++) - pmu_sig_ctrl[i] = readl(PMU_SIG_CTRL(i)); - - pmu_mpp_ctrl = readl(DOVE_PMU_MPP_GENERAL_CTRL); + u32 mpp_ctrl4 = readl(DOVE_MPP_CTRL4_VIRT_BASE); + int gpio_mode; - dove_mpp_dump_regs(); - - for ( ; *mpp_list != MPP_END; mpp_list++) { - unsigned int num = MPP_NUM(*mpp_list); - unsigned int sel = MPP_SEL(*mpp_list); - int shift, gpio_mode; - - if (num > MPP_MAX) { - pr_err("dove: invalid MPP number (%u)\n", num); - continue; - } - - if (*mpp_list & MPP_NFC_MASK) { - dove_mpp_cfg_nfc(sel); - continue; - } + for ( ; *mpp_grp_list; mpp_grp_list++) { + unsigned int num = MPP_NUM(*mpp_grp_list); + unsigned int sel = MPP_SEL(*mpp_grp_list); - if (*mpp_list & MPP_AU1_MASK) { - dove_mpp_cfg_au1(sel); + if (num > MPP_GRP_MAX) { + pr_err("dove: invalid MPP GRP number (%u)\n", num); continue; } - if (*mpp_list & MPP_GRP_MASK) { - dove_mpp_conf_grp(num, sel, &mpp_ctrl[3]); - continue; - } - - shift = (num & 7) << 2; - if (*mpp_list & MPP_PMU_MASK) { - pmu_mpp_ctrl |= (0x1 << num); - pmu_sig_ctrl[num / 8] &= ~(0xf << shift); - pmu_sig_ctrl[num / 8] |= 0xf << shift; - gpio_mode = 0; - } else { - mpp_ctrl[num / 8] &= ~(0xf << shift); - mpp_ctrl[num / 8] |= sel << shift; - gpio_mode = GPIO_OUTPUT_OK | GPIO_INPUT_OK; - } + mpp_ctrl4 &= ~(0x1 << num); + mpp_ctrl4 |= sel << num; - orion_gpio_set_valid(num, gpio_mode); + gpio_mode = sel ? GPIO_OUTPUT_OK | GPIO_INPUT_OK : 0; + dove_mpp_gpio_mode(dove_mpp_grp[num].start, + dove_mpp_grp[num].end, gpio_mode); } + writel(mpp_ctrl4, DOVE_MPP_CTRL4_VIRT_BASE); +} - for (i = 0; i < MPP_NR_REGS; i++) - writel(mpp_ctrl[i], MPP_CTRL(i)); +/* Configure the various MPP pins on Dove */ +void __init dove_mpp_conf(unsigned int *mpp_list, + unsigned int *mpp_grp_list, + unsigned int grp_au1_52_57, + unsigned int grp_nfc_64_71) +{ + dove_mpp_dump_regs(); - for (i = 0; i < PMU_SIG_REGS; i++) - writel(pmu_sig_ctrl[i], PMU_SIG_CTRL(i)); + /* Use platform code for pins 0-23 */ + orion_mpp_conf(mpp_list, 0, MPP_MAX, DOVE_MPP_VIRT_BASE); - writel(pmu_mpp_ctrl, DOVE_PMU_MPP_GENERAL_CTRL); + dove_mpp_conf_grp(mpp_grp_list); + dove_mpp_cfg_au1(grp_au1_52_57); + dove_mpp_cfg_nfc(grp_nfc_64_71); dove_mpp_dump_regs(); } diff --git a/arch/arm/mach-dove/mpp.h b/arch/arm/mach-dove/mpp.h index 2a43ce4..fbec7c5 100644 --- a/arch/arm/mach-dove/mpp.h +++ b/arch/arm/mach-dove/mpp.h @@ -1,178 +1,150 @@ #ifndef __ARCH_DOVE_MPP_CODED_H #define __ARCH_DOVE_MPP_CODED_H -#define MPP(_num, _mode, _pmu, _grp, _au1, _nfc) ( \ -/* MPP/group number */ ((_num) & 0xff) | \ -/* MPP select value */ (((_mode) & 0xf) << 8) | \ -/* MPP PMU */ ((!!(_pmu)) << 12) | \ -/* group flag */ ((!!(_grp)) << 13) | \ -/* AU1 flag */ ((!!(_au1)) << 14) | \ -/* NFCE flag */ ((!!(_nfc)) << 15)) - -#define MPP_MAX 71 - -#define MPP_NUM(x) ((x) & 0xff) -#define MPP_SEL(x) (((x) >> 8) & 0xf) - -#define MPP_PMU_MASK MPP(0, 0x0, 1, 0, 0, 0) -#define MPP_GRP_MASK MPP(0, 0x0, 0, 1, 0, 0) -#define MPP_AU1_MASK MPP(0, 0x0, 0, 0, 1, 0) -#define MPP_NFC_MASK MPP(0, 0x0, 0, 0, 0, 1) - -#define MPP_END MPP(0xff, 0xf, 1, 1, 1, 1) - -#define MPP_PMU_DRIVE_0 0x1 -#define MPP_PMU_DRIVE_1 0x2 -#define MPP_PMU_SDI 0x3 -#define MPP_PMU_CPU_PWRDWN 0x4 -#define MPP_PMU_STBY_PWRDWN 0x5 -#define MPP_PMU_CORE_PWR_GOOD 0x8 -#define MPP_PMU_BAT_FAULT 0xa -#define MPP_PMU_EXT0_WU 0xb -#define MPP_PMU_EXT1_WU 0xc -#define MPP_PMU_EXT2_WU 0xd -#define MPP_PMU_BLINK 0xe -#define MPP_PMU(_num, _mode) MPP((_num), MPP_PMU_##_mode, 1, 0, 0, 0) - -#define MPP_PIN(_num, _mode) MPP((_num), (_mode), 0, 0, 0, 0) -#define MPP_GRP(_grp, _mode) MPP((_grp), (_mode), 0, 1, 0, 0) -#define MPP_GRP_AU1(_mode) MPP(0, (_mode), 0, 0, 1, 0) -#define MPP_GRP_NFC(_mode) MPP(0, (_mode), 0, 0, 0, 1) - -#define MPP0_GPIO0 MPP_PIN(0, 0x0) -#define MPP0_UA2_RTSn MPP_PIN(0, 0x2) -#define MPP0_SDIO0_CD MPP_PIN(0, 0x3) -#define MPP0_LCD0_PWM MPP_PIN(0, 0xf) - -#define MPP1_GPIO1 MPP_PIN(1, 0x0) -#define MPP1_UA2_CTSn MPP_PIN(1, 0x2) -#define MPP1_SDIO0_WP MPP_PIN(1, 0x3) -#define MPP1_LCD1_PWM MPP_PIN(1, 0xf) - -#define MPP2_GPIO2 MPP_PIN(2, 0x0) -#define MPP2_SATA_PRESENT MPP_PIN(2, 0x1) -#define MPP2_UA2_TXD MPP_PIN(2, 0x2) -#define MPP2_SDIO0_BUS_POWER MPP_PIN(2, 0x3) -#define MPP2_UA_RTSn1 MPP_PIN(2, 0x4) - -#define MPP3_GPIO3 MPP_PIN(3, 0x0) -#define MPP3_SATA_ACT MPP_PIN(3, 0x1) -#define MPP3_UA2_RXD MPP_PIN(3, 0x2) -#define MPP3_SDIO0_LED_CTRL MPP_PIN(3, 0x3) -#define MPP3_UA_CTSn1 MPP_PIN(3, 0x4) -#define MPP3_SPI_LCD_CS1 MPP_PIN(3, 0xf) - -#define MPP4_GPIO4 MPP_PIN(4, 0x0) -#define MPP4_UA3_RTSn MPP_PIN(4, 0x2) -#define MPP4_SDIO1_CD MPP_PIN(4, 0x3) -#define MPP4_SPI_1_MISO MPP_PIN(4, 0x4) - -#define MPP5_GPIO5 MPP_PIN(5, 0x0) -#define MPP5_UA3_CTSn MPP_PIN(5, 0x2) -#define MPP5_SDIO1_WP MPP_PIN(5, 0x3) -#define MPP5_SPI_1_CS MPP_PIN(5, 0x4) - -#define MPP6_GPIO6 MPP_PIN(6, 0x0) -#define MPP6_UA3_TXD MPP_PIN(6, 0x2) -#define MPP6_SDIO1_BUS_POWER MPP_PIN(6, 0x3) -#define MPP6_SPI_1_MOSI MPP_PIN(6, 0x4) - -#define MPP7_GPIO7 MPP_PIN(7, 0x0) -#define MPP7_UA3_RXD MPP_PIN(7, 0x2) -#define MPP7_SDIO1_LED_CTRL MPP_PIN(7, 0x3) -#define MPP7_SPI_1_SCK MPP_PIN(7, 0x4) - -#define MPP8_GPIO8 MPP_PIN(8, 0x0) -#define MPP8_WD_RST_OUT MPP_PIN(8, 0x1) - -#define MPP9_GPIO9 MPP_PIN(9, 0x0) -#define MPP9_PEX1_CLKREQn MPP_PIN(9, 0x5) - -#define MPP10_GPIO10 MPP_PIN(10, 0x0) -#define MPP10_SSP_SCLK MPP_PIN(10, 0x5) - -#define MPP11_GPIO11 MPP_PIN(11, 0x0) -#define MPP11_SATA_PRESENT MPP_PIN(11, 0x1) -#define MPP11_SATA_ACT MPP_PIN(11, 0x2) -#define MPP11_SDIO0_LED_CTRL MPP_PIN(11, 0x3) -#define MPP11_SDIO1_LED_CTRL MPP_PIN(11, 0x4) -#define MPP11_PEX0_CLKREQn MPP_PIN(11, 0x5) - -#define MPP12_GPIO12 MPP_PIN(12, 0x0) -#define MPP12_SATA_ACT MPP_PIN(12, 0x1) -#define MPP12_UA2_RTSn MPP_PIN(12, 0x2) -#define MPP12_AD0_I2S_EXT_MCLK MPP_PIN(12, 0x3) -#define MPP12_SDIO1_CD MPP_PIN(12, 0x4) - -#define MPP13_GPIO13 MPP_PIN(13, 0x0) -#define MPP13_UA2_CTSn MPP_PIN(13, 0x2) -#define MPP13_AD1_I2S_EXT_MCLK MPP_PIN(13, 0x3) -#define MPP13_SDIO1WP MPP_PIN(13, 0x4) -#define MPP13_SSP_EXTCLK MPP_PIN(13, 0x5) - -#define MPP14_GPIO14 MPP_PIN(14, 0x0) -#define MPP14_UA2_TXD MPP_PIN(14, 0x2) -#define MPP14_SDIO1_BUS_POWER MPP_PIN(14, 0x4) -#define MPP14_SSP_RXD MPP_PIN(14, 0x5) - -#define MPP15_GPIO15 MPP_PIN(15, 0x0) -#define MPP15_UA2_RXD MPP_PIN(15, 0x2) -#define MPP15_SDIO1_LED_CTRL MPP_PIN(15, 0x4) -#define MPP15_SSP_SFRM MPP_PIN(15, 0x5) - -#define MPP16_GPIO16 MPP_PIN(16, 0x0) -#define MPP16_UA3_RTSn MPP_PIN(16, 0x2) -#define MPP16_SDIO0_CD MPP_PIN(16, 0x3) -#define MPP16_SPI_LCD_CS1 MPP_PIN(16, 0x4) -#define MPP16_AC97_SDATA_IN1 MPP_PIN(16, 0x5) - -#define MPP17_GPIO17 MPP_PIN(17, 0x0) -#define MPP17_AC97_SYSCLK_OUT MPP_PIN(17, 0x1) -#define MPP17_UA3_CTSn MPP_PIN(17, 0x2) -#define MPP17_SDIO0_WP MPP_PIN(17, 0x3) -#define MPP17_TW_SDA2 MPP_PIN(17, 0x4) -#define MPP17_AC97_SDATA_IN2 MPP_PIN(17, 0x5) - -#define MPP18_GPIO18 MPP_PIN(18, 0x0) -#define MPP18_UA3_TXD MPP_PIN(18, 0x2) -#define MPP18_SDIO0_BUS_POWER MPP_PIN(18, 0x3) -#define MPP18_LCD0_PWM MPP_PIN(18, 0x4) -#define MPP18_AC_SDATA_IN3 MPP_PIN(18, 0x5) - -#define MPP19_GPIO19 MPP_PIN(19, 0x0) -#define MPP19_UA3_RXD MPP_PIN(19, 0x2) -#define MPP19_SDIO0_LED_CTRL MPP_PIN(19, 0x3) -#define MPP19_TW_SCK2 MPP_PIN(19, 0x4) - -#define MPP20_GPIO20 MPP_PIN(20, 0x0) -#define MPP20_AC97_SYSCLK_OUT MPP_PIN(20, 0x1) -#define MPP20_SPI_LCD_MISO MPP_PIN(20, 0x2) -#define MPP20_SDIO1_CD MPP_PIN(20, 0x3) -#define MPP20_SDIO0_CD MPP_PIN(20, 0x5) -#define MPP20_SPI_1_MISO MPP_PIN(20, 0x6) - -#define MPP21_GPIO21 MPP_PIN(21, 0x0) -#define MPP21_UA1_RTSn MPP_PIN(21, 0x1) -#define MPP21_SPI_LCD_CS0 MPP_PIN(21, 0x2) -#define MPP21_SDIO1_WP MPP_PIN(21, 0x3) -#define MPP21_SSP_SFRM MPP_PIN(21, 0x4) -#define MPP21_SDIO0_WP MPP_PIN(21, 0x5) -#define MPP21_SPI_1_CS MPP_PIN(21, 0x6) - -#define MPP22_GPIO22 MPP_PIN(22, 0x0) -#define MPP22_UA1_CTSn MPP_PIN(22, 0x1) -#define MPP22_SPI_LCD_MOSI MPP_PIN(22, 0x2) -#define MPP22_SDIO1_BUS_POWER MPP_PIN(22, 0x3) -#define MPP22_SSP_TXD MPP_PIN(22, 0x4) -#define MPP22_SDIO0_BUS_POWER MPP_PIN(22, 0x5) -#define MPP22_SPI_1_MOSI MPP_PIN(22, 0x6) - -#define MPP23_GPIO23 MPP_PIN(23, 0x0) -#define MPP23_SPI_LCD_SCK MPP_PIN(23, 0x2) -#define MPP23_SDIO1_LED_CTRL MPP_PIN(23, 0x3) -#define MPP23_SSP_SCLK MPP_PIN(23, 0x4) -#define MPP23_SDIO0_LED_CTRL MPP_PIN(23, 0x5) -#define MPP23_SPI_1_SCK MPP_PIN(23, 0x6) +#define MPP(_num, _sel, _in, _out) ( \ + /* MPP number */ ((_num) & 0xff) | \ + /* MPP select value */ (((_sel) & 0xf) << 8) | \ + /* may be input signal */ ((!!(_in)) << 12) | \ + /* may be output signal */ ((!!(_out)) << 13)) + +#define MPP0_GPIO0 MPP(0, 0x0, 1, 1) +#define MPP0_UA2_RTSn MPP(0, 0x2, 0, 0) +#define MPP0_SDIO0_CD MPP(0, 0x3, 0, 0) +#define MPP0_LCD0_PWM MPP(0, 0xf, 0, 0) + +#define MPP1_GPIO1 MPP(1, 0x0, 1, 1) +#define MPP1_UA2_CTSn MPP(1, 0x2, 0, 0) +#define MPP1_SDIO0_WP MPP(1, 0x3, 0, 0) +#define MPP1_LCD1_PWM MPP(1, 0xf, 0, 0) + +#define MPP2_GPIO2 MPP(2, 0x0, 1, 1) +#define MPP2_SATA_PRESENT MPP(2, 0x1, 0, 0) +#define MPP2_UA2_TXD MPP(2, 0x2, 0, 0) +#define MPP2_SDIO0_BUS_POWER MPP(2, 0x3, 0, 0) +#define MPP2_UA_RTSn1 MPP(2, 0x4, 0, 0) + +#define MPP3_GPIO3 MPP(3, 0x0, 1, 1) +#define MPP3_SATA_ACT MPP(3, 0x1, 0, 0) +#define MPP3_UA2_RXD MPP(3, 0x2, 0, 0) +#define MPP3_SDIO0_LED_CTRL MPP(3, 0x3, 0, 0) +#define MPP3_UA_CTSn1 MPP(3, 0x4, 0, 0) +#define MPP3_SPI_LCD_CS1 MPP(3, 0xf, 0, 0) + +#define MPP4_GPIO4 MPP(4, 0x0, 1, 1) +#define MPP4_UA3_RTSn MPP(4, 0x2, 0, 0) +#define MPP4_SDIO1_CD MPP(4, 0x3, 0, 0) +#define MPP4_SPI_1_MISO MPP(4, 0x4, 0, 0) + +#define MPP5_GPIO5 MPP(5, 0x0, 1, 1) +#define MPP5_UA3_CTSn MPP(5, 0x2, 0, 0) +#define MPP5_SDIO1_WP MPP(5, 0x3, 0, 0) +#define MPP5_SPI_1_CS MPP(5, 0x4, 0, 0) + +#define MPP6_GPIO6 MPP(6, 0x0, 1, 1) +#define MPP6_UA3_TXD MPP(6, 0x2, 0, 0) +#define MPP6_SDIO1_BUS_POWER MPP(6, 0x3, 0, 0) +#define MPP6_SPI_1_MOSI MPP(6, 0x4, 0, 0) + +#define MPP7_GPIO7 MPP(7, 0x0, 1, 1) +#define MPP7_UA3_RXD MPP(7, 0x2, 0, 0) +#define MPP7_SDIO1_LED_CTRL MPP(7, 0x3, 0, 0) +#define MPP7_SPI_1_SCK MPP(7, 0x4, 0, 0) + +#define MPP8_GPIO8 MPP(8, 0x0, 1, 1) +#define MPP8_WD_RST_OUT MPP(8, 0x1, 0, 0) + +#define MPP9_GPIO9 MPP(9, 0x0, 1, 1) +#define MPP9_PEX1_CLKREQn MPP(9, 0x5, 0, 0) + +#define MPP10_GPIO10 MPP(10, 0x0, 1, 1) +#define MPP10_SSP_SCLK MPP(10, 0x5, 0, 0) + +#define MPP11_GPIO11 MPP(11, 0x0, 1, 1) +#define MPP11_SATA_PRESENT MPP(11, 0x1, 0, 0) +#define MPP11_SATA_ACT MPP(11, 0x2, 0, 0) +#define MPP11_SDIO0_LED_CTRL MPP(11, 0x3, 0, 0) +#define MPP11_SDIO1_LED_CTRL MPP(11, 0x4, 0, 0) +#define MPP11_PEX0_CLKREQn MPP(11, 0x5, 0, 0) + +#define MPP12_GPIO12 MPP(12, 0x0, 1, 1) +#define MPP12_SATA_ACT MPP(12, 0x1, 0, 0) +#define MPP12_UA2_RTSn MPP(12, 0x2, 0, 0) +#define MPP12_AD0_I2S_EXT_MCLK MPP(12, 0x3, 0, 0) +#define MPP12_SDIO1_CD MPP(12, 0x4, 0, 0) + +#define MPP13_GPIO13 MPP(13, 0x0, 1, 1) +#define MPP13_UA2_CTSn MPP(13, 0x2, 0, 0) +#define MPP13_AD1_I2S_EXT_MCLK MPP(13, 0x3, 0, 0) +#define MPP13_SDIO1WP MPP(13, 0x4, 0, 0) +#define MPP13_SSP_EXTCLK MPP(13, 0x5, 0, 0) + +#define MPP14_GPIO14 MPP(14, 0x0, 1, 1) +#define MPP14_UA2_TXD MPP(14, 0x2, 0, 0) +#define MPP14_SDIO1_BUS_POWER MPP(14, 0x4, 0, 0) +#define MPP14_SSP_RXD MPP(14, 0x5, 0, 0) + +#define MPP15_GPIO15 MPP(15, 0x0, 1, 1) +#define MPP15_UA2_RXD MPP(15, 0x2, 0, 0) +#define MPP15_SDIO1_LED_CTRL MPP(15, 0x4, 0, 0) +#define MPP15_SSP_SFRM MPP(15, 0x5, 0, 0) + +#define MPP16_GPIO16 MPP(16, 0x0, 1, 1) +#define MPP16_UA3_RTSn MPP(16, 0x2, 0, 0) +#define MPP16_SDIO0_CD MPP(16, 0x3, 0, 0) +#define MPP16_SPI_LCD_CS1 MPP(16, 0x4, 0, 0) +#define MPP16_AC97_SDATA_IN1 MPP(16, 0x5, 0, 0) + +#define MPP17_GPIO17 MPP(17, 0x0, 1, 1) +#define MPP17_AC97_SYSCLK_OUT MPP(17, 0x1, 0, 0) +#define MPP17_UA3_CTSn MPP(17, 0x2, 0, 0) +#define MPP17_SDIO0_WP MPP(17, 0x3, 0, 0) +#define MPP17_TW_SDA2 MPP(17, 0x4, 0, 0) +#define MPP17_AC97_SDATA_IN2 MPP(17, 0x5, 0, 0) + +#define MPP18_GPIO18 MPP(18, 0x0, 1, 1) +#define MPP18_UA3_TXD MPP(18, 0x2, 0, 0) +#define MPP18_SDIO0_BUS_POWER MPP(18, 0x3, 0, 0) +#define MPP18_LCD0_PWM MPP(18, 0x4, 0, 0) +#define MPP18_AC_SDATA_IN3 MPP(18, 0x5, 0, 0) + +#define MPP19_GPIO19 MPP(19, 0x0, 1, 1) +#define MPP19_UA3_RXD MPP(19, 0x2, 0, 0) +#define MPP19_SDIO0_LED_CTRL MPP(19, 0x3, 0, 0) +#define MPP19_TW_SCK2 MPP(19, 0x4, 0, 0) + +#define MPP20_GPIO20 MPP(20, 0x0, 1, 1) +#define MPP20_AC97_SYSCLK_OUT MPP(20, 0x1, 0, 0) +#define MPP20_SPI_LCD_MISO MPP(20, 0x2, 0, 0) +#define MPP20_SDIO1_CD MPP(20, 0x3, 0, 0) +#define MPP20_SDIO0_CD MPP(20, 0x5, 0, 0) +#define MPP20_SPI_1_MISO MPP(20, 0x6, 0, 0) + +#define MPP21_GPIO21 MPP(21, 0x0, 1, 1) +#define MPP21_UA1_RTSn MPP(21, 0x1, 0, 0) +#define MPP21_SPI_LCD_CS0 MPP(21, 0x2, 0, 0) +#define MPP21_SDIO1_WP MPP(21, 0x3, 0, 0) +#define MPP21_SSP_SFRM MPP(21, 0x4, 0, 0) +#define MPP21_SDIO0_WP MPP(21, 0x5, 0, 0) +#define MPP21_SPI_1_CS MPP(21, 0x6, 0, 0) + +#define MPP22_GPIO22 MPP(22, 0x0, 1, 1) +#define MPP22_UA1_CTSn MPP(22, 0x1, 0, 0) +#define MPP22_SPI_LCD_MOSI MPP(22, 0x2, 0, 0) +#define MPP22_SDIO1_BUS_POWER MPP(22, 0x3, 0, 0) +#define MPP22_SSP_TXD MPP(22, 0x4, 0, 0) +#define MPP22_SDIO0_BUS_POWER MPP(22, 0x5, 0, 0) +#define MPP22_SPI_1_MOSI MPP(22, 0x6, 0, 0) + +#define MPP23_GPIO23 MPP(23, 0x0, 1, 1) +#define MPP23_SPI_LCD_SCK MPP(23, 0x2, 0, 0) +#define MPP23_SDIO1_LED_CTRL MPP(23, 0x3, 0, 0) +#define MPP23_SSP_SCLK MPP(23, 0x4, 0, 0) +#define MPP23_SDIO0_LED_CTRL MPP(23, 0x5, 0, 0) +#define MPP23_SPI_1_SCK MPP(23, 0x6, 0, 0) + +#define MPP_MAX 23 + +#define MPP_GRP(_grp, _mode) MPP((_grp), (_mode), 0, 0) /* for MPP groups _num is a group index */ enum dove_mpp_grp_idx { @@ -181,40 +153,44 @@ enum dove_mpp_grp_idx { MPP_46_51 = 1, MPP_58_61 = 5, MPP_62_63 = 4, + MPP_GRP_MAX = 5, }; -#define MPP24_39_GPIO MPP_GRP(MPP_24_39, 0x1) -#define MPP24_39_CAM MPP_GRP(MPP_24_39, 0x0) +#define MPP_GRP_24_39_GPIO MPP_GRP(MPP_24_39, 0x1) +#define MPP_GRP_24_39_CAM MPP_GRP(MPP_24_39, 0x0) -#define MPP40_45_GPIO MPP_GRP(MPP_40_45, 0x1) -#define MPP40_45_SD0 MPP_GRP(MPP_40_45, 0x0) +#define MPP_GRP_40_45_GPIO MPP_GRP(MPP_40_45, 0x1) +#define MPP_GRP_40_45_SD0 MPP_GRP(MPP_40_45, 0x0) -#define MPP46_51_GPIO MPP_GRP(MPP_46_51, 0x1) -#define MPP46_51_SD1 MPP_GRP(MPP_46_51, 0x0) +#define MPP_GRP_46_51_GPIO MPP_GRP(MPP_46_51, 0x1) +#define MPP_GRP_46_51_SD1 MPP_GRP(MPP_46_51, 0x0) -#define MPP58_61_GPIO MPP_GRP(MPP_58_61, 0x1) -#define MPP58_61_SPI MPP_GRP(MPP_58_61, 0x0) +#define MPP_GRP_58_61_GPIO MPP_GRP(MPP_58_61, 0x1) +#define MPP_GRP_58_61_SPI MPP_GRP(MPP_58_61, 0x0) -#define MPP62_63_GPIO MPP_GRP(MPP_62_63, 0x1) -#define MPP62_63_UA1 MPP_GRP(MPP_62_63, 0x0) +#define MPP_GRP_62_63_GPIO MPP_GRP(MPP_62_63, 0x1) +#define MPP_GRP_62_63_UA1 MPP_GRP(MPP_62_63, 0x0) /* The MPP[64:71] control differs from other groups */ -#define MPP64_71_GPO MPP_GRP_NFC(0x1) -#define MPP64_71_NFC MPP_GRP_NFC(0x0) +#define MPP_GRP_NFC_64_71_GPO 0x1 +#define MPP_GRP_NFC_64_71_NFC 0x0 /* * The MPP[52:57] functionality is encoded by 4 bits in different * registers. The _num field in this case encodes those bits in * correspodence with Table 135 of 88AP510 Functional specification */ -#define MPP52_57_AU1 MPP_GRP_AU1(0x0) -#define MPP52_57_AU1_GPIO57 MPP_GRP_AU1(0x2) -#define MPP52_57_GPIO MPP_GRP_AU1(0xa) -#define MPP52_57_TW_GPIO MPP_GRP_AU1(0xb) -#define MPP52_57_AU1_SSP MPP_GRP_AU1(0xc) -#define MPP52_57_SSP_GPIO MPP_GRP_AU1(0xe) -#define MPP52_57_SSP_TW MPP_GRP_AU1(0xf) - -void dove_mpp_conf(unsigned int *mpp_list); +#define MPP_GRP_AU1_52_57_AU1 0x0 +#define MPP_GRP_AU1_52_57_AU1_GPIO57 0x2 +#define MPP_GRP_AU1_52_57_GPIO 0xa +#define MPP_GRP_AU1_52_57_TW_GPIO 0xb +#define MPP_GRP_AU1_52_57_AU1_SSP 0xc +#define MPP_GRP_AU1_52_57_SSP_GPIO 0xe +#define MPP_GRP_AU1_52_57_SSP_TW 0xf + +void dove_mpp_conf(unsigned int *mpp_list, + unsigned int *mpp_grp_list, + unsigned int grp_au1_52_57, + unsigned int grp_nfc_64_71); #endif /* __ARCH_DOVE_MPP_CODED_H */ diff --git a/arch/arm/mach-ep93xx/gpio.c b/arch/arm/mach-ep93xx/gpio.c index a5a9ff7..415dce3 100644 --- a/arch/arm/mach-ep93xx/gpio.c +++ b/arch/arm/mach-ep93xx/gpio.c @@ -356,29 +356,6 @@ static int ep93xx_gpio_set_debounce(struct gpio_chip *chip, return 0; } -static void ep93xx_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) -{ - struct ep93xx_gpio_chip *ep93xx_chip = to_ep93xx_gpio_chip(chip); - u8 data_reg, data_dir_reg; - int gpio, i; - - data_reg = __raw_readb(ep93xx_chip->data_reg); - data_dir_reg = __raw_readb(ep93xx_chip->data_dir_reg); - - gpio = ep93xx_chip->chip.base; - for (i = 0; i < chip->ngpio; i++, gpio++) { - int is_out = data_dir_reg & (1 << i); - int irq = gpio_to_irq(gpio); - - seq_printf(s, " %s%d gpio-%-3d (%-12s) %s %s %s\n", - chip->label, i, gpio, - gpiochip_is_requested(chip, i) ? : "", - is_out ? "out" : "in ", - (data_reg & (1<< i)) ? "hi" : "lo", - (!is_out && irq>= 0) ? "(interrupt)" : ""); - } -} - #define EP93XX_GPIO_BANK(name, dr, ddr, base_gpio) \ { \ .chip = { \ @@ -387,7 +364,6 @@ static void ep93xx_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) .direction_output = ep93xx_gpio_direction_output, \ .get = ep93xx_gpio_get, \ .set = ep93xx_gpio_set, \ - .dbg_show = ep93xx_gpio_dbg_show, \ .base = base_gpio, \ .ngpio = 8, \ }, \ diff --git a/arch/arm/mach-exynos4/Kconfig b/arch/arm/mach-exynos4/Kconfig index e849f67..8051962 100644 --- a/arch/arm/mach-exynos4/Kconfig +++ b/arch/arm/mach-exynos4/Kconfig @@ -170,6 +170,7 @@ config MACH_NURI select S3C_DEV_HSMMC3 select S3C_DEV_I2C1 select S3C_DEV_I2C5 + select S5P_DEV_USB_EHCI select EXYNOS4_SETUP_I2C1 select EXYNOS4_SETUP_I2C5 select EXYNOS4_SETUP_SDHCI diff --git a/arch/arm/mach-exynos4/Makefile b/arch/arm/mach-exynos4/Makefile index 9be104f..7778975 100644 --- a/arch/arm/mach-exynos4/Makefile +++ b/arch/arm/mach-exynos4/Makefile @@ -54,3 +54,5 @@ obj-$(CONFIG_EXYNOS4_SETUP_I2C7) += setup-i2c7.o obj-$(CONFIG_EXYNOS4_SETUP_KEYPAD) += setup-keypad.o obj-$(CONFIG_EXYNOS4_SETUP_SDHCI) += setup-sdhci.o obj-$(CONFIG_EXYNOS4_SETUP_SDHCI_GPIO) += setup-sdhci-gpio.o + +obj-$(CONFIG_USB_SUPPORT) += usb-phy.o diff --git a/arch/arm/mach-exynos4/cpu.c b/arch/arm/mach-exynos4/cpu.c index 7930113..08813a6 100644 --- a/arch/arm/mach-exynos4/cpu.c +++ b/arch/arm/mach-exynos4/cpu.c @@ -97,7 +97,12 @@ static struct map_desc exynos4_iodesc[] __initdata = { .pfn = __phys_to_pfn(EXYNOS4_PA_SROMC), .length = SZ_4K, .type = MT_DEVICE, - }, + }, { + .virtual = (unsigned long)S5P_VA_USB_HSPHY, + .pfn = __phys_to_pfn(EXYNOS4_PA_HSPHY), + .length = SZ_4K, + .type = MT_DEVICE, + } }; static void exynos4_idle(void) diff --git a/arch/arm/mach-exynos4/include/mach/map.h b/arch/arm/mach-exynos4/include/mach/map.h index 6330b73..0009e77 100644 --- a/arch/arm/mach-exynos4/include/mach/map.h +++ b/arch/arm/mach-exynos4/include/mach/map.h @@ -101,6 +101,9 @@ #define EXYNOS4_PA_SROMC 0x12570000 +#define EXYNOS4_PA_EHCI 0x12580000 +#define EXYNOS4_PA_HSPHY 0x125B0000 + #define EXYNOS4_PA_UART 0x13800000 #define EXYNOS4_PA_IIC(x) (0x13860000 + ((x) * 0x10000)) @@ -143,6 +146,7 @@ #define S5P_PA_SROMC EXYNOS4_PA_SROMC #define S5P_PA_SYSCON EXYNOS4_PA_SYSCON #define S5P_PA_TIMER EXYNOS4_PA_TIMER +#define S5P_PA_EHCI EXYNOS4_PA_EHCI #define SAMSUNG_PA_KEYPAD EXYNOS4_PA_KEYPAD diff --git a/arch/arm/mach-exynos4/include/mach/regs-pmu.h b/arch/arm/mach-exynos4/include/mach/regs-pmu.h index 62b0014..a964337 100644 --- a/arch/arm/mach-exynos4/include/mach/regs-pmu.h +++ b/arch/arm/mach-exynos4/include/mach/regs-pmu.h @@ -33,6 +33,9 @@ #define S5P_EINT_WAKEUP_MASK S5P_PMUREG(0x0604) #define S5P_WAKEUP_MASK S5P_PMUREG(0x0608) +#define S5P_USBHOST_PHY_CONTROL S5P_PMUREG(0x0708) +#define S5P_USBHOST_PHY_ENABLE (1 << 0) + #define S5P_MIPI_DPHY_CONTROL(n) S5P_PMUREG(0x0710 + (n) * 4) #define S5P_MIPI_DPHY_ENABLE (1 << 0) #define S5P_MIPI_DPHY_SRESETN (1 << 1) diff --git a/arch/arm/mach-exynos4/include/mach/regs-usb-phy.h b/arch/arm/mach-exynos4/include/mach/regs-usb-phy.h new file mode 100644 index 0000000..703118d --- /dev/null +++ b/arch/arm/mach-exynos4/include/mach/regs-usb-phy.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2011 Samsung Electronics Co.Ltd + * Author: Joonyoung Shim + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __PLAT_S5P_REGS_USB_PHY_H +#define __PLAT_S5P_REGS_USB_PHY_H + +#define EXYNOS4_HSOTG_PHYREG(x) ((x) + S5P_VA_USB_HSPHY) + +#define EXYNOS4_PHYPWR EXYNOS4_HSOTG_PHYREG(0x00) +#define PHY1_HSIC_NORMAL_MASK (0xf << 9) +#define PHY1_HSIC1_SLEEP (1 << 12) +#define PHY1_HSIC1_FORCE_SUSPEND (1 << 11) +#define PHY1_HSIC0_SLEEP (1 << 10) +#define PHY1_HSIC0_FORCE_SUSPEND (1 << 9) + +#define PHY1_STD_NORMAL_MASK (0x7 << 6) +#define PHY1_STD_SLEEP (1 << 8) +#define PHY1_STD_ANALOG_POWERDOWN (1 << 7) +#define PHY1_STD_FORCE_SUSPEND (1 << 6) + +#define PHY0_NORMAL_MASK (0x39 << 0) +#define PHY0_SLEEP (1 << 5) +#define PHY0_OTG_DISABLE (1 << 4) +#define PHY0_ANALOG_POWERDOWN (1 << 3) +#define PHY0_FORCE_SUSPEND (1 << 0) + +#define EXYNOS4_PHYCLK EXYNOS4_HSOTG_PHYREG(0x04) +#define PHY1_COMMON_ON_N (1 << 7) +#define PHY0_COMMON_ON_N (1 << 4) +#define PHY0_ID_PULLUP (1 << 2) +#define CLKSEL_MASK (0x3 << 0) +#define CLKSEL_SHIFT (0) +#define CLKSEL_48M (0x0 << 0) +#define CLKSEL_12M (0x2 << 0) +#define CLKSEL_24M (0x3 << 0) + +#define EXYNOS4_RSTCON EXYNOS4_HSOTG_PHYREG(0x08) +#define HOST_LINK_PORT_SWRST_MASK (0xf << 6) +#define HOST_LINK_PORT2_SWRST (1 << 9) +#define HOST_LINK_PORT1_SWRST (1 << 8) +#define HOST_LINK_PORT0_SWRST (1 << 7) +#define HOST_LINK_ALL_SWRST (1 << 6) + +#define PHY1_SWRST_MASK (0x7 << 3) +#define PHY1_HSIC_SWRST (1 << 5) +#define PHY1_STD_SWRST (1 << 4) +#define PHY1_ALL_SWRST (1 << 3) + +#define PHY0_SWRST_MASK (0x7 << 0) +#define PHY0_PHYLINK_SWRST (1 << 2) +#define PHY0_HLINK_SWRST (1 << 1) +#define PHY0_SWRST (1 << 0) + +#define EXYNOS4_PHY1CON EXYNOS4_HSOTG_PHYREG(0x34) +#define FPENABLEN (1 << 0) + +#endif /* __PLAT_S5P_REGS_USB_PHY_H */ diff --git a/arch/arm/mach-exynos4/include/mach/smp.h b/arch/arm/mach-exynos4/include/mach/smp.h deleted file mode 100644 index a463dce..0000000 --- a/arch/arm/mach-exynos4/include/mach/smp.h +++ /dev/null @@ -1,19 +0,0 @@ -/* linux/arch/arm/mach-exynos4/include/mach/smp.h - * - * Cloned from arch/arm/mach-realview/include/mach/smp.h -*/ - -#ifndef ASM_ARCH_SMP_H -#define ASM_ARCH_SMP_H __FILE__ - -#include - -/* - * We use IRQ1 as the IPI - */ -static inline void smp_cross_call(const struct cpumask *mask, int ipi) -{ - gic_raise_softirq(mask, ipi); -} - -#endif diff --git a/arch/arm/mach-exynos4/irq-combiner.c b/arch/arm/mach-exynos4/irq-combiner.c index f488b66..5a2758a 100644 --- a/arch/arm/mach-exynos4/irq-combiner.c +++ b/arch/arm/mach-exynos4/irq-combiner.c @@ -59,8 +59,7 @@ static void combiner_handle_cascade_irq(unsigned int irq, struct irq_desc *desc) unsigned int cascade_irq, combiner_irq; unsigned long status; - /* primary controller ack'ing */ - chip->irq_ack(&desc->irq_data); + chained_irq_enter(chip, desc); spin_lock(&irq_controller_lock); status = __raw_readl(chip_data->base + COMBINER_INT_STATUS); @@ -79,8 +78,7 @@ static void combiner_handle_cascade_irq(unsigned int irq, struct irq_desc *desc) generic_handle_irq(cascade_irq); out: - /* primary controller unmasking */ - chip->irq_unmask(&desc->irq_data); + chained_irq_exit(chip, desc); } static struct irq_chip combiner_chip = { diff --git a/arch/arm/mach-exynos4/mach-nuri.c b/arch/arm/mach-exynos4/mach-nuri.c index b79ad01..bb5d12f 100644 --- a/arch/arm/mach-exynos4/mach-nuri.c +++ b/arch/arm/mach-exynos4/mach-nuri.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include @@ -262,6 +264,16 @@ static struct i2c_board_info i2c5_devs[] __initdata = { /* max8997, To be updated */ }; +/* USB EHCI */ +static struct s5p_ehci_platdata nuri_ehci_pdata; + +static void __init nuri_ehci_init(void) +{ + struct s5p_ehci_platdata *pdata = &nuri_ehci_pdata; + + s5p_ehci_set_platdata(pdata); +} + static struct platform_device *nuri_devices[] __initdata = { /* Samsung Platform Devices */ &emmc_fixed_voltage, @@ -270,6 +282,7 @@ static struct platform_device *nuri_devices[] __initdata = { &s3c_device_hsmmc3, &s3c_device_wdt, &s3c_device_timer[0], + &s5p_device_ehci, /* NURI Devices */ &nuri_gpio_keys, @@ -291,6 +304,9 @@ static void __init nuri_machine_init(void) i2c_register_board_info(1, i2c1_devs, ARRAY_SIZE(i2c1_devs)); i2c_register_board_info(5, i2c5_devs, ARRAY_SIZE(i2c5_devs)); + nuri_ehci_init(); + clk_xusbxti.rate = 24000000; + /* Last */ platform_add_devices(nuri_devices, ARRAY_SIZE(nuri_devices)); } diff --git a/arch/arm/mach-exynos4/platsmp.c b/arch/arm/mach-exynos4/platsmp.c index 6d35878..c5e65a0 100644 --- a/arch/arm/mach-exynos4/platsmp.c +++ b/arch/arm/mach-exynos4/platsmp.c @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -104,7 +105,7 @@ int __cpuinit boot_secondary(unsigned int cpu, struct task_struct *idle) * the boot monitor to read the system wide flags register, * and branch to the address found there. */ - smp_cross_call(cpumask_of(cpu), 1); + gic_raise_softirq(cpumask_of(cpu), 1); timeout = jiffies + (1 * HZ); while (time_before(jiffies, timeout)) { @@ -147,6 +148,8 @@ void __init smp_init_cpus(void) for (i = 0; i < ncores; i++) set_cpu_possible(i, true); + + set_smp_cross_call(gic_raise_softirq); } void __init platform_smp_prepare_cpus(unsigned int max_cpus) diff --git a/arch/arm/mach-exynos4/pm.c b/arch/arm/mach-exynos4/pm.c index 10d917d..8755ca8 100644 --- a/arch/arm/mach-exynos4/pm.c +++ b/arch/arm/mach-exynos4/pm.c @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -372,7 +373,27 @@ void exynos4_scu_enable(void __iomem *scu_base) flush_cache_all(); } -static int exynos4_pm_resume(struct sys_device *dev) +static struct sysdev_driver exynos4_pm_driver = { + .add = exynos4_pm_add, +}; + +static __init int exynos4_pm_drvinit(void) +{ + unsigned int tmp; + + s3c_pm_init(); + + /* All wakeup disable */ + + tmp = __raw_readl(S5P_WAKEUP_MASK); + tmp |= ((0xFF << 8) | (0x1F << 1)); + __raw_writel(tmp, S5P_WAKEUP_MASK); + + return sysdev_driver_register(&exynos4_sysclass, &exynos4_pm_driver); +} +arch_initcall(exynos4_pm_drvinit); + +static void exynos4_pm_resume(void) { /* For release retention */ @@ -394,27 +415,15 @@ static int exynos4_pm_resume(struct sys_device *dev) /* enable L2X0*/ writel_relaxed(1, S5P_VA_L2CC + L2X0_CTRL); #endif - - return 0; } -static struct sysdev_driver exynos4_pm_driver = { - .add = exynos4_pm_add, +static struct syscore_ops exynos4_pm_syscore_ops = { .resume = exynos4_pm_resume, }; -static __init int exynos4_pm_drvinit(void) +static __init int exynos4_pm_syscore_init(void) { - unsigned int tmp; - - s3c_pm_init(); - - /* All wakeup disable */ - - tmp = __raw_readl(S5P_WAKEUP_MASK); - tmp |= ((0xFF << 8) | (0x1F << 1)); - __raw_writel(tmp, S5P_WAKEUP_MASK); - - return sysdev_driver_register(&exynos4_sysclass, &exynos4_pm_driver); + register_syscore_ops(&exynos4_pm_syscore_ops); + return 0; } -arch_initcall(exynos4_pm_drvinit); +arch_initcall(exynos4_pm_syscore_init); diff --git a/arch/arm/mach-exynos4/usb-phy.c b/arch/arm/mach-exynos4/usb-phy.c new file mode 100644 index 0000000..0883c1b --- /dev/null +++ b/arch/arm/mach-exynos4/usb-phy.c @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2011 Samsung Electronics Co.Ltd + * Author: Joonyoung Shim + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int exynos4_usb_phy1_init(struct platform_device *pdev) +{ + struct clk *otg_clk; + struct clk *xusbxti_clk; + u32 phyclk; + u32 rstcon; + int err; + + otg_clk = clk_get(&pdev->dev, "otg"); + if (IS_ERR(otg_clk)) { + dev_err(&pdev->dev, "Failed to get otg clock\n"); + return PTR_ERR(otg_clk); + } + + err = clk_enable(otg_clk); + if (err) { + clk_put(otg_clk); + return err; + } + + writel(readl(S5P_USBHOST_PHY_CONTROL) | S5P_USBHOST_PHY_ENABLE, + S5P_USBHOST_PHY_CONTROL); + + /* set clock frequency for PLL */ + phyclk = readl(EXYNOS4_PHYCLK) & ~CLKSEL_MASK; + + xusbxti_clk = clk_get(&pdev->dev, "xusbxti"); + if (xusbxti_clk && !IS_ERR(xusbxti_clk)) { + switch (clk_get_rate(xusbxti_clk)) { + case 12 * MHZ: + phyclk |= CLKSEL_12M; + break; + case 24 * MHZ: + phyclk |= CLKSEL_24M; + break; + default: + case 48 * MHZ: + /* default reference clock */ + break; + } + clk_put(xusbxti_clk); + } + + writel(phyclk, EXYNOS4_PHYCLK); + + /* floating prevention logic: disable */ + writel((readl(EXYNOS4_PHY1CON) | FPENABLEN), EXYNOS4_PHY1CON); + + /* set to normal HSIC 0 and 1 of PHY1 */ + writel((readl(EXYNOS4_PHYPWR) & ~PHY1_HSIC_NORMAL_MASK), + EXYNOS4_PHYPWR); + + /* set to normal standard USB of PHY1 */ + writel((readl(EXYNOS4_PHYPWR) & ~PHY1_STD_NORMAL_MASK), EXYNOS4_PHYPWR); + + /* reset all ports of both PHY and Link */ + rstcon = readl(EXYNOS4_RSTCON) | HOST_LINK_PORT_SWRST_MASK | + PHY1_SWRST_MASK; + writel(rstcon, EXYNOS4_RSTCON); + udelay(10); + + rstcon &= ~(HOST_LINK_PORT_SWRST_MASK | PHY1_SWRST_MASK); + writel(rstcon, EXYNOS4_RSTCON); + udelay(50); + + clk_disable(otg_clk); + clk_put(otg_clk); + + return 0; +} + +static int exynos4_usb_phy1_exit(struct platform_device *pdev) +{ + struct clk *otg_clk; + int err; + + otg_clk = clk_get(&pdev->dev, "otg"); + if (IS_ERR(otg_clk)) { + dev_err(&pdev->dev, "Failed to get otg clock\n"); + return PTR_ERR(otg_clk); + } + + err = clk_enable(otg_clk); + if (err) { + clk_put(otg_clk); + return err; + } + + writel((readl(EXYNOS4_PHYPWR) | PHY1_STD_ANALOG_POWERDOWN), + EXYNOS4_PHYPWR); + + writel(readl(S5P_USBHOST_PHY_CONTROL) & ~S5P_USBHOST_PHY_ENABLE, + S5P_USBHOST_PHY_CONTROL); + + clk_disable(otg_clk); + clk_put(otg_clk); + + return 0; +} + +int s5p_usb_phy_init(struct platform_device *pdev, int type) +{ + if (type == S5P_USB_PHY_HOST) + return exynos4_usb_phy1_init(pdev); + + return -EINVAL; +} + +int s5p_usb_phy_exit(struct platform_device *pdev, int type) +{ + if (type == S5P_USB_PHY_HOST) + return exynos4_usb_phy1_exit(pdev); + + return -EINVAL; +} diff --git a/arch/arm/mach-footbridge/Kconfig b/arch/arm/mach-footbridge/Kconfig index bdd2579..46adca0 100644 --- a/arch/arm/mach-footbridge/Kconfig +++ b/arch/arm/mach-footbridge/Kconfig @@ -4,6 +4,7 @@ menu "Footbridge Implementations" config ARCH_CATS bool "CATS" + select CLKSRC_I8253 select FOOTBRIDGE_HOST select ISA select ISA_DMA @@ -59,6 +60,7 @@ config ARCH_EBSA285_HOST config ARCH_NETWINDER bool "NetWinder" + select CLKSRC_I8253 select FOOTBRIDGE_HOST select ISA select ISA_DMA diff --git a/arch/arm/mach-footbridge/isa-timer.c b/arch/arm/mach-footbridge/isa-timer.c index 441c6ce..7020f1a 100644 --- a/arch/arm/mach-footbridge/isa-timer.c +++ b/arch/arm/mach-footbridge/isa-timer.c @@ -10,53 +10,16 @@ #include #include #include +#include #include #include - +#include #include #include "common.h" -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 - -#define PIT_LATCH ((PIT_TICK_RATE + HZ / 2) / HZ) - -static cycle_t pit_read(struct clocksource *cs) -{ - unsigned long flags; - static int old_count; - static u32 old_jifs; - int count; - u32 jifs; - - raw_local_irq_save(flags); - - jifs = jiffies; - outb_p(0x00, PIT_MODE); /* latch the count */ - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb_p(PIT_CH0) << 8; - - if (count > old_count && jifs == old_jifs) - count = old_count; - - old_count = count; - old_jifs = jifs; - - raw_local_irq_restore(flags); - - count = (PIT_LATCH - 1) - count; - - return (cycle_t)(jifs * PIT_LATCH) + count; -} - -static struct clocksource pit_cs = { - .name = "pit", - .rating = 110, - .read = pit_read, - .mask = CLOCKSOURCE_MASK(32), -}; +DEFINE_RAW_SPINLOCK(i8253_lock); static void pit_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) @@ -121,7 +84,7 @@ static void __init isa_timer_init(void) pit_ce.max_delta_ns = clockevent_delta2ns(0x7fff, &pit_ce); pit_ce.min_delta_ns = clockevent_delta2ns(0x000f, &pit_ce); - clocksource_register_hz(&pit_cs, PIT_TICK_RATE); + clocksource_i8253_init(); setup_irq(pit_ce.irq, &pit_timer_irq); clockevents_register_device(&pit_ce); diff --git a/arch/arm/mach-gemini/include/mach/uncompress.h b/arch/arm/mach-gemini/include/mach/uncompress.h index 5483f61..0efa262 100644 --- a/arch/arm/mach-gemini/include/mach/uncompress.h +++ b/arch/arm/mach-gemini/include/mach/uncompress.h @@ -16,7 +16,7 @@ #include #include -static volatile unsigned long *UART = (unsigned long *)GEMINI_UART_BASE; +static volatile unsigned long * const UART = (unsigned long *)GEMINI_UART_BASE; /* * The following code assumes the serial port has already been diff --git a/arch/arm/mach-h720x/include/mach/memory.h b/arch/arm/mach-h720x/include/mach/memory.h index 9d36876..b0b3bae 100644 --- a/arch/arm/mach-h720x/include/mach/memory.h +++ b/arch/arm/mach-h720x/include/mach/memory.h @@ -13,7 +13,6 @@ * There should not be more than (0xd0000000 - 0xc0000000) * bytes of RAM. */ -#define ISA_DMA_THRESHOLD (PHYS_OFFSET + SZ_256M - 1) -#define MAX_DMA_ADDRESS (PAGE_OFFSET + SZ_256M) +#define ARM_DMA_ZONE_SIZE SZ_256M #endif diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig index 56b930a..59c97a3 100644 --- a/arch/arm/mach-imx/Kconfig +++ b/arch/arm/mach-imx/Kconfig @@ -1,5 +1,15 @@ config IMX_HAVE_DMA_V1 bool +# +# ARCH_MX31 and ARCH_MX35 are left for compatibility +# Some usages assume that having one of them implies not having (e.g.) ARCH_MX2. +# To easily distinguish good and reviewed from unreviewed usages new (and IMHO +# more sensible) names are used: SOC_IMX31 and SOC_IMX35 +config ARCH_MX31 + bool + +config ARCH_MX35 + bool config SOC_IMX1 bool @@ -31,6 +41,24 @@ config SOC_IMX27 select IMX_HAVE_IOMUX_V1 select MXC_AVIC +config SOC_IMX31 + bool + select CPU_V6 + select IMX_HAVE_PLATFORM_MXC_RNGA + select ARCH_MXC_AUDMUX_V2 + select ARCH_MX31 + select MXC_AVIC + +config SOC_IMX35 + bool + select CPU_V6 + select ARCH_MXC_IOMUX_V3 + select ARCH_MXC_AUDMUX_V2 + select HAVE_EPIT + select ARCH_MX35 + select MXC_AVIC + + if ARCH_MX1 comment "MX1 platforms:" @@ -40,6 +68,7 @@ config MACH_MXLADS config ARCH_MX1ADS bool "MX1ADS platform" select MACH_MXLADS + select SOC_IMX1 select IMX_HAVE_PLATFORM_IMX_I2C select IMX_HAVE_PLATFORM_IMX_UART help @@ -51,6 +80,13 @@ config MACH_SCB9328 help Say Y here if you are using a Synertronixx scb9328 board +config MACH_APF9328 + bool "APF9328" + select SOC_IMX1 + select IMX_HAVE_PLATFORM_IMX_UART + help + Say Yes here if you are using the Armadeus APF9328 development board + endif if ARCH_MX2 @@ -129,6 +165,7 @@ choice config MACH_EUKREA_MBIMXSD25_BASEBOARD bool "Eukrea MBIMXSD development board" + select IMX_HAVE_PLATFORM_GPIO_KEYS select IMX_HAVE_PLATFORM_IMX_SSI help This adds board specific devices that can be found on Eukrea's @@ -254,6 +291,7 @@ config MACH_MX27_3DS config MACH_IMX27_VISSTRIM_M10 bool "Vista Silicon i.MX27 Visstrim_m10" select SOC_IMX27 + select IMX_HAVE_PLATFORM_GPIO_KEYS select IMX_HAVE_PLATFORM_IMX_I2C select IMX_HAVE_PLATFORM_IMX_SSI select IMX_HAVE_PLATFORM_IMX_UART @@ -314,3 +352,251 @@ config MACH_IMX27IPCAM configurations for the board and its peripherals. endif + +if ARCH_MX3 + +comment "MX31 platforms:" + +config MACH_MX31ADS + bool "Support MX31ADS platforms" + select SOC_IMX31 + select IMX_HAVE_PLATFORM_IMX_I2C + select IMX_HAVE_PLATFORM_IMX_SSI + select IMX_HAVE_PLATFORM_IMX_UART + default y + help + Include support for MX31ADS platform. This includes specific + configurations for the board and its peripherals. + +config MACH_MX31ADS_WM1133_EV1 + bool "Support Wolfson Microelectronics 1133-EV1 module" + depends on MACH_MX31ADS + depends on MFD_WM8350_I2C + depends on REGULATOR_WM8350 + select MFD_WM8350_CONFIG_MODE_0 + select MFD_WM8352_CONFIG_MODE_0 + help + Include support for the Wolfson Microelectronics 1133-EV1 PMU + and audio module for the MX31ADS platform. + +config MACH_MX31LILLY + bool "Support MX31 LILLY-1131 platforms (INCO startec)" + select SOC_IMX31 + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_IPU_CORE + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_MMC + select IMX_HAVE_PLATFORM_SPI_IMX + select MXC_ULPI if USB_ULPI + help + Include support for mx31 based LILLY1131 modules. This includes + specific configurations for the board and its peripherals. + +config MACH_MX31LITE + bool "Support MX31 LITEKIT (LogicPD)" + select SOC_IMX31 + select MXC_ULPI if USB_ULPI + select IMX_HAVE_PLATFORM_IMX2_WDT + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_MMC + select IMX_HAVE_PLATFORM_MXC_NAND + select IMX_HAVE_PLATFORM_MXC_RTC + select IMX_HAVE_PLATFORM_SPI_IMX + help + Include support for MX31 LITEKIT platform. This includes specific + configurations for the board and its peripherals. + +config MACH_PCM037 + bool "Support Phytec pcm037 (i.MX31) platforms" + select SOC_IMX31 + select IMX_HAVE_PLATFORM_FSL_USB2_UDC + select IMX_HAVE_PLATFORM_IMX2_WDT + select IMX_HAVE_PLATFORM_IMX_I2C + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_IPU_CORE + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_MMC + select IMX_HAVE_PLATFORM_MXC_NAND + select IMX_HAVE_PLATFORM_MXC_W1 + select MXC_ULPI if USB_ULPI + help + Include support for Phytec pcm037 platform. This includes + specific configurations for the board and its peripherals. + +config MACH_PCM037_EET + bool "Support pcm037 EET board extensions" + depends on MACH_PCM037 + select IMX_HAVE_PLATFORM_GPIO_KEYS + select IMX_HAVE_PLATFORM_SPI_IMX + help + Add support for PCM037 EET baseboard extensions. If you are using the + OLED display with EET, use "video=mx3fb:CMEL-OLED" kernel + command-line parameter. + +config MACH_MX31_3DS + bool "Support MX31PDK (3DS)" + select SOC_IMX31 + select MXC_DEBUG_BOARD + select IMX_HAVE_PLATFORM_FSL_USB2_UDC + select IMX_HAVE_PLATFORM_IMX2_WDT + select IMX_HAVE_PLATFORM_IMX_I2C + select IMX_HAVE_PLATFORM_IMX_KEYPAD + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_IPU_CORE + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_NAND + select IMX_HAVE_PLATFORM_SPI_IMX + select MXC_ULPI if USB_ULPI + help + Include support for MX31PDK (3DS) platform. This includes specific + configurations for the board and its peripherals. + +config MACH_MX31_3DS_MXC_NAND_USE_BBT + bool "Make the MXC NAND driver use the in flash Bad Block Table" + depends on MACH_MX31_3DS + depends on MTD_NAND_MXC + help + Enable this if you want that the MXC NAND driver uses the in flash + Bad Block Table to know what blocks are bad instead of scanning the + entire flash looking for bad block markers. + +config MACH_MX31MOBOARD + bool "Support mx31moboard platforms (EPFL Mobots group)" + select SOC_IMX31 + select IMX_HAVE_PLATFORM_FSL_USB2_UDC + select IMX_HAVE_PLATFORM_IMX_I2C + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_IPU_CORE + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_MMC + select IMX_HAVE_PLATFORM_SPI_IMX + select MXC_ULPI if USB_ULPI + help + Include support for mx31moboard platform. This includes specific + configurations for the board and its peripherals. + +config MACH_QONG + bool "Support Dave/DENX QongEVB-LITE platform" + select SOC_IMX31 + select IMX_HAVE_PLATFORM_IMX_UART + help + Include support for Dave/DENX QongEVB-LITE platform. This includes + specific configurations for the board and its peripherals. + +config MACH_ARMADILLO5X0 + bool "Support Atmark Armadillo-500 Development Base Board" + select SOC_IMX31 + select IMX_HAVE_PLATFORM_GPIO_KEYS + select IMX_HAVE_PLATFORM_IMX_I2C + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_IPU_CORE + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_MMC + select IMX_HAVE_PLATFORM_MXC_NAND + select MXC_ULPI if USB_ULPI + help + Include support for Atmark Armadillo-500 platform. This includes + specific configurations for the board and its peripherals. + +config MACH_KZM_ARM11_01 + bool "Support KZM-ARM11-01(Kyoto Microcomputer)" + select SOC_IMX31 + select IMX_HAVE_PLATFORM_IMX_UART + help + Include support for KZM-ARM11-01. This includes specific + configurations for the board and its peripherals. + +config MACH_BUG + bool "Support Buglabs BUGBase platform" + select SOC_IMX31 + select IMX_HAVE_PLATFORM_IMX_UART + default y + help + Include support for BUGBase 1.3 platform. This includes specific + configurations for the board and its peripherals. + +comment "MX35 platforms:" + +config MACH_PCM043 + bool "Support Phytec pcm043 (i.MX35) platforms" + select SOC_IMX35 + select IMX_HAVE_PLATFORM_FLEXCAN + select IMX_HAVE_PLATFORM_FSL_USB2_UDC + select IMX_HAVE_PLATFORM_IMX2_WDT + select IMX_HAVE_PLATFORM_IMX_I2C + select IMX_HAVE_PLATFORM_IMX_SSI + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_IPU_CORE + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_NAND + select IMX_HAVE_PLATFORM_SDHCI_ESDHC_IMX + select MXC_ULPI if USB_ULPI + help + Include support for Phytec pcm043 platform. This includes + specific configurations for the board and its peripherals. + +config MACH_MX35_3DS + bool "Support MX35PDK platform" + select SOC_IMX35 + select MXC_DEBUG_BOARD + select IMX_HAVE_PLATFORM_FSL_USB2_UDC + select IMX_HAVE_PLATFORM_IMX2_WDT + select IMX_HAVE_PLATFORM_IMX_I2C + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_NAND + select IMX_HAVE_PLATFORM_SDHCI_ESDHC_IMX + help + Include support for MX35PDK platform. This includes specific + configurations for the board and its peripherals. + +config MACH_EUKREA_CPUIMX35 + bool "Support Eukrea CPUIMX35 Platform" + select SOC_IMX35 + select IMX_HAVE_PLATFORM_FLEXCAN + select IMX_HAVE_PLATFORM_FSL_USB2_UDC + select IMX_HAVE_PLATFORM_IMX2_WDT + select IMX_HAVE_PLATFORM_IMX_I2C + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_NAND + select IMX_HAVE_PLATFORM_SDHCI_ESDHC_IMX + select MXC_ULPI if USB_ULPI + help + Include support for Eukrea CPUIMX35 platform. This includes + specific configurations for the board and its peripherals. + +choice + prompt "Baseboard" + depends on MACH_EUKREA_CPUIMX35 + default MACH_EUKREA_MBIMXSD35_BASEBOARD + +config MACH_EUKREA_MBIMXSD35_BASEBOARD + bool "Eukrea MBIMXSD development board" + select IMX_HAVE_PLATFORM_GPIO_KEYS + select IMX_HAVE_PLATFORM_IMX_SSI + select IMX_HAVE_PLATFORM_IPU_CORE + help + This adds board specific devices that can be found on Eukrea's + MBIMXSD evaluation board. + +endchoice + +config MACH_VPR200 + bool "Support VPR200 platform" + select SOC_IMX35 + select IMX_HAVE_PLATFORM_FSL_USB2_UDC + select IMX_HAVE_PLATFORM_GPIO_KEYS + select IMX_HAVE_PLATFORM_IMX2_WDT + select IMX_HAVE_PLATFORM_IMX_UART + select IMX_HAVE_PLATFORM_IMX_I2C + select IMX_HAVE_PLATFORM_IPU_CORE + select IMX_HAVE_PLATFORM_MXC_EHCI + select IMX_HAVE_PLATFORM_MXC_NAND + select IMX_HAVE_PLATFORM_SDHCI_ESDHC_IMX + help + Include support for VPR200 platform. This includes specific + configurations for the board and its peripherals. + +endif diff --git a/arch/arm/mach-imx/Makefile b/arch/arm/mach-imx/Makefile index b85794d..e9eb36d 100644 --- a/arch/arm/mach-imx/Makefile +++ b/arch/arm/mach-imx/Makefile @@ -1,9 +1,3 @@ -# -# Makefile for the linux kernel. -# - -# Object file lists. - obj-$(CONFIG_IMX_HAVE_DMA_V1) += dma-v1.o obj-$(CONFIG_ARCH_MX1) += clock-imx1.o mm-imx1.o @@ -14,18 +8,27 @@ obj-$(CONFIG_ARCH_MX25) += clock-imx25.o mm-imx25.o ehci-imx25.o obj-$(CONFIG_MACH_MX27) += cpu-imx27.o pm-imx27.o obj-$(CONFIG_MACH_MX27) += clock-imx27.o mm-imx27.o ehci-imx27.o +obj-$(CONFIG_SOC_IMX31) += mm-imx31.o cpu-imx31.o clock-imx31.o iomux-imx31.o ehci-imx31.o +obj-$(CONFIG_SOC_IMX35) += mm-imx35.o cpu-imx35.o clock-imx35.o ehci-imx35.o +obj-$(CONFIG_CACHE_L2X0) += cache-l2x0.o + # Support for CMOS sensor interface -obj-$(CONFIG_MX1_VIDEO) += mx1-camera-fiq.o mx1-camera-fiq-ksym.o +obj-$(CONFIG_MX1_VIDEO) += mx1-camera-fiq.o mx1-camera-fiq-ksym.o +# i.MX1 based machines obj-$(CONFIG_ARCH_MX1ADS) += mach-mx1ads.o obj-$(CONFIG_MACH_SCB9328) += mach-scb9328.o +obj-$(CONFIG_MACH_APF9328) += mach-apf9328.o +# i.MX21 based machines obj-$(CONFIG_MACH_MX21ADS) += mach-mx21ads.o +# i.MX25 based machines obj-$(CONFIG_MACH_MX25_3DS) += mach-mx25_3ds.o obj-$(CONFIG_MACH_EUKREA_CPUIMX25) += mach-eukrea_cpuimx25.o obj-$(CONFIG_MACH_EUKREA_MBIMXSD25_BASEBOARD) += eukrea_mbimxsd25-baseboard.o +# i.MX27 based machines obj-$(CONFIG_MACH_MX27ADS) += mach-mx27ads.o obj-$(CONFIG_MACH_PCM038) += mach-pcm038.o obj-$(CONFIG_MACH_PCM970_BASEBOARD) += pcm970-baseboard.o @@ -37,3 +40,24 @@ obj-$(CONFIG_MACH_EUKREA_MBIMX27_BASEBOARD) += eukrea_mbimx27-baseboard.o obj-$(CONFIG_MACH_PCA100) += mach-pca100.o obj-$(CONFIG_MACH_MXT_TD60) += mach-mxt_td60.o obj-$(CONFIG_MACH_IMX27IPCAM) += mach-imx27ipcam.o + +# i.MX31 based machines +obj-$(CONFIG_MACH_MX31ADS) += mach-mx31ads.o +obj-$(CONFIG_MACH_MX31LILLY) += mach-mx31lilly.o mx31lilly-db.o +obj-$(CONFIG_MACH_MX31LITE) += mach-mx31lite.o mx31lite-db.o +obj-$(CONFIG_MACH_PCM037) += mach-pcm037.o +obj-$(CONFIG_MACH_PCM037_EET) += mach-pcm037_eet.o +obj-$(CONFIG_MACH_MX31_3DS) += mach-mx31_3ds.o +obj-$(CONFIG_MACH_MX31MOBOARD) += mach-mx31moboard.o mx31moboard-devboard.o \ + mx31moboard-marxbot.o mx31moboard-smartbot.o +obj-$(CONFIG_MACH_QONG) += mach-qong.o +obj-$(CONFIG_MACH_ARMADILLO5X0) += mach-armadillo5x0.o +obj-$(CONFIG_MACH_KZM_ARM11_01) += mach-kzm_arm11_01.o +obj-$(CONFIG_MACH_BUG) += mach-bug.o + +# i.MX35 based machines +obj-$(CONFIG_MACH_PCM043) += mach-pcm043.o +obj-$(CONFIG_MACH_MX35_3DS) += mach-mx35_3ds.o +obj-$(CONFIG_MACH_EUKREA_CPUIMX35) += mach-cpuimx35.o +obj-$(CONFIG_MACH_EUKREA_MBIMXSD35_BASEBOARD) += eukrea_mbimxsd35-baseboard.o +obj-$(CONFIG_MACH_VPR200) += mach-vpr200.o diff --git a/arch/arm/mach-imx/Makefile.boot b/arch/arm/mach-imx/Makefile.boot index 3953d60..ebee18b 100644 --- a/arch/arm/mach-imx/Makefile.boot +++ b/arch/arm/mach-imx/Makefile.boot @@ -13,3 +13,7 @@ initrd_phys-$(CONFIG_ARCH_MX25) := 0x80800000 zreladdr-$(CONFIG_MACH_MX27) := 0xA0008000 params_phys-$(CONFIG_MACH_MX27) := 0xA0000100 initrd_phys-$(CONFIG_MACH_MX27) := 0xA0800000 + +zreladdr-$(CONFIG_ARCH_MX3) := 0x80008000 +params_phys-$(CONFIG_ARCH_MX3) := 0x80000100 +initrd_phys-$(CONFIG_ARCH_MX3) := 0x80800000 diff --git a/arch/arm/mach-imx/cache-l2x0.c b/arch/arm/mach-imx/cache-l2x0.c new file mode 100644 index 0000000..69d1322 --- /dev/null +++ b/arch/arm/mach-imx/cache-l2x0.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2009-2010 Pengutronix + * Sascha Hauer + * Juergen Beisert + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. + */ + +#include +#include +#include + +#include + +#include + +static int mxc_init_l2x0(void) +{ + void __iomem *l2x0_base; + void __iomem *clkctl_base; + + if (!cpu_is_mx31() && !cpu_is_mx35()) + return 0; + +/* + * First of all, we must repair broken chip settings. There are some + * i.MX35 CPUs in the wild, comming with bogus L2 cache settings. These + * misconfigured CPUs will run amok immediately when the L2 cache gets enabled. + * Workaraound is to setup the correct register setting prior enabling the + * L2 cache. This should not hurt already working CPUs, as they are using the + * same value. + */ +#define L2_MEM_VAL 0x10 + + clkctl_base = ioremap(MX35_CLKCTL_BASE_ADDR, 4096); + if (clkctl_base != NULL) { + writel(0x00000515, clkctl_base + L2_MEM_VAL); + iounmap(clkctl_base); + } else { + pr_err("L2 cache: Cannot fix timing. Trying to continue without\n"); + } + + l2x0_base = ioremap(MX3x_L2CC_BASE_ADDR, 4096); + if (IS_ERR(l2x0_base)) { + printk(KERN_ERR "remapping L2 cache area failed with %ld\n", + PTR_ERR(l2x0_base)); + return 0; + } + + l2x0_init(l2x0_base, 0x00030024, 0x00000000); + + return 0; +} +arch_initcall(mxc_init_l2x0); diff --git a/arch/arm/mach-imx/clock-imx31.c b/arch/arm/mach-imx/clock-imx31.c new file mode 100644 index 0000000..25f343f --- /dev/null +++ b/arch/arm/mach-imx/clock-imx31.c @@ -0,0 +1,629 @@ +/* + * Copyright 2005-2007 Freescale Semiconductor, Inc. All Rights Reserved. + * Copyright (C) 2008 by Sascha Hauer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "crmregs-imx31.h" + +#define PRE_DIV_MIN_FREQ 10000000 /* Minimum Frequency after Predivider */ + +static void __calc_pre_post_dividers(u32 div, u32 *pre, u32 *post) +{ + u32 min_pre, temp_pre, old_err, err; + + if (div >= 512) { + *pre = 8; + *post = 64; + } else if (div >= 64) { + min_pre = (div - 1) / 64 + 1; + old_err = 8; + for (temp_pre = 8; temp_pre >= min_pre; temp_pre--) { + err = div % temp_pre; + if (err == 0) { + *pre = temp_pre; + break; + } + err = temp_pre - err; + if (err < old_err) { + old_err = err; + *pre = temp_pre; + } + } + *post = (div + *pre - 1) / *pre; + } else if (div <= 8) { + *pre = div; + *post = 1; + } else { + *pre = 1; + *post = div; + } +} + +static struct clk mcu_pll_clk; +static struct clk serial_pll_clk; +static struct clk ipg_clk; +static struct clk ckih_clk; + +static int cgr_enable(struct clk *clk) +{ + u32 reg; + + if (!clk->enable_reg) + return 0; + + reg = __raw_readl(clk->enable_reg); + reg |= 3 << clk->enable_shift; + __raw_writel(reg, clk->enable_reg); + + return 0; +} + +static void cgr_disable(struct clk *clk) +{ + u32 reg; + + if (!clk->enable_reg) + return; + + reg = __raw_readl(clk->enable_reg); + reg &= ~(3 << clk->enable_shift); + + /* special case for EMI clock */ + if (clk->enable_reg == MXC_CCM_CGR2 && clk->enable_shift == 8) + reg |= (1 << clk->enable_shift); + + __raw_writel(reg, clk->enable_reg); +} + +static unsigned long pll_ref_get_rate(void) +{ + unsigned long ccmr; + unsigned int prcs; + + ccmr = __raw_readl(MXC_CCM_CCMR); + prcs = (ccmr & MXC_CCM_CCMR_PRCS_MASK) >> MXC_CCM_CCMR_PRCS_OFFSET; + if (prcs == 0x1) + return CKIL_CLK_FREQ * 1024; + else + return clk_get_rate(&ckih_clk); +} + +static unsigned long usb_pll_get_rate(struct clk *clk) +{ + unsigned long reg; + + reg = __raw_readl(MXC_CCM_UPCTL); + + return mxc_decode_pll(reg, pll_ref_get_rate()); +} + +static unsigned long serial_pll_get_rate(struct clk *clk) +{ + unsigned long reg; + + reg = __raw_readl(MXC_CCM_SRPCTL); + + return mxc_decode_pll(reg, pll_ref_get_rate()); +} + +static unsigned long mcu_pll_get_rate(struct clk *clk) +{ + unsigned long reg, ccmr; + + ccmr = __raw_readl(MXC_CCM_CCMR); + + if (!(ccmr & MXC_CCM_CCMR_MPE) || (ccmr & MXC_CCM_CCMR_MDS)) + return clk_get_rate(&ckih_clk); + + reg = __raw_readl(MXC_CCM_MPCTL); + + return mxc_decode_pll(reg, pll_ref_get_rate()); +} + +static int usb_pll_enable(struct clk *clk) +{ + u32 reg; + + reg = __raw_readl(MXC_CCM_CCMR); + reg |= MXC_CCM_CCMR_UPE; + __raw_writel(reg, MXC_CCM_CCMR); + + /* No lock bit on MX31, so using max time from spec */ + udelay(80); + + return 0; +} + +static void usb_pll_disable(struct clk *clk) +{ + u32 reg; + + reg = __raw_readl(MXC_CCM_CCMR); + reg &= ~MXC_CCM_CCMR_UPE; + __raw_writel(reg, MXC_CCM_CCMR); +} + +static int serial_pll_enable(struct clk *clk) +{ + u32 reg; + + reg = __raw_readl(MXC_CCM_CCMR); + reg |= MXC_CCM_CCMR_SPE; + __raw_writel(reg, MXC_CCM_CCMR); + + /* No lock bit on MX31, so using max time from spec */ + udelay(80); + + return 0; +} + +static void serial_pll_disable(struct clk *clk) +{ + u32 reg; + + reg = __raw_readl(MXC_CCM_CCMR); + reg &= ~MXC_CCM_CCMR_SPE; + __raw_writel(reg, MXC_CCM_CCMR); +} + +#define PDR0(mask, off) ((__raw_readl(MXC_CCM_PDR0) & mask) >> off) +#define PDR1(mask, off) ((__raw_readl(MXC_CCM_PDR1) & mask) >> off) +#define PDR2(mask, off) ((__raw_readl(MXC_CCM_PDR2) & mask) >> off) + +static unsigned long mcu_main_get_rate(struct clk *clk) +{ + u32 pmcr0 = __raw_readl(MXC_CCM_PMCR0); + + if ((pmcr0 & MXC_CCM_PMCR0_DFSUP1) == MXC_CCM_PMCR0_DFSUP1_SPLL) + return clk_get_rate(&serial_pll_clk); + else + return clk_get_rate(&mcu_pll_clk); +} + +static unsigned long ahb_get_rate(struct clk *clk) +{ + unsigned long max_pdf; + + max_pdf = PDR0(MXC_CCM_PDR0_MAX_PODF_MASK, + MXC_CCM_PDR0_MAX_PODF_OFFSET); + return clk_get_rate(clk->parent) / (max_pdf + 1); +} + +static unsigned long ipg_get_rate(struct clk *clk) +{ + unsigned long ipg_pdf; + + ipg_pdf = PDR0(MXC_CCM_PDR0_IPG_PODF_MASK, + MXC_CCM_PDR0_IPG_PODF_OFFSET); + return clk_get_rate(clk->parent) / (ipg_pdf + 1); +} + +static unsigned long nfc_get_rate(struct clk *clk) +{ + unsigned long nfc_pdf; + + nfc_pdf = PDR0(MXC_CCM_PDR0_NFC_PODF_MASK, + MXC_CCM_PDR0_NFC_PODF_OFFSET); + return clk_get_rate(clk->parent) / (nfc_pdf + 1); +} + +static unsigned long hsp_get_rate(struct clk *clk) +{ + unsigned long hsp_pdf; + + hsp_pdf = PDR0(MXC_CCM_PDR0_HSP_PODF_MASK, + MXC_CCM_PDR0_HSP_PODF_OFFSET); + return clk_get_rate(clk->parent) / (hsp_pdf + 1); +} + +static unsigned long usb_get_rate(struct clk *clk) +{ + unsigned long usb_pdf, usb_prepdf; + + usb_pdf = PDR1(MXC_CCM_PDR1_USB_PODF_MASK, + MXC_CCM_PDR1_USB_PODF_OFFSET); + usb_prepdf = PDR1(MXC_CCM_PDR1_USB_PRDF_MASK, + MXC_CCM_PDR1_USB_PRDF_OFFSET); + return clk_get_rate(clk->parent) / (usb_prepdf + 1) / (usb_pdf + 1); +} + +static unsigned long csi_get_rate(struct clk *clk) +{ + u32 reg, pre, post; + + reg = __raw_readl(MXC_CCM_PDR0); + pre = (reg & MXC_CCM_PDR0_CSI_PRDF_MASK) >> + MXC_CCM_PDR0_CSI_PRDF_OFFSET; + pre++; + post = (reg & MXC_CCM_PDR0_CSI_PODF_MASK) >> + MXC_CCM_PDR0_CSI_PODF_OFFSET; + post++; + return clk_get_rate(clk->parent) / (pre * post); +} + +static unsigned long csi_round_rate(struct clk *clk, unsigned long rate) +{ + u32 pre, post, parent = clk_get_rate(clk->parent); + u32 div = parent / rate; + + if (parent % rate) + div++; + + __calc_pre_post_dividers(div, &pre, &post); + + return parent / (pre * post); +} + +static int csi_set_rate(struct clk *clk, unsigned long rate) +{ + u32 reg, div, pre, post, parent = clk_get_rate(clk->parent); + + div = parent / rate; + + if ((parent / div) != rate) + return -EINVAL; + + __calc_pre_post_dividers(div, &pre, &post); + + /* Set CSI clock divider */ + reg = __raw_readl(MXC_CCM_PDR0) & + ~(MXC_CCM_PDR0_CSI_PODF_MASK | MXC_CCM_PDR0_CSI_PRDF_MASK); + reg |= (post - 1) << MXC_CCM_PDR0_CSI_PODF_OFFSET; + reg |= (pre - 1) << MXC_CCM_PDR0_CSI_PRDF_OFFSET; + __raw_writel(reg, MXC_CCM_PDR0); + + return 0; +} + +static unsigned long ssi1_get_rate(struct clk *clk) +{ + unsigned long ssi1_pdf, ssi1_prepdf; + + ssi1_pdf = PDR1(MXC_CCM_PDR1_SSI1_PODF_MASK, + MXC_CCM_PDR1_SSI1_PODF_OFFSET); + ssi1_prepdf = PDR1(MXC_CCM_PDR1_SSI1_PRE_PODF_MASK, + MXC_CCM_PDR1_SSI1_PRE_PODF_OFFSET); + return clk_get_rate(clk->parent) / (ssi1_prepdf + 1) / (ssi1_pdf + 1); +} + +static unsigned long ssi2_get_rate(struct clk *clk) +{ + unsigned long ssi2_pdf, ssi2_prepdf; + + ssi2_pdf = PDR1(MXC_CCM_PDR1_SSI2_PODF_MASK, + MXC_CCM_PDR1_SSI2_PODF_OFFSET); + ssi2_prepdf = PDR1(MXC_CCM_PDR1_SSI2_PRE_PODF_MASK, + MXC_CCM_PDR1_SSI2_PRE_PODF_OFFSET); + return clk_get_rate(clk->parent) / (ssi2_prepdf + 1) / (ssi2_pdf + 1); +} + +static unsigned long firi_get_rate(struct clk *clk) +{ + unsigned long firi_pdf, firi_prepdf; + + firi_pdf = PDR1(MXC_CCM_PDR1_FIRI_PODF_MASK, + MXC_CCM_PDR1_FIRI_PODF_OFFSET); + firi_prepdf = PDR1(MXC_CCM_PDR1_FIRI_PRE_PODF_MASK, + MXC_CCM_PDR1_FIRI_PRE_PODF_OFFSET); + return clk_get_rate(clk->parent) / (firi_prepdf + 1) / (firi_pdf + 1); +} + +static unsigned long firi_round_rate(struct clk *clk, unsigned long rate) +{ + u32 pre, post; + u32 parent = clk_get_rate(clk->parent); + u32 div = parent / rate; + + if (parent % rate) + div++; + + __calc_pre_post_dividers(div, &pre, &post); + + return parent / (pre * post); + +} + +static int firi_set_rate(struct clk *clk, unsigned long rate) +{ + u32 reg, div, pre, post, parent = clk_get_rate(clk->parent); + + div = parent / rate; + + if ((parent / div) != rate) + return -EINVAL; + + __calc_pre_post_dividers(div, &pre, &post); + + /* Set FIRI clock divider */ + reg = __raw_readl(MXC_CCM_PDR1) & + ~(MXC_CCM_PDR1_FIRI_PODF_MASK | MXC_CCM_PDR1_FIRI_PRE_PODF_MASK); + reg |= (pre - 1) << MXC_CCM_PDR1_FIRI_PRE_PODF_OFFSET; + reg |= (post - 1) << MXC_CCM_PDR1_FIRI_PODF_OFFSET; + __raw_writel(reg, MXC_CCM_PDR1); + + return 0; +} + +static unsigned long mbx_get_rate(struct clk *clk) +{ + return clk_get_rate(clk->parent) / 2; +} + +static unsigned long mstick1_get_rate(struct clk *clk) +{ + unsigned long msti_pdf; + + msti_pdf = PDR2(MXC_CCM_PDR2_MST1_PDF_MASK, + MXC_CCM_PDR2_MST1_PDF_OFFSET); + return clk_get_rate(clk->parent) / (msti_pdf + 1); +} + +static unsigned long mstick2_get_rate(struct clk *clk) +{ + unsigned long msti_pdf; + + msti_pdf = PDR2(MXC_CCM_PDR2_MST2_PDF_MASK, + MXC_CCM_PDR2_MST2_PDF_OFFSET); + return clk_get_rate(clk->parent) / (msti_pdf + 1); +} + +static unsigned long ckih_rate; + +static unsigned long clk_ckih_get_rate(struct clk *clk) +{ + return ckih_rate; +} + +static unsigned long clk_ckil_get_rate(struct clk *clk) +{ + return CKIL_CLK_FREQ; +} + +static struct clk ckih_clk = { + .get_rate = clk_ckih_get_rate, +}; + +static struct clk mcu_pll_clk = { + .parent = &ckih_clk, + .get_rate = mcu_pll_get_rate, +}; + +static struct clk mcu_main_clk = { + .parent = &mcu_pll_clk, + .get_rate = mcu_main_get_rate, +}; + +static struct clk serial_pll_clk = { + .parent = &ckih_clk, + .get_rate = serial_pll_get_rate, + .enable = serial_pll_enable, + .disable = serial_pll_disable, +}; + +static struct clk usb_pll_clk = { + .parent = &ckih_clk, + .get_rate = usb_pll_get_rate, + .enable = usb_pll_enable, + .disable = usb_pll_disable, +}; + +static struct clk ahb_clk = { + .parent = &mcu_main_clk, + .get_rate = ahb_get_rate, +}; + +#define DEFINE_CLOCK(name, i, er, es, gr, s, p) \ + static struct clk name = { \ + .id = i, \ + .enable_reg = er, \ + .enable_shift = es, \ + .get_rate = gr, \ + .enable = cgr_enable, \ + .disable = cgr_disable, \ + .secondary = s, \ + .parent = p, \ + } + +#define DEFINE_CLOCK1(name, i, er, es, getsetround, s, p) \ + static struct clk name = { \ + .id = i, \ + .enable_reg = er, \ + .enable_shift = es, \ + .get_rate = getsetround##_get_rate, \ + .set_rate = getsetround##_set_rate, \ + .round_rate = getsetround##_round_rate, \ + .enable = cgr_enable, \ + .disable = cgr_disable, \ + .secondary = s, \ + .parent = p, \ + } + +DEFINE_CLOCK(perclk_clk, 0, NULL, 0, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(ckil_clk, 0, NULL, 0, clk_ckil_get_rate, NULL, NULL); + +DEFINE_CLOCK(sdhc1_clk, 0, MXC_CCM_CGR0, 0, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(sdhc2_clk, 1, MXC_CCM_CGR0, 2, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(gpt_clk, 0, MXC_CCM_CGR0, 4, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(epit1_clk, 0, MXC_CCM_CGR0, 6, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(epit2_clk, 1, MXC_CCM_CGR0, 8, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(iim_clk, 0, MXC_CCM_CGR0, 10, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(ata_clk, 0, MXC_CCM_CGR0, 12, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(sdma_clk1, 0, MXC_CCM_CGR0, 14, NULL, NULL, &ahb_clk); +DEFINE_CLOCK(cspi3_clk, 2, MXC_CCM_CGR0, 16, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(rng_clk, 0, MXC_CCM_CGR0, 18, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(uart1_clk, 0, MXC_CCM_CGR0, 20, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(uart2_clk, 1, MXC_CCM_CGR0, 22, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(ssi1_clk, 0, MXC_CCM_CGR0, 24, ssi1_get_rate, NULL, &serial_pll_clk); +DEFINE_CLOCK(i2c1_clk, 0, MXC_CCM_CGR0, 26, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(i2c2_clk, 1, MXC_CCM_CGR0, 28, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(i2c3_clk, 2, MXC_CCM_CGR0, 30, NULL, NULL, &perclk_clk); + +DEFINE_CLOCK(mpeg4_clk, 0, MXC_CCM_CGR1, 0, NULL, NULL, &ahb_clk); +DEFINE_CLOCK(mstick1_clk, 0, MXC_CCM_CGR1, 2, mstick1_get_rate, NULL, &usb_pll_clk); +DEFINE_CLOCK(mstick2_clk, 1, MXC_CCM_CGR1, 4, mstick2_get_rate, NULL, &usb_pll_clk); +DEFINE_CLOCK1(csi_clk, 0, MXC_CCM_CGR1, 6, csi, NULL, &serial_pll_clk); +DEFINE_CLOCK(rtc_clk, 0, MXC_CCM_CGR1, 8, NULL, NULL, &ckil_clk); +DEFINE_CLOCK(wdog_clk, 0, MXC_CCM_CGR1, 10, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(pwm_clk, 0, MXC_CCM_CGR1, 12, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(usb_clk2, 0, MXC_CCM_CGR1, 18, usb_get_rate, NULL, &ahb_clk); +DEFINE_CLOCK(kpp_clk, 0, MXC_CCM_CGR1, 20, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(ipu_clk, 0, MXC_CCM_CGR1, 22, hsp_get_rate, NULL, &mcu_main_clk); +DEFINE_CLOCK(uart3_clk, 2, MXC_CCM_CGR1, 24, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(uart4_clk, 3, MXC_CCM_CGR1, 26, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(uart5_clk, 4, MXC_CCM_CGR1, 28, NULL, NULL, &perclk_clk); +DEFINE_CLOCK(owire_clk, 0, MXC_CCM_CGR1, 30, NULL, NULL, &perclk_clk); + +DEFINE_CLOCK(ssi2_clk, 1, MXC_CCM_CGR2, 0, ssi2_get_rate, NULL, &serial_pll_clk); +DEFINE_CLOCK(cspi1_clk, 0, MXC_CCM_CGR2, 2, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(cspi2_clk, 1, MXC_CCM_CGR2, 4, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(mbx_clk, 0, MXC_CCM_CGR2, 6, mbx_get_rate, NULL, &ahb_clk); +DEFINE_CLOCK(emi_clk, 0, MXC_CCM_CGR2, 8, NULL, NULL, &ahb_clk); +DEFINE_CLOCK(rtic_clk, 0, MXC_CCM_CGR2, 10, NULL, NULL, &ahb_clk); +DEFINE_CLOCK1(firi_clk, 0, MXC_CCM_CGR2, 12, firi, NULL, &usb_pll_clk); + +DEFINE_CLOCK(sdma_clk2, 0, NULL, 0, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(usb_clk1, 0, NULL, 0, usb_get_rate, NULL, &usb_pll_clk); +DEFINE_CLOCK(nfc_clk, 0, NULL, 0, nfc_get_rate, NULL, &ahb_clk); +DEFINE_CLOCK(scc_clk, 0, NULL, 0, NULL, NULL, &ipg_clk); +DEFINE_CLOCK(ipg_clk, 0, NULL, 0, ipg_get_rate, NULL, &ahb_clk); + +#define _REGISTER_CLOCK(d, n, c) \ + { \ + .dev_id = d, \ + .con_id = n, \ + .clk = &c, \ + }, + +static struct clk_lookup lookups[] = { + _REGISTER_CLOCK(NULL, "emi", emi_clk) + _REGISTER_CLOCK("imx31-cspi.0", NULL, cspi1_clk) + _REGISTER_CLOCK("imx31-cspi.1", NULL, cspi2_clk) + _REGISTER_CLOCK("imx31-cspi.2", NULL, cspi3_clk) + _REGISTER_CLOCK(NULL, "gpt", gpt_clk) + _REGISTER_CLOCK(NULL, "pwm", pwm_clk) + _REGISTER_CLOCK("imx2-wdt.0", NULL, wdog_clk) + _REGISTER_CLOCK(NULL, "rtc", rtc_clk) + _REGISTER_CLOCK(NULL, "epit", epit1_clk) + _REGISTER_CLOCK(NULL, "epit", epit2_clk) + _REGISTER_CLOCK("mxc_nand.0", NULL, nfc_clk) + _REGISTER_CLOCK("ipu-core", NULL, ipu_clk) + _REGISTER_CLOCK("mx3_sdc_fb", NULL, ipu_clk) + _REGISTER_CLOCK(NULL, "kpp", kpp_clk) + _REGISTER_CLOCK("mxc-ehci.0", "usb", usb_clk1) + _REGISTER_CLOCK("mxc-ehci.0", "usb_ahb", usb_clk2) + _REGISTER_CLOCK("mxc-ehci.1", "usb", usb_clk1) + _REGISTER_CLOCK("mxc-ehci.1", "usb_ahb", usb_clk2) + _REGISTER_CLOCK("mxc-ehci.2", "usb", usb_clk1) + _REGISTER_CLOCK("mxc-ehci.2", "usb_ahb", usb_clk2) + _REGISTER_CLOCK("fsl-usb2-udc", "usb", usb_clk1) + _REGISTER_CLOCK("fsl-usb2-udc", "usb_ahb", usb_clk2) + _REGISTER_CLOCK("mx3-camera.0", NULL, csi_clk) + _REGISTER_CLOCK("imx-uart.0", NULL, uart1_clk) + _REGISTER_CLOCK("imx-uart.1", NULL, uart2_clk) + _REGISTER_CLOCK("imx-uart.2", NULL, uart3_clk) + _REGISTER_CLOCK("imx-uart.3", NULL, uart4_clk) + _REGISTER_CLOCK("imx-uart.4", NULL, uart5_clk) + _REGISTER_CLOCK("imx-i2c.0", NULL, i2c1_clk) + _REGISTER_CLOCK("imx-i2c.1", NULL, i2c2_clk) + _REGISTER_CLOCK("imx-i2c.2", NULL, i2c3_clk) + _REGISTER_CLOCK("mxc_w1.0", NULL, owire_clk) + _REGISTER_CLOCK("mxc-mmc.0", NULL, sdhc1_clk) + _REGISTER_CLOCK("mxc-mmc.1", NULL, sdhc2_clk) + _REGISTER_CLOCK("imx-ssi.0", NULL, ssi1_clk) + _REGISTER_CLOCK("imx-ssi.1", NULL, ssi2_clk) + _REGISTER_CLOCK(NULL, "firi", firi_clk) + _REGISTER_CLOCK(NULL, "ata", ata_clk) + _REGISTER_CLOCK(NULL, "rtic", rtic_clk) + _REGISTER_CLOCK(NULL, "rng", rng_clk) + _REGISTER_CLOCK("imx-sdma", NULL, sdma_clk1) + _REGISTER_CLOCK(NULL, "sdma_ipg", sdma_clk2) + _REGISTER_CLOCK(NULL, "mstick", mstick1_clk) + _REGISTER_CLOCK(NULL, "mstick", mstick2_clk) + _REGISTER_CLOCK(NULL, "scc", scc_clk) + _REGISTER_CLOCK(NULL, "iim", iim_clk) + _REGISTER_CLOCK(NULL, "mpeg4", mpeg4_clk) + _REGISTER_CLOCK(NULL, "mbx", mbx_clk) +}; + +int __init mx31_clocks_init(unsigned long fref) +{ + u32 reg; + + ckih_rate = fref; + + clkdev_add_table(lookups, ARRAY_SIZE(lookups)); + + /* change the csi_clk parent if necessary */ + reg = __raw_readl(MXC_CCM_CCMR); + if (!(reg & MXC_CCM_CCMR_CSCS)) + if (clk_set_parent(&csi_clk, &usb_pll_clk)) + pr_err("%s: error changing csi_clk parent\n", __func__); + + + /* Turn off all possible clocks */ + __raw_writel((3 << 4), MXC_CCM_CGR0); + __raw_writel(0, MXC_CCM_CGR1); + __raw_writel((3 << 8) | (3 << 14) | (3 << 16)| + 1 << 27 | 1 << 28, /* Bit 27 and 28 are not defined for + MX32, but still required to be set */ + MXC_CCM_CGR2); + + /* + * Before turning off usb_pll make sure ipg_per_clk is generated + * by ipg_clk and not usb_pll. + */ + __raw_writel(__raw_readl(MXC_CCM_CCMR) | (1 << 24), MXC_CCM_CCMR); + + usb_pll_disable(&usb_pll_clk); + + pr_info("Clock input source is %ld\n", clk_get_rate(&ckih_clk)); + + clk_enable(&gpt_clk); + clk_enable(&emi_clk); + clk_enable(&iim_clk); + + clk_enable(&serial_pll_clk); + + mx31_read_cpu_rev(); + + if (mx31_revision() >= IMX_CHIP_REVISION_2_0) { + reg = __raw_readl(MXC_CCM_PMCR1); + /* No PLL restart on DVFS switch; enable auto EMI handshake */ + reg |= MXC_CCM_PMCR1_PLLRDIS | MXC_CCM_PMCR1_EMIRQ_EN; + __raw_writel(reg, MXC_CCM_PMCR1); + } + + mxc_timer_init(&ipg_clk, MX31_IO_ADDRESS(MX31_GPT1_BASE_ADDR), + MX31_INT_GPT); + + return 0; +} diff --git a/arch/arm/mach-imx/clock-imx35.c b/arch/arm/mach-imx/clock-imx35.c new file mode 100644 index 0000000..5a4cc1e --- /dev/null +++ b/arch/arm/mach-imx/clock-imx35.c @@ -0,0 +1,549 @@ +/* + * Copyright (C) 2009 by Sascha Hauer, Pengutronix + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define CCM_BASE MX35_IO_ADDRESS(MX35_CCM_BASE_ADDR) + +#define CCM_CCMR 0x00 +#define CCM_PDR0 0x04 +#define CCM_PDR1 0x08 +#define CCM_PDR2 0x0C +#define CCM_PDR3 0x10 +#define CCM_PDR4 0x14 +#define CCM_RCSR 0x18 +#define CCM_MPCTL 0x1C +#define CCM_PPCTL 0x20 +#define CCM_ACMR 0x24 +#define CCM_COSR 0x28 +#define CCM_CGR0 0x2C +#define CCM_CGR1 0x30 +#define CCM_CGR2 0x34 +#define CCM_CGR3 0x38 + +#ifdef HAVE_SET_RATE_SUPPORT +static void calc_dividers(u32 div, u32 *pre, u32 *post, u32 maxpost) +{ + u32 min_pre, temp_pre, old_err, err; + + min_pre = (div - 1) / maxpost + 1; + old_err = 8; + + for (temp_pre = 8; temp_pre >= min_pre; temp_pre--) { + if (div > (temp_pre * maxpost)) + break; + + if (div < (temp_pre * temp_pre)) + continue; + + err = div % temp_pre; + + if (err == 0) { + *pre = temp_pre; + break; + } + + err = temp_pre - err; + + if (err < old_err) { + old_err = err; + *pre = temp_pre; + } + } + + *post = (div + *pre - 1) / *pre; +} + +/* get the best values for a 3-bit divider combined with a 6-bit divider */ +static void calc_dividers_3_6(u32 div, u32 *pre, u32 *post) +{ + if (div >= 512) { + *pre = 8; + *post = 64; + } else if (div >= 64) { + calc_dividers(div, pre, post, 64); + } else if (div <= 8) { + *pre = div; + *post = 1; + } else { + *pre = 1; + *post = div; + } +} + +/* get the best values for two cascaded 3-bit dividers */ +static void calc_dividers_3_3(u32 div, u32 *pre, u32 *post) +{ + if (div >= 64) { + *pre = *post = 8; + } else if (div > 8) { + calc_dividers(div, pre, post, 8); + } else { + *pre = 1; + *post = div; + } +} +#endif + +static unsigned long get_rate_mpll(void) +{ + ulong mpctl = __raw_readl(CCM_BASE + CCM_MPCTL); + + return mxc_decode_pll(mpctl, 24000000); +} + +static unsigned long get_rate_ppll(void) +{ + ulong ppctl = __raw_readl(CCM_BASE + CCM_PPCTL); + + return mxc_decode_pll(ppctl, 24000000); +} + +struct arm_ahb_div { + unsigned char arm, ahb, sel; +}; + +static struct arm_ahb_div clk_consumer[] = { + { .arm = 1, .ahb = 4, .sel = 0}, + { .arm = 1, .ahb = 3, .sel = 1}, + { .arm = 2, .ahb = 2, .sel = 0}, + { .arm = 0, .ahb = 0, .sel = 0}, + { .arm = 0, .ahb = 0, .sel = 0}, + { .arm = 0, .ahb = 0, .sel = 0}, + { .arm = 4, .ahb = 1, .sel = 0}, + { .arm = 1, .ahb = 5, .sel = 0}, + { .arm = 1, .ahb = 8, .sel = 0}, + { .arm = 1, .ahb = 6, .sel = 1}, + { .arm = 2, .ahb = 4, .sel = 0}, + { .arm = 0, .ahb = 0, .sel = 0}, + { .arm = 0, .ahb = 0, .sel = 0}, + { .arm = 0, .ahb = 0, .sel = 0}, + { .arm = 4, .ahb = 2, .sel = 0}, + { .arm = 0, .ahb = 0, .sel = 0}, +}; + +static unsigned long get_rate_arm(void) +{ + unsigned long pdr0 = __raw_readl(CCM_BASE + CCM_PDR0); + struct arm_ahb_div *aad; + unsigned long fref = get_rate_mpll(); + + aad = &clk_consumer[(pdr0 >> 16) & 0xf]; + if (aad->sel) + fref = fref * 3 / 4; + + return fref / aad->arm; +} + +static unsigned long get_rate_ahb(struct clk *clk) +{ + unsigned long pdr0 = __raw_readl(CCM_BASE + CCM_PDR0); + struct arm_ahb_div *aad; + unsigned long fref = get_rate_arm(); + + aad = &clk_consumer[(pdr0 >> 16) & 0xf]; + + return fref / aad->ahb; +} + +static unsigned long get_rate_ipg(struct clk *clk) +{ + return get_rate_ahb(NULL) >> 1; +} + +static unsigned long get_rate_uart(struct clk *clk) +{ + unsigned long pdr3 = __raw_readl(CCM_BASE + CCM_PDR3); + unsigned long pdr4 = __raw_readl(CCM_BASE + CCM_PDR4); + unsigned long div = ((pdr4 >> 10) & 0x3f) + 1; + + if (pdr3 & (1 << 14)) + return get_rate_arm() / div; + else + return get_rate_ppll() / div; +} + +static unsigned long get_rate_sdhc(struct clk *clk) +{ + unsigned long pdr3 = __raw_readl(CCM_BASE + CCM_PDR3); + unsigned long div, rate; + + if (pdr3 & (1 << 6)) + rate = get_rate_arm(); + else + rate = get_rate_ppll(); + + switch (clk->id) { + default: + case 0: + div = pdr3 & 0x3f; + break; + case 1: + div = (pdr3 >> 8) & 0x3f; + break; + case 2: + div = (pdr3 >> 16) & 0x3f; + break; + } + + return rate / (div + 1); +} + +static unsigned long get_rate_mshc(struct clk *clk) +{ + unsigned long pdr1 = __raw_readl(CCM_BASE + CCM_PDR1); + unsigned long div1, div2, rate; + + if (pdr1 & (1 << 7)) + rate = get_rate_arm(); + else + rate = get_rate_ppll(); + + div1 = (pdr1 >> 29) & 0x7; + div2 = (pdr1 >> 22) & 0x3f; + + return rate / ((div1 + 1) * (div2 + 1)); +} + +static unsigned long get_rate_ssi(struct clk *clk) +{ + unsigned long pdr2 = __raw_readl(CCM_BASE + CCM_PDR2); + unsigned long div1, div2, rate; + + if (pdr2 & (1 << 6)) + rate = get_rate_arm(); + else + rate = get_rate_ppll(); + + switch (clk->id) { + default: + case 0: + div1 = pdr2 & 0x3f; + div2 = (pdr2 >> 24) & 0x7; + break; + case 1: + div1 = (pdr2 >> 8) & 0x3f; + div2 = (pdr2 >> 27) & 0x7; + break; + } + + return rate / ((div1 + 1) * (div2 + 1)); +} + +static unsigned long get_rate_csi(struct clk *clk) +{ + unsigned long pdr2 = __raw_readl(CCM_BASE + CCM_PDR2); + unsigned long rate; + + if (pdr2 & (1 << 7)) + rate = get_rate_arm(); + else + rate = get_rate_ppll(); + + return rate / (((pdr2 >> 16) & 0x3f) + 1); +} + +static unsigned long get_rate_otg(struct clk *clk) +{ + unsigned long pdr4 = __raw_readl(CCM_BASE + CCM_PDR4); + unsigned long rate; + + if (pdr4 & (1 << 9)) + rate = get_rate_arm(); + else + rate = get_rate_ppll(); + + return rate / (((pdr4 >> 22) & 0x3f) + 1); +} + +static unsigned long get_rate_ipg_per(struct clk *clk) +{ + unsigned long pdr0 = __raw_readl(CCM_BASE + CCM_PDR0); + unsigned long pdr4 = __raw_readl(CCM_BASE + CCM_PDR4); + unsigned long div; + + if (pdr0 & (1 << 26)) { + div = (pdr4 >> 16) & 0x3f; + return get_rate_arm() / (div + 1); + } else { + div = (pdr0 >> 12) & 0x7; + return get_rate_ahb(NULL) / (div + 1); + } +} + +static unsigned long get_rate_hsp(struct clk *clk) +{ + unsigned long hsp_podf = (__raw_readl(CCM_BASE + CCM_PDR0) >> 20) & 0x03; + unsigned long fref = get_rate_mpll(); + + if (fref > 400 * 1000 * 1000) { + switch (hsp_podf) { + case 0: + return fref >> 2; + case 1: + return fref >> 3; + case 2: + return fref / 3; + } + } else { + switch (hsp_podf) { + case 0: + case 2: + return fref / 3; + case 1: + return fref / 6; + } + } + + return 0; +} + +static int clk_cgr_enable(struct clk *clk) +{ + u32 reg; + + reg = __raw_readl(clk->enable_reg); + reg |= 3 << clk->enable_shift; + __raw_writel(reg, clk->enable_reg); + + return 0; +} + +static void clk_cgr_disable(struct clk *clk) +{ + u32 reg; + + reg = __raw_readl(clk->enable_reg); + reg &= ~(3 << clk->enable_shift); + __raw_writel(reg, clk->enable_reg); +} + +#define DEFINE_CLOCK(name, i, er, es, gr, sr) \ + static struct clk name = { \ + .id = i, \ + .enable_reg = CCM_BASE + er, \ + .enable_shift = es, \ + .get_rate = gr, \ + .set_rate = sr, \ + .enable = clk_cgr_enable, \ + .disable = clk_cgr_disable, \ + } + +DEFINE_CLOCK(asrc_clk, 0, CCM_CGR0, 0, NULL, NULL); +DEFINE_CLOCK(ata_clk, 0, CCM_CGR0, 2, get_rate_ipg, NULL); +/* DEFINE_CLOCK(audmux_clk, 0, CCM_CGR0, 4, NULL, NULL); */ +DEFINE_CLOCK(can1_clk, 0, CCM_CGR0, 6, get_rate_ipg, NULL); +DEFINE_CLOCK(can2_clk, 1, CCM_CGR0, 8, get_rate_ipg, NULL); +DEFINE_CLOCK(cspi1_clk, 0, CCM_CGR0, 10, get_rate_ipg, NULL); +DEFINE_CLOCK(cspi2_clk, 1, CCM_CGR0, 12, get_rate_ipg, NULL); +DEFINE_CLOCK(ect_clk, 0, CCM_CGR0, 14, get_rate_ipg, NULL); +DEFINE_CLOCK(edio_clk, 0, CCM_CGR0, 16, NULL, NULL); +DEFINE_CLOCK(emi_clk, 0, CCM_CGR0, 18, get_rate_ipg, NULL); +DEFINE_CLOCK(epit1_clk, 0, CCM_CGR0, 20, get_rate_ipg, NULL); +DEFINE_CLOCK(epit2_clk, 1, CCM_CGR0, 22, get_rate_ipg, NULL); +DEFINE_CLOCK(esai_clk, 0, CCM_CGR0, 24, NULL, NULL); +DEFINE_CLOCK(esdhc1_clk, 0, CCM_CGR0, 26, get_rate_sdhc, NULL); +DEFINE_CLOCK(esdhc2_clk, 1, CCM_CGR0, 28, get_rate_sdhc, NULL); +DEFINE_CLOCK(esdhc3_clk, 2, CCM_CGR0, 30, get_rate_sdhc, NULL); + +DEFINE_CLOCK(fec_clk, 0, CCM_CGR1, 0, get_rate_ipg, NULL); +DEFINE_CLOCK(gpio1_clk, 0, CCM_CGR1, 2, NULL, NULL); +DEFINE_CLOCK(gpio2_clk, 1, CCM_CGR1, 4, NULL, NULL); +DEFINE_CLOCK(gpio3_clk, 2, CCM_CGR1, 6, NULL, NULL); +DEFINE_CLOCK(gpt_clk, 0, CCM_CGR1, 8, get_rate_ipg, NULL); +DEFINE_CLOCK(i2c1_clk, 0, CCM_CGR1, 10, get_rate_ipg_per, NULL); +DEFINE_CLOCK(i2c2_clk, 1, CCM_CGR1, 12, get_rate_ipg_per, NULL); +DEFINE_CLOCK(i2c3_clk, 2, CCM_CGR1, 14, get_rate_ipg_per, NULL); +DEFINE_CLOCK(iomuxc_clk, 0, CCM_CGR1, 16, NULL, NULL); +DEFINE_CLOCK(ipu_clk, 0, CCM_CGR1, 18, get_rate_hsp, NULL); +DEFINE_CLOCK(kpp_clk, 0, CCM_CGR1, 20, get_rate_ipg, NULL); +DEFINE_CLOCK(mlb_clk, 0, CCM_CGR1, 22, get_rate_ahb, NULL); +DEFINE_CLOCK(mshc_clk, 0, CCM_CGR1, 24, get_rate_mshc, NULL); +DEFINE_CLOCK(owire_clk, 0, CCM_CGR1, 26, get_rate_ipg_per, NULL); +DEFINE_CLOCK(pwm_clk, 0, CCM_CGR1, 28, get_rate_ipg_per, NULL); +DEFINE_CLOCK(rngc_clk, 0, CCM_CGR1, 30, get_rate_ipg, NULL); + +DEFINE_CLOCK(rtc_clk, 0, CCM_CGR2, 0, get_rate_ipg, NULL); +DEFINE_CLOCK(rtic_clk, 0, CCM_CGR2, 2, get_rate_ahb, NULL); +DEFINE_CLOCK(scc_clk, 0, CCM_CGR2, 4, get_rate_ipg, NULL); +DEFINE_CLOCK(sdma_clk, 0, CCM_CGR2, 6, NULL, NULL); +DEFINE_CLOCK(spba_clk, 0, CCM_CGR2, 8, get_rate_ipg, NULL); +DEFINE_CLOCK(spdif_clk, 0, CCM_CGR2, 10, NULL, NULL); +DEFINE_CLOCK(ssi1_clk, 0, CCM_CGR2, 12, get_rate_ssi, NULL); +DEFINE_CLOCK(ssi2_clk, 1, CCM_CGR2, 14, get_rate_ssi, NULL); +DEFINE_CLOCK(uart1_clk, 0, CCM_CGR2, 16, get_rate_uart, NULL); +DEFINE_CLOCK(uart2_clk, 1, CCM_CGR2, 18, get_rate_uart, NULL); +DEFINE_CLOCK(uart3_clk, 2, CCM_CGR2, 20, get_rate_uart, NULL); +DEFINE_CLOCK(usbotg_clk, 0, CCM_CGR2, 22, get_rate_otg, NULL); +DEFINE_CLOCK(wdog_clk, 0, CCM_CGR2, 24, NULL, NULL); +DEFINE_CLOCK(max_clk, 0, CCM_CGR2, 26, NULL, NULL); +DEFINE_CLOCK(audmux_clk, 0, CCM_CGR2, 30, NULL, NULL); + +DEFINE_CLOCK(csi_clk, 0, CCM_CGR3, 0, get_rate_csi, NULL); +DEFINE_CLOCK(iim_clk, 0, CCM_CGR3, 2, NULL, NULL); +DEFINE_CLOCK(gpu2d_clk, 0, CCM_CGR3, 4, NULL, NULL); + +DEFINE_CLOCK(usbahb_clk, 0, 0, 0, get_rate_ahb, NULL); + +static int clk_dummy_enable(struct clk *clk) +{ + return 0; +} + +static void clk_dummy_disable(struct clk *clk) +{ +} + +static unsigned long get_rate_nfc(struct clk *clk) +{ + unsigned long div1; + + div1 = (__raw_readl(CCM_BASE + CCM_PDR4) >> 28) + 1; + + return get_rate_ahb(NULL) / div1; +} + +/* NAND Controller: It seems it can't be disabled */ +static struct clk nfc_clk = { + .id = 0, + .enable_reg = 0, + .enable_shift = 0, + .get_rate = get_rate_nfc, + .set_rate = NULL, /* set_rate_nfc, */ + .enable = clk_dummy_enable, + .disable = clk_dummy_disable +}; + +#define _REGISTER_CLOCK(d, n, c) \ + { \ + .dev_id = d, \ + .con_id = n, \ + .clk = &c, \ + }, + +static struct clk_lookup lookups[] = { + _REGISTER_CLOCK(NULL, "asrc", asrc_clk) + _REGISTER_CLOCK(NULL, "ata", ata_clk) + _REGISTER_CLOCK("flexcan.0", NULL, can1_clk) + _REGISTER_CLOCK("flexcan.1", NULL, can2_clk) + _REGISTER_CLOCK("imx35-cspi.0", NULL, cspi1_clk) + _REGISTER_CLOCK("imx35-cspi.1", NULL, cspi2_clk) + _REGISTER_CLOCK(NULL, "ect", ect_clk) + _REGISTER_CLOCK(NULL, "edio", edio_clk) + _REGISTER_CLOCK(NULL, "emi", emi_clk) + _REGISTER_CLOCK("imx-epit.0", NULL, epit1_clk) + _REGISTER_CLOCK("imx-epit.1", NULL, epit2_clk) + _REGISTER_CLOCK(NULL, "esai", esai_clk) + _REGISTER_CLOCK("sdhci-esdhc-imx.0", NULL, esdhc1_clk) + _REGISTER_CLOCK("sdhci-esdhc-imx.1", NULL, esdhc2_clk) + _REGISTER_CLOCK("sdhci-esdhc-imx.2", NULL, esdhc3_clk) + _REGISTER_CLOCK("fec.0", NULL, fec_clk) + _REGISTER_CLOCK(NULL, "gpio", gpio1_clk) + _REGISTER_CLOCK(NULL, "gpio", gpio2_clk) + _REGISTER_CLOCK(NULL, "gpio", gpio3_clk) + _REGISTER_CLOCK("gpt.0", NULL, gpt_clk) + _REGISTER_CLOCK("imx-i2c.0", NULL, i2c1_clk) + _REGISTER_CLOCK("imx-i2c.1", NULL, i2c2_clk) + _REGISTER_CLOCK("imx-i2c.2", NULL, i2c3_clk) + _REGISTER_CLOCK(NULL, "iomuxc", iomuxc_clk) + _REGISTER_CLOCK("ipu-core", NULL, ipu_clk) + _REGISTER_CLOCK("mx3_sdc_fb", NULL, ipu_clk) + _REGISTER_CLOCK(NULL, "kpp", kpp_clk) + _REGISTER_CLOCK(NULL, "mlb", mlb_clk) + _REGISTER_CLOCK(NULL, "mshc", mshc_clk) + _REGISTER_CLOCK("mxc_w1", NULL, owire_clk) + _REGISTER_CLOCK(NULL, "pwm", pwm_clk) + _REGISTER_CLOCK(NULL, "rngc", rngc_clk) + _REGISTER_CLOCK(NULL, "rtc", rtc_clk) + _REGISTER_CLOCK(NULL, "rtic", rtic_clk) + _REGISTER_CLOCK(NULL, "scc", scc_clk) + _REGISTER_CLOCK("imx-sdma", NULL, sdma_clk) + _REGISTER_CLOCK(NULL, "spba", spba_clk) + _REGISTER_CLOCK(NULL, "spdif", spdif_clk) + _REGISTER_CLOCK("imx-ssi.0", NULL, ssi1_clk) + _REGISTER_CLOCK("imx-ssi.1", NULL, ssi2_clk) + _REGISTER_CLOCK("imx-uart.0", NULL, uart1_clk) + _REGISTER_CLOCK("imx-uart.1", NULL, uart2_clk) + _REGISTER_CLOCK("imx-uart.2", NULL, uart3_clk) + _REGISTER_CLOCK("mxc-ehci.0", "usb", usbotg_clk) + _REGISTER_CLOCK("mxc-ehci.1", "usb", usbotg_clk) + _REGISTER_CLOCK("mxc-ehci.2", "usb", usbotg_clk) + _REGISTER_CLOCK("fsl-usb2-udc", "usb", usbotg_clk) + _REGISTER_CLOCK("fsl-usb2-udc", "usb_ahb", usbahb_clk) + _REGISTER_CLOCK("imx2-wdt.0", NULL, wdog_clk) + _REGISTER_CLOCK(NULL, "max", max_clk) + _REGISTER_CLOCK(NULL, "audmux", audmux_clk) + _REGISTER_CLOCK(NULL, "csi", csi_clk) + _REGISTER_CLOCK(NULL, "iim", iim_clk) + _REGISTER_CLOCK(NULL, "gpu2d", gpu2d_clk) + _REGISTER_CLOCK("mxc_nand.0", NULL, nfc_clk) +}; + +int __init mx35_clocks_init() +{ + unsigned int cgr2 = 3 << 26, cgr3 = 0; + +#if defined(CONFIG_DEBUG_LL) && !defined(CONFIG_DEBUG_ICEDCC) + cgr2 |= 3 << 16; +#endif + + clkdev_add_table(lookups, ARRAY_SIZE(lookups)); + + /* Turn off all clocks except the ones we need to survive, namely: + * EMI, GPIO1/2/3, GPT, IOMUX, MAX and eventually uart + */ + __raw_writel((3 << 18), CCM_BASE + CCM_CGR0); + __raw_writel((3 << 2) | (3 << 4) | (3 << 6) | (3 << 8) | (3 << 16), + CCM_BASE + CCM_CGR1); + + /* + * Check if we came up in internal boot mode. If yes, we need some + * extra clocks turned on, otherwise the MX35 boot ROM code will + * hang after a watchdog reset. + */ + if (!(__raw_readl(CCM_BASE + CCM_RCSR) & (3 << 10))) { + /* Additionally turn on UART1, SCC, and IIM clocks */ + cgr2 |= 3 << 16 | 3 << 4; + cgr3 |= 3 << 2; + } + + __raw_writel(cgr2, CCM_BASE + CCM_CGR2); + __raw_writel(cgr3, CCM_BASE + CCM_CGR3); + + clk_enable(&iim_clk); + mx35_read_cpu_rev(); + +#ifdef CONFIG_MXC_USE_EPIT + epit_timer_init(&epit1_clk, + MX35_IO_ADDRESS(MX35_EPIT1_BASE_ADDR), MX35_INT_EPIT1); +#else + mxc_timer_init(&gpt_clk, + MX35_IO_ADDRESS(MX35_GPT1_BASE_ADDR), MX35_INT_GPT); +#endif + + return 0; +} diff --git a/arch/arm/mach-imx/cpu-imx31.c b/arch/arm/mach-imx/cpu-imx31.c new file mode 100644 index 0000000..a378070 --- /dev/null +++ b/arch/arm/mach-imx/cpu-imx31.c @@ -0,0 +1,57 @@ +/* + * MX31 CPU type detection + * + * Copyright (c) 2009 Daniel Mack + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include + +unsigned int mx31_cpu_rev; +EXPORT_SYMBOL(mx31_cpu_rev); + +static struct { + u8 srev; + const char *name; + const char *v; + unsigned int rev; +} mx31_cpu_type[] __initdata = { + { .srev = 0x00, .name = "i.MX31(L)", .v = "1.0", .rev = IMX_CHIP_REVISION_1_0 }, + { .srev = 0x10, .name = "i.MX31", .v = "1.1", .rev = IMX_CHIP_REVISION_1_1 }, + { .srev = 0x11, .name = "i.MX31L", .v = "1.1", .rev = IMX_CHIP_REVISION_1_1 }, + { .srev = 0x12, .name = "i.MX31", .v = "1.15", .rev = IMX_CHIP_REVISION_1_1 }, + { .srev = 0x13, .name = "i.MX31L", .v = "1.15", .rev = IMX_CHIP_REVISION_1_1 }, + { .srev = 0x14, .name = "i.MX31", .v = "1.2", .rev = IMX_CHIP_REVISION_1_2 }, + { .srev = 0x15, .name = "i.MX31L", .v = "1.2", .rev = IMX_CHIP_REVISION_1_2 }, + { .srev = 0x28, .name = "i.MX31", .v = "2.0", .rev = IMX_CHIP_REVISION_2_0 }, + { .srev = 0x29, .name = "i.MX31L", .v = "2.0", .rev = IMX_CHIP_REVISION_2_0 }, +}; + +void __init mx31_read_cpu_rev(void) +{ + u32 i, srev; + + /* read SREV register from IIM module */ + srev = __raw_readl(MX31_IO_ADDRESS(MX31_IIM_BASE_ADDR + MXC_IIMSREV)); + + for (i = 0; i < ARRAY_SIZE(mx31_cpu_type); i++) + if (srev == mx31_cpu_type[i].srev) { + printk(KERN_INFO + "CPU identified as %s, silicon rev %s\n", + mx31_cpu_type[i].name, mx31_cpu_type[i].v); + + mx31_cpu_rev = mx31_cpu_type[i].rev; + return; + } + + mx31_cpu_rev = IMX_CHIP_REVISION_UNKNOWN; + + printk(KERN_WARNING "Unknown CPU identifier. srev = %02x\n", srev); +} diff --git a/arch/arm/mach-imx/cpu-imx35.c b/arch/arm/mach-imx/cpu-imx35.c new file mode 100644 index 0000000..6637cd8 --- /dev/null +++ b/arch/arm/mach-imx/cpu-imx35.c @@ -0,0 +1,44 @@ +/* + * MX35 CPU type detection + * + * Copyright (c) 2009 Daniel Mack + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#include +#include +#include +#include + +unsigned int mx35_cpu_rev; +EXPORT_SYMBOL(mx35_cpu_rev); + +void __init mx35_read_cpu_rev(void) +{ + u32 rev; + char *srev; + + rev = __raw_readl(MX35_IO_ADDRESS(MX35_IIM_BASE_ADDR + MXC_IIMSREV)); + switch (rev) { + case 0x00: + mx35_cpu_rev = IMX_CHIP_REVISION_1_0; + srev = "1.0"; + break; + case 0x10: + mx35_cpu_rev = IMX_CHIP_REVISION_2_0; + srev = "2.0"; + break; + case 0x11: + mx35_cpu_rev = IMX_CHIP_REVISION_2_1; + srev = "2.1"; + break; + default: + mx35_cpu_rev = IMX_CHIP_REVISION_UNKNOWN; + srev = "unknown"; + } + + printk(KERN_INFO "CPU identified as i.MX35, silicon rev %s\n", srev); +} diff --git a/arch/arm/mach-imx/crmregs-imx31.h b/arch/arm/mach-imx/crmregs-imx31.h new file mode 100644 index 0000000..37a8a07 --- /dev/null +++ b/arch/arm/mach-imx/crmregs-imx31.h @@ -0,0 +1,248 @@ +/* + * Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved. + * Copyright (C) 2008 by Sascha Hauer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef __ARCH_ARM_MACH_MX3_CRM_REGS_H__ +#define __ARCH_ARM_MACH_MX3_CRM_REGS_H__ + +#define CKIH_CLK_FREQ 26000000 +#define CKIH_CLK_FREQ_27MHZ 27000000 +#define CKIL_CLK_FREQ 32768 + +#define MXC_CCM_BASE MX31_IO_ADDRESS(MX31_CCM_BASE_ADDR) + +/* Register addresses */ +#define MXC_CCM_CCMR (MXC_CCM_BASE + 0x00) +#define MXC_CCM_PDR0 (MXC_CCM_BASE + 0x04) +#define MXC_CCM_PDR1 (MXC_CCM_BASE + 0x08) +#define MXC_CCM_RCSR (MXC_CCM_BASE + 0x0C) +#define MXC_CCM_MPCTL (MXC_CCM_BASE + 0x10) +#define MXC_CCM_UPCTL (MXC_CCM_BASE + 0x14) +#define MXC_CCM_SRPCTL (MXC_CCM_BASE + 0x18) +#define MXC_CCM_COSR (MXC_CCM_BASE + 0x1C) +#define MXC_CCM_CGR0 (MXC_CCM_BASE + 0x20) +#define MXC_CCM_CGR1 (MXC_CCM_BASE + 0x24) +#define MXC_CCM_CGR2 (MXC_CCM_BASE + 0x28) +#define MXC_CCM_WIMR (MXC_CCM_BASE + 0x2C) +#define MXC_CCM_LDC (MXC_CCM_BASE + 0x30) +#define MXC_CCM_DCVR0 (MXC_CCM_BASE + 0x34) +#define MXC_CCM_DCVR1 (MXC_CCM_BASE + 0x38) +#define MXC_CCM_DCVR2 (MXC_CCM_BASE + 0x3C) +#define MXC_CCM_DCVR3 (MXC_CCM_BASE + 0x40) +#define MXC_CCM_LTR0 (MXC_CCM_BASE + 0x44) +#define MXC_CCM_LTR1 (MXC_CCM_BASE + 0x48) +#define MXC_CCM_LTR2 (MXC_CCM_BASE + 0x4C) +#define MXC_CCM_LTR3 (MXC_CCM_BASE + 0x50) +#define MXC_CCM_LTBR0 (MXC_CCM_BASE + 0x54) +#define MXC_CCM_LTBR1 (MXC_CCM_BASE + 0x58) +#define MXC_CCM_PMCR0 (MXC_CCM_BASE + 0x5C) +#define MXC_CCM_PMCR1 (MXC_CCM_BASE + 0x60) +#define MXC_CCM_PDR2 (MXC_CCM_BASE + 0x64) + +/* Register bit definitions */ +#define MXC_CCM_CCMR_WBEN (1 << 27) +#define MXC_CCM_CCMR_CSCS (1 << 25) +#define MXC_CCM_CCMR_PERCS (1 << 24) +#define MXC_CCM_CCMR_SSI1S_OFFSET 18 +#define MXC_CCM_CCMR_SSI1S_MASK (0x3 << 18) +#define MXC_CCM_CCMR_SSI2S_OFFSET 21 +#define MXC_CCM_CCMR_SSI2S_MASK (0x3 << 21) +#define MXC_CCM_CCMR_LPM_OFFSET 14 +#define MXC_CCM_CCMR_LPM_MASK (0x3 << 14) +#define MXC_CCM_CCMR_FIRS_OFFSET 11 +#define MXC_CCM_CCMR_FIRS_MASK (0x3 << 11) +#define MXC_CCM_CCMR_UPE (1 << 9) +#define MXC_CCM_CCMR_SPE (1 << 8) +#define MXC_CCM_CCMR_MDS (1 << 7) +#define MXC_CCM_CCMR_SBYCS (1 << 4) +#define MXC_CCM_CCMR_MPE (1 << 3) +#define MXC_CCM_CCMR_PRCS_OFFSET 1 +#define MXC_CCM_CCMR_PRCS_MASK (0x3 << 1) + +#define MXC_CCM_PDR0_CSI_PODF_OFFSET 26 +#define MXC_CCM_PDR0_CSI_PODF_MASK (0x3F << 26) +#define MXC_CCM_PDR0_CSI_PRDF_OFFSET 23 +#define MXC_CCM_PDR0_CSI_PRDF_MASK (0x7 << 23) +#define MXC_CCM_PDR0_PER_PODF_OFFSET 16 +#define MXC_CCM_PDR0_PER_PODF_MASK (0x1F << 16) +#define MXC_CCM_PDR0_HSP_PODF_OFFSET 11 +#define MXC_CCM_PDR0_HSP_PODF_MASK (0x7 << 11) +#define MXC_CCM_PDR0_NFC_PODF_OFFSET 8 +#define MXC_CCM_PDR0_NFC_PODF_MASK (0x7 << 8) +#define MXC_CCM_PDR0_IPG_PODF_OFFSET 6 +#define MXC_CCM_PDR0_IPG_PODF_MASK (0x3 << 6) +#define MXC_CCM_PDR0_MAX_PODF_OFFSET 3 +#define MXC_CCM_PDR0_MAX_PODF_MASK (0x7 << 3) +#define MXC_CCM_PDR0_MCU_PODF_OFFSET 0 +#define MXC_CCM_PDR0_MCU_PODF_MASK 0x7 + +#define MXC_CCM_PDR1_USB_PRDF_OFFSET 30 +#define MXC_CCM_PDR1_USB_PRDF_MASK (0x3 << 30) +#define MXC_CCM_PDR1_USB_PODF_OFFSET 27 +#define MXC_CCM_PDR1_USB_PODF_MASK (0x7 << 27) +#define MXC_CCM_PDR1_FIRI_PRE_PODF_OFFSET 24 +#define MXC_CCM_PDR1_FIRI_PRE_PODF_MASK (0x7 << 24) +#define MXC_CCM_PDR1_FIRI_PODF_OFFSET 18 +#define MXC_CCM_PDR1_FIRI_PODF_MASK (0x3F << 18) +#define MXC_CCM_PDR1_SSI2_PRE_PODF_OFFSET 15 +#define MXC_CCM_PDR1_SSI2_PRE_PODF_MASK (0x7 << 15) +#define MXC_CCM_PDR1_SSI2_PODF_OFFSET 9 +#define MXC_CCM_PDR1_SSI2_PODF_MASK (0x3F << 9) +#define MXC_CCM_PDR1_SSI1_PRE_PODF_OFFSET 6 +#define MXC_CCM_PDR1_SSI1_PRE_PODF_MASK (0x7 << 6) +#define MXC_CCM_PDR1_SSI1_PODF_OFFSET 0 +#define MXC_CCM_PDR1_SSI1_PODF_MASK 0x3F + +/* Bit definitions for RCSR */ +#define MXC_CCM_RCSR_NF16B 0x80000000 + +/* + * LTR0 register offsets + */ +#define MXC_CCM_LTR0_DIV3CK_OFFSET 1 +#define MXC_CCM_LTR0_DIV3CK_MASK (0x3 << 1) +#define MXC_CCM_LTR0_DNTHR_OFFSET 16 +#define MXC_CCM_LTR0_DNTHR_MASK (0x3F << 16) +#define MXC_CCM_LTR0_UPTHR_OFFSET 22 +#define MXC_CCM_LTR0_UPTHR_MASK (0x3F << 22) + +/* + * LTR1 register offsets + */ +#define MXC_CCM_LTR1_PNCTHR_OFFSET 0 +#define MXC_CCM_LTR1_PNCTHR_MASK 0x3F +#define MXC_CCM_LTR1_UPCNT_OFFSET 6 +#define MXC_CCM_LTR1_UPCNT_MASK (0xFF << 6) +#define MXC_CCM_LTR1_DNCNT_OFFSET 14 +#define MXC_CCM_LTR1_DNCNT_MASK (0xFF << 14) +#define MXC_CCM_LTR1_LTBRSR_MASK 0x400000 +#define MXC_CCM_LTR1_LTBRSR_OFFSET 22 +#define MXC_CCM_LTR1_LTBRSR 0x400000 +#define MXC_CCM_LTR1_LTBRSH 0x800000 + +/* + * LTR2 bit definitions. x ranges from 0 for WSW9 to 6 for WSW15 + */ +#define MXC_CCM_LTR2_WSW_OFFSET(x) (11 + (x) * 3) +#define MXC_CCM_LTR2_WSW_MASK(x) (0x7 << \ + MXC_CCM_LTR2_WSW_OFFSET((x))) +#define MXC_CCM_LTR2_EMAC_OFFSET 0 +#define MXC_CCM_LTR2_EMAC_MASK 0x1FF + +/* + * LTR3 bit definitions. x ranges from 0 for WSW0 to 8 for WSW8 + */ +#define MXC_CCM_LTR3_WSW_OFFSET(x) (5 + (x) * 3) +#define MXC_CCM_LTR3_WSW_MASK(x) (0x7 << \ + MXC_CCM_LTR3_WSW_OFFSET((x))) + +#define MXC_CCM_PMCR0_DFSUP1 0x80000000 +#define MXC_CCM_PMCR0_DFSUP1_SPLL (0 << 31) +#define MXC_CCM_PMCR0_DFSUP1_MPLL (1 << 31) +#define MXC_CCM_PMCR0_DFSUP0 0x40000000 +#define MXC_CCM_PMCR0_DFSUP0_PLL (0 << 30) +#define MXC_CCM_PMCR0_DFSUP0_PDR (1 << 30) +#define MXC_CCM_PMCR0_DFSUP_MASK (0x3 << 30) + +#define DVSUP_TURBO 0 +#define DVSUP_HIGH 1 +#define DVSUP_MEDIUM 2 +#define DVSUP_LOW 3 +#define MXC_CCM_PMCR0_DVSUP_TURBO (DVSUP_TURBO << 28) +#define MXC_CCM_PMCR0_DVSUP_HIGH (DVSUP_HIGH << 28) +#define MXC_CCM_PMCR0_DVSUP_MEDIUM (DVSUP_MEDIUM << 28) +#define MXC_CCM_PMCR0_DVSUP_LOW (DVSUP_LOW << 28) +#define MXC_CCM_PMCR0_DVSUP_OFFSET 28 +#define MXC_CCM_PMCR0_DVSUP_MASK (0x3 << 28) +#define MXC_CCM_PMCR0_UDSC 0x08000000 +#define MXC_CCM_PMCR0_UDSC_MASK (1 << 27) +#define MXC_CCM_PMCR0_UDSC_UP (1 << 27) +#define MXC_CCM_PMCR0_UDSC_DOWN (0 << 27) + +#define MXC_CCM_PMCR0_VSCNT_1 (0x0 << 24) +#define MXC_CCM_PMCR0_VSCNT_2 (0x1 << 24) +#define MXC_CCM_PMCR0_VSCNT_3 (0x2 << 24) +#define MXC_CCM_PMCR0_VSCNT_4 (0x3 << 24) +#define MXC_CCM_PMCR0_VSCNT_5 (0x4 << 24) +#define MXC_CCM_PMCR0_VSCNT_6 (0x5 << 24) +#define MXC_CCM_PMCR0_VSCNT_7 (0x6 << 24) +#define MXC_CCM_PMCR0_VSCNT_8 (0x7 << 24) +#define MXC_CCM_PMCR0_VSCNT_OFFSET 24 +#define MXC_CCM_PMCR0_VSCNT_MASK (0x7 << 24) +#define MXC_CCM_PMCR0_DVFEV 0x00800000 +#define MXC_CCM_PMCR0_DVFIS 0x00400000 +#define MXC_CCM_PMCR0_LBMI 0x00200000 +#define MXC_CCM_PMCR0_LBFL 0x00100000 +#define MXC_CCM_PMCR0_LBCF_4 (0x0 << 18) +#define MXC_CCM_PMCR0_LBCF_8 (0x1 << 18) +#define MXC_CCM_PMCR0_LBCF_12 (0x2 << 18) +#define MXC_CCM_PMCR0_LBCF_16 (0x3 << 18) +#define MXC_CCM_PMCR0_LBCF_OFFSET 18 +#define MXC_CCM_PMCR0_LBCF_MASK (0x3 << 18) +#define MXC_CCM_PMCR0_PTVIS 0x00020000 +#define MXC_CCM_PMCR0_UPDTEN 0x00010000 +#define MXC_CCM_PMCR0_UPDTEN_MASK (0x1 << 16) +#define MXC_CCM_PMCR0_FSVAIM 0x00008000 +#define MXC_CCM_PMCR0_FSVAI_OFFSET 13 +#define MXC_CCM_PMCR0_FSVAI_MASK (0x3 << 13) +#define MXC_CCM_PMCR0_DPVCR 0x00001000 +#define MXC_CCM_PMCR0_DPVV 0x00000800 +#define MXC_CCM_PMCR0_WFIM 0x00000400 +#define MXC_CCM_PMCR0_DRCE3 0x00000200 +#define MXC_CCM_PMCR0_DRCE2 0x00000100 +#define MXC_CCM_PMCR0_DRCE1 0x00000080 +#define MXC_CCM_PMCR0_DRCE0 0x00000040 +#define MXC_CCM_PMCR0_DCR 0x00000020 +#define MXC_CCM_PMCR0_DVFEN 0x00000010 +#define MXC_CCM_PMCR0_PTVAIM 0x00000008 +#define MXC_CCM_PMCR0_PTVAI_OFFSET 1 +#define MXC_CCM_PMCR0_PTVAI_MASK (0x3 << 1) +#define MXC_CCM_PMCR0_DPTEN 0x00000001 + +#define MXC_CCM_PMCR1_DVGP_OFFSET 0 +#define MXC_CCM_PMCR1_DVGP_MASK (0xF) + +#define MXC_CCM_PMCR1_PLLRDIS (0x1 << 7) +#define MXC_CCM_PMCR1_EMIRQ_EN (0x1 << 8) + +#define MXC_CCM_DCVR_ULV_MASK (0x3FF << 22) +#define MXC_CCM_DCVR_ULV_OFFSET 22 +#define MXC_CCM_DCVR_LLV_MASK (0x3FF << 12) +#define MXC_CCM_DCVR_LLV_OFFSET 12 +#define MXC_CCM_DCVR_ELV_MASK (0x3FF << 2) +#define MXC_CCM_DCVR_ELV_OFFSET 2 + +#define MXC_CCM_PDR2_MST2_PDF_MASK (0x3F << 7) +#define MXC_CCM_PDR2_MST2_PDF_OFFSET 7 +#define MXC_CCM_PDR2_MST1_PDF_MASK 0x3F +#define MXC_CCM_PDR2_MST1_PDF_OFFSET 0 + +#define MXC_CCM_COSR_CLKOSEL_MASK 0x0F +#define MXC_CCM_COSR_CLKOSEL_OFFSET 0 +#define MXC_CCM_COSR_CLKOUTDIV_MASK (0x07 << 6) +#define MXC_CCM_COSR_CLKOUTDIV_OFFSET 6 +#define MXC_CCM_COSR_CLKOEN (1 << 9) + +/* + * PMCR0 register offsets + */ +#define MXC_CCM_PMCR0_LBFL_OFFSET 20 +#define MXC_CCM_PMCR0_DFSUP0_OFFSET 30 +#define MXC_CCM_PMCR0_DFSUP1_OFFSET 31 + +#endif /* __ARCH_ARM_MACH_MX3_CRM_REGS_H__ */ diff --git a/arch/arm/mach-imx/devices-imx1.h b/arch/arm/mach-imx/devices-imx1.h index da59365..3aad1e7 100644 --- a/arch/arm/mach-imx/devices-imx1.h +++ b/arch/arm/mach-imx/devices-imx1.h @@ -9,21 +9,21 @@ #include #include -extern const struct imx_imx_fb_data imx1_imx_fb_data __initconst; +extern const struct imx_imx_fb_data imx1_imx_fb_data; #define imx1_add_imx_fb(pdata) \ imx_add_imx_fb(&imx1_imx_fb_data, pdata) -extern const struct imx_imx_i2c_data imx1_imx_i2c_data __initconst; +extern const struct imx_imx_i2c_data imx1_imx_i2c_data; #define imx1_add_imx_i2c(pdata) \ imx_add_imx_i2c(&imx1_imx_i2c_data, pdata) -extern const struct imx_imx_uart_3irq_data imx1_imx_uart_data[] __initconst; +extern const struct imx_imx_uart_3irq_data imx1_imx_uart_data[]; #define imx1_add_imx_uart(id, pdata) \ imx_add_imx_uart_3irq(&imx1_imx_uart_data[id], pdata) #define imx1_add_imx_uart0(pdata) imx1_add_imx_uart(0, pdata) #define imx1_add_imx_uart1(pdata) imx1_add_imx_uart(1, pdata) -extern const struct imx_spi_imx_data imx1_cspi_data[] __initconst; +extern const struct imx_spi_imx_data imx1_cspi_data[]; #define imx1_add_cspi(id, pdata) \ imx_add_spi_imx(&imx1_cspi_data[id], pdata) diff --git a/arch/arm/mach-imx/devices-imx21.h b/arch/arm/mach-imx/devices-imx21.h index 16744d2..2628e0c 100644 --- a/arch/arm/mach-imx/devices-imx21.h +++ b/arch/arm/mach-imx/devices-imx21.h @@ -9,31 +9,31 @@ #include #include -extern const struct imx_imx21_hcd_data imx21_imx21_hcd_data __initconst; +extern const struct imx_imx21_hcd_data imx21_imx21_hcd_data; #define imx21_add_imx21_hcd(pdata) \ imx_add_imx21_hcd(&imx21_imx21_hcd_data, pdata) -extern const struct imx_imx2_wdt_data imx21_imx2_wdt_data __initconst; +extern const struct imx_imx2_wdt_data imx21_imx2_wdt_data; #define imx21_add_imx2_wdt(pdata) \ imx_add_imx2_wdt(&imx21_imx2_wdt_data) -extern const struct imx_imx_fb_data imx21_imx_fb_data __initconst; +extern const struct imx_imx_fb_data imx21_imx_fb_data; #define imx21_add_imx_fb(pdata) \ imx_add_imx_fb(&imx21_imx_fb_data, pdata) -extern const struct imx_imx_i2c_data imx21_imx_i2c_data __initconst; +extern const struct imx_imx_i2c_data imx21_imx_i2c_data; #define imx21_add_imx_i2c(pdata) \ imx_add_imx_i2c(&imx21_imx_i2c_data, pdata) -extern const struct imx_imx_keypad_data imx21_imx_keypad_data __initconst; +extern const struct imx_imx_keypad_data imx21_imx_keypad_data; #define imx21_add_imx_keypad(pdata) \ imx_add_imx_keypad(&imx21_imx_keypad_data, pdata) -extern const struct imx_imx_ssi_data imx21_imx_ssi_data[] __initconst; +extern const struct imx_imx_ssi_data imx21_imx_ssi_data[]; #define imx21_add_imx_ssi(id, pdata) \ imx_add_imx_ssi(&imx21_imx_ssi_data[id], pdata) -extern const struct imx_imx_uart_1irq_data imx21_imx_uart_data[] __initconst; +extern const struct imx_imx_uart_1irq_data imx21_imx_uart_data[]; #define imx21_add_imx_uart(id, pdata) \ imx_add_imx_uart_1irq(&imx21_imx_uart_data[id], pdata) #define imx21_add_imx_uart0(pdata) imx21_add_imx_uart(0, pdata) @@ -41,19 +41,19 @@ extern const struct imx_imx_uart_1irq_data imx21_imx_uart_data[] __initconst; #define imx21_add_imx_uart2(pdata) imx21_add_imx_uart(2, pdata) #define imx21_add_imx_uart3(pdata) imx21_add_imx_uart(3, pdata) -extern const struct imx_mxc_mmc_data imx21_mxc_mmc_data[] __initconst; +extern const struct imx_mxc_mmc_data imx21_mxc_mmc_data[]; #define imx21_add_mxc_mmc(id, pdata) \ imx_add_mxc_mmc(&imx21_mxc_mmc_data[id], pdata) -extern const struct imx_mxc_nand_data imx21_mxc_nand_data __initconst; +extern const struct imx_mxc_nand_data imx21_mxc_nand_data; #define imx21_add_mxc_nand(pdata) \ imx_add_mxc_nand(&imx21_mxc_nand_data, pdata) -extern const struct imx_mxc_w1_data imx21_mxc_w1_data __initconst; +extern const struct imx_mxc_w1_data imx21_mxc_w1_data; #define imx21_add_mxc_w1(pdata) \ imx_add_mxc_w1(&imx21_mxc_w1_data) -extern const struct imx_spi_imx_data imx21_cspi_data[] __initconst; +extern const struct imx_spi_imx_data imx21_cspi_data[]; #define imx21_add_cspi(id, pdata) \ imx_add_spi_imx(&imx21_cspi_data[id], pdata) #define imx21_add_spi_imx0(pdata) imx21_add_cspi(0, pdata) diff --git a/arch/arm/mach-imx/devices-imx25.h b/arch/arm/mach-imx/devices-imx25.h index b591d72..efa0761 100644 --- a/arch/arm/mach-imx/devices-imx25.h +++ b/arch/arm/mach-imx/devices-imx25.h @@ -9,48 +9,48 @@ #include #include -extern const struct imx_fec_data imx25_fec_data __initconst; +extern const struct imx_fec_data imx25_fec_data; #define imx25_add_fec(pdata) \ imx_add_fec(&imx25_fec_data, pdata) -extern const struct imx_flexcan_data imx25_flexcan_data[] __initconst; +extern const struct imx_flexcan_data imx25_flexcan_data[]; #define imx25_add_flexcan(id, pdata) \ imx_add_flexcan(&imx25_flexcan_data[id], pdata) #define imx25_add_flexcan0(pdata) imx25_add_flexcan(0, pdata) #define imx25_add_flexcan1(pdata) imx25_add_flexcan(1, pdata) -extern const struct imx_fsl_usb2_udc_data imx25_fsl_usb2_udc_data __initconst; +extern const struct imx_fsl_usb2_udc_data imx25_fsl_usb2_udc_data; #define imx25_add_fsl_usb2_udc(pdata) \ imx_add_fsl_usb2_udc(&imx25_fsl_usb2_udc_data, pdata) -extern struct imx_imxdi_rtc_data imx25_imxdi_rtc_data __initconst; +extern struct imx_imxdi_rtc_data imx25_imxdi_rtc_data; #define imx25_add_imxdi_rtc(pdata) \ imx_add_imxdi_rtc(&imx25_imxdi_rtc_data) -extern const struct imx_imx2_wdt_data imx25_imx2_wdt_data __initconst; +extern const struct imx_imx2_wdt_data imx25_imx2_wdt_data; #define imx25_add_imx2_wdt(pdata) \ imx_add_imx2_wdt(&imx25_imx2_wdt_data) -extern const struct imx_imx_fb_data imx25_imx_fb_data __initconst; +extern const struct imx_imx_fb_data imx25_imx_fb_data; #define imx25_add_imx_fb(pdata) \ imx_add_imx_fb(&imx25_imx_fb_data, pdata) -extern const struct imx_imx_i2c_data imx25_imx_i2c_data[] __initconst; +extern const struct imx_imx_i2c_data imx25_imx_i2c_data[]; #define imx25_add_imx_i2c(id, pdata) \ imx_add_imx_i2c(&imx25_imx_i2c_data[id], pdata) #define imx25_add_imx_i2c0(pdata) imx25_add_imx_i2c(0, pdata) #define imx25_add_imx_i2c1(pdata) imx25_add_imx_i2c(1, pdata) #define imx25_add_imx_i2c2(pdata) imx25_add_imx_i2c(2, pdata) -extern const struct imx_imx_keypad_data imx25_imx_keypad_data __initconst; +extern const struct imx_imx_keypad_data imx25_imx_keypad_data; #define imx25_add_imx_keypad(pdata) \ imx_add_imx_keypad(&imx25_imx_keypad_data, pdata) -extern const struct imx_imx_ssi_data imx25_imx_ssi_data[] __initconst; +extern const struct imx_imx_ssi_data imx25_imx_ssi_data[]; #define imx25_add_imx_ssi(id, pdata) \ imx_add_imx_ssi(&imx25_imx_ssi_data[id], pdata) -extern const struct imx_imx_uart_1irq_data imx25_imx_uart_data[] __initconst; +extern const struct imx_imx_uart_1irq_data imx25_imx_uart_data[]; #define imx25_add_imx_uart(id, pdata) \ imx_add_imx_uart_1irq(&imx25_imx_uart_data[id], pdata) #define imx25_add_imx_uart0(pdata) imx25_add_imx_uart(0, pdata) @@ -59,33 +59,32 @@ extern const struct imx_imx_uart_1irq_data imx25_imx_uart_data[] __initconst; #define imx25_add_imx_uart3(pdata) imx25_add_imx_uart(3, pdata) #define imx25_add_imx_uart4(pdata) imx25_add_imx_uart(4, pdata) -extern const struct imx_mx2_camera_data imx25_mx2_camera_data __initconst; +extern const struct imx_mx2_camera_data imx25_mx2_camera_data; #define imx25_add_mx2_camera(pdata) \ imx_add_mx2_camera(&imx25_mx2_camera_data, pdata) -extern const struct imx_mxc_ehci_data imx25_mxc_ehci_otg_data __initconst; +extern const struct imx_mxc_ehci_data imx25_mxc_ehci_otg_data; #define imx25_add_mxc_ehci_otg(pdata) \ imx_add_mxc_ehci(&imx25_mxc_ehci_otg_data, pdata) -extern const struct imx_mxc_ehci_data imx25_mxc_ehci_hs_data __initconst; +extern const struct imx_mxc_ehci_data imx25_mxc_ehci_hs_data; #define imx25_add_mxc_ehci_hs(pdata) \ imx_add_mxc_ehci(&imx25_mxc_ehci_hs_data, pdata) -extern const struct imx_mxc_nand_data imx25_mxc_nand_data __initconst; +extern const struct imx_mxc_nand_data imx25_mxc_nand_data; #define imx25_add_mxc_nand(pdata) \ imx_add_mxc_nand(&imx25_mxc_nand_data, pdata) -extern const struct imx_sdhci_esdhc_imx_data -imx25_sdhci_esdhc_imx_data[] __initconst; +extern const struct imx_sdhci_esdhc_imx_data imx25_sdhci_esdhc_imx_data[]; #define imx25_add_sdhci_esdhc_imx(id, pdata) \ imx_add_sdhci_esdhc_imx(&imx25_sdhci_esdhc_imx_data[id], pdata) -extern const struct imx_spi_imx_data imx25_cspi_data[] __initconst; +extern const struct imx_spi_imx_data imx25_cspi_data[]; #define imx25_add_spi_imx(id, pdata) \ imx_add_spi_imx(&imx25_cspi_data[id], pdata) #define imx25_add_spi_imx0(pdata) imx25_add_spi_imx(0, pdata) #define imx25_add_spi_imx1(pdata) imx25_add_spi_imx(1, pdata) #define imx25_add_spi_imx2(pdata) imx25_add_spi_imx(2, pdata) -extern struct imx_mxc_pwm_data imx25_mxc_pwm_data[] __initconst; +extern struct imx_mxc_pwm_data imx25_mxc_pwm_data[]; #define imx25_add_mxc_pwm(id) \ imx_add_mxc_pwm(&imx25_mxc_pwm_data[id]) diff --git a/arch/arm/mach-imx/devices-imx27.h b/arch/arm/mach-imx/devices-imx27.h index f1272d4..7f97a3c 100644 --- a/arch/arm/mach-imx/devices-imx27.h +++ b/arch/arm/mach-imx/devices-imx27.h @@ -9,35 +9,35 @@ #include #include -extern const struct imx_fec_data imx27_fec_data __initconst; +extern const struct imx_fec_data imx27_fec_data; #define imx27_add_fec(pdata) \ imx_add_fec(&imx27_fec_data, pdata) -extern const struct imx_fsl_usb2_udc_data imx27_fsl_usb2_udc_data __initconst; +extern const struct imx_fsl_usb2_udc_data imx27_fsl_usb2_udc_data; #define imx27_add_fsl_usb2_udc(pdata) \ imx_add_fsl_usb2_udc(&imx27_fsl_usb2_udc_data, pdata) -extern const struct imx_imx2_wdt_data imx27_imx2_wdt_data __initconst; +extern const struct imx_imx2_wdt_data imx27_imx2_wdt_data; #define imx27_add_imx2_wdt(pdata) \ imx_add_imx2_wdt(&imx27_imx2_wdt_data) -extern const struct imx_imx_fb_data imx27_imx_fb_data __initconst; +extern const struct imx_imx_fb_data imx27_imx_fb_data; #define imx27_add_imx_fb(pdata) \ imx_add_imx_fb(&imx27_imx_fb_data, pdata) -extern const struct imx_imx_i2c_data imx27_imx_i2c_data[] __initconst; +extern const struct imx_imx_i2c_data imx27_imx_i2c_data[]; #define imx27_add_imx_i2c(id, pdata) \ imx_add_imx_i2c(&imx27_imx_i2c_data[id], pdata) -extern const struct imx_imx_keypad_data imx27_imx_keypad_data __initconst; +extern const struct imx_imx_keypad_data imx27_imx_keypad_data; #define imx27_add_imx_keypad(pdata) \ imx_add_imx_keypad(&imx27_imx_keypad_data, pdata) -extern const struct imx_imx_ssi_data imx27_imx_ssi_data[] __initconst; +extern const struct imx_imx_ssi_data imx27_imx_ssi_data[]; #define imx27_add_imx_ssi(id, pdata) \ imx_add_imx_ssi(&imx27_imx_ssi_data[id], pdata) -extern const struct imx_imx_uart_1irq_data imx27_imx_uart_data[] __initconst; +extern const struct imx_imx_uart_1irq_data imx27_imx_uart_data[]; #define imx27_add_imx_uart(id, pdata) \ imx_add_imx_uart_1irq(&imx27_imx_uart_data[id], pdata) #define imx27_add_imx_uart0(pdata) imx27_add_imx_uart(0, pdata) @@ -47,30 +47,30 @@ extern const struct imx_imx_uart_1irq_data imx27_imx_uart_data[] __initconst; #define imx27_add_imx_uart4(pdata) imx27_add_imx_uart(4, pdata) #define imx27_add_imx_uart5(pdata) imx27_add_imx_uart(5, pdata) -extern const struct imx_mx2_camera_data imx27_mx2_camera_data __initconst; +extern const struct imx_mx2_camera_data imx27_mx2_camera_data; #define imx27_add_mx2_camera(pdata) \ imx_add_mx2_camera(&imx27_mx2_camera_data, pdata) -extern const struct imx_mxc_ehci_data imx27_mxc_ehci_otg_data __initconst; +extern const struct imx_mxc_ehci_data imx27_mxc_ehci_otg_data; #define imx27_add_mxc_ehci_otg(pdata) \ imx_add_mxc_ehci(&imx27_mxc_ehci_otg_data, pdata) -extern const struct imx_mxc_ehci_data imx27_mxc_ehci_hs_data[] __initconst; +extern const struct imx_mxc_ehci_data imx27_mxc_ehci_hs_data[]; #define imx27_add_mxc_ehci_hs(id, pdata) \ imx_add_mxc_ehci(&imx27_mxc_ehci_hs_data[id - 1], pdata) -extern const struct imx_mxc_mmc_data imx27_mxc_mmc_data[] __initconst; +extern const struct imx_mxc_mmc_data imx27_mxc_mmc_data[]; #define imx27_add_mxc_mmc(id, pdata) \ imx_add_mxc_mmc(&imx27_mxc_mmc_data[id], pdata) -extern const struct imx_mxc_nand_data imx27_mxc_nand_data __initconst; +extern const struct imx_mxc_nand_data imx27_mxc_nand_data; #define imx27_add_mxc_nand(pdata) \ imx_add_mxc_nand(&imx27_mxc_nand_data, pdata) -extern const struct imx_mxc_w1_data imx27_mxc_w1_data __initconst; +extern const struct imx_mxc_w1_data imx27_mxc_w1_data; #define imx27_add_mxc_w1(pdata) \ imx_add_mxc_w1(&imx27_mxc_w1_data) -extern const struct imx_spi_imx_data imx27_cspi_data[] __initconst; +extern const struct imx_spi_imx_data imx27_cspi_data[]; #define imx27_add_cspi(id, pdata) \ imx_add_spi_imx(&imx27_cspi_data[id], pdata) #define imx27_add_spi_imx0(pdata) imx27_add_cspi(0, pdata) diff --git a/arch/arm/mach-imx/devices-imx31.h b/arch/arm/mach-imx/devices-imx31.h new file mode 100644 index 0000000..dbe940d --- /dev/null +++ b/arch/arm/mach-imx/devices-imx31.h @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2010 Pengutronix + * Uwe Kleine-Koenig + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. + */ +#include +#include + +extern const struct imx_fsl_usb2_udc_data imx31_fsl_usb2_udc_data; +#define imx31_add_fsl_usb2_udc(pdata) \ + imx_add_fsl_usb2_udc(&imx31_fsl_usb2_udc_data, pdata) + +extern const struct imx_imx2_wdt_data imx31_imx2_wdt_data; +#define imx31_add_imx2_wdt(pdata) \ + imx_add_imx2_wdt(&imx31_imx2_wdt_data) + +extern const struct imx_imx_i2c_data imx31_imx_i2c_data[]; +#define imx31_add_imx_i2c(id, pdata) \ + imx_add_imx_i2c(&imx31_imx_i2c_data[id], pdata) +#define imx31_add_imx_i2c0(pdata) imx31_add_imx_i2c(0, pdata) +#define imx31_add_imx_i2c1(pdata) imx31_add_imx_i2c(1, pdata) +#define imx31_add_imx_i2c2(pdata) imx31_add_imx_i2c(2, pdata) + +extern const struct imx_imx_keypad_data imx31_imx_keypad_data; +#define imx31_add_imx_keypad(pdata) \ + imx_add_imx_keypad(&imx31_imx_keypad_data, pdata) + +extern const struct imx_imx_ssi_data imx31_imx_ssi_data[]; +#define imx31_add_imx_ssi(id, pdata) \ + imx_add_imx_ssi(&imx31_imx_ssi_data[id], pdata) + +extern const struct imx_imx_uart_1irq_data imx31_imx_uart_data[]; +#define imx31_add_imx_uart(id, pdata) \ + imx_add_imx_uart_1irq(&imx31_imx_uart_data[id], pdata) +#define imx31_add_imx_uart0(pdata) imx31_add_imx_uart(0, pdata) +#define imx31_add_imx_uart1(pdata) imx31_add_imx_uart(1, pdata) +#define imx31_add_imx_uart2(pdata) imx31_add_imx_uart(2, pdata) +#define imx31_add_imx_uart3(pdata) imx31_add_imx_uart(3, pdata) +#define imx31_add_imx_uart4(pdata) imx31_add_imx_uart(4, pdata) + +extern const struct imx_ipu_core_data imx31_ipu_core_data; +#define imx31_add_ipu_core(pdata) \ + imx_add_ipu_core(&imx31_ipu_core_data, pdata) +#define imx31_alloc_mx3_camera(pdata) \ + imx_alloc_mx3_camera(&imx31_ipu_core_data, pdata) +#define imx31_add_mx3_sdc_fb(pdata) \ + imx_add_mx3_sdc_fb(&imx31_ipu_core_data, pdata) + +extern const struct imx_mxc_ehci_data imx31_mxc_ehci_otg_data; +#define imx31_add_mxc_ehci_otg(pdata) \ + imx_add_mxc_ehci(&imx31_mxc_ehci_otg_data, pdata) +extern const struct imx_mxc_ehci_data imx31_mxc_ehci_hs_data[]; +#define imx31_add_mxc_ehci_hs(id, pdata) \ + imx_add_mxc_ehci(&imx31_mxc_ehci_hs_data[id - 1], pdata) + +extern const struct imx_mxc_mmc_data imx31_mxc_mmc_data[]; +#define imx31_add_mxc_mmc(id, pdata) \ + imx_add_mxc_mmc(&imx31_mxc_mmc_data[id], pdata) + +extern const struct imx_mxc_nand_data imx31_mxc_nand_data; +#define imx31_add_mxc_nand(pdata) \ + imx_add_mxc_nand(&imx31_mxc_nand_data, pdata) + +extern const struct imx_mxc_rtc_data imx31_mxc_rtc_data; +#define imx31_add_mxc_rtc(pdata) \ + imx_add_mxc_rtc(&imx31_mxc_rtc_data) + +extern const struct imx_mxc_w1_data imx31_mxc_w1_data; +#define imx31_add_mxc_w1(pdata) \ + imx_add_mxc_w1(&imx31_mxc_w1_data) + +extern const struct imx_spi_imx_data imx31_cspi_data[]; +#define imx31_add_cspi(id, pdata) \ + imx_add_spi_imx(&imx31_cspi_data[id], pdata) +#define imx31_add_spi_imx0(pdata) imx31_add_cspi(0, pdata) +#define imx31_add_spi_imx1(pdata) imx31_add_cspi(1, pdata) +#define imx31_add_spi_imx2(pdata) imx31_add_cspi(2, pdata) diff --git a/arch/arm/mach-imx/devices-imx35.h b/arch/arm/mach-imx/devices-imx35.h new file mode 100644 index 0000000..234cbd3 --- /dev/null +++ b/arch/arm/mach-imx/devices-imx35.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2010 Pengutronix + * Uwe Kleine-Koenig + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. + */ +#include +#include + +extern const struct imx_fec_data imx35_fec_data; +#define imx35_add_fec(pdata) \ + imx_add_fec(&imx35_fec_data, pdata) + +extern const struct imx_fsl_usb2_udc_data imx35_fsl_usb2_udc_data; +#define imx35_add_fsl_usb2_udc(pdata) \ + imx_add_fsl_usb2_udc(&imx35_fsl_usb2_udc_data, pdata) + +extern const struct imx_flexcan_data imx35_flexcan_data[]; +#define imx35_add_flexcan(id, pdata) \ + imx_add_flexcan(&imx35_flexcan_data[id], pdata) +#define imx35_add_flexcan0(pdata) imx35_add_flexcan(0, pdata) +#define imx35_add_flexcan1(pdata) imx35_add_flexcan(1, pdata) + +extern const struct imx_imx2_wdt_data imx35_imx2_wdt_data; +#define imx35_add_imx2_wdt(pdata) \ + imx_add_imx2_wdt(&imx35_imx2_wdt_data) + +extern const struct imx_imx_i2c_data imx35_imx_i2c_data[]; +#define imx35_add_imx_i2c(id, pdata) \ + imx_add_imx_i2c(&imx35_imx_i2c_data[id], pdata) +#define imx35_add_imx_i2c0(pdata) imx35_add_imx_i2c(0, pdata) +#define imx35_add_imx_i2c1(pdata) imx35_add_imx_i2c(1, pdata) +#define imx35_add_imx_i2c2(pdata) imx35_add_imx_i2c(2, pdata) + +extern const struct imx_imx_keypad_data imx35_imx_keypad_data; +#define imx35_add_imx_keypad(pdata) \ + imx_add_imx_keypad(&imx35_imx_keypad_data, pdata) + +extern const struct imx_imx_ssi_data imx35_imx_ssi_data[]; +#define imx35_add_imx_ssi(id, pdata) \ + imx_add_imx_ssi(&imx35_imx_ssi_data[id], pdata) + +extern const struct imx_imx_uart_1irq_data imx35_imx_uart_data[]; +#define imx35_add_imx_uart(id, pdata) \ + imx_add_imx_uart_1irq(&imx35_imx_uart_data[id], pdata) +#define imx35_add_imx_uart0(pdata) imx35_add_imx_uart(0, pdata) +#define imx35_add_imx_uart1(pdata) imx35_add_imx_uart(1, pdata) +#define imx35_add_imx_uart2(pdata) imx35_add_imx_uart(2, pdata) + +extern const struct imx_ipu_core_data imx35_ipu_core_data; +#define imx35_add_ipu_core(pdata) \ + imx_add_ipu_core(&imx35_ipu_core_data, pdata) +#define imx35_alloc_mx3_camera(pdata) \ + imx_alloc_mx3_camera(&imx35_ipu_core_data, pdata) +#define imx35_add_mx3_sdc_fb(pdata) \ + imx_add_mx3_sdc_fb(&imx35_ipu_core_data, pdata) + +extern const struct imx_mxc_ehci_data imx35_mxc_ehci_otg_data; +#define imx35_add_mxc_ehci_otg(pdata) \ + imx_add_mxc_ehci(&imx35_mxc_ehci_otg_data, pdata) +extern const struct imx_mxc_ehci_data imx35_mxc_ehci_hs_data; +#define imx35_add_mxc_ehci_hs(pdata) \ + imx_add_mxc_ehci(&imx35_mxc_ehci_hs_data, pdata) + +extern const struct imx_mxc_nand_data imx35_mxc_nand_data; +#define imx35_add_mxc_nand(pdata) \ + imx_add_mxc_nand(&imx35_mxc_nand_data, pdata) + +extern const struct imx_mxc_w1_data imx35_mxc_w1_data; +#define imx35_add_mxc_w1(pdata) \ + imx_add_mxc_w1(&imx35_mxc_w1_data) + +extern const struct imx_sdhci_esdhc_imx_data imx35_sdhci_esdhc_imx_data[]; +#define imx35_add_sdhci_esdhc_imx(id, pdata) \ + imx_add_sdhci_esdhc_imx(&imx35_sdhci_esdhc_imx_data[id], pdata) + +extern const struct imx_spi_imx_data imx35_cspi_data[]; +#define imx35_add_cspi(id, pdata) \ + imx_add_spi_imx(&imx35_cspi_data[id], pdata) +#define imx35_add_spi_imx0(pdata) imx35_add_cspi(0, pdata) +#define imx35_add_spi_imx1(pdata) imx35_add_cspi(1, pdata) diff --git a/arch/arm/mach-imx/ehci-imx31.c b/arch/arm/mach-imx/ehci-imx31.c new file mode 100644 index 0000000..faad0f1 --- /dev/null +++ b/arch/arm/mach-imx/ehci-imx31.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2009 Daniel Mack + * Copyright (C) 2010 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include +#include + +#include +#include + +#define USBCTRL_OTGBASE_OFFSET 0x600 + +#define MX31_OTG_SIC_SHIFT 29 +#define MX31_OTG_SIC_MASK (0x3 << MX31_OTG_SIC_SHIFT) +#define MX31_OTG_PM_BIT (1 << 24) + +#define MX31_H2_SIC_SHIFT 21 +#define MX31_H2_SIC_MASK (0x3 << MX31_H2_SIC_SHIFT) +#define MX31_H2_PM_BIT (1 << 16) +#define MX31_H2_DT_BIT (1 << 5) + +#define MX31_H1_SIC_SHIFT 13 +#define MX31_H1_SIC_MASK (0x3 << MX31_H1_SIC_SHIFT) +#define MX31_H1_PM_BIT (1 << 8) +#define MX31_H1_DT_BIT (1 << 4) + +int mx31_initialize_usb_hw(int port, unsigned int flags) +{ + unsigned int v; + + v = readl(MX31_IO_ADDRESS(MX31_USB_BASE_ADDR + USBCTRL_OTGBASE_OFFSET)); + + switch (port) { + case 0: /* OTG port */ + v &= ~(MX31_OTG_SIC_MASK | MX31_OTG_PM_BIT); + v |= (flags & MXC_EHCI_INTERFACE_MASK) << MX31_OTG_SIC_SHIFT; + + if (!(flags & MXC_EHCI_POWER_PINS_ENABLED)) + v |= MX31_OTG_PM_BIT; + + break; + case 1: /* H1 port */ + v &= ~(MX31_H1_SIC_MASK | MX31_H1_PM_BIT | MX31_H1_DT_BIT); + v |= (flags & MXC_EHCI_INTERFACE_MASK) << MX31_H1_SIC_SHIFT; + + if (!(flags & MXC_EHCI_POWER_PINS_ENABLED)) + v |= MX31_H1_PM_BIT; + + if (!(flags & MXC_EHCI_TTL_ENABLED)) + v |= MX31_H1_DT_BIT; + + break; + case 2: /* H2 port */ + v &= ~(MX31_H2_SIC_MASK | MX31_H2_PM_BIT | MX31_H2_DT_BIT); + v |= (flags & MXC_EHCI_INTERFACE_MASK) << MX31_H2_SIC_SHIFT; + + if (!(flags & MXC_EHCI_POWER_PINS_ENABLED)) + v |= MX31_H2_PM_BIT; + + if (!(flags & MXC_EHCI_TTL_ENABLED)) + v |= MX31_H2_DT_BIT; + + break; + default: + return -EINVAL; + } + + writel(v, MX31_IO_ADDRESS(MX31_USB_BASE_ADDR + USBCTRL_OTGBASE_OFFSET)); + + return 0; +} diff --git a/arch/arm/mach-imx/ehci-imx35.c b/arch/arm/mach-imx/ehci-imx35.c new file mode 100644 index 0000000..001ec39 --- /dev/null +++ b/arch/arm/mach-imx/ehci-imx35.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2009 Daniel Mack + * Copyright (C) 2010 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include +#include + +#include +#include + +#define USBCTRL_OTGBASE_OFFSET 0x600 + +#define MX35_OTG_SIC_SHIFT 29 +#define MX35_OTG_SIC_MASK (0x3 << MX35_OTG_SIC_SHIFT) +#define MX35_OTG_PM_BIT (1 << 24) + +#define MX35_H1_SIC_SHIFT 21 +#define MX35_H1_SIC_MASK (0x3 << MX35_H1_SIC_SHIFT) +#define MX35_H1_PM_BIT (1 << 8) +#define MX35_H1_IPPUE_UP_BIT (1 << 7) +#define MX35_H1_IPPUE_DOWN_BIT (1 << 6) +#define MX35_H1_TLL_BIT (1 << 5) +#define MX35_H1_USBTE_BIT (1 << 4) + +int mx35_initialize_usb_hw(int port, unsigned int flags) +{ + unsigned int v; + + v = readl(MX35_IO_ADDRESS(MX35_USB_BASE_ADDR + USBCTRL_OTGBASE_OFFSET)); + + switch (port) { + case 0: /* OTG port */ + v &= ~(MX35_OTG_SIC_MASK | MX35_OTG_PM_BIT); + v |= (flags & MXC_EHCI_INTERFACE_MASK) << MX35_OTG_SIC_SHIFT; + + if (!(flags & MXC_EHCI_POWER_PINS_ENABLED)) + v |= MX35_OTG_PM_BIT; + + break; + case 1: /* H1 port */ + v &= ~(MX35_H1_SIC_MASK | MX35_H1_PM_BIT | MX35_H1_TLL_BIT | + MX35_H1_USBTE_BIT | MX35_H1_IPPUE_DOWN_BIT | MX35_H1_IPPUE_UP_BIT); + v |= (flags & MXC_EHCI_INTERFACE_MASK) << MX35_H1_SIC_SHIFT; + + if (!(flags & MXC_EHCI_POWER_PINS_ENABLED)) + v |= MX35_H1_PM_BIT; + + if (!(flags & MXC_EHCI_TTL_ENABLED)) + v |= MX35_H1_TLL_BIT; + + if (flags & MXC_EHCI_INTERNAL_PHY) + v |= MX35_H1_USBTE_BIT; + + if (flags & MXC_EHCI_IPPUE_DOWN) + v |= MX35_H1_IPPUE_DOWN_BIT; + + if (flags & MXC_EHCI_IPPUE_UP) + v |= MX35_H1_IPPUE_UP_BIT; + + break; + default: + return -EINVAL; + } + + writel(v, MX35_IO_ADDRESS(MX35_USB_BASE_ADDR + USBCTRL_OTGBASE_OFFSET)); + + return 0; +} diff --git a/arch/arm/mach-imx/eukrea_mbimx27-baseboard.c b/arch/arm/mach-imx/eukrea_mbimx27-baseboard.c index fa5288018..5911281 100644 --- a/arch/arm/mach-imx/eukrea_mbimx27-baseboard.c +++ b/arch/arm/mach-imx/eukrea_mbimx27-baseboard.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include "devices-imx27.h" diff --git a/arch/arm/mach-imx/eukrea_mbimxsd25-baseboard.c b/arch/arm/mach-imx/eukrea_mbimxsd25-baseboard.c index 6269053..f9ef04a 100644 --- a/arch/arm/mach-imx/eukrea_mbimxsd25-baseboard.c +++ b/arch/arm/mach-imx/eukrea_mbimxsd25-baseboard.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include