From: Paul Jackson This patch provides a minimal mechanism to support the safe cpuset-relative management of CPU and Memory placement from user library code, in the face of possible external migration to different CPU's and Memory Nodes. The interface presented to user space for cpusets uses system wide numbering of CPUs and Memory Nodes. It is the responsibility of user level code, presumably in a library, to present cpuset-relative numbering to applications when that would be more useful to them. However if a task is moved to a different cpuset, or if the 'cpus' or 'mems' of a cpuset are changed, then we need a way for such library code to detect that its cpuset-relative numbering has changed, when expressed using system wide numbering. The kernel cannot safely allow user code to lock kernel resources. The kernel could deliver out-of-band notice of cpuset changes by such mechanisms as signals or usermodehelper callbacks, however this can't be delivered to library code linked in applications without intruding on the IPC mechanisms available to the app. The kernel could require user level code to do all the work, tracking the cpuset state before and during changes, to verify no unexpected change occurred, but this becomes an onerous task. The "marker_pid" cpuset field provides a simple way to make this task less onerous on user library code. The code writes its pid to a cpusets "marker_pid" at the start of a sequence of queries and updates, and check as it goes that the cpsuets marker_pid doesn't change. The pread(2) system call does a seek and read in a single call. If the marker_pid changes, the library code should retry the required sequence of operations. Anytime that a task modifies the "cpus" or "mems" of a cpuset, unless it's pid is in the cpusets marker_pid field, the kernel zeros this field. The above was inspired by the load linked and store conditional (ll/sc) instructions in the MIPS II instruction set. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton --- kernel/cpuset.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 74 insertions(+) diff -puN kernel/cpuset.c~cpuset-change-marker-for-relative-numbering kernel/cpuset.c --- devel/kernel/cpuset.c~cpuset-change-marker-for-relative-numbering 2005-11-04 23:13:49.000000000 -0800 +++ devel-akpm/kernel/cpuset.c 2005-11-04 23:13:50.000000000 -0800 @@ -60,6 +60,7 @@ struct cpuset { unsigned long flags; /* "unsigned long" so bitops work */ cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + pid_t marker_pid; /* pid of task doing marked updates */ /* * Count is atomic so can incr (fork) or decr (exit) without a lock. @@ -130,10 +131,49 @@ static inline int notify_on_release(cons */ static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); +/* + * marker_pid -- managing cpuset changes safely from user space. + * + * The interface presented to user space for cpusets uses system wide + * numbering of CPUs and Memory Nodes. It is the responsibility of + * user level code, presumably in a library, to present cpuset-relative + * numbering to applications when that would be more useful to them. + * + * However if a task is moved to a different cpuset, or if the 'cpus' + * or 'mems' of a cpuset are changed, then we need a way for such + * library code to detect that its cpuset-relative numbering has + * changed, when expressed using system wide numbering. + * + * The kernel cannot safely allow user code to lock kernel resources. + * The kernel could deliver out-of-band notice of cpuset changes by + * such mechanisms as signals or usermodehelper callbacks, however + * this can't be delivered to library code linked in applications + * without intruding on the IPC mechanisms available to the app. + * The kernel could require user level code to do all the work, + * tracking the cpuset state before and during changes, to verify no + * unexpected change occurred, but this becomes an onerous task. + * + * The "marker_pid" cpuset field provides a simple way to make this + * task less onerous on user library code. A task writes its pid + * to a cpusets "marker_pid" at the start of a sequence of queries + * and updates, and check as it goes that the cpsuets marker_pid + * doesn't change. The pread(2) system call does a seek and read in + * a single call. If the marker_pid changes, the user code should + * retry the required sequence of operations. + * + * Anytime that a task modifies the "cpus" or "mems" of a cpuset, + * unless it's pid is in the cpusets marker_pid field, the kernel + * zeros this field. + * + * The above was inspired by the load linked and store conditional + * (ll/sc) instructions in the MIPS II instruction set. + */ + static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .cpus_allowed = CPU_MASK_ALL, .mems_allowed = NODE_MASK_ALL, + .marker_pid = 0, .count = ATOMIC_INIT(0), .sibling = LIST_HEAD_INIT(top_cpuset.sibling), .children = LIST_HEAD_INIT(top_cpuset.children), @@ -798,6 +838,19 @@ static int update_nodemask(struct cpuset } /* + * Call with manage_sem held. + */ + +static int update_marker_pid(struct cpuset *cs, char *buf) +{ + if (simple_strtoul(buf, NULL, 10) != 0) + cs->marker_pid = current->pid; + else + cs->marker_pid = 0; + return 0; +} + +/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, * CS_NOTIFY_ON_RELEASE) @@ -915,6 +968,7 @@ typedef enum { FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_NOTIFY_ON_RELEASE, + FILE_MARKER_PID, FILE_TASKLIST, } cpuset_filetype_t; @@ -927,6 +981,7 @@ static ssize_t cpuset_common_file_write( char *buffer; char *pathbuf = NULL; int retval = 0; + int marked_change; /* Crude upper limit on largest legitimate cpulist user might write. */ if (nbytes > 100 + 6 * NR_CPUS) @@ -949,12 +1004,15 @@ static ssize_t cpuset_common_file_write( goto out2; } + marked_change = 0; switch (type) { case FILE_CPULIST: retval = update_cpumask(cs, buffer); + marked_change = 1; break; case FILE_MEMLIST: retval = update_nodemask(cs, buffer); + marked_change = 1; break; case FILE_CPU_EXCLUSIVE: retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); @@ -965,6 +1023,9 @@ static ssize_t cpuset_common_file_write( case FILE_NOTIFY_ON_RELEASE: retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; + case FILE_MARKER_PID: + retval = update_marker_pid(cs, buffer); + break; case FILE_TASKLIST: retval = attach_task(cs, buffer, &pathbuf); break; @@ -973,6 +1034,9 @@ static ssize_t cpuset_common_file_write( goto out2; } + if (marked_change && retval == 0 && cs->marker_pid != current->pid) + cs->marker_pid = 0; + if (retval == 0) retval = nbytes; out2: @@ -1065,6 +1129,9 @@ static ssize_t cpuset_common_file_read(s case FILE_NOTIFY_ON_RELEASE: *s++ = notify_on_release(cs) ? '1' : '0'; break; + case FILE_MARKER_PID: + s += sprintf(s, "%d", cs->marker_pid); + break; default: retval = -EINVAL; goto out; @@ -1413,6 +1480,11 @@ static struct cftype cft_notify_on_relea .private = FILE_NOTIFY_ON_RELEASE, }; +static struct cftype cft_marker_pid = { + .name = "marker_pid", + .private = FILE_MARKER_PID, +}; + static int cpuset_populate_dir(struct dentry *cs_dentry) { int err; @@ -1427,6 +1499,8 @@ static int cpuset_populate_dir(struct de return err; if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) return err; + if ((err = cpuset_add_file(cs_dentry, &cft_marker_pid)) < 0) + return err; if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) return err; return 0; _