--- linux-next.orig/mm/page-writeback.c 2011-03-02 10:45:48.000000000 +0800 +++ linux-next/mm/page-writeback.c 2011-03-02 14:12:04.000000000 +0800 @@ -37,24 +37,9 @@ #include /* - * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited - * will look to see if it needs to force writeback or throttling. + * Don't sleep more than 200ms at a time in balance_dirty_pages(). */ -static long ratelimit_pages = 32; - -/* - * When balance_dirty_pages decides that the caller needs to perform some - * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than dirtied pages to ensure that reasonably - * large amounts of I/O are submitted. - */ -static inline long sync_writeback_pages(unsigned long dirtied) -{ - if (dirtied < ratelimit_pages) - dirtied = ratelimit_pages; - - return dirtied + dirtied / 2; -} +#define MAX_PAUSE max(HZ/5, 1) /* The following parameters are exported via /proc/sys/vm */ @@ -111,7 +96,6 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ - /* * Scale the writeback cache size proportional to the relative writeout speeds. * @@ -145,7 +129,7 @@ static int calc_period_shift(void) else dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; - return 2 + ilog2(dirty_total - 1); + return ilog2(dirty_total - 1); } /* @@ -219,6 +203,7 @@ int dirty_bytes_handler(struct ctl_table */ static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) { + __inc_bdi_stat(bdi, BDI_WRITTEN); __prop_inc_percpu_max(&vm_completions, &bdi->completions, bdi->max_prop_frac); } @@ -241,19 +226,14 @@ void task_dirty_inc(struct task_struct * /* * Obtain an accurate fraction of the BDI's portion. */ -static void bdi_writeout_fraction(struct backing_dev_info *bdi, +void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { - if (bdi_cap_writeback_dirty(bdi)) { - prop_fraction_percpu(&vm_completions, &bdi->completions, + prop_fraction_percpu(&vm_completions, &bdi->completions, numerator, denominator); - } else { - *numerator = 0; - *denominator = 1; - } } -static inline void task_dirties_fraction(struct task_struct *tsk, +void task_dirties_fraction(struct task_struct *tsk, long *numerator, long *denominator) { prop_fraction_single(&vm_dirties, &tsk->dirties, @@ -261,36 +241,6 @@ static inline void task_dirties_fraction } /* - * task_dirty_limit - scale down dirty throttling threshold for one task - * - * task specific dirty limit: - * - * dirty -= (dirty/8) * p_{t} - * - * To protect light/slow dirtying tasks from heavier/fast ones, we start - * throttling individual tasks before reaching the bdi dirty limit. - * Relatively low thresholds will be allocated to heavy dirtiers. So when - * dirty pages grow large, heavy dirtiers will be throttled first, which will - * effectively curb the growth of dirty pages. Light dirtiers with high enough - * dirty threshold may never get throttled. - */ -static unsigned long task_dirty_limit(struct task_struct *tsk, - unsigned long bdi_dirty) -{ - long numerator, denominator; - unsigned long dirty = bdi_dirty; - u64 inv = dirty >> 3; - - task_dirties_fraction(tsk, &numerator, &denominator); - inv *= numerator; - do_div(inv, denominator); - - dirty -= inv; - - return max(dirty, bdi_dirty/2); -} - -/* * */ static unsigned int bdi_min_ratio; @@ -403,8 +353,6 @@ unsigned long determine_dirtyable_memory * Calculate the dirty thresholds based on sysctl parameters * - vm.dirty_background_ratio or vm.dirty_background_bytes * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { @@ -426,28 +374,45 @@ void global_dirty_limits(unsigned long * else background = (dirty_background_ratio * available_memory) / 100; - if (background >= dirty) - background = dirty / 2; + /* + * Ensure at least 1/4 gap between background and dirty thresholds, so + * that when dirty throttling starts at (background + dirty)/2, it's + * below or at the entrance of the soft dirty throttle scope. + */ + if (background > dirty - dirty / (DIRTY_SCOPE / 2)) + background = dirty - dirty / (DIRTY_SCOPE / 2); + tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; - } *pbackground = background; *pdirty = dirty; + trace_global_dirty_state(background, dirty); } +EXPORT_SYMBOL_GPL(global_dirty_limits); -/* +/** * bdi_dirty_limit - @bdi's share of dirty throttling threshold + * @bdi: the backing_dev_info to query + * @dirty: global dirty limit in pages + * @dirty_pages: current number of dirty pages * - * Allocate high/low dirty limits to fast/slow devices, in order to prevent + * Returns @bdi's dirty limit in pages. The term "dirty" in the context of + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * + * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * * The bdi's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + * + * There is a chicken and egg problem: when bdi A (eg. /pub) is heavy dirtied + * and bdi B (eg. /) is light dirtied hence has 0 dirty limit, tasks writing to + * B always get heavily throttled and bdi B's dirty limit might never be able + * to grow up from 0. So we do tricks to reserve some global margin and honour + * it to the bdi's that run low. */ -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, + unsigned long dirty) { u64 bdi_dirty; long numerator, denominator; @@ -462,6 +427,7 @@ unsigned long bdi_dirty_limit(struct bac do_div(bdi_dirty, denominator); bdi_dirty += (dirty * bdi->min_ratio) / 100; + if (bdi_dirty > (dirty * bdi->max_ratio) / 100) bdi_dirty = dirty * bdi->max_ratio / 100; @@ -469,6 +435,719 @@ unsigned long bdi_dirty_limit(struct bac } /* + * If we can dirty N more pages globally, honour N/8 to the bdi that + * runs low, so as to help it ramp up. + */ +static unsigned long dirty_rampup_size(unsigned long dirty, + unsigned long thresh) +{ + if (thresh > dirty + MIN_WRITEBACK_PAGES) + return min(MIN_WRITEBACK_PAGES * 2, (thresh - dirty) / 8); + + return MIN_WRITEBACK_PAGES / 8; +} + +/* + * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() + * will look to see if it needs to start dirty throttling. + * + * If ratelimit_pages is too low then big NUMA machines will call the expensive + * global_page_state() too often. So scale it near-sqrt to the safety margin + * (the number of pages we may dirty without exceeding the dirty limits). + */ +static unsigned long ratelimit_pages(unsigned long dirty, + unsigned long thresh) +{ + if (thresh > dirty) + return 1UL << (ilog2(thresh - dirty) >> 1); + + return 1; +} + +/* + * last time exceeded (limit - limit/DIRTY_MARGIN) + */ +static bool dirty_exceeded_recently(struct backing_dev_info *bdi, + unsigned long time_window) +{ + return jiffies - bdi->dirty_exceed_time <= time_window; +} + +/* + * last time dropped below (thresh - 2*thresh/DIRTY_SCOPE + thresh/DIRTY_MARGIN) + */ +static bool dirty_free_run_recently(struct backing_dev_info *bdi, + unsigned long time_window) +{ + return jiffies - bdi->dirty_free_run <= time_window; +} + +/* + * Position based bandwidth control. + * + * (1) hard dirty limiting areas + * + * The block area is required to stop large number of slow dirtiers, because + * the max throttle area is only able to throttle a task at 1page/200ms=20KB/s. + * + * The max throttle area is sufficient for normal workloads, and has the virtue + * of bounded latency for light dirtiers. + * + * The brake area is typically enough to hold off the dirtiers as long as the + * dirtyable memory is not so tight. + * + * The block area and max throttle area are enforced inside the loop of + * balance_dirty_pages(). Others can be found in dirty_throttle_bandwidth(). + * + * block area, loop until drop below the area -------------------|<=== + * max throttle area, sleep(max_pause) and return -----------|<=====>| + * brake area, bw scaled from 1 down to 0 ---|<=====>| + * --------------------------------------------------------o-------o-------o---- + * ^ ^ ^ + * limit - limit/DIRTY_MARGIN ---' | | + * limit -----------' | + * limit + limit/DIRTY_MARGIN -------------------' + * + * (2) global control scope + * + * The rampup area is for ramping up the base bandwidth whereas the above brake + * area is for scaling down the base bandwidth. + * + * The global thresh is typically equal to the above global limit. The + * difference is, @thresh is real-time computed from global_dirty_limits() and + * @limit is tracking @thresh at 100ms intervals in update_dirty_limit(). The + * point is to track @thresh slowly if it dropped below the number of dirty + * pages, so as to avoid unnecessarily entering the three areas in (1). + * + *rampup area setpoint/goal + *|<=======>| v + * [-------------------------------*-------------------------------]------------ + * ^ ^ ^ + * thresh - 2*thresh/DIRTY_SCOPE thresh - thresh/DIRTY_SCOPE thresh + * + * (3) bdi control scope + * + * The bdi reserve area tries to keep a reasonable number of dirty pages for + * preventing block queue underrun. + * + * reserve area, scale up bw as dirty pages drop low bdi_setpoint + * |<=============================================>| v + * |-------------------------------------------------------*-------|---------- + * 0 bdi_thresh - bdi_thresh/DIRTY_SCOPE^ ^bdi_thresh + * + * (4) global/bdi control lines + * + * dirty_throttle_bandwidth() applies 2 main and 3 regional control lines for + * scaling up/down the base bandwidth based on the position of dirty pages. + * + * The two main control lines for the global/bdi control scopes do not end at + * thresh/bdi_thresh. They are centered at setpoint/bdi_setpoint and cover the + * whole [0, limit]. If the control line drops below 0 before reaching @limit, + * an auxiliary line will be setup to connect them. The below figure illustrates + * the main bdi control line with an auxiliary line extending it to @limit. + * + * This allows smoothly throttling down bdi_dirty back to normal if it starts + * high in situations like + * - start writing to a slow SD card and a fast disk at the same time. The SD + * card's bdi_dirty may rush to 5 times higher than bdi_setpoint. + * - the global/bdi dirty thresh/goal may be knocked down suddenly either on + * user request or on increased memory consumption. + * + * o + * o + * o [o] main control line + * o [*] auxiliary control line + * o + * o + * o + * o + * o + * o + * o--------------------- balance point, bw scale = 1 + * | o + * | o + * | o + * | o + * | o + * | o + * | o------- connect point, bw scale = 1/2 + * | .* + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * [--------------------*-----------------------------.--------------------*] + * 0 bdi_setpoint bdi_origin limit + * + * The bdi control line: if (bdi_origin < limit), an auxiliary control line (*) + * will be setup to extend the main control line (o) to @limit. + */ +static unsigned long dirty_throttle_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty, + struct task_struct *tsk) +{ + unsigned long limit = default_backing_dev_info.dirty_threshold; + unsigned long bdi_thresh = bdi->dirty_threshold; + unsigned long origin; + unsigned long goal; + unsigned long long span; + unsigned long long bw; + + if (unlikely(dirty >= limit)) + return 0; + + /* + * global setpoint + */ + origin = 2 * thresh; + goal = thresh - thresh / DIRTY_SCOPE; + + if (unlikely(origin < limit && dirty > (goal + origin) / 2)) { + origin = limit; + goal = (goal + origin) / 2; + bw >>= 1; + } + bw = origin - dirty; + bw <<= BASE_BW_SHIFT; + do_div(bw, origin - goal + 1); + + /* + * brake area to prevent global dirty exceeding + */ + if (dirty > limit - limit / DIRTY_MARGIN) { + bw *= limit - dirty; + do_div(bw, limit / DIRTY_MARGIN + 1); + } + + /* + * rampup area, immediately above the unthrottled free-run region. + * It's setup mainly to get an estimation of ref_bw for reliably + * ramping up the base bandwidth. + */ + dirty = default_backing_dev_info.avg_dirty; + origin = thresh - thresh / (DIRTY_SCOPE/2) + thresh / DIRTY_MARGIN; + if (dirty < origin) { + span = (origin - dirty) * bw; + do_div(span, thresh / (8 * DIRTY_MARGIN) + 1); + bw += span; + } + + /* + * bdi setpoint + */ + if (unlikely(bdi_thresh > thresh)) + bdi_thresh = thresh; + goal = bdi_thresh - bdi_thresh / DIRTY_SCOPE; + span = (u64) bdi_thresh * (thresh - bdi_thresh) + + (2 * bdi->avg_bandwidth) * bdi_thresh; + do_div(span, thresh + 1); + origin = goal + 2 * span; + + dirty = bdi->avg_dirty; + if (unlikely(dirty > goal + span)) { + if (dirty > limit) + return 0; + if (origin < limit) { + origin = limit; + goal += span; + bw >>= 1; + } + } + bw *= origin - dirty; + do_div(bw, origin - goal + 1); + + /* + * bdi reserve area, safeguard against bdi dirty underflow and disk idle + */ + origin = bdi_thresh - bdi_thresh / (DIRTY_SCOPE / 2); + if (bdi_dirty < origin) + bw = bw * origin / (bdi_dirty | 1); + + /* + * honour light dirtiers higher bandwidth: + * + * bw *= sqrt(1 / task_dirty_weight); + */ + if (tsk) { + unsigned long numerator, denominator; + + task_dirties_fraction(tsk, &numerator, &denominator); + bw *= int_sqrt((denominator << 10) / (numerator + 1)); + bw >>= 5 + BASE_BW_SHIFT / 2; + bw = (unsigned long)bw * bdi->throttle_bandwidth; + bw >>= 2 * BASE_BW_SHIFT - BASE_BW_SHIFT / 2; + + /* + * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and + * real-time tasks. + * + * The avg_bandwidth bound is necessary because + * bdi_update_throttle_bandwidth() blindly sets base bandwidth + * to avg_bandwidth for more stable estimation, when it + * believes the current task is the only dirtier. + */ + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + return min(2 * (unsigned long)bw, bdi->avg_bandwidth); + } + + return bw; +} + +static void bdi_update_dirty_smooth(struct backing_dev_info *bdi, + unsigned long dirty) +{ + unsigned long avg = bdi->avg_dirty; + unsigned long old = bdi->old_dirty; + + if (unlikely(!avg)) { + avg = dirty; + goto update; + } + + /* + * dirty pages are departing upwards, follow up + */ + if (avg < old && old <= dirty) { + avg += (old - avg) >> 3; + goto update; + } + + /* + * dirty pages are departing downwards, follow down + */ + if (avg > old && old >= dirty) { + avg -= (avg - old) >> 3; + goto update; + } + + /* + * This can filter out one half unnecessary updates when bdi_dirty is + * fluctuating around the balance point, and is most effective on XFS, + * whose theoretic pattern is + * . + * [.] dirty [-] avg . . + * . . + * . . . . . . + * --------------------------------------- . . + * . . . . . . + * . . . . . . + * . . . . . . + * . . . . . . + * . . . . + * . . . . (flucuated) + * . . . . + * . . . . + * + * @avg will remain flat at the cost of being biased towards high. In + * practice the error tend to be much smaller: thanks to more coarse + * grained fluctuations, @avg becomes the real average number for the + * last two rising lines of @dirty. + */ + goto out; + +update: + bdi->avg_dirty = avg; +out: + bdi->old_dirty = dirty; +} + +static void __bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long elapsed, + unsigned long written) +{ + const unsigned long period = roundup_pow_of_two(3 * HZ); + unsigned long avg = bdi->avg_bandwidth; + unsigned long old = bdi->write_bandwidth; + unsigned long cur; + u64 bw; + + bw = written - bdi->written_stamp; + bw *= HZ; + if (unlikely(elapsed > period / 2)) { + do_div(bw, elapsed); + elapsed = period / 2; + bw *= elapsed; + } + bw += (u64)bdi->write_bandwidth * (period - elapsed); + cur = bw >> ilog2(period); + bdi->write_bandwidth = cur; + + /* + * one more level of smoothing + */ + if (avg > old && old > cur) + avg -= (avg - old) >> 5; + + if (avg < old && old < cur) + avg += (old - avg) >> 5; + + bdi->avg_bandwidth = avg; +} + +static void update_dirty_limit(unsigned long thresh, + unsigned long dirty) +{ + unsigned long limit = default_backing_dev_info.dirty_threshold; + unsigned long min = dirty + limit / DIRTY_MARGIN; + + if (limit < thresh) { + limit = thresh; + goto out; + } + + /* take care not to follow into the brake area */ + if (limit > thresh + thresh / (DIRTY_MARGIN * 8) && + limit > min) { + limit -= (limit - max(thresh, min)) >> 3; + goto out; + } + + return; +out: + default_backing_dev_info.dirty_threshold = limit; +} + +static void bdi_update_dirty_threshold(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty) +{ + unsigned long old = bdi->old_dirty_threshold; + unsigned long avg = bdi->dirty_threshold; + unsigned long min; + + min = dirty_rampup_size(dirty, thresh); + thresh = bdi_dirty_limit(bdi, thresh); + + if (avg > old && old >= thresh) + avg -= (avg - old) >> 4; + + if (avg < old && old <= thresh) + avg += (old - avg) >> 4; + + bdi->dirty_threshold = max(avg, min); + bdi->old_dirty_threshold = thresh; +} + +/* + * ref_bw typically fluctuates within a small range, with large isolated points + * from time to time. The smoothed reference_bandwidth can effectively filter + * out 1 such standalone point. When there comes 2+ isolated points together -- + * observed in ext4 on sudden redirty -- reference_bandwidth may surge high and + * take long time to return to normal, which can mostly be counteracted by + * xref_bw and other update restrictions in bdi_update_throttle_bandwidth(). + */ +static void bdi_update_reference_bandwidth(struct backing_dev_info *bdi, + unsigned long ref_bw) +{ + unsigned long old = bdi->old_ref_bandwidth; + unsigned long avg = bdi->reference_bandwidth; + + if (avg > old && old >= ref_bw && avg - old >= old - ref_bw) + avg -= (avg - old) >> 3; + + if (avg < old && old <= ref_bw && old - avg >= ref_bw - old) + avg += (old - avg) >> 3; + + bdi->reference_bandwidth = avg; + bdi->old_ref_bandwidth = ref_bw; +} + +/* + * Base throttle bandwidth. + */ +static void bdi_update_throttle_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty, + unsigned long dirtied, + unsigned long elapsed) +{ + unsigned long limit = default_backing_dev_info.dirty_threshold; + unsigned long margin = limit / DIRTY_MARGIN; + unsigned long goal = thresh - thresh / DIRTY_SCOPE; + unsigned long bdi_thresh = bdi->dirty_threshold; + unsigned long bdi_goal = bdi_thresh - bdi_thresh / DIRTY_SCOPE; + unsigned long long bw = bdi->throttle_bandwidth; + unsigned long long dirty_bw; + unsigned long long pos_bw; + unsigned long long delta; + unsigned long long ref_bw = 0; + unsigned long long xref_bw; + unsigned long pos_ratio; + unsigned long spread; + + if (dirty > limit - margin) + bdi->dirty_exceed_time = jiffies; + + if (dirty < thresh - thresh / (DIRTY_SCOPE/2) + margin) + bdi->dirty_free_run = jiffies; + + /* + * The dirty rate should match the writeback rate exactly, except when + * dirty pages are truncated before IO submission. The mismatches are + * hopefully small and hence ignored. So a continuous stream of dirty + * page trucates will result in errors in ref_bw, fortunately pos_bw + * can effectively stop the base bw from being driven away endlessly + * by the errors. + * + * It'd be nicer for the filesystems to not redirty too much pages + * either on IO or lock contention, or on sub-page writes. ext4 is + * known to redirty pages in big bursts, leading to + * - surges of dirty_bw, which can be mostly safeguarded by the + * min/max'ed xref_bw + * - the temporary drop of task weight and hence surge of task bw + * It could possibly be fixed in the FS. + */ + dirty_bw = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; + + pos_ratio = dirty_throttle_bandwidth(bdi, thresh, dirty, + bdi_dirty, NULL); + /* + * pos_bw = task_bw, assuming 100% task dirty weight + * + * (pos_bw > bw) means the position of the number of dirty pages is + * lower than the global and/or bdi setpoints. It does not necessarily + * mean the base throttle bandwidth is larger than its balanced value. + * The latter is likely only when + * - (position) the dirty pages are at some distance from the setpoint, + * - (speed) and either stands still or is departing from the setpoint. + */ + pos_bw = (bw >> (BASE_BW_SHIFT/2)) * pos_ratio >> + (BASE_BW_SHIFT/2); + + /* + * A typical desktop has only 1 task writing to 1 disk, in which case + * the dirtier task should be throttled at the disk's write bandwidth. + * Note that we ignore minor dirty/writeback mismatches such as + * redirties and truncated dirty pages. + */ + if (bdi_thresh > thresh - thresh / 16) { + unsigned long numerator, denominator; + + task_dirties_fraction(current, &numerator, &denominator); + if (numerator > denominator - denominator / 16) + ref_bw = bdi->avg_bandwidth << BASE_BW_SHIFT; + } + /* + * Otherwise there may be + * 1) N dd tasks writing to the current disk, or + * 2) X dd tasks and Y "rsync --bwlimit" tasks. + * The below estimation is accurate enough for (1). For (2), where not + * all task's dirty rate can be changed proportionally by adjusting the + * base throttle bandwidth, it would require multiple adjust-reestimate + * cycles to approach the rate matching point. Which is not a big + * concern as we always do small steps to approach the target. The + * un-controllable tasks may only slow down the progress. + */ + if (!ref_bw) { + ref_bw = pos_ratio * bdi->avg_bandwidth; + do_div(ref_bw, dirty_bw | 1); + ref_bw = (bw >> (BASE_BW_SHIFT/2)) * (unsigned long)ref_bw >> + (BASE_BW_SHIFT/2); + } + + /* + * The average dirty pages typically fluctuates within this scope. + */ + spread = min(bdi->write_bandwidth / 8, bdi_thresh / DIRTY_MARGIN); + + /* + * Update the base throttle bandwidth rigidly: eg. only try lowering it + * when both the global/bdi dirty pages are away from their setpoints, + * and are either standing still or continue departing away. + * + * The "+ avg_dirty / 256" tricks mainly help btrfs, which behaves + * amazingly smoothly. Its average dirty pages simply tracks more and + * more close to the number of dirty pages without any overshooting, + * thus its dirty pages may be ever moving towards the setpoint and + * @avg_dirty ever approaching @dirty, slower and slower, but very hard + * to cross it to trigger a base bandwidth update. What the trick does + * is "when @avg_dirty is _close enough_ to @dirty, it indicates slowed + * down @dirty change rate, hence the other inequalities are now a good + * indication of something unbalanced in the current bdi". + * + * In the cases of hitting the upper/lower margins, it's obviously + * necessary to adjust the (possibly very unbalanced) base bandwidth, + * unless the opposite margin was also been hit recently, which + * indicates that the dirty control scope may be smaller than the bdi + * write bandwidth and hence the dirty pages are quickly fluctuating + * between the upper/lower margins. + */ + if (bw < pos_bw) { + if (dirty < goal && + dirty <= default_backing_dev_info.avg_dirty + + (default_backing_dev_info.avg_dirty >> 8) && + bdi->avg_dirty + spread < bdi_goal && + bdi_dirty <= bdi->avg_dirty + (bdi->avg_dirty >> 8) && + bdi_dirty <= bdi->old_dirty) + goto adjust; + if (dirty < thresh - thresh / (DIRTY_SCOPE/2) + margin && + !dirty_exceeded_recently(bdi, HZ)) + goto adjust; + } + + if (bw > pos_bw) { + if (dirty > goal && + dirty >= default_backing_dev_info.avg_dirty - + (default_backing_dev_info.avg_dirty >> 8) && + bdi->avg_dirty > bdi_goal + spread && + bdi_dirty >= bdi->avg_dirty - (bdi->avg_dirty >> 8) && + bdi_dirty >= bdi->old_dirty) + goto adjust; + if (dirty > limit - margin && + !dirty_free_run_recently(bdi, HZ)) + goto adjust; + } + + goto out; + +adjust: + /* + * The min/max'ed xref_bw is an effective safeguard. The most dangerous + * case that could unnecessarily disturb the base bandwith is: when the + * control scope is roughly equal to the write bandwidth, the dirty + * pages may rush into the upper/lower margins regularly. It typically + * hits the upper margin in a blink, making a sudden drop of pos_bw and + * ref_bw. Assume 5 points A, b, c, D, E, where b, c have the dropped + * down number of pages, and A, D, E are at normal level. At point b, + * the xref_bw will be the good A; at c, the xref_bw will be the + * dragged-down-by-b reference_bandwidth which is bad; at D and E, the + * still-low reference_bandwidth will no longer bring the base + * bandwidth down, as xref_bw will take the larger values from D and E. + */ + if (pos_bw > bw) { + xref_bw = min(ref_bw, bdi->old_ref_bandwidth); + xref_bw = min(xref_bw, bdi->reference_bandwidth); + if (xref_bw > bw) + delta = xref_bw - bw; + else + delta = 0; + } else { + xref_bw = max(ref_bw, bdi->reference_bandwidth); + xref_bw = max(xref_bw, bdi->reference_bandwidth); + if (xref_bw < bw) + delta = bw - xref_bw; + else + delta = 0; + } + + /* + * Don't pursue 100% rate matching. It's impossible since the balanced + * rate itself is constantly fluctuating. So decrease the track speed + * when it gets close to the target. Also limit the step size in + * various ways to avoid overshooting. + */ + delta >>= bw / (2 * delta + 1); + delta = min(delta, (u64)abs64(pos_bw - bw)); + delta >>= 1; + delta = min(delta, bw / 8); + + if (pos_bw > bw) + bw += delta; + else + bw -= delta; + + bdi->throttle_bandwidth = bw; +out: + bdi_update_reference_bandwidth(bdi, ref_bw); + trace_throttle_bandwidth(bdi, dirty_bw, pos_bw, ref_bw); +} + +void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty, + unsigned long start_time) +{ + static DEFINE_SPINLOCK(dirty_lock); + unsigned long now = jiffies; + unsigned long elapsed; + unsigned long dirtied; + unsigned long written; + + if (!spin_trylock(&dirty_lock)) + return; + + elapsed = now - bdi->bw_time_stamp; + dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); + written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + + /* skip quiet periods when disk bandwidth is under-utilized */ + if (elapsed > HZ/2 && + elapsed > now - start_time) + goto snapshot; + + /* + * rate-limit, only update once every 100ms. Demand higher threshold + * on the flusher so that the throttled task(s) can do most updates. + */ + if (!thresh && elapsed <= HZ/4) + goto unlock; + if (elapsed <= HZ/10) + goto unlock; + + if (thresh) { + update_dirty_limit(thresh, dirty); + bdi_update_dirty_threshold(bdi, thresh, dirty); + bdi_update_throttle_bandwidth(bdi, thresh, dirty, + bdi_dirty, dirtied, elapsed); + } + __bdi_update_write_bandwidth(bdi, elapsed, written); + if (thresh) { + bdi_update_dirty_smooth(bdi, bdi_dirty); + bdi_update_dirty_smooth(&default_backing_dev_info, dirty); + } + +snapshot: + bdi->dirtied_stamp = dirtied; + bdi->written_stamp = written; + bdi->bw_time_stamp = now; +unlock: + spin_unlock(&dirty_lock); +} + +/* + * Limit pause time for small memory systems. If sleeping for too long time, + * the small pool of dirty/writeback pages may go empty and disk go idle. + */ +static unsigned long max_pause(struct backing_dev_info *bdi, + unsigned long bdi_dirty) +{ + unsigned long t; /* jiffies */ + + /* 1ms for every 1MB; may further consider bdi bandwidth */ + t = bdi_dirty >> (30 - PAGE_CACHE_SHIFT - ilog2(HZ)); + t += 2; + + return min_t(unsigned long, t, MAX_PAUSE); +} + +/* + * Scale up pause time for concurrent dirtiers in order to reduce CPU overheads. + * But ensure reasonably large [min_pause, max_pause] range size, so that + * nr_dirtied_pause (and hence future pause time) can stay reasonably stable. + */ +static unsigned long min_pause(struct backing_dev_info *bdi, + unsigned long max) +{ + unsigned long hi = ilog2(bdi->write_bandwidth); + unsigned long lo = ilog2(bdi->throttle_bandwidth) - BASE_BW_SHIFT; + unsigned long t = 1 + max / 8; /* jiffies */ + + if (lo >= hi) + return t; + + /* (N * 10ms) on 2^N concurrent tasks */ + t += (hi - lo) * (10 * HZ) / 1024; + + return min(t, max / 2); +} + +/* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. @@ -476,45 +1155,34 @@ unsigned long bdi_dirty_limit(struct bac * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, - unsigned long write_chunk) + unsigned long pages_dirtied) { - long nr_reclaimable, bdi_nr_reclaimable; - long nr_writeback, bdi_nr_writeback; + unsigned long nr_reclaimable; + unsigned long nr_dirty; + unsigned long bdi_dirty; /* = file_dirty + writeback + unstable_nfs */ unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; - unsigned long pages_written = 0; - unsigned long pause = 1; - bool dirty_exceeded = false; + unsigned long bw; + unsigned long period; + unsigned long pause = 0; + unsigned long pause_max; struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long start_time = jiffies; for (;;) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = write_chunk, - .range_cyclic = 1, - }; - + /* + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + */ nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - nr_writeback = global_page_state(NR_WRITEBACK); + nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); global_dirty_limits(&background_thresh, &dirty_thresh); /* - * Throttle it only when the background writeback cannot - * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up. - */ - if (nr_reclaimable + nr_writeback <= - (background_thresh + dirty_thresh) / 2) - break; - - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - bdi_thresh = task_dirty_limit(current, bdi_thresh); - - /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. @@ -524,62 +1192,107 @@ static void balance_dirty_pages(struct a * actually dirty; with m+n sitting in the percpu * deltas. */ - if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); + if (bdi->dirty_threshold < 2*bdi_stat_error(bdi)) { + bdi_dirty = bdi_stat_sum(bdi, BDI_RECLAIMABLE) + + bdi_stat_sum(bdi, BDI_WRITEBACK); } else { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + bdi_dirty = bdi_stat(bdi, BDI_RECLAIMABLE) + + bdi_stat(bdi, BDI_WRITEBACK); } /* - * The bdi thresh is somehow "soft" limit derived from the - * global "hard" limit. The former helps to prevent heavy IO - * bdi or process from holding back light ones; The latter is - * the last resort safeguard. + * Throttle it only when the background writeback cannot + * catch-up. This avoids (excessively) small writeouts + * when the bdi limits are ramping up. */ - dirty_exceeded = - (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) - || (nr_reclaimable + nr_writeback > dirty_thresh); - - if (!dirty_exceeded) + if (nr_dirty <= (background_thresh + dirty_thresh) / 2) { + current->paused_when = jiffies; + current->nr_dirtied = 0; break; + } - if (!bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; + bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, + bdi_dirty, start_time); - /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - * Only move pages to writeback if this bdi is over its - * threshold otherwise wait until the disk writes catch - * up. + if (unlikely(!writeback_in_progress(bdi))) + bdi_start_background_writeback(bdi); + + pause_max = max_pause(bdi, bdi_dirty); + + bw = dirty_throttle_bandwidth(bdi, dirty_thresh, nr_dirty, + bdi_dirty, current); + if (unlikely(bw == 0)) { + period = pause_max; + pause = pause_max; + goto pause; + } + period = (HZ * pages_dirtied + bw / 2) / bw; + pause = current->paused_when + period - jiffies; + /* + * Take it as long think time if pause falls into (-10s, 0). + * If it's less than 500ms (ext2 blocks the dirtier task for + * up to 400ms from time to time on 1-HDD; so does xfs, however + * at much less frequency), try to compensate it in future by + * updating the virtual time; otherwise just reset the time, as + * it may be a light dirtier. */ - trace_wbc_balance_dirty_start(&wbc, bdi); - if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes_wb(&bdi->wb, &wbc); - pages_written += write_chunk - wbc.nr_to_write; - trace_wbc_balance_dirty_written(&wbc, bdi); - if (pages_written >= write_chunk) - break; /* We've done our duty */ + if (unlikely(-pause < HZ*10)) { + trace_balance_dirty_pages(bdi, + dirty_thresh, + nr_dirty, + bdi_dirty, + bw, + pages_dirtied, + period, + pause, + start_time); + if (-pause > HZ/2) { + current->paused_when = jiffies; + current->nr_dirtied = 0; + pause = 0; + } else if (period) { + current->paused_when += period; + current->nr_dirtied = 0; + pause = 1; + } else + current->nr_dirtied_pause <<= 1; + break; } - trace_wbc_balance_dirty_wait(&wbc, bdi); + if (pause > pause_max) + pause = pause_max; + +pause: + trace_balance_dirty_pages(bdi, + dirty_thresh, + nr_dirty, + bdi_dirty, + bw, + pages_dirtied, + period, + pause, + start_time); + current->paused_when = jiffies; __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); + current->paused_when += pause; + current->nr_dirtied = 0; - /* - * Increase the delay for each loop, up to our previous - * default of taking a 100ms nap. - */ - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; + if (nr_dirty < default_backing_dev_info.dirty_threshold + + default_backing_dev_info.dirty_threshold / DIRTY_MARGIN) + break; } - if (!dirty_exceeded && bdi->dirty_exceeded) - bdi->dirty_exceeded = 0; + if (pause == 0) + current->nr_dirtied_pause = + ratelimit_pages(nr_dirty, dirty_thresh); + else if (pause <= min_pause(bdi, pause_max)) + current->nr_dirtied_pause += current->nr_dirtied_pause / 32 + 1; + else if (pause >= pause_max) + /* + * when repeated, writing 1 page per 100ms on slow devices, + * i-(i+2)/4 will be able to reach 1 but never reduce to 0. + */ + current->nr_dirtied_pause -= (current->nr_dirtied_pause+2) >> 2; if (writeback_in_progress(bdi)) return; @@ -592,8 +1305,10 @@ static void balance_dirty_pages(struct a * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ - if ((laptop_mode && pages_written) || - (!laptop_mode && (nr_reclaimable > background_thresh))) + if (laptop_mode) + return; + + if (nr_reclaimable > background_thresh) bdi_start_background_writeback(bdi); } @@ -607,8 +1322,6 @@ void set_page_dirty_balance(struct page } } -static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; - /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -618,36 +1331,35 @@ static DEFINE_PER_CPU(unsigned long, bdp * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * - * On really big machines, get_writeback_state is expensive, so try to avoid + * On really big machines, global_page_state() is expensive, so try to avoid * calling it too often (ratelimiting). But once we're over the dirty memory - * limit we decrease the ratelimiting by a lot, to prevent individual processes - * from overshooting the limit by (ratelimit_pages) each. + * limit we disable the ratelimiting, to prevent individual processes from + * overshooting the limit by (ratelimit_pages) each. */ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - unsigned long ratelimit; - unsigned long *p; + struct backing_dev_info *bdi = mapping->backing_dev_info; - ratelimit = ratelimit_pages; - if (mapping->backing_dev_info->dirty_exceeded) - ratelimit = 8; + if (!bdi_cap_account_dirty(bdi)) + return; + + current->nr_dirtied += nr_pages_dirtied; + + if (dirty_exceeded_recently(bdi, MAX_PAUSE)) { + unsigned long max = current->nr_dirtied + + (128 >> (PAGE_SHIFT - 10)); + + if (current->nr_dirtied_pause > max) + current->nr_dirtied_pause = max; + } /* * Check the rate limiting. Also, we do not want to throttle real-time * tasks in balance_dirty_pages(). Period. */ - preempt_disable(); - p = &__get_cpu_var(bdp_ratelimits); - *p += nr_pages_dirtied; - if (unlikely(*p >= ratelimit)) { - ratelimit = sync_writeback_pages(*p); - *p = 0; - preempt_enable(); - balance_dirty_pages(mapping, ratelimit); - return; - } - preempt_enable(); + if (unlikely(current->nr_dirtied >= current->nr_dirtied_pause)) + balance_dirty_pages(mapping, current->nr_dirtied); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); @@ -735,44 +1447,6 @@ void laptop_sync_completion(void) #endif /* - * If ratelimit_pages is too high then we can get into dirty-data overload - * if a large number of processes all perform writes at the same time. - * If it is too low then SMP machines will call the (expensive) - * get_writeback_state too often. - * - * Here we set ratelimit_pages to a level which ensures that when all CPUs are - * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory - * thresholds before writeback cuts in. - * - * But the limit should not be set too high. Because it also controls the - * amount of memory which the balance_dirty_pages() caller has to write back. - * If this is too large then the caller will block on the IO queue all the - * time. So limit it to four megabytes - the balance_dirty_pages() caller - * will write six megabyte chunks, max. - */ - -void writeback_set_ratelimit(void) -{ - ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); - if (ratelimit_pages < 16) - ratelimit_pages = 16; - if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) - ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; -} - -static int __cpuinit -ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) -{ - writeback_set_ratelimit(); - return NOTIFY_DONE; -} - -static struct notifier_block __cpuinitdata ratelimit_nb = { - .notifier_call = ratelimit_handler, - .next = NULL, -}; - -/* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory @@ -794,9 +1468,6 @@ void __init page_writeback_init(void) { int shift; - writeback_set_ratelimit(); - register_cpu_notifier(&ratelimit_nb); - shift = calc_period_shift(); prop_descriptor_init(&vm_completions, shift); prop_descriptor_init(&vm_dirties, shift); @@ -1120,6 +1791,7 @@ void account_page_dirtied(struct page *p __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } @@ -1134,7 +1806,6 @@ EXPORT_SYMBOL(account_page_dirtied); void account_page_writeback(struct page *page) { inc_zone_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_WRITTEN); } EXPORT_SYMBOL(account_page_writeback); @@ -1341,8 +2012,10 @@ int test_clear_page_writeback(struct pag } else { ret = TestClearPageWriteback(page); } - if (ret) + if (ret) { dec_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); + } return ret; } --- linux-next.orig/mm/backing-dev.c 2011-03-02 10:45:48.000000000 +0800 +++ linux-next/mm/backing-dev.c 2011-03-02 10:45:58.000000000 +0800 @@ -87,20 +87,26 @@ static int bdi_debug_stats_show(struct s #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, - "BdiWriteback: %8lu kB\n" - "BdiReclaimable: %8lu kB\n" - "BdiDirtyThresh: %8lu kB\n" - "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n" - "b_dirty: %8lu\n" - "b_io: %8lu\n" - "b_more_io: %8lu\n" - "bdi_list: %8u\n" - "state: %8lx\n", + "BdiWriteback: %10lu kB\n" + "BdiReclaimable: %10lu kB\n" + "BdiDirtyThresh: %10lu kB\n" + "DirtyThresh: %10lu kB\n" + "BackgroundThresh: %10lu kB\n" + "BdiDirtied: %10lu kB\n" + "BdiWritten: %10lu kB\n" + "BdiWriteBandwidth: %10lu kBps\n" + "b_dirty: %10lu\n" + "b_io: %10lu\n" + "b_more_io: %10lu\n" + "bdi_list: %10u\n" + "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), K(dirty_thresh), - K(background_thresh), nr_dirty, nr_io, nr_more_io, + K(bdi_thresh), K(dirty_thresh), K(background_thresh), + (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), + (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), + (unsigned long) K(bdi->write_bandwidth), + nr_dirty, nr_io, nr_more_io, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -637,6 +643,11 @@ static void bdi_wb_init(struct bdi_write setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } +/* + * initial write bandwidth: 50 MB/s + */ +#define INIT_BW (50 << (20 - PAGE_SHIFT)) + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -658,7 +669,17 @@ int bdi_init(struct backing_dev_info *bd goto err; } - bdi->dirty_exceeded = 0; + bdi->bw_time_stamp = jiffies; + bdi->written_stamp = 0; + + bdi->write_bandwidth = INIT_BW; + bdi->avg_bandwidth = INIT_BW; + bdi->throttle_bandwidth = (u64)INIT_BW << BASE_BW_SHIFT; + + bdi->avg_dirty = 0; + bdi->old_dirty = 0; + bdi->dirty_threshold = MIN_WRITEBACK_PAGES; + err = prop_local_init_percpu(&bdi->completions); if (err) { --- linux-next.orig/include/linux/writeback.h 2011-03-02 10:45:48.000000000 +0800 +++ linux-next/include/linux/writeback.h 2011-03-02 10:45:58.000000000 +0800 @@ -12,6 +12,47 @@ struct backing_dev_info; extern spinlock_t inode_lock; /* + * The 1/4 region under the global dirty thresh is for elastic dirty throttling: + * + * (thresh - 2*thresh/DIRTY_SCOPE, thresh) + * + * The 1/32 region under the global dirty limit will be more rigidly throttled: + * + * (limit - limit/DIRTY_MARGIN, limit) + * + * The 1/32 region above the global dirty limit will be put to maximum pauses: + * + * (limit, limit + limit/DIRTY_MARGIN) + * + * Further beyond, the dirtier task will enter a loop waiting (possibly long + * time) for the dirty pages to drop below (limit + limit/DIRTY_MARGIN). + * + * The last case may happen lightly when memory is very tight or at sudden + * workload rampup. Or under DoS situations such as a fork bomb where every new + * task dirties some more pages, or creating 10,000 tasks each writing to a USB + * key slowly in 4KB/s. + * + * The global dirty threshold is normally equal to global dirty limit, except + * when the system suddenly allocates a lot of anonymous memory and knocks down + * the global dirty threshold quickly, in which case the global dirty limit + * will follow down slowly to prevent livelocking all dirtier tasks. + */ +#define DIRTY_SCOPE 8 +#define DIRTY_MARGIN (DIRTY_SCOPE * 4) + +/* + * The base throttle bandwidth will be 1000 times smaller than write bandwidth + * when there are 100 concurrent heavy dirtiers. This shift can work with up to + * 40 bits dirty size and 2^16 concurrent dirtiers. + */ +#define BASE_BW_SHIFT 24 + +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) + +/* * fs/fs-writeback.c */ enum writeback_sync_modes { @@ -33,6 +74,7 @@ struct writeback_control { extra jobs and livelock */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ + long per_file_limit; /* Write this many pages for one file */ long pages_skipped; /* Pages which were not written */ /* @@ -126,7 +168,23 @@ int dirty_writeback_centisecs_handler(st void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, - unsigned long dirty); + unsigned long dirty); + +void bdi_writeout_fraction(struct backing_dev_info *bdi, + long *numerator, long *denominator); +void task_dirties_fraction(struct task_struct *tsk, + long *numerator, long *denominator); + +void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty, + unsigned long start_time); +static inline void bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time) +{ + bdi_update_bandwidth(bdi, 0, 0, 0, start_time); +} void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, --- linux-next.orig/mm/filemap.c 2011-03-02 10:45:48.000000000 +0800 +++ linux-next/mm/filemap.c 2011-03-02 10:45:57.000000000 +0800 @@ -2253,6 +2253,7 @@ static ssize_t generic_perform_write(str long status = 0; ssize_t written = 0; unsigned int flags = 0; + unsigned int dirty; /* * Copies from kernel address space cannot fail (NFSD is a big user). @@ -2301,6 +2302,7 @@ again: pagefault_enable(); flush_dcache_page(page); + dirty = PageDirty(page); mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); @@ -2327,7 +2329,8 @@ again: pos += copied; written += copied; - balance_dirty_pages_ratelimited(mapping); + if (!dirty) + balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); --- linux-next.orig/include/linux/sched.h 2011-03-02 10:45:47.000000000 +0800 +++ linux-next/include/linux/sched.h 2011-03-02 10:45:57.000000000 +0800 @@ -1487,6 +1487,14 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + /* + * when (nr_dirtied >= nr_dirtied_pause), it's time to call + * balance_dirty_pages() for some dirty throttling pause + */ + int nr_dirtied; + int nr_dirtied_pause; + unsigned long paused_when; /* start of a write-and-pause period */ + #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; --- linux-next.orig/mm/memory_hotplug.c 2011-03-02 10:45:47.000000000 +0800 +++ linux-next/mm/memory_hotplug.c 2011-03-02 10:45:57.000000000 +0800 @@ -468,8 +468,6 @@ int online_pages(unsigned long pfn, unsi vm_total_pages = nr_free_pagecache_pages(); - writeback_set_ratelimit(); - if (onlined_pages) memory_notify(MEM_ONLINE, &arg); unlock_memory_hotplug(); @@ -901,7 +899,6 @@ repeat: } vm_total_pages = nr_free_pagecache_pages(); - writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); unlock_memory_hotplug(); --- linux-next.orig/include/linux/backing-dev.h 2011-03-02 10:45:47.000000000 +0800 +++ linux-next/include/linux/backing-dev.h 2011-03-02 10:45:58.000000000 +0800 @@ -39,7 +39,9 @@ typedef int (congested_fn)(void *, int); enum bdi_stat_item { BDI_RECLAIMABLE, + BDI_DIRTIED, BDI_WRITEBACK, + BDI_WRITTEN, NR_BDI_STAT_ITEMS }; @@ -73,8 +75,25 @@ struct backing_dev_info { struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; + unsigned long bw_time_stamp; + unsigned long dirtied_stamp; + unsigned long written_stamp; + unsigned long write_bandwidth; + unsigned long avg_bandwidth; + unsigned long long throttle_bandwidth; + unsigned long long reference_bandwidth; + unsigned long long old_ref_bandwidth; + unsigned long avg_dirty; + unsigned long old_dirty; + unsigned long dirty_threshold; + unsigned long old_dirty_threshold; + struct prop_local_percpu completions; - int dirty_exceeded; + + /* last time exceeded (limit - limit/DIRTY_MARGIN) */ + unsigned long dirty_exceed_time; + /* last time dropped below (background_thresh + dirty_thresh) / 2 */ + unsigned long dirty_free_run; unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; --- linux-next.orig/fs/fs-writeback.c 2011-03-02 10:45:48.000000000 +0800 +++ linux-next/fs/fs-writeback.c 2011-03-02 10:45:58.000000000 +0800 @@ -330,6 +330,8 @@ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; + long per_file_limit = wbc->per_file_limit; + long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; @@ -349,7 +351,8 @@ writeback_single_inode(struct inode *ino */ if (wbc->sync_mode != WB_SYNC_ALL) { requeue_io(inode); - return 0; + ret = 0; + goto out; } /* @@ -365,8 +368,14 @@ writeback_single_inode(struct inode *ino inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode_lock); + if (per_file_limit) + wbc->nr_to_write = per_file_limit; + ret = do_writepages(mapping, wbc); + if (per_file_limit) + wbc->nr_to_write += nr_to_write - per_file_limit; + /* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data @@ -436,6 +445,9 @@ writeback_single_inode(struct inode *ino } } inode_sync_complete(inode); +out: + trace_writeback_single_inode(inode, wbc, + nr_to_write - wbc->nr_to_write); return ret; } @@ -584,15 +596,6 @@ static void __writeback_inodes_sb(struct spin_unlock(&inode_lock); } -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 - static inline bool over_bground_thresh(void) { unsigned long background_thresh, dirty_thresh; @@ -604,6 +607,39 @@ static inline bool over_bground_thresh(v } /* + * Give each inode a nr_to_write that can complete within 1 second. + */ +static unsigned long writeback_chunk_size(struct backing_dev_info *bdi, + int sync_mode) +{ + unsigned long pages; + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * __writeback_inodes_sb() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (sync_mode == WB_SYNC_ALL) + return LONG_MAX; + + pages = min(bdi->avg_bandwidth, + bdi->dirty_threshold / DIRTY_SCOPE); + + if (pages <= MIN_WRITEBACK_PAGES) + return MIN_WRITEBACK_PAGES; + + return rounddown_pow_of_two(pages); +} + +/* * Explicit flushing or periodic writeback of "old" data. * * Define "old": the first time one of an inode's pages is dirtied, we mark the @@ -643,25 +679,9 @@ static long wb_writeback(struct bdi_writ wbc.range_end = LLONG_MAX; } - /* - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX - * here avoids calling into writeback_inodes_wb() more than once. - * - * The intended call sequence for WB_SYNC_ALL writeback is: - * - * wb_writeback() - * __writeback_inodes_sb() <== called only once - * write_cache_pages() <== called once for each inode - * (quickly) tag currently dirty pages - * (maybe slowly) sync all tagged pages - */ - if (wbc.sync_mode == WB_SYNC_NONE) - write_chunk = MAX_WRITEBACK_PAGES; - else - write_chunk = LONG_MAX; - wbc.wb_start = jiffies; /* livelock avoidance */ + bdi_update_write_bandwidth(wb->bdi, wbc.wb_start); + for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -687,7 +707,9 @@ static long wb_writeback(struct bdi_writ break; wbc.more_io = 0; + write_chunk = writeback_chunk_size(wb->bdi, wbc.sync_mode); wbc.nr_to_write = write_chunk; + wbc.per_file_limit = write_chunk; wbc.pages_skipped = 0; trace_wbc_writeback_start(&wbc, wb->bdi); @@ -697,6 +719,8 @@ static long wb_writeback(struct bdi_writ writeback_inodes_wb(wb, &wbc); trace_wbc_writeback_written(&wbc, wb->bdi); + bdi_update_write_bandwidth(wb->bdi, wbc.wb_start); + work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; @@ -720,6 +744,12 @@ static long wb_writeback(struct bdi_writ * become available for writeback. Otherwise * we'll just busyloop. */ + if (list_empty(&wb->b_more_io)) { + trace_wbc_writeback_wait(&wbc, wb->bdi); + __set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(max(HZ/100, 1)); + continue; + } spin_lock(&inode_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); --- linux-next.orig/include/trace/events/writeback.h 2011-03-02 10:45:45.000000000 +0800 +++ linux-next/include/trace/events/writeback.h 2011-03-02 10:45:58.000000000 +0800 @@ -7,9 +7,23 @@ #include #include #include +#include struct wb_writeback_work; +#define show_inode_state(state) \ + __print_flags(state, "|", \ + {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ + {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ + {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ + {I_NEW, "I_NEW"}, \ + {I_WILL_FREE, "I_WILL_FREE"}, \ + {I_FREEING, "I_FREEING"}, \ + {I_CLEAR, "I_CLEAR"}, \ + {I_SYNC, "I_SYNC"}, \ + {I_REFERENCED, "I_REFERENCED"} \ + ) + DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), TP_ARGS(bdi, work), @@ -147,11 +161,238 @@ DEFINE_EVENT(wbc_class, name, \ DEFINE_WBC_EVENT(wbc_writeback_start); DEFINE_WBC_EVENT(wbc_writeback_written); DEFINE_WBC_EVENT(wbc_writeback_wait); -DEFINE_WBC_EVENT(wbc_balance_dirty_start); -DEFINE_WBC_EVENT(wbc_balance_dirty_written); -DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); +TRACE_EVENT(writeback_single_inode, + + TP_PROTO(struct inode *inode, + struct writeback_control *wbc, + unsigned long wrote + ), + + TP_ARGS(inode, wbc, wrote), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, ino) + __field(unsigned long, state) + __field(unsigned long, age) + __field(unsigned long, wrote) + __field(long, nr_to_write) + __field(unsigned long, writeback_index) + ), + + TP_fast_assign( + strncpy(__entry->name, + dev_name(inode->i_mapping->backing_dev_info->dev), 32); + __entry->ino = inode->i_ino; + __entry->state = inode->i_state; + __entry->age = (jiffies - inode->dirtied_when) * + 1000 / HZ; + __entry->wrote = wrote; + __entry->nr_to_write = wbc->nr_to_write; + __entry->writeback_index = inode->i_mapping->writeback_index; + ), + + TP_printk("bdi %s: ino=%lu state=%s age=%lu " + "wrote=%lu to_write=%ld index=%lu", + __entry->name, + __entry->ino, + show_inode_state(__entry->state), + __entry->age, + __entry->wrote, + __entry->nr_to_write, + __entry->writeback_index + ) +); + +TRACE_EVENT(global_dirty_state, + + TP_PROTO(unsigned long background_thresh, + unsigned long dirty_thresh + ), + + TP_ARGS(background_thresh, + dirty_thresh + ), + + TP_STRUCT__entry( + __field(unsigned long, nr_dirty) + __field(unsigned long, nr_writeback) + __field(unsigned long, nr_unstable) + __field(unsigned long, background_thresh) + __field(unsigned long, dirty_thresh) + __field(unsigned long, poll_thresh) + __field(unsigned long, nr_dirtied) + __field(unsigned long, nr_written) + ), + + TP_fast_assign( + __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); + __entry->nr_writeback = global_page_state(NR_WRITEBACK); + __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); + __entry->nr_dirtied = global_page_state(NR_DIRTIED); + __entry->nr_written = global_page_state(NR_WRITTEN); + __entry->background_thresh = background_thresh; + __entry->dirty_thresh = dirty_thresh; + __entry->poll_thresh = current->nr_dirtied_pause; + ), + + TP_printk("dirty=%lu writeback=%lu unstable=%lu " + "bg_thresh=%lu thresh=%lu gap=%ld poll=%ld " + "dirtied=%lu written=%lu", + __entry->nr_dirty, + __entry->nr_writeback, + __entry->nr_unstable, + __entry->background_thresh, + __entry->dirty_thresh, + __entry->dirty_thresh - __entry->nr_dirty - + __entry->nr_writeback - __entry->nr_unstable, + __entry->poll_thresh, + __entry->nr_dirtied, + __entry->nr_written + ) +); + +#define KBps(x) ((x) << (PAGE_SHIFT - 10)) +#define Bps(x) ((x) >> (BASE_BW_SHIFT - PAGE_SHIFT)) + +TRACE_EVENT(throttle_bandwidth, + + TP_PROTO(struct backing_dev_info *bdi, + unsigned long dirty_bw, + unsigned long long pos_bw, + unsigned long long ref_bw), + + TP_ARGS(bdi, dirty_bw, pos_bw, ref_bw), + + TP_STRUCT__entry( + __array(char, bdi, 32) + __field(unsigned long, write_bw) + __field(unsigned long, avg_bw) + __field(unsigned long, dirty_bw) + __field(unsigned long long, base_bw) + __field(unsigned long long, pos_bw) + __field(unsigned long long, ref_bw) + __field(unsigned long long, avg_ref_bw) + ), + + TP_fast_assign( + strlcpy(__entry->bdi, dev_name(bdi->dev), 32); + __entry->write_bw = KBps(bdi->write_bandwidth); + __entry->avg_bw = KBps(bdi->avg_bandwidth); + __entry->dirty_bw = KBps(dirty_bw); + __entry->base_bw = Bps(bdi->throttle_bandwidth); + __entry->pos_bw = Bps(pos_bw); + __entry->ref_bw = Bps(ref_bw); + __entry->avg_ref_bw = Bps(bdi->reference_bandwidth); + ), + + + TP_printk("bdi %s: " + "write_bw=%lu avg_bw=%lu dirty_bw=%lu " + "base_bw=%llu pos_bw=%llu ref_bw=%llu aref_bw=%lu", + __entry->bdi, + __entry->write_bw, /* bdi write bandwidth */ + __entry->avg_bw, /* bdi avg write bandwidth */ + __entry->dirty_bw, /* bdi dirty bandwidth */ + __entry->base_bw, /* base throttle bandwidth */ + __entry->pos_bw, /* position control bandwidth */ + __entry->ref_bw, /* reference throttle bandwidth */ + __entry->avg_ref_bw /* smoothed reference bandwidth */ + ) +); + + +TRACE_EVENT(balance_dirty_pages, + + TP_PROTO(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty, + unsigned long task_bw, + unsigned long dirtied, + unsigned long period, + long pause, + unsigned long start_time), + + TP_ARGS(bdi, thresh, dirty, bdi_dirty, + task_bw, dirtied, period, pause, start_time), + + TP_STRUCT__entry( + __array( char, bdi, 32) + __field(unsigned long, bdi_weight) + __field(unsigned long, task_weight) + __field(unsigned long, limit) + __field(unsigned long, goal) + __field(unsigned long, dirty) + __field(unsigned long, bdi_goal) + __field(unsigned long, bdi_dirty) + __field(unsigned long, avg_dirty) + __field(unsigned long, base_bw) + __field(unsigned long, task_bw) + __field(unsigned long, dirtied) + __field(unsigned long, period) + __field( long, think) + __field( long, pause) + __field(unsigned long, paused) + ), + + TP_fast_assign( + long numerator; + long denominator; + + strlcpy(__entry->bdi, dev_name(bdi->dev), 32); + + bdi_writeout_fraction(bdi, &numerator, &denominator); + __entry->bdi_weight = 1000 * numerator / denominator; + task_dirties_fraction(current, &numerator, &denominator); + __entry->task_weight = 1000 * numerator / denominator; + + __entry->limit = default_backing_dev_info.dirty_threshold; + __entry->goal = thresh - thresh / DIRTY_SCOPE; + __entry->dirty = dirty; + __entry->bdi_goal = bdi->dirty_threshold - + bdi->dirty_threshold / DIRTY_SCOPE; + __entry->bdi_dirty = bdi_dirty; + __entry->avg_dirty = bdi->avg_dirty; + __entry->base_bw = KBps(bdi->throttle_bandwidth) >> + BASE_BW_SHIFT; + __entry->task_bw = KBps(task_bw); + __entry->dirtied = dirtied; + __entry->think = current->paused_when == 0 ? 0 : + (long)(jiffies - current->paused_when) * 1000 / HZ; + __entry->period = period * 1000 / HZ; + __entry->pause = pause * 1000 / HZ; + __entry->paused = (jiffies - start_time) * 1000 / HZ; + ), + + + TP_printk("bdi %s: bdi_weight=%lu task_weight=%lu " + "limit=%lu goal=%lu dirty=%lu " + "bdi_goal=%lu bdi_dirty=%lu avg_dirty=%lu " + "base_bw=%lu task_bw=%lu " + "dirtied=%lu " + "period=%lu think=%ld pause=%ld paused=%lu", + __entry->bdi, + __entry->bdi_weight, + __entry->task_weight, + __entry->limit, + __entry->goal, + __entry->dirty, + __entry->bdi_goal, + __entry->bdi_dirty, + __entry->avg_dirty, + __entry->base_bw, /* base throttle bandwidth */ + __entry->task_bw, /* task throttle bandwidth */ + __entry->dirtied, + __entry->period, /* ms */ + __entry->think, /* ms */ + __entry->pause, /* ms */ + __entry->paused /* ms */ + ) +); + DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), --- linux-next.orig/fs/btrfs/file.c 2011-03-02 10:45:44.000000000 +0800 +++ linux-next/fs/btrfs/file.c 2011-03-02 10:45:57.000000000 +0800 @@ -770,7 +770,8 @@ out: static noinline int prepare_pages(struct btrfs_root *root, struct file *file, struct page **pages, size_t num_pages, loff_t pos, unsigned long first_index, - unsigned long last_index, size_t write_bytes) + unsigned long last_index, size_t write_bytes, + int *nr_dirtied) { struct extent_state *cached_state = NULL; int i; @@ -837,7 +838,8 @@ again: GFP_NOFS); } for (i = 0; i < num_pages; i++) { - clear_page_dirty_for_io(pages[i]); + if (!clear_page_dirty_for_io(pages[i])) + (*nr_dirtied)++; set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } @@ -947,9 +949,9 @@ static ssize_t btrfs_file_aio_write(stru } iov_iter_init(&i, iov, nr_segs, count, num_written); - nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / - (sizeof(struct page *))); + nrptrs = min(DIV_ROUND_UP(iov_iter_count(&i), PAGE_CACHE_SIZE), + min(PAGE_CACHE_SIZE / sizeof(struct page *), + current->nr_dirtied_pause)); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; @@ -989,6 +991,7 @@ static ssize_t btrfs_file_aio_write(stru } while (iov_iter_count(&i) > 0) { + int nr_dirtied = 0; size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(&i), nrptrs * (size_t)PAGE_CACHE_SIZE - @@ -1015,7 +1018,7 @@ static ssize_t btrfs_file_aio_write(stru ret = prepare_pages(root, file, pages, num_pages, pos, first_index, last_index, - write_bytes); + write_bytes, &nr_dirtied); if (ret) { btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); @@ -1050,7 +1053,7 @@ static ssize_t btrfs_file_aio_write(stru } else { balance_dirty_pages_ratelimited_nr( inode->i_mapping, - dirty_pages); + nr_dirtied); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root, 1); --- linux-next.orig/fs/btrfs/ioctl.c 2011-03-02 10:45:44.000000000 +0800 +++ linux-next/fs/btrfs/ioctl.c 2011-03-02 10:45:57.000000000 +0800 @@ -654,6 +654,7 @@ static int btrfs_defrag_file(struct file u64 skip = 0; u64 defrag_end = 0; unsigned long i; + int dirtied; int ret; int compress_type = BTRFS_COMPRESS_ZLIB; @@ -766,7 +767,7 @@ again: btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); ClearPageChecked(page); - set_page_dirty(page); + dirtied = set_page_dirty(page); unlock_extent(io_tree, page_start, page_end, GFP_NOFS); loop_unlock: @@ -774,7 +775,8 @@ loop_unlock: page_cache_release(page); mutex_unlock(&inode->i_mutex); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); + if (dirtied) + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); i++; } --- linux-next.orig/fs/btrfs/relocation.c 2011-03-02 10:45:44.000000000 +0800 +++ linux-next/fs/btrfs/relocation.c 2011-03-02 10:45:57.000000000 +0800 @@ -2902,6 +2902,7 @@ static int relocate_file_extent_cluster( struct file_ra_state *ra; int nr = 0; int ret = 0; + int dirtied; if (!cluster->nr) return 0; @@ -2978,7 +2979,7 @@ static int relocate_file_extent_cluster( } btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); - set_page_dirty(page); + dirtied = set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); @@ -2986,7 +2987,8 @@ static int relocate_file_extent_cluster( page_cache_release(page); index++; - balance_dirty_pages_ratelimited(inode->i_mapping); + if (dirtied) + balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(BTRFS_I(inode)->root); } WARN_ON(nr != cluster->nr); --- linux-next.orig/fs/btrfs/disk-io.c 2011-03-02 10:45:44.000000000 +0800 +++ linux-next/fs/btrfs/disk-io.c 2011-03-02 10:45:57.000000000 +0800 @@ -616,6 +616,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_ extent_submit_bio_hook_t *submit_bio_done) { struct async_submit_bio *async; + int limit; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) @@ -643,6 +644,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_ btrfs_queue_worker(&fs_info->workers, &async->work); + limit = btrfs_async_submit_limit(fs_info); + + if (atomic_read(&fs_info->nr_async_bios) > limit) + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_bios) < limit)); + while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { wait_event(fs_info->async_submit_wait, --- linux-next.orig/fs/nfs/file.c 2011-03-02 10:45:44.000000000 +0800 +++ linux-next/fs/nfs/file.c 2011-03-02 10:45:57.000000000 +0800 @@ -392,15 +392,6 @@ static int nfs_write_begin(struct file * IOMODE_RW); start: - /* - * Prevent starvation issues if someone is doing a consistency - * sync-to-disk - */ - ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - return ret; - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; --- linux-next.orig/fs/nfs/write.c 2011-03-02 10:45:44.000000000 +0800 +++ linux-next/fs/nfs/write.c 2011-03-02 10:45:58.000000000 +0800 @@ -29,6 +29,9 @@ #include "nfs4_fs.h" #include "fscache.h" +#define CREATE_TRACE_POINTS +#include + #define NFSDBG_FACILITY NFSDBG_PAGECACHE #define MIN_POOL_WRITE (32) @@ -185,11 +188,68 @@ static int wb_priority(struct writeback_ * NFS congestion control */ +#define NFS_WAIT_PAGES (1024L >> (PAGE_SHIFT - 10)) int nfs_congestion_kb; -#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) -#define NFS_CONGESTION_OFF_THRESH \ - (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) +/* + * SYNC requests will block on (2*limit) and wakeup on (2*limit-NFS_WAIT_PAGES) + * ASYNC requests will block on (limit) and wakeup on (limit - NFS_WAIT_PAGES) + * In this way SYNC writes will never be blocked by ASYNC ones. + */ + +static void nfs_set_congested(long nr, struct backing_dev_info *bdi) +{ + long limit = nfs_congestion_kb >> (PAGE_SHIFT - 10); + + if (nr > limit && !test_bit(BDI_async_congested, &bdi->state)) + set_bdi_congested(bdi, BLK_RW_ASYNC); + else if (nr > 2 * limit && !test_bit(BDI_sync_congested, &bdi->state)) + set_bdi_congested(bdi, BLK_RW_SYNC); +} + +static void nfs_wait_contested(int is_sync, + struct backing_dev_info *bdi, + wait_queue_head_t *wqh) +{ + int waitbit = is_sync ? BDI_sync_congested : BDI_async_congested; + DEFINE_WAIT(wait); + + if (!test_bit(waitbit, &bdi->state)) + return; + + for (;;) { + prepare_to_wait(&wqh[is_sync], &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(waitbit, &bdi->state)) + break; + + io_schedule(); + } + finish_wait(&wqh[is_sync], &wait); +} + +static void nfs_wakeup_congested(long nr, + struct backing_dev_info *bdi, + wait_queue_head_t *wqh) +{ + long limit = nfs_congestion_kb >> (PAGE_SHIFT - 10); + + if (nr < 2 * limit - min(limit / 8, NFS_WAIT_PAGES)) { + if (test_bit(BDI_sync_congested, &bdi->state)) { + clear_bdi_congested(bdi, BLK_RW_SYNC); + smp_mb__after_clear_bit(); + } + if (waitqueue_active(&wqh[BLK_RW_SYNC])) + wake_up(&wqh[BLK_RW_SYNC]); + } + if (nr < limit - min(limit / 8, NFS_WAIT_PAGES)) { + if (test_bit(BDI_async_congested, &bdi->state)) { + clear_bdi_congested(bdi, BLK_RW_ASYNC); + smp_mb__after_clear_bit(); + } + if (waitqueue_active(&wqh[BLK_RW_ASYNC])) + wake_up(&wqh[BLK_RW_ASYNC]); + } +} static int nfs_set_page_writeback(struct page *page) { @@ -200,11 +260,8 @@ static int nfs_set_page_writeback(struct struct nfs_server *nfss = NFS_SERVER(inode); page_cache_get(page); - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) { - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } + nfs_set_congested(atomic_long_inc_return(&nfss->writeback), + &nfss->backing_dev_info); } return ret; } @@ -216,8 +273,10 @@ static void nfs_end_page_writeback(struc end_page_writeback(page); page_cache_release(page); - if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + + nfs_wakeup_congested(atomic_long_dec_return(&nfss->writeback), + &nfss->backing_dev_info, + nfss->writeback_wait); } static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) @@ -318,45 +377,49 @@ static int nfs_writepage_locked(struct p int nfs_writepage(struct page *page, struct writeback_control *wbc) { + struct inode *inode = page->mapping->host; + struct nfs_server *nfss = NFS_SERVER(inode); int ret; ret = nfs_writepage_locked(page, wbc); unlock_page(page); + + nfs_wait_contested(wbc->sync_mode == WB_SYNC_ALL, + &nfss->backing_dev_info, + nfss->writeback_wait); + return ret; } -static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +static int nfs_writepages_callback(struct page *page, + struct writeback_control *wbc, void *data) { + struct inode *inode = page->mapping->host; + struct nfs_server *nfss = NFS_SERVER(inode); int ret; ret = nfs_do_writepage(page, wbc, data); unlock_page(page); + + nfs_wait_contested(wbc->sync_mode == WB_SYNC_ALL, + &nfss->backing_dev_info, + nfss->writeback_wait); + return ret; } int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; - /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (err) - goto out_err; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); - clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_FLUSHING); - if (err < 0) goto out_err; err = pgio.pg_error; @@ -1244,7 +1307,7 @@ static void nfs_commitdata_release(void */ static int nfs_commit_rpcsetup(struct list_head *head, struct nfs_write_data *data, - int how) + int how, pgoff_t offset, pgoff_t count) { struct nfs_page *first = nfs_list_entry(head->next); struct inode *inode = first->wb_context->path.dentry->d_inode; @@ -1276,8 +1339,8 @@ static int nfs_commit_rpcsetup(struct li data->args.fh = NFS_FH(data->inode); /* Note: we always request a commit of the entire inode */ - data->args.offset = 0; - data->args.count = 0; + data->args.offset = offset; + data->args.count = count; data->args.context = get_nfs_open_context(first->wb_context); data->res.count = 0; data->res.fattr = &data->fattr; @@ -1300,7 +1363,8 @@ static int nfs_commit_rpcsetup(struct li * Commit dirty pages */ static int -nfs_commit_list(struct inode *inode, struct list_head *head, int how) +nfs_commit_list(struct inode *inode, struct list_head *head, int how, + pgoff_t offset, pgoff_t count) { struct nfs_write_data *data; struct nfs_page *req; @@ -1311,7 +1375,7 @@ nfs_commit_list(struct inode *inode, str goto out_bad; /* Set up the argument struct */ - return nfs_commit_rpcsetup(head, data, how); + return nfs_commit_rpcsetup(head, data, how, offset, count); out_bad: while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -1379,6 +1443,9 @@ static void nfs_commit_release(void *cal nfs_clear_page_tag_locked(req); } nfs_commit_clear_lock(NFS_I(data->inode)); + trace_nfs_commit_release(data->inode, + data->args.offset, + data->args.count); nfs_commitdata_release(calldata); } @@ -1393,6 +1460,8 @@ static const struct rpc_call_ops nfs_com int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); + pgoff_t first_index; + pgoff_t last_index; int may_wait = how & FLUSH_SYNC; int res = 0; @@ -1400,9 +1469,14 @@ int nfs_commit_inode(struct inode *inode goto out_mark_dirty; spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); + if (res) { + first_index = nfs_list_entry(head.next)->wb_index; + last_index = nfs_list_entry(head.prev)->wb_index; + } spin_unlock(&inode->i_lock); if (res) { - int error = nfs_commit_list(inode, &head, how); + int error = nfs_commit_list(inode, &head, how, first_index, + last_index - first_index + 1); if (error < 0) return error; if (may_wait) @@ -1432,9 +1506,10 @@ static int nfs_commit_unstable_pages(str if (wbc->sync_mode == WB_SYNC_NONE) { /* Don't commit yet if this is a non-blocking flush and there - * are a lot of outstanding writes for this mapping. + * are a lot of outstanding writes for this mapping, until + * collected enough pages to commit. */ - if (nfsi->ncommit <= (nfsi->npages >> 1)) + if (nfsi->ncommit <= nfsi->npages / DIRTY_MARGIN) goto out_mark_dirty; /* don't wait for the COMMIT response */ @@ -1443,17 +1518,15 @@ static int nfs_commit_unstable_pages(str ret = nfs_commit_inode(inode, flags); if (ret >= 0) { - if (wbc->sync_mode == WB_SYNC_NONE) { - if (ret < wbc->nr_to_write) - wbc->nr_to_write -= ret; - else - wbc->nr_to_write = 0; - } - return 0; + wbc->nr_to_write -= ret; + goto out; } + out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; +out: + trace_nfs_commit_unstable_pages(inode, wbc, flags, ret); + return ret >= 0 ? 0 : ret; } #else static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) @@ -1582,6 +1655,9 @@ out: int __init nfs_init_writepagecache(void) { + unsigned long background_thresh; + unsigned long dirty_thresh; + nfs_wdata_cachep = kmem_cache_create("nfs_write_data", sizeof(struct nfs_write_data), 0, SLAB_HWCACHE_ALIGN, @@ -1619,6 +1695,16 @@ int __init nfs_init_writepagecache(void) if (nfs_congestion_kb > 256*1024) nfs_congestion_kb = 256*1024; + /* + * Limit to 1/8 dirty threshold, so that writeback+in_commit pages + * won't overnumber dirty+to_commit pages. + */ + global_dirty_limits(&background_thresh, &dirty_thresh); + dirty_thresh <<= PAGE_SHIFT - 10; + + if (nfs_congestion_kb > dirty_thresh / 8) + nfs_congestion_kb = dirty_thresh / 8; + return 0; } --- linux-next.orig/include/linux/nfs_fs.h 2011-03-02 10:45:44.000000000 +0800 +++ linux-next/include/linux/nfs_fs.h 2011-03-02 10:45:57.000000000 +0800 @@ -215,7 +215,6 @@ struct nfs_inode { #define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */ #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ -#define NFS_INO_FLUSHING (4) /* inode is flushing out data */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ #define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ --- linux-next.orig/include/linux/nfs_fs_sb.h 2011-03-02 10:45:44.000000000 +0800 +++ linux-next/include/linux/nfs_fs_sb.h 2011-03-02 10:45:57.000000000 +0800 @@ -102,6 +102,7 @@ struct nfs_server { struct nfs_iostats __percpu *io_stats; /* I/O statistics */ struct backing_dev_info backing_dev_info; atomic_long_t writeback; /* number of writeback pages */ + wait_queue_head_t writeback_wait[2]; int flags; /* various flags */ unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ --- linux-next.orig/fs/nfs/client.c 2011-03-02 10:45:44.000000000 +0800 +++ linux-next/fs/nfs/client.c 2011-03-02 10:45:57.000000000 +0800 @@ -1042,6 +1042,8 @@ static struct nfs_server *nfs_alloc_serv INIT_LIST_HEAD(&server->delegations); atomic_set(&server->active, 0); + init_waitqueue_head(&server->writeback_wait[BLK_RW_SYNC]); + init_waitqueue_head(&server->writeback_wait[BLK_RW_ASYNC]); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-next/include/trace/events/nfs.h 2011-03-02 10:45:58.000000000 +0800 @@ -0,0 +1,88 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfs + +#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NFS_H + +#include + + +TRACE_EVENT(nfs_commit_unstable_pages, + + TP_PROTO(struct inode *inode, + struct writeback_control *wbc, + int sync, + int ret + ), + + TP_ARGS(inode, wbc, sync, ret), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, ino) + __field(unsigned long, npages) + __field(unsigned long, to_commit) + __field(unsigned long, write_chunk) + __field(int, sync) + __field(int, ret) + ), + + TP_fast_assign( + strncpy(__entry->name, + dev_name(inode->i_mapping->backing_dev_info->dev), 32); + __entry->ino = inode->i_ino; + __entry->npages = NFS_I(inode)->npages; + __entry->to_commit = NFS_I(inode)->ncommit; + __entry->write_chunk = wbc->per_file_limit; + __entry->sync = sync; + __entry->ret = ret; + ), + + TP_printk("bdi %s: ino=%lu npages=%ld tocommit=%lu " + "write_chunk=%lu sync=%d ret=%d", + __entry->name, + __entry->ino, + __entry->npages, + __entry->to_commit, + __entry->write_chunk, + __entry->sync, + __entry->ret + ) +); + +TRACE_EVENT(nfs_commit_release, + + TP_PROTO(struct inode *inode, + unsigned long offset, + unsigned long len), + + TP_ARGS(inode, offset, len), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, ino) + __field(unsigned long, offset) + __field(unsigned long, len) + ), + + TP_fast_assign( + strncpy(__entry->name, + dev_name(inode->i_mapping->backing_dev_info->dev), 32); + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->len = len; + ), + + TP_printk("bdi %s: ino=%lu offset=%lu len=%lu", + __entry->name, + __entry->ino, + __entry->offset, + __entry->len + ) +); + + +#endif /* _TRACE_NFS_H */ + +/* This part must be outside protection */ +#include