--- linux-next.orig/mm/page-writeback.c	2011-04-13 17:18:06.000000000 +0800
+++ linux-next/mm/page-writeback.c	2011-04-15 13:48:58.000000000 +0800
@@ -37,24 +37,11 @@
 #include <trace/events/writeback.h>
 
 /*
- * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
- * will look to see if it needs to force writeback or throttling.
+ * Sleep at most 200ms at a time in balance_dirty_pages().
  */
-static long ratelimit_pages = 32;
+#define MAX_PAUSE	max(HZ/5, 1)
 
-/*
- * When balance_dirty_pages decides that the caller needs to perform some
- * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than dirtied pages to ensure that reasonably
- * large amounts of I/O are submitted.
- */
-static inline long sync_writeback_pages(unsigned long dirtied)
-{
-	if (dirtied < ratelimit_pages)
-		dirtied = ratelimit_pages;
-
-	return dirtied + dirtied / 2;
-}
+#define RATIO_SHIFT	10
 
 /* The following parameters are exported via /proc/sys/vm */
 
@@ -219,6 +206,7 @@ int dirty_bytes_handler(struct ctl_table
  */
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
+	__inc_bdi_stat(bdi, BDI_WRITTEN);
 	__prop_inc_percpu_max(&vm_completions, &bdi->completions,
 			      bdi->max_prop_frac);
 }
@@ -244,53 +232,11 @@ void task_dirty_inc(struct task_struct *
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
 		long *numerator, long *denominator)
 {
-	if (bdi_cap_writeback_dirty(bdi)) {
-		prop_fraction_percpu(&vm_completions, &bdi->completions,
-				numerator, denominator);
-	} else {
-		*numerator = 0;
-		*denominator = 1;
-	}
-}
-
-static inline void task_dirties_fraction(struct task_struct *tsk,
-		long *numerator, long *denominator)
-{
-	prop_fraction_single(&vm_dirties, &tsk->dirties,
+	prop_fraction_percpu(&vm_completions, &bdi->completions,
 				numerator, denominator);
 }
 
 /*
- * task_dirty_limit - scale down dirty throttling threshold for one task
- *
- * task specific dirty limit:
- *
- *   dirty -= (dirty/8) * p_{t}
- *
- * To protect light/slow dirtying tasks from heavier/fast ones, we start
- * throttling individual tasks before reaching the bdi dirty limit.
- * Relatively low thresholds will be allocated to heavy dirtiers. So when
- * dirty pages grow large, heavy dirtiers will be throttled first, which will
- * effectively curb the growth of dirty pages. Light dirtiers with high enough
- * dirty threshold may never get throttled.
- */
-static unsigned long task_dirty_limit(struct task_struct *tsk,
-				       unsigned long bdi_dirty)
-{
-	long numerator, denominator;
-	unsigned long dirty = bdi_dirty;
-	u64 inv = dirty >> 3;
-
-	task_dirties_fraction(tsk, &numerator, &denominator);
-	inv *= numerator;
-	do_div(inv, denominator);
-
-	dirty -= inv;
-
-	return max(dirty, bdi_dirty/2);
-}
-
-/*
  *
  */
 static unsigned int bdi_min_ratio;
@@ -397,14 +343,18 @@ unsigned long determine_dirtyable_memory
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+	return max(thresh + thresh / DIRTY_BRAKE,
+		   default_backing_dev_info.dirty_threshold);
+}
+
 /*
  * global_dirty_limits - background-writeback and dirty-throttling thresholds
  *
  * Calculate the dirty thresholds based on sysctl parameters
  * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
  * - vm.dirty_ratio             or  vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
  */
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
@@ -426,21 +376,31 @@ void global_dirty_limits(unsigned long *
 	else
 		background = (dirty_background_ratio * available_memory) / 100;
 
-	if (background >= dirty)
-		background = dirty / 2;
+	/*
+	 * Ensure at least 1/4 gap between background and dirty thresholds, so
+	 * that when dirty throttling starts at (background + dirty)/2, it's
+	 * below or at the entrance of the soft dirty throttle scope.
+	 */
+	if (background > dirty - dirty / DIRTY_FULL_SCOPE)
+		background = dirty - dirty / DIRTY_FULL_SCOPE;
+
 	tsk = current;
-	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-		background += background / 4;
-		dirty += dirty / 4;
-	}
 	*pbackground = background;
 	*pdirty = dirty;
+	trace_global_dirty_state(background, dirty);
 }
 
-/*
+/**
  * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
+ *
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
  *
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
  * - starving fast devices
  * - piling up dirty pages (that will take long time to sync) on slow devices
  *
@@ -469,6 +429,643 @@ unsigned long bdi_dirty_limit(struct bac
 }
 
 /*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If ratelimit_pages is too low then big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long ratelimit_pages(unsigned long dirty,
+				     unsigned long thresh)
+{
+	if (thresh > dirty)
+		return 1UL << (ilog2(thresh - dirty) >> 1);
+
+	return 1;
+}
+
+/*
+ * last time exceeded (limit - limit/DIRTY_BRAKE)
+ */
+static bool dirty_exceeded_recently(struct backing_dev_info *bdi,
+				    unsigned long time_window)
+{
+	return jiffies - bdi->dirty_exceed_time <= time_window;
+}
+
+/*
+ * last time dropped below (thresh - 2*thresh/DIRTY_SCOPE + thresh/DIRTY_RAMPUP)
+ */
+static bool dirty_free_run_recently(struct backing_dev_info *bdi,
+				    unsigned long time_window)
+{
+	return jiffies - bdi->dirty_free_run <= time_window;
+}
+
+/*
+ * Position based bandwidth control.
+ *
+ * (1) boundary guarding areas
+ *
+ * The loop area is required to stop large number of slow dirtiers, because
+ * the max-pause area is only able to throttle a task at 1page/200ms=20KB/s.
+ *
+ * The pass-good region can stop a slow UKEY with 100+ slow dirtiers, while
+ * still avoid looping for the other good disk, so that their performance won't
+ * be impacted.
+ *
+ * The max-pause area can safeguard unknown bugs in the control algorithms
+ * as well as the possible surges in small memory boxes.
+ *
+ * The brake area is a good leeway for holding off the dirty pages in sudden
+ * workload change, or when some bdi dirty goal is excessively exceeded.
+ *
+ * The loop, pass-good and max-pause areas are enforced inside the loop of
+ * balance_dirty_pages(). Others can be found in bdi_position_ratio().
+ *
+ *      loop area,  loop until drop below the area  -----------------------|<===
+ * pass-good area,  dirty exceeded bdi's will loop  -----------------|<===>|
+ * max-pause area,  sleep(max_pause) and return     -----------|<===>|
+ *     brake area,  bw scaled from 1 down to 0      ---|<=====>|
+ * ----------------------------------------------------o-------o-----o-----o----
+ *                                                     ^       ^     ^     ^
+ *                    limit - limit/DIRTY_BRAKE     ---'       |     |     |
+ *                    limit                         -----------'     |     |
+ *                    limit + limit/DIRTY_MAXPAUSE  -----------------'     |
+ *                    limit + limit/DIRTY_PASSGOOD  -----------------------'
+ *
+ * (2) global control areas
+ *
+ * The rampup area is for ramping up the base bandwidth whereas the above brake
+ * area is for scaling down the base bandwidth.
+ *
+ * The global thresh typically lies at the bottom of the brake area. @thresh
+ * is real-time computed from global_dirty_limits() and @limit is tracking
+ * (thresh + thresh/DIRTY_BRAKE) at 200ms intervals in update_dirty_limit().
+ *
+ *rampup area                 setpoint/goal
+ *|<=======>|                      v
+ * |-------------------------------*-------------------------------|------------
+ * ^                               ^                               ^
+ * thresh - 2*thresh/DIRTY_SCOPE   thresh - thresh/DIRTY_SCOPE     thresh
+ *
+ * (3) bdi control areas
+ *
+ * The bdi reserve area tries to keep a reasonable number of dirty pages for
+ * preventing block queue underrun.
+ *
+ * reserve area, scale up bw as dirty pages drop low  bdi_setpoint
+ * |<=============================================>|       v
+ * |-------------------------------------------------------*-------|----------
+ * 0                    bdi_thresh - bdi_thresh/DIRTY_SCOPE^       ^bdi_thresh
+ *
+ * (4) global/bdi control lines
+ *
+ * bdi_position_ratio() applies 2 main and 3 regional control lines for
+ * scaling up/down the base bandwidth based on the position of dirty pages.
+ *
+ * The two main control lines for the global/bdi control scopes do not end at
+ * thresh/bdi_thresh.  They are centered at setpoint/bdi_setpoint and cover the
+ * whole [0, limit].  If the control line drops below 0 before reaching @limit,
+ * an auxiliary line will be setup to connect them. The below figure illustrates
+ * the main bdi control line with an auxiliary line extending it to @limit.
+ *
+ * This allows smoothly throttling down bdi_dirty back to normal if it starts
+ * high in situations like
+ * - start writing to a slow SD card and a fast disk at the same time. The SD
+ *   card's bdi_dirty may rush to 5 times higher than bdi_setpoint.
+ * - the global/bdi dirty thresh/goal may be knocked down suddenly either on
+ *   user request or on increased memory consumption.
+ *
+ *   o
+ *     o
+ *       o                                      [o] main control line
+ *         o                                    [*] auxiliary control line
+ *           o
+ *             o
+ *               o
+ *                 o
+ *                   o
+ *                     o
+ *                       o--------------------- balance point, bw scale = 1
+ *                       | o
+ *                       |   o
+ *                       |     o
+ *                       |       o
+ *                       |         o
+ *                       |           o
+ *                       |             o------- connect point, bw scale = 1/2
+ *                       |               .*
+ *                       |                 .   *
+ *                       |                   .      *
+ *                       |                     .         *
+ *                       |                       .           *
+ *                       |                         .              *
+ *                       |                           .                 *
+ *  [--------------------*-----------------------------.--------------------*]
+ *  0                 bdi_setpoint                  bdi_origin           limit
+ *
+ * The bdi control line: if (bdi_origin < limit), an auxiliary control line (*)
+ * will be setup to extend the main control line (o) to @limit.
+ */
+static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
+					unsigned long thresh,
+					unsigned long dirty,
+					unsigned long bdi_dirty)
+{
+	unsigned long limit = hard_dirty_limit(thresh);
+	unsigned long bdi_thresh = bdi->dirty_threshold;
+	unsigned long origin;
+	unsigned long goal;
+	unsigned long long span;
+	unsigned long long bw;
+
+	if (unlikely(dirty >= limit))
+		return 0;
+
+	/*
+	 * global setpoint
+	 */
+	goal = thresh - thresh / DIRTY_SCOPE;
+	origin = goal + 2 * thresh;
+
+	if (unlikely(origin < limit && dirty > (goal + origin) / 2)) {
+		origin = limit;
+		goal = (goal + origin) / 2;
+		bw >>= 1;
+	}
+	bw = origin - dirty;
+	bw <<= RATIO_SHIFT;
+	do_div(bw, origin - goal + 1);
+
+	/*
+	 * brake area, hold off dirtiers when the estimated dirty_ratelimit
+	 * and/or write_bandwidth are adapting to sudden workload changes.
+	 * It also balances the pressure to push global pages high when some
+	 * bdi dirty pages are over-committed (eg. a UKEY's bdi goal could be
+	 * exceeded a lot in the free run area; an unresponsing server may make
+	 * an NFS bdi's dirty goal drop much lower than its dirty pages).
+	 */
+	if (dirty > limit - limit / DIRTY_BRAKE) {
+		bw *= limit - dirty;
+		do_div(bw, limit / DIRTY_BRAKE + 1);
+	}
+
+	/*
+	 * rampup area, immediately above the unthrottled free-run region.
+	 * It's setup mainly to get an estimation of ref_bw for reliably
+	 * ramping up the base bandwidth.
+	 */
+	dirty = default_backing_dev_info.avg_dirty;
+	origin = thresh - thresh / DIRTY_FULL_SCOPE + thresh / DIRTY_RAMPUP;
+	if (dirty < origin) {
+		span = (origin - dirty) * bw;
+		do_div(span, thresh / (4 * DIRTY_RAMPUP) + 1);
+		bw += min(span, 4 * bw);
+	}
+
+	/*
+	 * bdi reserve area, safeguard against bdi dirty underflow and disk idle
+	 */
+	origin = bdi->avg_write_bandwidth / 2 + 2 * MIN_WRITEBACK_PAGES;
+	origin = min(origin, thresh - thresh / DIRTY_FULL_SCOPE);
+	if (bdi_dirty < origin) {
+		if (bdi_dirty > origin / 4)
+			bw = bw * origin / bdi_dirty;
+		else
+			bw = bw * 4;
+	}
+
+	/*
+	 * bdi setpoint
+	 */
+	if (unlikely(bdi_thresh > thresh))
+		bdi_thresh = thresh;
+	goal = bdi_thresh - bdi_thresh / DIRTY_SCOPE;
+	/*
+	 * In JBOD case, bdi_thresh could fluctuate proportional to its own
+	 * size. Otherwise the bdi write bandwidth is good for limiting the
+	 * floating area, to compensate for the global control line being too
+	 * flat in large memory systems.
+	 */
+	span = (u64) bdi_thresh * (thresh - bdi_thresh) +
+		(2 * bdi->avg_write_bandwidth) * bdi_thresh;
+	do_div(span, thresh + 1);
+	origin = goal + 2 * span;
+
+	if (likely(bdi->avg_dirty))
+		bdi_dirty = bdi->avg_dirty;
+	if (unlikely(bdi_dirty > goal + span)) {
+		if (bdi_dirty > limit)
+			return 0;
+		if (origin < limit) {
+			origin = limit;
+			goal += span;
+			bw >>= 1;
+		}
+	}
+	bw *= origin - bdi_dirty;
+	do_div(bw, origin - goal + 1);
+
+	return bw;
+}
+
+static void bdi_update_dirty_smooth(struct backing_dev_info *bdi,
+				    unsigned long dirty)
+{
+	unsigned long avg = bdi->avg_dirty;
+	unsigned long old = bdi->old_dirty;
+
+	if (unlikely(!avg)) {
+		avg = dirty;
+		goto update;
+	}
+
+	/*
+	 * dirty pages are departing upwards, follow up
+	 */
+	if (avg < old && old <= dirty) {
+		avg += (old - avg) >> 2;
+		goto update;
+	}
+
+	/*
+	 * dirty pages are departing downwards, follow down
+	 */
+	if (avg > old && old >= dirty) {
+		avg -= (avg - old) >> 2;
+		goto update;
+	}
+
+	/*
+	 * This can filter out one half unnecessary updates when bdi_dirty is
+	 * fluctuating around the balance point, and is most effective on XFS,
+	 * whose pattern is
+	 *                                                             .
+	 *	[.] dirty	[-] avg                       .       .
+	 *                                                   .       .
+	 *              .         .         .         .     .       .
+	 *      ---------------------------------------    .       .
+	 *            .         .         .         .     .       .
+	 *           .         .         .         .     .       .
+	 *          .         .         .         .     .       .
+	 *         .         .         .         .     .       .
+	 *        .         .         .         .
+	 *       .         .         .         .      (fluctuated)
+	 *      .         .         .         .
+	 *     .         .         .         .
+	 *
+	 * @avg will remain flat at the cost of being biased towards high. In
+	 * practice the error tend to be much smaller: thanks to more coarse
+	 * grained fluctuations, @avg becomes the real average number for the
+	 * last two rising lines of @dirty.
+	 */
+	goto out;
+
+update:
+	bdi->avg_dirty = avg;
+out:
+	bdi->old_dirty = dirty;
+}
+
+static void __bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+					 unsigned long elapsed,
+					 unsigned long written)
+{
+	const unsigned long period = roundup_pow_of_two(3 * HZ);
+	unsigned long avg = bdi->avg_write_bandwidth;
+	unsigned long old = bdi->write_bandwidth;
+	unsigned long cur;
+	u64 bw;
+
+	bw = written - bdi->written_stamp;
+	bw *= HZ;
+	if (unlikely(elapsed > period / 2)) {
+		do_div(bw, elapsed);
+		elapsed = period / 2;
+		bw *= elapsed;
+	}
+	bw += (u64)bdi->write_bandwidth * (period - elapsed);
+	cur = bw >> ilog2(period);
+	bdi->write_bandwidth = cur;
+
+	/*
+	 * one more level of smoothing
+	 */
+	if (avg > old && old > cur)
+		avg -= (avg - old) >> 3;
+
+	if (avg < old && old < cur)
+		avg += (old - avg) >> 3;
+
+	bdi->avg_write_bandwidth = avg;
+}
+
+static void update_dirty_limit(unsigned long thresh,
+			       unsigned long dirty)
+{
+	unsigned long limit = default_backing_dev_info.dirty_threshold;
+	unsigned long min = dirty + limit / DIRTY_BRAKE;
+
+	thresh += thresh / DIRTY_BRAKE;
+
+	if (limit < thresh) {
+		limit = thresh;
+		goto update;
+	}
+
+	/* take care not to follow into the brake area */
+	if (limit > thresh &&
+	    limit > min) {
+		limit -= (limit - max(thresh, min)) >> 5;
+		goto update;
+	}
+	return;
+update:
+	default_backing_dev_info.dirty_threshold = limit;
+}
+
+static void bdi_update_dirty_threshold(struct backing_dev_info *bdi,
+				       unsigned long thresh,
+				       unsigned long dirty)
+{
+	unsigned long old = bdi->old_dirty_threshold;
+	unsigned long avg = bdi->dirty_threshold;
+
+	thresh = bdi_dirty_limit(bdi, thresh);
+
+	if (avg > old && old >= thresh)
+		avg -= (avg - old) >> 3;
+
+	if (avg < old && old <= thresh)
+		avg += (old - avg) >> 3;
+
+	bdi->dirty_threshold = avg;
+	bdi->old_dirty_threshold = thresh;
+}
+
+/*
+ * ref_bw typically fluctuates within a small range, with large isolated points
+ * from time to time. The smoothed reference_ratelimit can effectively filter
+ * out 1 such standalone point. When there comes 2+ isolated points together --
+ * observed in ext4 on sudden redirty -- reference_ratelimit may surge high and
+ * take long time to return to normal, which can mostly be counteracted by
+ * xref_bw and other update restrictions in bdi_update_dirty_ratelimit().
+ */
+static void bdi_update_reference_ratelimit(struct backing_dev_info *bdi,
+					   unsigned long ref_bw)
+{
+	unsigned long old = bdi->old_ref_ratelimit;
+	unsigned long avg = bdi->reference_ratelimit;
+
+	if (avg > old && old >= ref_bw && avg - old >= old - ref_bw)
+		avg -= (avg - old) >> 2;
+
+	if (avg < old && old <= ref_bw && old - avg >= ref_bw - old)
+		avg += (old - avg) >> 2;
+
+	bdi->reference_ratelimit = avg;
+	bdi->old_ref_ratelimit = ref_bw;
+}
+
+/*
+ * Base throttle bandwidth.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+				       unsigned long thresh,
+				       unsigned long dirty,
+				       unsigned long bdi_dirty,
+				       unsigned long dirtied,
+				       unsigned long elapsed)
+{
+	unsigned long limit = default_backing_dev_info.dirty_threshold;
+	unsigned long goal = thresh - thresh / DIRTY_SCOPE;
+	unsigned long bw = bdi->dirty_ratelimit;
+	unsigned long dirty_bw;
+	unsigned long pos_bw;
+	unsigned long delta;
+	unsigned long ref_bw;
+	unsigned long xref_bw;
+	unsigned long long pos_ratio;
+
+	if (dirty > limit - limit / DIRTY_BRAKE)
+		bdi->dirty_exceed_time = jiffies;
+
+	if (dirty < thresh - thresh / DIRTY_FULL_SCOPE + thresh / DIRTY_RAMPUP)
+		bdi->dirty_free_run = jiffies;
+
+	/*
+	 * The dirty rate will match the writeback rate in long term, except
+	 * when dirty pages are truncated by userspace before IO submission, or
+	 * re-dirtied when the FS finds it not suitable to do IO at the time.
+	 */
+	dirty_bw = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+
+	pos_ratio = bdi_position_ratio(bdi, thresh, dirty, bdi_dirty);
+	/*
+	 * (pos_bw > bw) means the position of the number of dirty pages is
+	 * lower than the global and/or bdi setpoints. It does not necessarily
+	 * mean the base throttle bandwidth is larger than its balanced value.
+	 * The latter is likely only when
+	 * - (position) the dirty pages are at some distance from the setpoint,
+	 * - (speed) and either stands still or is departing from the setpoint.
+	 */
+	pos_bw = bw * pos_ratio >> RATIO_SHIFT;
+
+	/*
+	 * There may be
+	 * 1) X dd tasks writing to the current disk, and/or
+	 * 2) Y "rsync --bwlimit" tasks.
+	 * The below estimation is accurate enough for (1). For (2), where not
+	 * all task's dirty rate can be changed proportionally by adjusting the
+	 * base throttle bandwidth, it would require multiple adjust-reestimate
+	 * cycles to approach the rate balance point. That is not a big concern
+	 * as we do small steps anyway for the sake of other unknown noises.
+	 * The un-controllable tasks may only slow down the approximating
+	 * progress and is harmless otherwise.
+	 */
+	pos_ratio *= bdi->avg_write_bandwidth;
+	do_div(pos_ratio, dirty_bw | 1);
+	ref_bw = bw * pos_ratio >> RATIO_SHIFT;
+	ref_bw = min(ref_bw, bdi->avg_write_bandwidth);
+
+	/*
+	 * Update the base throttle bandwidth rigidly: eg. only try lowering it
+	 * when both the global/bdi dirty pages are away from their setpoints,
+	 * and are either standing still or continue departing away.
+	 *
+	 * The "+ (avg_dirty >> 8)" margin mainly help btrfs, which behaves
+	 * amazingly smoothly. Its @avg_dirty is ever approaching @dirty,
+	 * slower and slower, but very hard to cross it to trigger a base
+	 * bandwidth update. The added margin says "when @avg_dirty is _close
+	 * enough_ to @dirty, it indicates slowed down @dirty change rate,
+	 * hence the other inequalities are now a good indication of something
+	 * unbalanced in the current bdi".
+	 *
+	 * In the cases of hitting the upper/lower margins, it's obviously
+	 * necessary to adjust the (possibly very unbalanced) base bandwidth,
+	 * unless the opposite margin was also been hit recently, which
+	 * indicates that the dirty control scope may be smaller than the bdi
+	 * write bandwidth and hence the dirty pages are quickly fluctuating
+	 * between the upper/lower margins.
+	 */
+	if (bw < pos_bw) {
+		if (dirty < goal &&
+		    dirty <= default_backing_dev_info.avg_dirty +
+			     (default_backing_dev_info.avg_dirty >> 8) &&
+		    bdi_dirty <= bdi->avg_dirty + (bdi->avg_dirty >> 8) &&
+		    bdi_dirty <= bdi->old_dirty)
+			goto adjust;
+		if (dirty < thresh - thresh / DIRTY_FULL_SCOPE
+				   + thresh / DIRTY_RAMPUP &&
+		    !dirty_exceeded_recently(bdi, HZ))
+			goto adjust;
+	}
+
+	if (bw > pos_bw) {
+		if (dirty > goal &&
+		    dirty >= default_backing_dev_info.avg_dirty -
+			     (default_backing_dev_info.avg_dirty >> 8) &&
+		    bdi_dirty >= bdi->avg_dirty - (bdi->avg_dirty >> 8) &&
+		    bdi_dirty >= bdi->old_dirty)
+			goto adjust;
+		if (dirty > limit - limit / DIRTY_BRAKE &&
+		    !dirty_free_run_recently(bdi, HZ))
+			goto adjust;
+	}
+
+	goto out;
+
+adjust:
+	/*
+	 * The min/max'ed xref_bw is an effective safeguard against transient
+	 * large deviations. By considering not only the current ref_bw value,
+	 * but also the old/avg values, the sudden drop can be filtered out.
+	 */
+	if (pos_bw > bw) {
+		xref_bw = min(ref_bw, bdi->old_ref_ratelimit);
+		xref_bw = min(xref_bw, bdi->reference_ratelimit);
+		if (xref_bw > bw)
+			delta = xref_bw - bw;
+		else
+			delta = 0;
+	} else {
+		xref_bw = max(ref_bw, bdi->old_ref_ratelimit);
+		xref_bw = max(xref_bw, bdi->reference_ratelimit);
+		if (xref_bw < bw)
+			delta = bw - xref_bw;
+		else
+			delta = 0;
+	}
+
+	/*
+	 * Don't pursue 100% rate matching. It's impossible since the balanced
+	 * rate itself is constantly fluctuating. So decrease the track speed
+	 * when it gets close to the target. This avoids possible oscillations.
+	 * Also limit the step size to avoid overshooting.
+	 */
+	delta >>= bw / (8 * delta + 1);
+
+	if (pos_bw > bw)
+		bw += min(delta, pos_bw - bw) >> 2;
+	else
+		bw -= min(delta, bw - pos_bw) >> 2;
+
+	bdi->dirty_ratelimit = bw;
+out:
+	bdi_update_reference_ratelimit(bdi, ref_bw);
+	trace_dirty_ratelimit(bdi, dirty_bw, pos_bw, ref_bw);
+}
+
+void bdi_update_bandwidth(struct backing_dev_info *bdi,
+			  unsigned long thresh,
+			  unsigned long dirty,
+			  unsigned long bdi_dirty,
+			  unsigned long start_time)
+{
+	static DEFINE_SPINLOCK(dirty_lock);
+	unsigned long now = jiffies;
+	unsigned long elapsed;
+	unsigned long dirtied;
+	unsigned long written;
+
+	if (!spin_trylock(&dirty_lock))
+		return;
+
+	elapsed = now - bdi->bw_time_stamp;
+	dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
+	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+
+	/* skip quiet periods when disk bandwidth is under-utilized */
+	if (elapsed > 4 * MAX_PAUSE &&
+	    elapsed > now - start_time)
+		goto snapshot;
+
+	/*
+	 * rate-limit, only update once every 200ms. Demand higher threshold
+	 * on the flusher so that the throttled task(s) can do most updates.
+	 */
+	if (!thresh && elapsed <= 2 * MAX_PAUSE)
+		goto unlock;
+	if (elapsed <= MAX_PAUSE)
+		goto unlock;
+
+	if (thresh &&
+	    now - default_backing_dev_info.bw_time_stamp >= MAX_PAUSE) {
+		default_backing_dev_info.bw_time_stamp = now;
+		update_dirty_limit(thresh, dirty);
+		bdi_update_dirty_smooth(&default_backing_dev_info, dirty);
+	}
+	if (thresh) {
+		bdi_update_dirty_ratelimit(bdi, thresh, dirty,
+					   bdi_dirty, dirtied, elapsed);
+		bdi_update_dirty_threshold(bdi, thresh, dirty);
+		bdi_update_dirty_smooth(bdi, bdi_dirty);
+	}
+	__bdi_update_write_bandwidth(bdi, elapsed, written);
+
+snapshot:
+	bdi->dirtied_stamp = dirtied;
+	bdi->written_stamp = written;
+	bdi->bw_time_stamp = now;
+unlock:
+	spin_unlock(&dirty_lock);
+}
+
+static unsigned long max_pause(struct backing_dev_info *bdi,
+			       unsigned long bdi_dirty)
+{
+	unsigned long hi = ilog2(bdi->write_bandwidth);
+	unsigned long lo = ilog2(bdi->dirty_ratelimit);
+	unsigned long t;
+
+	/* target for 10ms pause on 1-dd case */
+	t = HZ / 50;
+
+	/*
+	 * Scale up pause time for concurrent dirtiers in order to reduce CPU
+	 * overheads.
+	 *
+	 * (N * 20ms) on 2^N concurrent tasks.
+	 */
+	if (hi > lo)
+		t += (hi - lo) * (20 * HZ) / 1024;
+
+	/*
+	 * Limit pause time for small memory systems. If sleeping for too long
+	 * time, a small pool of dirty/writeback pages may go empty and disk go
+	 * idle.
+	 *
+	 * 1ms for every 1MB; may further consider bdi bandwidth.
+	 */
+	if (bdi_dirty)
+		t = min(t, bdi_dirty >> (30 - PAGE_CACHE_SHIFT - ilog2(HZ)));
+
+	return clamp_val(t, 4, MAX_PAUSE);
+}
+
+/*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to perform writeback if the system is over `vm_dirty_ratio'.
@@ -476,29 +1073,32 @@ unsigned long bdi_dirty_limit(struct bac
  * perform some writeout.
  */
 static void balance_dirty_pages(struct address_space *mapping,
-				unsigned long write_chunk)
+				unsigned long pages_dirtied)
 {
-	long nr_reclaimable, bdi_nr_reclaimable;
-	long nr_writeback, bdi_nr_writeback;
+	unsigned long nr_reclaimable;
+	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
+	unsigned long bdi_dirty;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
-	unsigned long bdi_thresh;
-	unsigned long pages_written = 0;
-	unsigned long pause = 1;
-	bool dirty_exceeded = false;
+	unsigned long bw;
+	unsigned long base_bw;
+	unsigned long period;
+	unsigned long pause = 0;
+	unsigned long pause_max;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	unsigned long start_time = jiffies;
 
 	for (;;) {
-		struct writeback_control wbc = {
-			.sync_mode	= WB_SYNC_NONE,
-			.older_than_this = NULL,
-			.nr_to_write	= write_chunk,
-			.range_cyclic	= 1,
-		};
-
+		unsigned long now = jiffies;
+		/*
+		 * Unstable writes are a feature of certain networked
+		 * filesystems (i.e. NFS) in which data may have been
+		 * written to the server's write cache, but has not yet
+		 * been flushed to permanent storage.
+		 */
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
-		nr_writeback = global_page_state(NR_WRITEBACK);
+		nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
 
 		global_dirty_limits(&background_thresh, &dirty_thresh);
 
@@ -507,12 +1107,11 @@ static void balance_dirty_pages(struct a
 		 * catch-up. This avoids (excessively) small writeouts
 		 * when the bdi limits are ramping up.
 		 */
-		if (nr_reclaimable + nr_writeback <=
-				(background_thresh + dirty_thresh) / 2)
+		if (nr_dirty <= (background_thresh + dirty_thresh) / 2) {
+			current->paused_when = jiffies;
+			current->nr_dirtied = 0;
 			break;
-
-		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-		bdi_thresh = task_dirty_limit(current, bdi_thresh);
+		}
 
 		/*
 		 * In order to avoid the stacked BDI deadlock we need
@@ -524,62 +1123,107 @@ static void balance_dirty_pages(struct a
 		 * actually dirty; with m+n sitting in the percpu
 		 * deltas.
 		 */
-		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+		if (bdi->dirty_threshold < 2*bdi_stat_error(bdi)) {
+			bdi_dirty = bdi_stat_sum(bdi, BDI_RECLAIMABLE) +
+				    bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+			bdi_dirty = bdi_stat(bdi, BDI_RECLAIMABLE) +
+				    bdi_stat(bdi, BDI_WRITEBACK);
 		}
 
-		/*
-		 * The bdi thresh is somehow "soft" limit derived from the
-		 * global "hard" limit. The former helps to prevent heavy IO
-		 * bdi or process from holding back light ones; The latter is
-		 * the last resort safeguard.
-		 */
-		dirty_exceeded =
-			(bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
-			|| (nr_reclaimable + nr_writeback > dirty_thresh);
+		bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+				     bdi_dirty, start_time);
 
-		if (!dirty_exceeded)
-			break;
+		if (unlikely(!writeback_in_progress(bdi)))
+			bdi_start_background_writeback(bdi);
 
-		if (!bdi->dirty_exceeded)
-			bdi->dirty_exceeded = 1;
+		pause_max = max_pause(bdi, bdi_dirty);
 
-		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
-		 * Unstable writes are a feature of certain networked
-		 * filesystems (i.e. NFS) in which data may have been
-		 * written to the server's write cache, but has not yet
-		 * been flushed to permanent storage.
-		 * Only move pages to writeback if this bdi is over its
-		 * threshold otherwise wait until the disk writes catch
-		 * up.
+		base_bw = bdi->dirty_ratelimit;
+		/*
+		 * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and
+		 * real-time tasks.
 		 */
-		trace_wbc_balance_dirty_start(&wbc, bdi);
-		if (bdi_nr_reclaimable > bdi_thresh) {
-			writeback_inodes_wb(&bdi->wb, &wbc);
-			pages_written += write_chunk - wbc.nr_to_write;
-			trace_wbc_balance_dirty_written(&wbc, bdi);
-			if (pages_written >= write_chunk)
-				break;		/* We've done our duty */
+		if (current->flags & PF_LESS_THROTTLE || rt_task(current))
+			base_bw *= 2;
+		bw = bdi_position_ratio(bdi, dirty_thresh, nr_dirty, bdi_dirty);
+		if (unlikely(bw == 0)) {
+			period = pause_max;
+			pause = pause_max;
+			goto pause;
 		}
-		trace_wbc_balance_dirty_wait(&wbc, bdi);
+		bw = (u64)base_bw * bw >> RATIO_SHIFT;
+		period = (HZ * pages_dirtied + bw / 2) / (bw | 1);
+		pause = current->paused_when + period - now;
+		/*
+		 * Take it as long think time if pause falls into (-10s, 0).
+		 * If it's less than 500ms (ext2 blocks the dirtier task for
+		 * up to 400ms from time to time on 1-HDD; so does xfs, however
+		 * at much less frequency), try to compensate it in future by
+		 * updating the virtual time; otherwise just reset the time, as
+		 * it may be a light dirtier.
+		 */
+		if (unlikely(-pause < HZ*10)) {
+			trace_balance_dirty_pages(bdi,
+						  dirty_thresh,
+						  nr_dirty,
+						  bdi_dirty,
+						  base_bw,
+						  bw,
+						  pages_dirtied,
+						  period,
+						  pause,
+						  start_time);
+			if (-pause > HZ/2) {
+				current->paused_when = now;
+				current->nr_dirtied = 0;
+			} else if (period) {
+				current->paused_when += period;
+				current->nr_dirtied = 0;
+			}
+			pause = 1;
+			break;
+		}
+		pause = min(pause, pause_max);
+
+pause:
+		trace_balance_dirty_pages(bdi,
+					  dirty_thresh,
+					  nr_dirty,
+					  bdi_dirty,
+					  base_bw,
+					  bw,
+					  pages_dirtied,
+					  period,
+					  pause,
+					  start_time);
+		current->paused_when = now;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		io_schedule_timeout(pause);
+		current->paused_when += pause;
+		current->nr_dirtied = 0;
 
-		/*
-		 * Increase the delay for each loop, up to our previous
-		 * default of taking a 100ms nap.
-		 */
-		pause <<= 1;
-		if (pause > HZ / 10)
-			pause = HZ / 10;
+		dirty_thresh = hard_dirty_limit(dirty_thresh);
+		if (nr_dirty < dirty_thresh + dirty_thresh / DIRTY_MAXPAUSE)
+			break;
+		if (nr_dirty < dirty_thresh + dirty_thresh / DIRTY_PASSGOOD &&
+		    bdi_dirty < bdi->dirty_threshold)
+			break;
 	}
 
-	if (!dirty_exceeded && bdi->dirty_exceeded)
-		bdi->dirty_exceeded = 0;
+	if (pause == 0)
+		current->nr_dirtied_pause =
+				ratelimit_pages(nr_dirty, dirty_thresh);
+	else if (period <= pause_max / 4)
+		current->nr_dirtied_pause = clamp_val(
+						base_bw * (pause_max/2) / HZ,
+						pages_dirtied + pages_dirtied/8,
+						pages_dirtied * 4);
+	else if (pause >= pause_max)
+		current->nr_dirtied_pause = 1 | clamp_val(
+						base_bw * (pause_max*3/8) / HZ,
+						current->nr_dirtied_pause / 4,
+						current->nr_dirtied_pause*7/8);
 
 	if (writeback_in_progress(bdi))
 		return;
@@ -592,8 +1236,10 @@ static void balance_dirty_pages(struct a
 	 * In normal mode, we start background writeout at the lower
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
-	if ((laptop_mode && pages_written) ||
-	    (!laptop_mode && (nr_reclaimable > background_thresh)))
+	if (laptop_mode)
+		return;
+
+	if (nr_reclaimable > background_thresh)
 		bdi_start_background_writeback(bdi);
 }
 
@@ -607,8 +1253,6 @@ void set_page_dirty_balance(struct page 
 	}
 }
 
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
-
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -618,36 +1262,35 @@ static DEFINE_PER_CPU(unsigned long, bdp
  * which was newly dirtied.  The function will periodically check the system's
  * dirty state and will initiate writeback if needed.
  *
- * On really big machines, get_writeback_state is expensive, so try to avoid
+ * On really big machines, global_page_state() is expensive, so try to avoid
  * calling it too often (ratelimiting).  But once we're over the dirty memory
- * limit we decrease the ratelimiting by a lot, to prevent individual processes
- * from overshooting the limit by (ratelimit_pages) each.
+ * limit we disable the ratelimiting, to prevent individual processes from
+ * overshooting the limit by (ratelimit_pages) each.
  */
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied)
 {
-	unsigned long ratelimit;
-	unsigned long *p;
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+	if (!bdi_cap_account_dirty(bdi))
+		return;
+
+	current->nr_dirtied += nr_pages_dirtied;
 
-	ratelimit = ratelimit_pages;
-	if (mapping->backing_dev_info->dirty_exceeded)
-		ratelimit = 8;
+	if (dirty_exceeded_recently(bdi, MAX_PAUSE)) {
+		unsigned long max = current->nr_dirtied +
+						(128 >> (PAGE_SHIFT - 10));
+
+		if (current->nr_dirtied_pause > max)
+			current->nr_dirtied_pause = max;
+	}
 
 	/*
 	 * Check the rate limiting. Also, we do not want to throttle real-time
 	 * tasks in balance_dirty_pages(). Period.
 	 */
-	preempt_disable();
-	p =  &__get_cpu_var(bdp_ratelimits);
-	*p += nr_pages_dirtied;
-	if (unlikely(*p >= ratelimit)) {
-		ratelimit = sync_writeback_pages(*p);
-		*p = 0;
-		preempt_enable();
-		balance_dirty_pages(mapping, ratelimit);
-		return;
-	}
-	preempt_enable();
+	if (unlikely(current->nr_dirtied >= current->nr_dirtied_pause))
+		balance_dirty_pages(mapping, current->nr_dirtied);
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
@@ -735,44 +1378,6 @@ void laptop_sync_completion(void)
 #endif
 
 /*
- * If ratelimit_pages is too high then we can get into dirty-data overload
- * if a large number of processes all perform writes at the same time.
- * If it is too low then SMP machines will call the (expensive)
- * get_writeback_state too often.
- *
- * Here we set ratelimit_pages to a level which ensures that when all CPUs are
- * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
- *
- * But the limit should not be set too high.  Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time.  So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
- */
-
-void writeback_set_ratelimit(void)
-{
-	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
-	if (ratelimit_pages < 16)
-		ratelimit_pages = 16;
-	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
-		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
-}
-
-static int __cpuinit
-ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
-{
-	writeback_set_ratelimit();
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block __cpuinitdata ratelimit_nb = {
-	.notifier_call	= ratelimit_handler,
-	.next		= NULL,
-};
-
-/*
  * Called early on to tune the page writeback dirty limits.
  *
  * We used to scale dirty pages according to how total memory
@@ -794,9 +1399,6 @@ void __init page_writeback_init(void)
 {
 	int shift;
 
-	writeback_set_ratelimit();
-	register_cpu_notifier(&ratelimit_nb);
-
 	shift = calc_period_shift();
 	prop_descriptor_init(&vm_completions, shift);
 	prop_descriptor_init(&vm_dirties, shift);
@@ -1127,6 +1729,7 @@ void account_page_dirtied(struct page *p
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
 		task_dirty_inc(current);
 		task_io_account_write(PAGE_CACHE_SIZE);
 	}
--- linux-next.orig/mm/filemap.c	2011-04-13 17:18:06.000000000 +0800
+++ linux-next/mm/filemap.c	2011-04-13 17:18:10.000000000 +0800
@@ -2313,6 +2313,7 @@ static ssize_t generic_perform_write(str
 	long status = 0;
 	ssize_t written = 0;
 	unsigned int flags = 0;
+	unsigned int dirty;
 
 	/*
 	 * Copies from kernel address space cannot fail (NFSD is a big user).
@@ -2361,6 +2362,7 @@ again:
 		pagefault_enable();
 		flush_dcache_page(page);
 
+		dirty = PageDirty(page);
 		mark_page_accessed(page);
 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
 						page, fsdata);
@@ -2387,7 +2389,8 @@ again:
 		pos += copied;
 		written += copied;
 
-		balance_dirty_pages_ratelimited(mapping);
+		if (!dirty)
+			balance_dirty_pages_ratelimited(mapping);
 
 	} while (iov_iter_count(i));
 
--- linux-next.orig/include/linux/backing-dev.h	2011-04-14 09:20:56.000000000 +0800
+++ linux-next/include/linux/backing-dev.h	2011-04-15 13:41:32.000000000 +0800
@@ -40,6 +40,8 @@ typedef int (congested_fn)(void *, int);
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
 	BDI_WRITEBACK,
+	BDI_DIRTIED,
+	BDI_WRITTEN,
 	NR_BDI_STAT_ITEMS
 };
 
@@ -71,8 +73,27 @@ struct backing_dev_info {
 
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
 
+	unsigned long bw_time_stamp;
+	unsigned long dirtied_stamp;
+	unsigned long written_stamp;
+	unsigned long write_bandwidth;
+	unsigned long avg_write_bandwidth;
+	/* the base bandwidth, the task's dirty rate will be curbed under it */
+	unsigned long dirty_ratelimit;
+	/* the estimated balance point, base bw will follow it step by step */
+	unsigned long reference_ratelimit;
+	unsigned long old_ref_ratelimit;
+	unsigned long avg_dirty;
+	unsigned long old_dirty;
+	unsigned long dirty_threshold;
+	unsigned long old_dirty_threshold;
+
 	struct prop_local_percpu completions;
-	int dirty_exceeded;
+
+	/* last time exceeded (limit - limit/DIRTY_BRAKE) */
+	unsigned long dirty_exceed_time;
+	/* last time dropped to the rampup area or even the unthrottled area */
+	unsigned long dirty_free_run;
 
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
--- linux-next.orig/mm/backing-dev.c	2011-04-14 09:20:56.000000000 +0800
+++ linux-next/mm/backing-dev.c	2011-04-15 13:42:45.000000000 +0800
@@ -81,20 +81,30 @@ static int bdi_debug_stats_show(struct s
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
-		   "BdiWriteback:     %8lu kB\n"
-		   "BdiReclaimable:   %8lu kB\n"
-		   "BdiDirtyThresh:   %8lu kB\n"
-		   "DirtyThresh:      %8lu kB\n"
-		   "BackgroundThresh: %8lu kB\n"
-		   "b_dirty:          %8lu\n"
-		   "b_io:             %8lu\n"
-		   "b_more_io:        %8lu\n"
-		   "bdi_list:         %8u\n"
-		   "state:            %8lx\n",
+		   "BdiWriteback:       %10lu kB\n"
+		   "BdiReclaimable:     %10lu kB\n"
+		   "BdiDirtyThresh:     %10lu kB\n"
+		   "DirtyThresh:        %10lu kB\n"
+		   "BackgroundThresh:   %10lu kB\n"
+		   "BdiDirtied:         %10lu kB\n"
+		   "BdiWritten:         %10lu kB\n"
+		   "BdiWriteBandwidth:  %10lu kBps\n"
+		   "b_dirty:            %10lu\n"
+		   "b_io:               %10lu\n"
+		   "b_more_io:          %10lu\n"
+		   "bdi_list:           %10u\n"
+		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
 		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-		   K(bdi_thresh), K(dirty_thresh),
-		   K(background_thresh), nr_dirty, nr_io, nr_more_io,
+		   K(bdi_thresh),
+		   K(dirty_thresh),
+		   K(background_thresh),
+		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+		   (unsigned long) K(bdi->write_bandwidth),
+		   nr_dirty,
+		   nr_io,
+		   nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
@@ -631,6 +641,11 @@ static void bdi_wb_init(struct bdi_write
 	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 }
 
+/*
+ * initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW		(100 << (20 - PAGE_SHIFT))
+
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int i, err;
@@ -652,7 +667,17 @@ int bdi_init(struct backing_dev_info *bd
 			goto err;
 	}
 
-	bdi->dirty_exceeded = 0;
+	bdi->bw_time_stamp = jiffies;
+	bdi->written_stamp = 0;
+
+	bdi->write_bandwidth = INIT_BW;
+	bdi->avg_write_bandwidth = INIT_BW;
+	bdi->dirty_ratelimit = INIT_BW;
+
+	bdi->avg_dirty = 0;
+	bdi->old_dirty = 0;
+	bdi->dirty_threshold = MIN_WRITEBACK_PAGES;
+
 	err = prop_local_init_percpu(&bdi->completions);
 
 	if (err) {
--- linux-next.orig/fs/fs-writeback.c	2011-04-14 21:51:23.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-04-15 13:48:58.000000000 +0800
@@ -689,6 +689,7 @@ static long wb_writeback(struct bdi_writ
 		write_chunk = LONG_MAX;
 
 	wbc.wb_start = jiffies; /* livelock avoidance */
+	bdi_update_write_bandwidth(wb->bdi, wbc.wb_start);
 	for (;;) {
 		/*
 		 * Stop writeback when nr_pages has been consumed
@@ -724,6 +725,8 @@ static long wb_writeback(struct bdi_writ
 			writeback_inodes_wb(wb, &wbc);
 		trace_wbc_writeback_written(&wbc, wb->bdi);
 
+		bdi_update_write_bandwidth(wb->bdi, wbc.wb_start);
+
 		work->nr_pages -= write_chunk - wbc.nr_to_write;
 		wrote += write_chunk - wbc.nr_to_write;
 
--- linux-next.orig/include/linux/writeback.h	2011-04-14 21:51:23.000000000 +0800
+++ linux-next/include/linux/writeback.h	2011-04-15 13:48:58.000000000 +0800
@@ -12,6 +12,44 @@ struct backing_dev_info;
 extern spinlock_t inode_wb_list_lock;
 
 /*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
+
+/*
+ * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
+ *
+ *		(thresh - thresh/DIRTY_FULL_SCOPE, thresh)
+ *
+ * The 1/4 region under the global dirty limit will be more rigidly throttled:
+ *
+ *		(limit - limit/DIRTY_BRAKE, limit)
+ *
+ * The 1/32 region above the global dirty limit will be put to maximum pauses:
+ *
+ *		(limit, limit + limit/DIRTY_MAXPAUSE)
+ *
+ * The 1/16 region above the global dirty limit, dirty exceeded bdi's will be
+ * put to loops:
+ *
+ *		(limit, limit + limit/DIRTY_PASSGOOD)
+ *
+ * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
+ * time) for the dirty pages to drop.
+ *
+ * The global dirty threshold is normally at the lower bound of the brake
+ * region, except when the system suddenly allocates a lot of anonymous memory
+ * and knocks down the global dirty threshold quickly, in which case the global
+ * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
+ */
+#define DIRTY_RAMPUP		32
+#define DIRTY_SCOPE		8
+#define DIRTY_FULL_SCOPE	(DIRTY_SCOPE / 2)
+#define DIRTY_BRAKE		8
+#define DIRTY_MAXPAUSE		32
+#define DIRTY_PASSGOOD		16
+
+/*
  * fs/fs-writeback.c
  */
 enum writeback_sync_modes {
@@ -128,6 +166,17 @@ void global_dirty_limits(unsigned long *
 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
 			       unsigned long dirty);
 
+void bdi_update_bandwidth(struct backing_dev_info *bdi,
+			  unsigned long thresh,
+			  unsigned long dirty,
+			  unsigned long bdi_dirty,
+			  unsigned long start_time);
+static inline void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+					      unsigned long start_time)
+{
+	bdi_update_bandwidth(bdi, 0, 0, 0, start_time);
+}
+
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied);
--- linux-next.orig/include/linux/sched.h	2011-04-15 13:41:32.000000000 +0800
+++ linux-next/include/linux/sched.h	2011-04-15 13:41:32.000000000 +0800
@@ -1493,6 +1493,14 @@ struct task_struct {
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+	/*
+	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
+	 * balance_dirty_pages() for some dirty throttling pause
+	 */
+	int nr_dirtied;
+	int nr_dirtied_pause;
+	unsigned long paused_when;	/* start of a write-and-pause period */
+
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
--- linux-next.orig/mm/memory_hotplug.c	2011-04-15 13:41:32.000000000 +0800
+++ linux-next/mm/memory_hotplug.c	2011-04-15 13:41:32.000000000 +0800
@@ -468,8 +468,6 @@ int online_pages(unsigned long pfn, unsi
 
 	vm_total_pages = nr_free_pagecache_pages();
 
-	writeback_set_ratelimit();
-
 	if (onlined_pages)
 		memory_notify(MEM_ONLINE, &arg);
 	unlock_memory_hotplug();
@@ -901,7 +899,6 @@ repeat:
 	}
 
 	vm_total_pages = nr_free_pagecache_pages();
-	writeback_set_ratelimit();
 
 	memory_notify(MEM_OFFLINE, &arg);
 	unlock_memory_hotplug();
--- linux-next.orig/include/trace/events/writeback.h	2011-04-15 13:41:31.000000000 +0800
+++ linux-next/include/trace/events/writeback.h	2011-04-15 13:48:58.000000000 +0800
@@ -147,11 +147,184 @@ DEFINE_EVENT(wbc_class, name, \
 DEFINE_WBC_EVENT(wbc_writeback_start);
 DEFINE_WBC_EVENT(wbc_writeback_written);
 DEFINE_WBC_EVENT(wbc_writeback_wait);
-DEFINE_WBC_EVENT(wbc_balance_dirty_start);
-DEFINE_WBC_EVENT(wbc_balance_dirty_written);
-DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
 DEFINE_WBC_EVENT(wbc_writepage);
 
+#define KBps(x)			((x) << (PAGE_SHIFT - 10))
+
+TRACE_EVENT(dirty_ratelimit,
+
+	TP_PROTO(struct backing_dev_info *bdi,
+		 unsigned long dirty_bw,
+		 unsigned long pos_bw,
+		 unsigned long ref_bw),
+
+	TP_ARGS(bdi, dirty_bw, pos_bw, ref_bw),
+
+	TP_STRUCT__entry(
+		__array(char,		bdi, 32)
+		__field(unsigned long,	write_bw)
+		__field(unsigned long,	avg_bw)
+		__field(unsigned long,	dirty_bw)
+		__field(unsigned long,	base_bw)
+		__field(unsigned long,	pos_bw)
+		__field(unsigned long,	ref_bw)
+		__field(unsigned long,	avg_ref_bw)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+		__entry->write_bw	= KBps(bdi->write_bandwidth);
+		__entry->avg_bw		= KBps(bdi->avg_write_bandwidth);
+		__entry->dirty_bw	= KBps(dirty_bw);
+		__entry->base_bw	= KBps(bdi->dirty_ratelimit);
+		__entry->pos_bw		= KBps(pos_bw);
+		__entry->ref_bw		= KBps(ref_bw);
+		__entry->avg_ref_bw	= KBps(bdi->reference_ratelimit);
+	),
+
+
+	TP_printk("bdi %s: "
+		  "write_bw=%lu awrite_bw=%lu dirty_bw=%lu "
+		  "base_bw=%lu pos_bw=%lu ref_bw=%lu aref_bw=%lu",
+		  __entry->bdi,
+		  __entry->write_bw,	/* write bandwidth */
+		  __entry->avg_bw,	/* avg write bandwidth */
+		  __entry->dirty_bw,	/* dirty bandwidth */
+		  __entry->base_bw,	/* dirty ratelimit on each task */
+		  __entry->pos_bw,	/* position control ratelimit */
+		  __entry->ref_bw,	/* the reference ratelimit */
+		  __entry->avg_ref_bw	/* smoothed reference ratelimit */
+	)
+);
+
+TRACE_EVENT(balance_dirty_pages,
+
+	TP_PROTO(struct backing_dev_info *bdi,
+		 unsigned long thresh,
+		 unsigned long dirty,
+		 unsigned long bdi_dirty,
+		 unsigned long base_bw,
+		 unsigned long task_bw,
+		 unsigned long dirtied,
+		 unsigned long period,
+		 long pause,
+		 unsigned long start_time),
+
+	TP_ARGS(bdi, thresh, dirty, bdi_dirty,
+		base_bw, task_bw, dirtied, period, pause, start_time),
+
+	TP_STRUCT__entry(
+		__array(	 char,	bdi, 32)
+		__field(unsigned long,	limit)
+		__field(unsigned long,	goal)
+		__field(unsigned long,	dirty)
+		__field(unsigned long,	bdi_goal)
+		__field(unsigned long,	bdi_dirty)
+		__field(unsigned long,	avg_dirty)
+		__field(unsigned long,	base_bw)
+		__field(unsigned long,	task_bw)
+		__field(unsigned int,	dirtied)
+		__field(unsigned int,	dirtied_pause)
+		__field(unsigned long,	period)
+		__field(	 long,	think)
+		__field(	 long,	pause)
+		__field(unsigned long,	paused)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+
+		__entry->limit = default_backing_dev_info.dirty_threshold;
+		__entry->goal		= thresh - thresh / DIRTY_SCOPE;
+		__entry->dirty		= dirty;
+		__entry->bdi_goal	= bdi->dirty_threshold -
+					  bdi->dirty_threshold / DIRTY_SCOPE;
+		__entry->bdi_dirty	= bdi_dirty;
+		__entry->avg_dirty	= bdi->avg_dirty;
+		__entry->base_bw	= KBps(base_bw);
+		__entry->task_bw	= KBps(task_bw);
+		__entry->dirtied	= dirtied;
+		__entry->dirtied_pause	= current->nr_dirtied_pause;
+		__entry->think		= current->paused_when == 0 ? 0 :
+			 (long)(jiffies - current->paused_when) * 1000 / HZ;
+		__entry->period		= period * 1000 / HZ;
+		__entry->pause		= pause * 1000 / HZ;
+		__entry->paused		= (jiffies - start_time) * 1000 / HZ;
+	),
+
+
+	TP_printk("bdi %s: "
+		  "limit=%lu goal=%lu dirty=%lu "
+		  "bdi_goal=%lu bdi_dirty=%lu avg_dirty=%lu "
+		  "base_bw=%lu task_bw=%lu "
+		  "dirtied=%u dirtied_pause=%u "
+		  "period=%lu think=%ld pause=%ld paused=%lu",
+		  __entry->bdi,
+		  __entry->limit,
+		  __entry->goal,
+		  __entry->dirty,
+		  __entry->bdi_goal,
+		  __entry->bdi_dirty,
+		  __entry->avg_dirty,
+		  __entry->base_bw,	/* base throttle bandwidth */
+		  __entry->task_bw,	/* task throttle bandwidth */
+		  __entry->dirtied,
+		  __entry->dirtied_pause,
+		  __entry->period,	/* ms */
+		  __entry->think,	/* ms */
+		  __entry->pause,	/* ms */
+		  __entry->paused	/* ms */
+		  )
+);
+
+TRACE_EVENT(global_dirty_state,
+
+	TP_PROTO(unsigned long background_thresh,
+		 unsigned long dirty_thresh
+	),
+
+	TP_ARGS(background_thresh,
+		dirty_thresh
+	),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	nr_dirty)
+		__field(unsigned long,	nr_writeback)
+		__field(unsigned long,	nr_unstable)
+		__field(unsigned long,	background_thresh)
+		__field(unsigned long,	dirty_thresh)
+		__field(unsigned long,	dirty_limit)
+		__field(unsigned long,	nr_dirtied)
+		__field(unsigned long,	nr_written)
+	),
+
+	TP_fast_assign(
+		__entry->nr_dirty	= global_page_state(NR_FILE_DIRTY);
+		__entry->nr_writeback	= global_page_state(NR_WRITEBACK);
+		__entry->nr_unstable	= global_page_state(NR_UNSTABLE_NFS);
+		__entry->nr_dirtied	= global_page_state(NR_DIRTIED);
+		__entry->nr_written	= global_page_state(NR_WRITTEN);
+		__entry->background_thresh = background_thresh;
+		__entry->dirty_thresh	= dirty_thresh;
+		__entry->dirty_limit = default_backing_dev_info.dirty_threshold;
+	),
+
+	TP_printk("dirty=%lu writeback=%lu unstable=%lu "
+		  "bg_thresh=%lu thresh=%lu limit=%lu gap=%ld "
+		  "dirtied=%lu written=%lu",
+		  __entry->nr_dirty,
+		  __entry->nr_writeback,
+		  __entry->nr_unstable,
+		  __entry->background_thresh,
+		  __entry->dirty_thresh,
+		  __entry->dirty_limit,
+		  __entry->dirty_thresh - __entry->nr_dirty -
+		  __entry->nr_writeback - __entry->nr_unstable,
+		  __entry->nr_dirtied,
+		  __entry->nr_written
+	)
+);
+
 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
 
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),