--- linux-next.orig/mm/page-writeback.c	2011-03-02 10:45:48.000000000 +0800
+++ linux-next/mm/page-writeback.c	2011-03-02 14:12:04.000000000 +0800
@@ -37,24 +37,9 @@
 #include <trace/events/writeback.h>
 
 /*
- * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
- * will look to see if it needs to force writeback or throttling.
+ * Don't sleep more than 200ms at a time in balance_dirty_pages().
  */
-static long ratelimit_pages = 32;
-
-/*
- * When balance_dirty_pages decides that the caller needs to perform some
- * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than dirtied pages to ensure that reasonably
- * large amounts of I/O are submitted.
- */
-static inline long sync_writeback_pages(unsigned long dirtied)
-{
-	if (dirtied < ratelimit_pages)
-		dirtied = ratelimit_pages;
-
-	return dirtied + dirtied / 2;
-}
+#define MAX_PAUSE	max(HZ/5, 1)
 
 /* The following parameters are exported via /proc/sys/vm */
 
@@ -111,7 +96,6 @@ EXPORT_SYMBOL(laptop_mode);
 
 /* End of sysctl-exported parameters */
 
-
 /*
  * Scale the writeback cache size proportional to the relative writeout speeds.
  *
@@ -145,7 +129,7 @@ static int calc_period_shift(void)
 	else
 		dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
 				100;
-	return 2 + ilog2(dirty_total - 1);
+	return ilog2(dirty_total - 1);
 }
 
 /*
@@ -219,6 +203,7 @@ int dirty_bytes_handler(struct ctl_table
  */
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
+	__inc_bdi_stat(bdi, BDI_WRITTEN);
 	__prop_inc_percpu_max(&vm_completions, &bdi->completions,
 			      bdi->max_prop_frac);
 }
@@ -241,19 +226,14 @@ void task_dirty_inc(struct task_struct *
 /*
  * Obtain an accurate fraction of the BDI's portion.
  */
-static void bdi_writeout_fraction(struct backing_dev_info *bdi,
+void bdi_writeout_fraction(struct backing_dev_info *bdi,
 		long *numerator, long *denominator)
 {
-	if (bdi_cap_writeback_dirty(bdi)) {
-		prop_fraction_percpu(&vm_completions, &bdi->completions,
+	prop_fraction_percpu(&vm_completions, &bdi->completions,
 				numerator, denominator);
-	} else {
-		*numerator = 0;
-		*denominator = 1;
-	}
 }
 
-static inline void task_dirties_fraction(struct task_struct *tsk,
+void task_dirties_fraction(struct task_struct *tsk,
 		long *numerator, long *denominator)
 {
 	prop_fraction_single(&vm_dirties, &tsk->dirties,
@@ -261,36 +241,6 @@ static inline void task_dirties_fraction
 }
 
 /*
- * task_dirty_limit - scale down dirty throttling threshold for one task
- *
- * task specific dirty limit:
- *
- *   dirty -= (dirty/8) * p_{t}
- *
- * To protect light/slow dirtying tasks from heavier/fast ones, we start
- * throttling individual tasks before reaching the bdi dirty limit.
- * Relatively low thresholds will be allocated to heavy dirtiers. So when
- * dirty pages grow large, heavy dirtiers will be throttled first, which will
- * effectively curb the growth of dirty pages. Light dirtiers with high enough
- * dirty threshold may never get throttled.
- */
-static unsigned long task_dirty_limit(struct task_struct *tsk,
-				       unsigned long bdi_dirty)
-{
-	long numerator, denominator;
-	unsigned long dirty = bdi_dirty;
-	u64 inv = dirty >> 3;
-
-	task_dirties_fraction(tsk, &numerator, &denominator);
-	inv *= numerator;
-	do_div(inv, denominator);
-
-	dirty -= inv;
-
-	return max(dirty, bdi_dirty/2);
-}
-
-/*
  *
  */
 static unsigned int bdi_min_ratio;
@@ -403,8 +353,6 @@ unsigned long determine_dirtyable_memory
  * Calculate the dirty thresholds based on sysctl parameters
  * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
  * - vm.dirty_ratio             or  vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
  */
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
@@ -426,28 +374,45 @@ void global_dirty_limits(unsigned long *
 	else
 		background = (dirty_background_ratio * available_memory) / 100;
 
-	if (background >= dirty)
-		background = dirty / 2;
+	/*
+	 * Ensure at least 1/4 gap between background and dirty thresholds, so
+	 * that when dirty throttling starts at (background + dirty)/2, it's
+	 * below or at the entrance of the soft dirty throttle scope.
+	 */
+	if (background > dirty - dirty / (DIRTY_SCOPE / 2))
+		background = dirty - dirty / (DIRTY_SCOPE / 2);
+
 	tsk = current;
-	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-		background += background / 4;
-		dirty += dirty / 4;
-	}
 	*pbackground = background;
 	*pdirty = dirty;
+	trace_global_dirty_state(background, dirty);
 }
+EXPORT_SYMBOL_GPL(global_dirty_limits);
 
-/*
+/**
  * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
+ * @dirty_pages: current number of dirty pages
  *
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ *
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
  * - starving fast devices
  * - piling up dirty pages (that will take long time to sync) on slow devices
  *
  * The bdi's share of dirty limit will be adapting to its throughput and
  * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ *
+ * There is a chicken and egg problem: when bdi A (eg. /pub) is heavy dirtied
+ * and bdi B (eg. /) is light dirtied hence has 0 dirty limit, tasks writing to
+ * B always get heavily throttled and bdi B's dirty limit might never be able
+ * to grow up from 0. So we do tricks to reserve some global margin and honour
+ * it to the bdi's that run low.
  */
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
+			      unsigned long dirty)
 {
 	u64 bdi_dirty;
 	long numerator, denominator;
@@ -462,6 +427,7 @@ unsigned long bdi_dirty_limit(struct bac
 	do_div(bdi_dirty, denominator);
 
 	bdi_dirty += (dirty * bdi->min_ratio) / 100;
+
 	if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
 		bdi_dirty = dirty * bdi->max_ratio / 100;
 
@@ -469,6 +435,719 @@ unsigned long bdi_dirty_limit(struct bac
 }
 
 /*
+ * If we can dirty N more pages globally, honour N/8 to the bdi that
+ * runs low, so as to help it ramp up.
+ */
+static unsigned long dirty_rampup_size(unsigned long dirty,
+				       unsigned long thresh)
+{
+	if (thresh > dirty + MIN_WRITEBACK_PAGES)
+		return min(MIN_WRITEBACK_PAGES * 2, (thresh - dirty) / 8);
+
+	return MIN_WRITEBACK_PAGES / 8;
+}
+
+/*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If ratelimit_pages is too low then big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long ratelimit_pages(unsigned long dirty,
+				     unsigned long thresh)
+{
+	if (thresh > dirty)
+		return 1UL << (ilog2(thresh - dirty) >> 1);
+
+	return 1;
+}
+
+/*
+ * last time exceeded (limit - limit/DIRTY_MARGIN)
+ */
+static bool dirty_exceeded_recently(struct backing_dev_info *bdi,
+				    unsigned long time_window)
+{
+	return jiffies - bdi->dirty_exceed_time <= time_window;
+}
+
+/*
+ * last time dropped below (thresh - 2*thresh/DIRTY_SCOPE + thresh/DIRTY_MARGIN)
+ */
+static bool dirty_free_run_recently(struct backing_dev_info *bdi,
+				    unsigned long time_window)
+{
+	return jiffies - bdi->dirty_free_run <= time_window;
+}
+
+/*
+ * Position based bandwidth control.
+ *
+ * (1) hard dirty limiting areas
+ *
+ * The block area is required to stop large number of slow dirtiers, because
+ * the max throttle area is only able to throttle a task at 1page/200ms=20KB/s.
+ *
+ * The max throttle area is sufficient for normal workloads, and has the virtue
+ * of bounded latency for light dirtiers.
+ *
+ * The brake area is typically enough to hold off the dirtiers as long as the
+ * dirtyable memory is not so tight.
+ *
+ * The block area and max throttle area are enforced inside the loop of
+ * balance_dirty_pages(). Others can be found in dirty_throttle_bandwidth().
+ *
+ *         block area,  loop until drop below the area  -------------------|<===
+ *  max throttle area,  sleep(max_pause) and return     -----------|<=====>|
+ *         brake area,  bw scaled from 1 down to 0      ---|<=====>|
+ * --------------------------------------------------------o-------o-------o----
+ *                                                         ^       ^       ^
+ *                          limit - limit/DIRTY_MARGIN  ---'       |       |
+ *                          limit                       -----------'       |
+ *                          limit + limit/DIRTY_MARGIN  -------------------'
+ *
+ * (2) global control scope
+ *
+ * The rampup area is for ramping up the base bandwidth whereas the above brake
+ * area is for scaling down the base bandwidth.
+ *
+ * The global thresh is typically equal to the above global limit. The
+ * difference is, @thresh is real-time computed from global_dirty_limits() and
+ * @limit is tracking @thresh at 100ms intervals in update_dirty_limit(). The
+ * point is to track @thresh slowly if it dropped below the number of dirty
+ * pages, so as to avoid unnecessarily entering the three areas in (1).
+ *
+ *rampup area                 setpoint/goal
+ *|<=======>|                      v
+ * [-------------------------------*-------------------------------]------------
+ * ^                               ^                               ^
+ * thresh - 2*thresh/DIRTY_SCOPE   thresh - thresh/DIRTY_SCOPE     thresh
+ *
+ * (3) bdi control scope
+ *
+ * The bdi reserve area tries to keep a reasonable number of dirty pages for
+ * preventing block queue underrun.
+ *
+ * reserve area, scale up bw as dirty pages drop low  bdi_setpoint
+ * |<=============================================>|       v
+ * |-------------------------------------------------------*-------|----------
+ * 0                    bdi_thresh - bdi_thresh/DIRTY_SCOPE^       ^bdi_thresh
+ *
+ * (4) global/bdi control lines
+ *
+ * dirty_throttle_bandwidth() applies 2 main and 3 regional control lines for
+ * scaling up/down the base bandwidth based on the position of dirty pages.
+ *
+ * The two main control lines for the global/bdi control scopes do not end at
+ * thresh/bdi_thresh.  They are centered at setpoint/bdi_setpoint and cover the
+ * whole [0, limit].  If the control line drops below 0 before reaching @limit,
+ * an auxiliary line will be setup to connect them. The below figure illustrates
+ * the main bdi control line with an auxiliary line extending it to @limit.
+ *
+ * This allows smoothly throttling down bdi_dirty back to normal if it starts
+ * high in situations like
+ * - start writing to a slow SD card and a fast disk at the same time. The SD
+ *   card's bdi_dirty may rush to 5 times higher than bdi_setpoint.
+ * - the global/bdi dirty thresh/goal may be knocked down suddenly either on
+ *   user request or on increased memory consumption.
+ *
+ *   o
+ *     o
+ *       o                                      [o] main control line
+ *         o                                    [*] auxiliary control line
+ *           o
+ *             o
+ *               o
+ *                 o
+ *                   o
+ *                     o
+ *                       o--------------------- balance point, bw scale = 1
+ *                       | o
+ *                       |   o
+ *                       |     o
+ *                       |       o
+ *                       |         o
+ *                       |           o
+ *                       |             o------- connect point, bw scale = 1/2
+ *                       |               .*
+ *                       |                 .   *
+ *                       |                   .      *
+ *                       |                     .         *
+ *                       |                       .           *
+ *                       |                         .              *
+ *                       |                           .                 *
+ *  [--------------------*-----------------------------.--------------------*]
+ *  0                 bdi_setpoint                  bdi_origin           limit
+ *
+ * The bdi control line: if (bdi_origin < limit), an auxiliary control line (*)
+ * will be setup to extend the main control line (o) to @limit.
+ */
+static unsigned long dirty_throttle_bandwidth(struct backing_dev_info *bdi,
+					      unsigned long thresh,
+					      unsigned long dirty,
+					      unsigned long bdi_dirty,
+					      struct task_struct *tsk)
+{
+	unsigned long limit = default_backing_dev_info.dirty_threshold;
+	unsigned long bdi_thresh = bdi->dirty_threshold;
+	unsigned long origin;
+	unsigned long goal;
+	unsigned long long span;
+	unsigned long long bw;
+
+	if (unlikely(dirty >= limit))
+		return 0;
+
+	/*
+	 * global setpoint
+	 */
+	origin = 2 * thresh;
+	goal = thresh - thresh / DIRTY_SCOPE;
+
+	if (unlikely(origin < limit && dirty > (goal + origin) / 2)) {
+		origin = limit;
+		goal = (goal + origin) / 2;
+		bw >>= 1;
+	}
+	bw = origin - dirty;
+	bw <<= BASE_BW_SHIFT;
+	do_div(bw, origin - goal + 1);
+
+	/*
+	 * brake area to prevent global dirty exceeding
+	 */
+	if (dirty > limit - limit / DIRTY_MARGIN) {
+		bw *= limit - dirty;
+		do_div(bw, limit / DIRTY_MARGIN + 1);
+	}
+
+	/*
+	 * rampup area, immediately above the unthrottled free-run region.
+	 * It's setup mainly to get an estimation of ref_bw for reliably
+	 * ramping up the base bandwidth.
+	 */
+	dirty = default_backing_dev_info.avg_dirty;
+	origin = thresh - thresh / (DIRTY_SCOPE/2) + thresh / DIRTY_MARGIN;
+	if (dirty < origin) {
+		span = (origin - dirty) * bw;
+		do_div(span, thresh / (8 * DIRTY_MARGIN) + 1);
+		bw += span;
+	}
+
+	/*
+	 * bdi setpoint
+	 */
+	if (unlikely(bdi_thresh > thresh))
+		bdi_thresh = thresh;
+	goal = bdi_thresh - bdi_thresh / DIRTY_SCOPE;
+	span = (u64) bdi_thresh * (thresh - bdi_thresh) +
+		(2 * bdi->avg_bandwidth) * bdi_thresh;
+	do_div(span, thresh + 1);
+	origin = goal + 2 * span;
+
+	dirty = bdi->avg_dirty;
+	if (unlikely(dirty > goal + span)) {
+		if (dirty > limit)
+			return 0;
+		if (origin < limit) {
+			origin = limit;
+			goal += span;
+			bw >>= 1;
+		}
+	}
+	bw *= origin - dirty;
+	do_div(bw, origin - goal + 1);
+
+	/*
+	 * bdi reserve area, safeguard against bdi dirty underflow and disk idle
+	 */
+	origin = bdi_thresh - bdi_thresh / (DIRTY_SCOPE / 2);
+	if (bdi_dirty < origin)
+		bw = bw * origin / (bdi_dirty | 1);
+
+	/*
+	 * honour light dirtiers higher bandwidth:
+	 *
+	 *	bw *= sqrt(1 / task_dirty_weight);
+	 */
+	if (tsk) {
+		unsigned long numerator, denominator;
+
+		task_dirties_fraction(tsk, &numerator, &denominator);
+		bw *= int_sqrt((denominator << 10) / (numerator + 1));
+		bw >>= 5 + BASE_BW_SHIFT / 2;
+		bw = (unsigned long)bw * bdi->throttle_bandwidth;
+		bw >>= 2 * BASE_BW_SHIFT - BASE_BW_SHIFT / 2;
+
+		/*
+		 * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and
+		 * real-time tasks.
+		 *
+		 * The avg_bandwidth bound is necessary because
+		 * bdi_update_throttle_bandwidth() blindly sets base bandwidth
+		 * to avg_bandwidth for more stable estimation, when it
+		 * believes the current task is the only dirtier.
+		 */
+		if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+			return min(2 * (unsigned long)bw, bdi->avg_bandwidth);
+	}
+
+	return bw;
+}
+
+static void bdi_update_dirty_smooth(struct backing_dev_info *bdi,
+				    unsigned long dirty)
+{
+	unsigned long avg = bdi->avg_dirty;
+	unsigned long old = bdi->old_dirty;
+
+	if (unlikely(!avg)) {
+		avg = dirty;
+		goto update;
+	}
+
+	/*
+	 * dirty pages are departing upwards, follow up
+	 */
+	if (avg < old && old <= dirty) {
+		avg += (old - avg) >> 3;
+		goto update;
+	}
+
+	/*
+	 * dirty pages are departing downwards, follow down
+	 */
+	if (avg > old && old >= dirty) {
+		avg -= (avg - old) >> 3;
+		goto update;
+	}
+
+	/*
+	 * This can filter out one half unnecessary updates when bdi_dirty is
+	 * fluctuating around the balance point, and is most effective on XFS,
+	 * whose theoretic pattern is
+	 *                                                             .
+	 *	[.] dirty	[-] avg                       .       .
+	 *                                                   .       .
+	 *              .         .         .         .     .       .
+	 *      ---------------------------------------    .       .
+	 *            .         .         .         .     .       .
+	 *           .         .         .         .     .       .
+	 *          .         .         .         .     .       .
+	 *         .         .         .         .     .       .
+	 *        .         .         .         .
+	 *       .         .         .         .      (flucuated)
+	 *      .         .         .         .
+	 *     .         .         .         .
+	 *
+	 * @avg will remain flat at the cost of being biased towards high. In
+	 * practice the error tend to be much smaller: thanks to more coarse
+	 * grained fluctuations, @avg becomes the real average number for the
+	 * last two rising lines of @dirty.
+	 */
+	goto out;
+
+update:
+	bdi->avg_dirty = avg;
+out:
+	bdi->old_dirty = dirty;
+}
+
+static void __bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+					 unsigned long elapsed,
+					 unsigned long written)
+{
+	const unsigned long period = roundup_pow_of_two(3 * HZ);
+	unsigned long avg = bdi->avg_bandwidth;
+	unsigned long old = bdi->write_bandwidth;
+	unsigned long cur;
+	u64 bw;
+
+	bw = written - bdi->written_stamp;
+	bw *= HZ;
+	if (unlikely(elapsed > period / 2)) {
+		do_div(bw, elapsed);
+		elapsed = period / 2;
+		bw *= elapsed;
+	}
+	bw += (u64)bdi->write_bandwidth * (period - elapsed);
+	cur = bw >> ilog2(period);
+	bdi->write_bandwidth = cur;
+
+	/*
+	 * one more level of smoothing
+	 */
+	if (avg > old && old > cur)
+		avg -= (avg - old) >> 5;
+
+	if (avg < old && old < cur)
+		avg += (old - avg) >> 5;
+
+	bdi->avg_bandwidth = avg;
+}
+
+static void update_dirty_limit(unsigned long thresh,
+			       unsigned long dirty)
+{
+	unsigned long limit = default_backing_dev_info.dirty_threshold;
+	unsigned long min = dirty + limit / DIRTY_MARGIN;
+
+	if (limit < thresh) {
+		limit = thresh;
+		goto out;
+	}
+
+	/* take care not to follow into the brake area */
+	if (limit > thresh + thresh / (DIRTY_MARGIN * 8) &&
+	    limit > min) {
+		limit -= (limit - max(thresh, min)) >> 3;
+		goto out;
+	}
+
+	return;
+out:
+	default_backing_dev_info.dirty_threshold = limit;
+}
+
+static void bdi_update_dirty_threshold(struct backing_dev_info *bdi,
+				       unsigned long thresh,
+				       unsigned long dirty)
+{
+	unsigned long old = bdi->old_dirty_threshold;
+	unsigned long avg = bdi->dirty_threshold;
+	unsigned long min;
+
+	min = dirty_rampup_size(dirty, thresh);
+	thresh = bdi_dirty_limit(bdi, thresh);
+
+	if (avg > old && old >= thresh)
+		avg -= (avg - old) >> 4;
+
+	if (avg < old && old <= thresh)
+		avg += (old - avg) >> 4;
+
+	bdi->dirty_threshold = max(avg, min);
+	bdi->old_dirty_threshold = thresh;
+}
+
+/*
+ * ref_bw typically fluctuates within a small range, with large isolated points
+ * from time to time. The smoothed reference_bandwidth can effectively filter
+ * out 1 such standalone point. When there comes 2+ isolated points together --
+ * observed in ext4 on sudden redirty -- reference_bandwidth may surge high and
+ * take long time to return to normal, which can mostly be counteracted by
+ * xref_bw and other update restrictions in bdi_update_throttle_bandwidth().
+ */
+static void bdi_update_reference_bandwidth(struct backing_dev_info *bdi,
+					   unsigned long ref_bw)
+{
+	unsigned long old = bdi->old_ref_bandwidth;
+	unsigned long avg = bdi->reference_bandwidth;
+
+	if (avg > old && old >= ref_bw && avg - old >= old - ref_bw)
+		avg -= (avg - old) >> 3;
+
+	if (avg < old && old <= ref_bw && old - avg >= ref_bw - old)
+		avg += (old - avg) >> 3;
+
+	bdi->reference_bandwidth = avg;
+	bdi->old_ref_bandwidth = ref_bw;
+}
+
+/*
+ * Base throttle bandwidth.
+ */
+static void bdi_update_throttle_bandwidth(struct backing_dev_info *bdi,
+					  unsigned long thresh,
+					  unsigned long dirty,
+					  unsigned long bdi_dirty,
+					  unsigned long dirtied,
+					  unsigned long elapsed)
+{
+	unsigned long limit = default_backing_dev_info.dirty_threshold;
+	unsigned long margin = limit / DIRTY_MARGIN;
+	unsigned long goal = thresh - thresh / DIRTY_SCOPE;
+	unsigned long bdi_thresh = bdi->dirty_threshold;
+	unsigned long bdi_goal = bdi_thresh - bdi_thresh / DIRTY_SCOPE;
+	unsigned long long bw = bdi->throttle_bandwidth;
+	unsigned long long dirty_bw;
+	unsigned long long pos_bw;
+	unsigned long long delta;
+	unsigned long long ref_bw = 0;
+	unsigned long long xref_bw;
+	unsigned long pos_ratio;
+	unsigned long spread;
+
+	if (dirty > limit - margin)
+		bdi->dirty_exceed_time = jiffies;
+
+	if (dirty < thresh - thresh / (DIRTY_SCOPE/2) + margin)
+		bdi->dirty_free_run = jiffies;
+
+	/*
+	 * The dirty rate should match the writeback rate exactly, except when
+	 * dirty pages are truncated before IO submission. The mismatches are
+	 * hopefully small and hence ignored. So a continuous stream of dirty
+	 * page trucates will result in errors in ref_bw, fortunately pos_bw
+	 * can effectively stop the base bw from being driven away endlessly
+	 * by the errors.
+	 *
+	 * It'd be nicer for the filesystems to not redirty too much pages
+	 * either on IO or lock contention, or on sub-page writes.  ext4 is
+	 * known to redirty pages in big bursts, leading to
+	 *   - surges of dirty_bw, which can be mostly safeguarded by the
+	 *     min/max'ed xref_bw
+	 *   - the temporary drop of task weight and hence surge of task bw
+	 * It could possibly be fixed in the FS.
+	 */
+	dirty_bw = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+
+	pos_ratio = dirty_throttle_bandwidth(bdi, thresh, dirty,
+					     bdi_dirty, NULL);
+	/*
+	 * pos_bw = task_bw, assuming 100% task dirty weight
+	 *
+	 * (pos_bw > bw) means the position of the number of dirty pages is
+	 * lower than the global and/or bdi setpoints. It does not necessarily
+	 * mean the base throttle bandwidth is larger than its balanced value.
+	 * The latter is likely only when
+	 * - (position) the dirty pages are at some distance from the setpoint,
+	 * - (speed) and either stands still or is departing from the setpoint.
+	 */
+	pos_bw = (bw >> (BASE_BW_SHIFT/2)) * pos_ratio >>
+			(BASE_BW_SHIFT/2);
+
+	/*
+	 * A typical desktop has only 1 task writing to 1 disk, in which case
+	 * the dirtier task should be throttled at the disk's write bandwidth.
+	 * Note that we ignore minor dirty/writeback mismatches such as
+	 * redirties and truncated dirty pages.
+	 */
+	if (bdi_thresh > thresh - thresh / 16) {
+		unsigned long numerator, denominator;
+
+		task_dirties_fraction(current, &numerator, &denominator);
+		if (numerator > denominator - denominator / 16)
+			ref_bw = bdi->avg_bandwidth << BASE_BW_SHIFT;
+	}
+	/*
+	 * Otherwise there may be
+	 * 1) N dd tasks writing to the current disk, or
+	 * 2) X dd tasks and Y "rsync --bwlimit" tasks.
+	 * The below estimation is accurate enough for (1). For (2), where not
+	 * all task's dirty rate can be changed proportionally by adjusting the
+	 * base throttle bandwidth, it would require multiple adjust-reestimate
+	 * cycles to approach the rate matching point. Which is not a big
+	 * concern as we always do small steps to approach the target. The
+	 * un-controllable tasks may only slow down the progress.
+	 */
+	if (!ref_bw) {
+		ref_bw = pos_ratio * bdi->avg_bandwidth;
+		do_div(ref_bw, dirty_bw | 1);
+		ref_bw = (bw >> (BASE_BW_SHIFT/2)) * (unsigned long)ref_bw >>
+				(BASE_BW_SHIFT/2);
+	}
+
+	/*
+	 * The average dirty pages typically fluctuates within this scope.
+	 */
+	spread = min(bdi->write_bandwidth / 8, bdi_thresh / DIRTY_MARGIN);
+
+	/*
+	 * Update the base throttle bandwidth rigidly: eg. only try lowering it
+	 * when both the global/bdi dirty pages are away from their setpoints,
+	 * and are either standing still or continue departing away.
+	 *
+	 * The "+ avg_dirty / 256" tricks mainly help btrfs, which behaves
+	 * amazingly smoothly.  Its average dirty pages simply tracks more and
+	 * more close to the number of dirty pages without any overshooting,
+	 * thus its dirty pages may be ever moving towards the setpoint and
+	 * @avg_dirty ever approaching @dirty, slower and slower, but very hard
+	 * to cross it to trigger a base bandwidth update. What the trick does
+	 * is "when @avg_dirty is _close enough_ to @dirty, it indicates slowed
+	 * down @dirty change rate, hence the other inequalities are now a good
+	 * indication of something unbalanced in the current bdi".
+	 *
+	 * In the cases of hitting the upper/lower margins, it's obviously
+	 * necessary to adjust the (possibly very unbalanced) base bandwidth,
+	 * unless the opposite margin was also been hit recently, which
+	 * indicates that the dirty control scope may be smaller than the bdi
+	 * write bandwidth and hence the dirty pages are quickly fluctuating
+	 * between the upper/lower margins.
+	 */
+	if (bw < pos_bw) {
+		if (dirty < goal &&
+		    dirty <= default_backing_dev_info.avg_dirty +
+			     (default_backing_dev_info.avg_dirty >> 8) &&
+		    bdi->avg_dirty + spread < bdi_goal &&
+		    bdi_dirty <= bdi->avg_dirty + (bdi->avg_dirty >> 8) &&
+		    bdi_dirty <= bdi->old_dirty)
+			goto adjust;
+		if (dirty < thresh - thresh / (DIRTY_SCOPE/2) + margin &&
+		    !dirty_exceeded_recently(bdi, HZ))
+			goto adjust;
+	}
+
+	if (bw > pos_bw) {
+		if (dirty > goal &&
+		    dirty >= default_backing_dev_info.avg_dirty -
+			     (default_backing_dev_info.avg_dirty >> 8) &&
+		    bdi->avg_dirty > bdi_goal + spread &&
+		    bdi_dirty >= bdi->avg_dirty - (bdi->avg_dirty >> 8) &&
+		    bdi_dirty >= bdi->old_dirty)
+			goto adjust;
+		if (dirty > limit - margin &&
+		    !dirty_free_run_recently(bdi, HZ))
+			goto adjust;
+	}
+
+	goto out;
+
+adjust:
+	/*
+	 * The min/max'ed xref_bw is an effective safeguard. The most dangerous
+	 * case that could unnecessarily disturb the base bandwith is: when the
+	 * control scope is roughly equal to the write bandwidth, the dirty
+	 * pages may rush into the upper/lower margins regularly. It typically
+	 * hits the upper margin in a blink, making a sudden drop of pos_bw and
+	 * ref_bw. Assume 5 points A, b, c, D, E, where b, c have the dropped
+	 * down number of pages, and A, D, E are at normal level.  At point b,
+	 * the xref_bw will be the good A; at c, the xref_bw will be the
+	 * dragged-down-by-b reference_bandwidth which is bad; at D and E, the
+	 * still-low reference_bandwidth will no longer bring the base
+	 * bandwidth down, as xref_bw will take the larger values from D and E.
+	 */
+	if (pos_bw > bw) {
+		xref_bw = min(ref_bw, bdi->old_ref_bandwidth);
+		xref_bw = min(xref_bw, bdi->reference_bandwidth);
+		if (xref_bw > bw)
+			delta = xref_bw - bw;
+		else
+			delta = 0;
+	} else {
+		xref_bw = max(ref_bw, bdi->reference_bandwidth);
+		xref_bw = max(xref_bw, bdi->reference_bandwidth);
+		if (xref_bw < bw)
+			delta = bw - xref_bw;
+		else
+			delta = 0;
+	}
+
+	/*
+	 * Don't pursue 100% rate matching. It's impossible since the balanced
+	 * rate itself is constantly fluctuating. So decrease the track speed
+	 * when it gets close to the target. Also limit the step size in
+	 * various ways to avoid overshooting.
+	 */
+	delta >>= bw / (2 * delta + 1);
+	delta = min(delta, (u64)abs64(pos_bw - bw));
+	delta >>= 1;
+	delta = min(delta, bw / 8);
+
+	if (pos_bw > bw)
+		bw += delta;
+	else
+		bw -= delta;
+
+	bdi->throttle_bandwidth = bw;
+out:
+	bdi_update_reference_bandwidth(bdi, ref_bw);
+	trace_throttle_bandwidth(bdi, dirty_bw, pos_bw, ref_bw);
+}
+
+void bdi_update_bandwidth(struct backing_dev_info *bdi,
+			  unsigned long thresh,
+			  unsigned long dirty,
+			  unsigned long bdi_dirty,
+			  unsigned long start_time)
+{
+	static DEFINE_SPINLOCK(dirty_lock);
+	unsigned long now = jiffies;
+	unsigned long elapsed;
+	unsigned long dirtied;
+	unsigned long written;
+
+	if (!spin_trylock(&dirty_lock))
+		return;
+
+	elapsed = now - bdi->bw_time_stamp;
+	dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
+	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+
+	/* skip quiet periods when disk bandwidth is under-utilized */
+	if (elapsed > HZ/2 &&
+	    elapsed > now - start_time)
+		goto snapshot;
+
+	/*
+	 * rate-limit, only update once every 100ms. Demand higher threshold
+	 * on the flusher so that the throttled task(s) can do most updates.
+	 */
+	if (!thresh && elapsed <= HZ/4)
+		goto unlock;
+	if (elapsed <= HZ/10)
+		goto unlock;
+
+	if (thresh) {
+		update_dirty_limit(thresh, dirty);
+		bdi_update_dirty_threshold(bdi, thresh, dirty);
+		bdi_update_throttle_bandwidth(bdi, thresh, dirty,
+					      bdi_dirty, dirtied, elapsed);
+	}
+	__bdi_update_write_bandwidth(bdi, elapsed, written);
+	if (thresh) {
+		bdi_update_dirty_smooth(bdi, bdi_dirty);
+		bdi_update_dirty_smooth(&default_backing_dev_info, dirty);
+	}
+
+snapshot:
+	bdi->dirtied_stamp = dirtied;
+	bdi->written_stamp = written;
+	bdi->bw_time_stamp = now;
+unlock:
+	spin_unlock(&dirty_lock);
+}
+
+/*
+ * Limit pause time for small memory systems. If sleeping for too long time,
+ * the small pool of dirty/writeback pages may go empty and disk go idle.
+ */
+static unsigned long max_pause(struct backing_dev_info *bdi,
+			       unsigned long bdi_dirty)
+{
+	unsigned long t;  /* jiffies */
+
+	/* 1ms for every 1MB; may further consider bdi bandwidth */
+	t = bdi_dirty >> (30 - PAGE_CACHE_SHIFT - ilog2(HZ));
+	t += 2;
+
+	return min_t(unsigned long, t, MAX_PAUSE);
+}
+
+/*
+ * Scale up pause time for concurrent dirtiers in order to reduce CPU overheads.
+ * But ensure reasonably large [min_pause, max_pause] range size, so that
+ * nr_dirtied_pause (and hence future pause time) can stay reasonably stable.
+ */
+static unsigned long min_pause(struct backing_dev_info *bdi,
+			       unsigned long max)
+{
+	unsigned long hi = ilog2(bdi->write_bandwidth);
+	unsigned long lo = ilog2(bdi->throttle_bandwidth) - BASE_BW_SHIFT;
+	unsigned long t = 1 + max / 8;  /* jiffies */
+
+	if (lo >= hi)
+		return t;
+
+	/* (N * 10ms) on 2^N concurrent tasks */
+	t += (hi - lo) * (10 * HZ) / 1024;
+
+	return min(t, max / 2);
+}
+
+/*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to perform writeback if the system is over `vm_dirty_ratio'.
@@ -476,45 +1155,34 @@ unsigned long bdi_dirty_limit(struct bac
  * perform some writeout.
  */
 static void balance_dirty_pages(struct address_space *mapping,
-				unsigned long write_chunk)
+				unsigned long pages_dirtied)
 {
-	long nr_reclaimable, bdi_nr_reclaimable;
-	long nr_writeback, bdi_nr_writeback;
+	unsigned long nr_reclaimable;
+	unsigned long nr_dirty;
+	unsigned long bdi_dirty;  /* = file_dirty + writeback + unstable_nfs */
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
-	unsigned long bdi_thresh;
-	unsigned long pages_written = 0;
-	unsigned long pause = 1;
-	bool dirty_exceeded = false;
+	unsigned long bw;
+	unsigned long period;
+	unsigned long pause = 0;
+	unsigned long pause_max;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	unsigned long start_time = jiffies;
 
 	for (;;) {
-		struct writeback_control wbc = {
-			.sync_mode	= WB_SYNC_NONE,
-			.older_than_this = NULL,
-			.nr_to_write	= write_chunk,
-			.range_cyclic	= 1,
-		};
-
+		/*
+		 * Unstable writes are a feature of certain networked
+		 * filesystems (i.e. NFS) in which data may have been
+		 * written to the server's write cache, but has not yet
+		 * been flushed to permanent storage.
+		 */
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
-		nr_writeback = global_page_state(NR_WRITEBACK);
+		nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
 
 		global_dirty_limits(&background_thresh, &dirty_thresh);
 
 		/*
-		 * Throttle it only when the background writeback cannot
-		 * catch-up. This avoids (excessively) small writeouts
-		 * when the bdi limits are ramping up.
-		 */
-		if (nr_reclaimable + nr_writeback <=
-				(background_thresh + dirty_thresh) / 2)
-			break;
-
-		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-		bdi_thresh = task_dirty_limit(current, bdi_thresh);
-
-		/*
 		 * In order to avoid the stacked BDI deadlock we need
 		 * to ensure we accurately count the 'dirty' pages when
 		 * the threshold is low.
@@ -524,62 +1192,107 @@ static void balance_dirty_pages(struct a
 		 * actually dirty; with m+n sitting in the percpu
 		 * deltas.
 		 */
-		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+		if (bdi->dirty_threshold < 2*bdi_stat_error(bdi)) {
+			bdi_dirty = bdi_stat_sum(bdi, BDI_RECLAIMABLE) +
+				    bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+			bdi_dirty = bdi_stat(bdi, BDI_RECLAIMABLE) +
+				    bdi_stat(bdi, BDI_WRITEBACK);
 		}
 
 		/*
-		 * The bdi thresh is somehow "soft" limit derived from the
-		 * global "hard" limit. The former helps to prevent heavy IO
-		 * bdi or process from holding back light ones; The latter is
-		 * the last resort safeguard.
+		 * Throttle it only when the background writeback cannot
+		 * catch-up. This avoids (excessively) small writeouts
+		 * when the bdi limits are ramping up.
 		 */
-		dirty_exceeded =
-			(bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
-			|| (nr_reclaimable + nr_writeback > dirty_thresh);
-
-		if (!dirty_exceeded)
+		if (nr_dirty <= (background_thresh + dirty_thresh) / 2) {
+			current->paused_when = jiffies;
+			current->nr_dirtied = 0;
 			break;
+		}
 
-		if (!bdi->dirty_exceeded)
-			bdi->dirty_exceeded = 1;
+		bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+				     bdi_dirty, start_time);
 
-		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
-		 * Unstable writes are a feature of certain networked
-		 * filesystems (i.e. NFS) in which data may have been
-		 * written to the server's write cache, but has not yet
-		 * been flushed to permanent storage.
-		 * Only move pages to writeback if this bdi is over its
-		 * threshold otherwise wait until the disk writes catch
-		 * up.
+		if (unlikely(!writeback_in_progress(bdi)))
+			bdi_start_background_writeback(bdi);
+
+		pause_max = max_pause(bdi, bdi_dirty);
+
+		bw = dirty_throttle_bandwidth(bdi, dirty_thresh, nr_dirty,
+					      bdi_dirty, current);
+		if (unlikely(bw == 0)) {
+			period = pause_max;
+			pause = pause_max;
+			goto pause;
+		}
+		period = (HZ * pages_dirtied + bw / 2) / bw;
+		pause = current->paused_when + period - jiffies;
+		/*
+		 * Take it as long think time if pause falls into (-10s, 0).
+		 * If it's less than 500ms (ext2 blocks the dirtier task for
+		 * up to 400ms from time to time on 1-HDD; so does xfs, however
+		 * at much less frequency), try to compensate it in future by
+		 * updating the virtual time; otherwise just reset the time, as
+		 * it may be a light dirtier.
 		 */
-		trace_wbc_balance_dirty_start(&wbc, bdi);
-		if (bdi_nr_reclaimable > bdi_thresh) {
-			writeback_inodes_wb(&bdi->wb, &wbc);
-			pages_written += write_chunk - wbc.nr_to_write;
-			trace_wbc_balance_dirty_written(&wbc, bdi);
-			if (pages_written >= write_chunk)
-				break;		/* We've done our duty */
+		if (unlikely(-pause < HZ*10)) {
+			trace_balance_dirty_pages(bdi,
+						  dirty_thresh,
+						  nr_dirty,
+						  bdi_dirty,
+						  bw,
+						  pages_dirtied,
+						  period,
+						  pause,
+						  start_time);
+			if (-pause > HZ/2) {
+				current->paused_when = jiffies;
+				current->nr_dirtied = 0;
+				pause = 0;
+			} else if (period) {
+				current->paused_when += period;
+				current->nr_dirtied = 0;
+				pause = 1;
+			} else
+				current->nr_dirtied_pause <<= 1;
+			break;
 		}
-		trace_wbc_balance_dirty_wait(&wbc, bdi);
+		if (pause > pause_max)
+			pause = pause_max;
+
+pause:
+		trace_balance_dirty_pages(bdi,
+					  dirty_thresh,
+					  nr_dirty,
+					  bdi_dirty,
+					  bw,
+					  pages_dirtied,
+					  period,
+					  pause,
+					  start_time);
+		current->paused_when = jiffies;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		io_schedule_timeout(pause);
+		current->paused_when += pause;
+		current->nr_dirtied = 0;
 
-		/*
-		 * Increase the delay for each loop, up to our previous
-		 * default of taking a 100ms nap.
-		 */
-		pause <<= 1;
-		if (pause > HZ / 10)
-			pause = HZ / 10;
+		if (nr_dirty < default_backing_dev_info.dirty_threshold +
+		    default_backing_dev_info.dirty_threshold / DIRTY_MARGIN)
+			break;
 	}
 
-	if (!dirty_exceeded && bdi->dirty_exceeded)
-		bdi->dirty_exceeded = 0;
+	if (pause == 0)
+		current->nr_dirtied_pause =
+				ratelimit_pages(nr_dirty, dirty_thresh);
+	else if (pause <= min_pause(bdi, pause_max))
+		current->nr_dirtied_pause += current->nr_dirtied_pause / 32 + 1;
+	else if (pause >= pause_max)
+		/*
+		 * when repeated, writing 1 page per 100ms on slow devices,
+		 * i-(i+2)/4 will be able to reach 1 but never reduce to 0.
+		 */
+		current->nr_dirtied_pause -= (current->nr_dirtied_pause+2) >> 2;
 
 	if (writeback_in_progress(bdi))
 		return;
@@ -592,8 +1305,10 @@ static void balance_dirty_pages(struct a
 	 * In normal mode, we start background writeout at the lower
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
-	if ((laptop_mode && pages_written) ||
-	    (!laptop_mode && (nr_reclaimable > background_thresh)))
+	if (laptop_mode)
+		return;
+
+	if (nr_reclaimable > background_thresh)
 		bdi_start_background_writeback(bdi);
 }
 
@@ -607,8 +1322,6 @@ void set_page_dirty_balance(struct page 
 	}
 }
 
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
-
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -618,36 +1331,35 @@ static DEFINE_PER_CPU(unsigned long, bdp
  * which was newly dirtied.  The function will periodically check the system's
  * dirty state and will initiate writeback if needed.
  *
- * On really big machines, get_writeback_state is expensive, so try to avoid
+ * On really big machines, global_page_state() is expensive, so try to avoid
  * calling it too often (ratelimiting).  But once we're over the dirty memory
- * limit we decrease the ratelimiting by a lot, to prevent individual processes
- * from overshooting the limit by (ratelimit_pages) each.
+ * limit we disable the ratelimiting, to prevent individual processes from
+ * overshooting the limit by (ratelimit_pages) each.
  */
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied)
 {
-	unsigned long ratelimit;
-	unsigned long *p;
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
 
-	ratelimit = ratelimit_pages;
-	if (mapping->backing_dev_info->dirty_exceeded)
-		ratelimit = 8;
+	if (!bdi_cap_account_dirty(bdi))
+		return;
+
+	current->nr_dirtied += nr_pages_dirtied;
+
+	if (dirty_exceeded_recently(bdi, MAX_PAUSE)) {
+		unsigned long max = current->nr_dirtied +
+						(128 >> (PAGE_SHIFT - 10));
+
+		if (current->nr_dirtied_pause > max)
+			current->nr_dirtied_pause = max;
+	}
 
 	/*
 	 * Check the rate limiting. Also, we do not want to throttle real-time
 	 * tasks in balance_dirty_pages(). Period.
 	 */
-	preempt_disable();
-	p =  &__get_cpu_var(bdp_ratelimits);
-	*p += nr_pages_dirtied;
-	if (unlikely(*p >= ratelimit)) {
-		ratelimit = sync_writeback_pages(*p);
-		*p = 0;
-		preempt_enable();
-		balance_dirty_pages(mapping, ratelimit);
-		return;
-	}
-	preempt_enable();
+	if (unlikely(current->nr_dirtied >= current->nr_dirtied_pause))
+		balance_dirty_pages(mapping, current->nr_dirtied);
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
@@ -735,44 +1447,6 @@ void laptop_sync_completion(void)
 #endif
 
 /*
- * If ratelimit_pages is too high then we can get into dirty-data overload
- * if a large number of processes all perform writes at the same time.
- * If it is too low then SMP machines will call the (expensive)
- * get_writeback_state too often.
- *
- * Here we set ratelimit_pages to a level which ensures that when all CPUs are
- * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
- *
- * But the limit should not be set too high.  Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time.  So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
- */
-
-void writeback_set_ratelimit(void)
-{
-	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
-	if (ratelimit_pages < 16)
-		ratelimit_pages = 16;
-	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
-		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
-}
-
-static int __cpuinit
-ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
-{
-	writeback_set_ratelimit();
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block __cpuinitdata ratelimit_nb = {
-	.notifier_call	= ratelimit_handler,
-	.next		= NULL,
-};
-
-/*
  * Called early on to tune the page writeback dirty limits.
  *
  * We used to scale dirty pages according to how total memory
@@ -794,9 +1468,6 @@ void __init page_writeback_init(void)
 {
 	int shift;
 
-	writeback_set_ratelimit();
-	register_cpu_notifier(&ratelimit_nb);
-
 	shift = calc_period_shift();
 	prop_descriptor_init(&vm_completions, shift);
 	prop_descriptor_init(&vm_dirties, shift);
@@ -1120,6 +1791,7 @@ void account_page_dirtied(struct page *p
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
 		task_dirty_inc(current);
 		task_io_account_write(PAGE_CACHE_SIZE);
 	}
@@ -1134,7 +1806,6 @@ EXPORT_SYMBOL(account_page_dirtied);
 void account_page_writeback(struct page *page)
 {
 	inc_zone_page_state(page, NR_WRITEBACK);
-	inc_zone_page_state(page, NR_WRITTEN);
 }
 EXPORT_SYMBOL(account_page_writeback);
 
@@ -1341,8 +2012,10 @@ int test_clear_page_writeback(struct pag
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
-	if (ret)
+	if (ret) {
 		dec_zone_page_state(page, NR_WRITEBACK);
+		inc_zone_page_state(page, NR_WRITTEN);
+	}
 	return ret;
 }
 
--- linux-next.orig/mm/backing-dev.c	2011-03-02 10:45:48.000000000 +0800
+++ linux-next/mm/backing-dev.c	2011-03-02 10:45:58.000000000 +0800
@@ -87,20 +87,26 @@ static int bdi_debug_stats_show(struct s
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
-		   "BdiWriteback:     %8lu kB\n"
-		   "BdiReclaimable:   %8lu kB\n"
-		   "BdiDirtyThresh:   %8lu kB\n"
-		   "DirtyThresh:      %8lu kB\n"
-		   "BackgroundThresh: %8lu kB\n"
-		   "b_dirty:          %8lu\n"
-		   "b_io:             %8lu\n"
-		   "b_more_io:        %8lu\n"
-		   "bdi_list:         %8u\n"
-		   "state:            %8lx\n",
+		   "BdiWriteback:       %10lu kB\n"
+		   "BdiReclaimable:     %10lu kB\n"
+		   "BdiDirtyThresh:     %10lu kB\n"
+		   "DirtyThresh:        %10lu kB\n"
+		   "BackgroundThresh:   %10lu kB\n"
+		   "BdiDirtied:         %10lu kB\n"
+		   "BdiWritten:         %10lu kB\n"
+		   "BdiWriteBandwidth:  %10lu kBps\n"
+		   "b_dirty:            %10lu\n"
+		   "b_io:               %10lu\n"
+		   "b_more_io:          %10lu\n"
+		   "bdi_list:           %10u\n"
+		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
 		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-		   K(bdi_thresh), K(dirty_thresh),
-		   K(background_thresh), nr_dirty, nr_io, nr_more_io,
+		   K(bdi_thresh), K(dirty_thresh), K(background_thresh),
+		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+		   (unsigned long) K(bdi->write_bandwidth),
+		   nr_dirty, nr_io, nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
@@ -637,6 +643,11 @@ static void bdi_wb_init(struct bdi_write
 	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 }
 
+/*
+ * initial write bandwidth: 50 MB/s 
+ */
+#define INIT_BW		(50 << (20 - PAGE_SHIFT))
+
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int i, err;
@@ -658,7 +669,17 @@ int bdi_init(struct backing_dev_info *bd
 			goto err;
 	}
 
-	bdi->dirty_exceeded = 0;
+	bdi->bw_time_stamp = jiffies;
+	bdi->written_stamp = 0;
+
+	bdi->write_bandwidth = INIT_BW;
+	bdi->avg_bandwidth = INIT_BW;
+	bdi->throttle_bandwidth = (u64)INIT_BW << BASE_BW_SHIFT;
+
+	bdi->avg_dirty = 0;
+	bdi->old_dirty = 0;
+	bdi->dirty_threshold = MIN_WRITEBACK_PAGES;
+
 	err = prop_local_init_percpu(&bdi->completions);
 
 	if (err) {
--- linux-next.orig/include/linux/writeback.h	2011-03-02 10:45:48.000000000 +0800
+++ linux-next/include/linux/writeback.h	2011-03-02 10:45:58.000000000 +0800
@@ -12,6 +12,47 @@ struct backing_dev_info;
 extern spinlock_t inode_lock;
 
 /*
+ * The 1/4 region under the global dirty thresh is for elastic dirty throttling:
+ *
+ *		(thresh - 2*thresh/DIRTY_SCOPE, thresh)
+ *
+ * The 1/32 region under the global dirty limit will be more rigidly throttled:
+ *
+ *		(limit - limit/DIRTY_MARGIN, limit)
+ *
+ * The 1/32 region above the global dirty limit will be put to maximum pauses:
+ *
+ *		(limit, limit + limit/DIRTY_MARGIN)
+ *
+ * Further beyond, the dirtier task will enter a loop waiting (possibly long
+ * time) for the dirty pages to drop below (limit + limit/DIRTY_MARGIN).
+ *
+ * The last case may happen lightly when memory is very tight or at sudden
+ * workload rampup. Or under DoS situations such as a fork bomb where every new
+ * task dirties some more pages, or creating 10,000 tasks each writing to a USB
+ * key slowly in 4KB/s.
+ *
+ * The global dirty threshold is normally equal to global dirty limit, except
+ * when the system suddenly allocates a lot of anonymous memory and knocks down
+ * the global dirty threshold quickly, in which case the global dirty limit
+ * will follow down slowly to prevent livelocking all dirtier tasks.
+ */
+#define DIRTY_SCOPE		8
+#define DIRTY_MARGIN		(DIRTY_SCOPE * 4)
+
+/*
+ * The base throttle bandwidth will be 1000 times smaller than write bandwidth
+ * when there are 100 concurrent heavy dirtiers. This shift can work with up to
+ * 40 bits dirty size and 2^16 concurrent dirtiers.
+ */
+#define BASE_BW_SHIFT		24
+
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES     (4096UL >> (PAGE_CACHE_SHIFT - 10))
+
+/*
  * fs/fs-writeback.c
  */
 enum writeback_sync_modes {
@@ -33,6 +74,7 @@ struct writeback_control {
 					   extra jobs and livelock */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
+	long per_file_limit;		/* Write this many pages for one file */
 	long pages_skipped;		/* Pages which were not written */
 
 	/*
@@ -126,7 +168,23 @@ int dirty_writeback_centisecs_handler(st
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
-			       unsigned long dirty);
+			      unsigned long dirty);
+
+void bdi_writeout_fraction(struct backing_dev_info *bdi,
+			   long *numerator, long *denominator);
+void task_dirties_fraction(struct task_struct *tsk,
+			   long *numerator, long *denominator);
+
+void bdi_update_bandwidth(struct backing_dev_info *bdi,
+			  unsigned long thresh,
+			  unsigned long dirty,
+			  unsigned long bdi_dirty,
+			  unsigned long start_time);
+static inline void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+					      unsigned long start_time)
+{
+	bdi_update_bandwidth(bdi, 0, 0, 0, start_time);
+}
 
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
--- linux-next.orig/mm/filemap.c	2011-03-02 10:45:48.000000000 +0800
+++ linux-next/mm/filemap.c	2011-03-02 10:45:57.000000000 +0800
@@ -2253,6 +2253,7 @@ static ssize_t generic_perform_write(str
 	long status = 0;
 	ssize_t written = 0;
 	unsigned int flags = 0;
+	unsigned int dirty;
 
 	/*
 	 * Copies from kernel address space cannot fail (NFSD is a big user).
@@ -2301,6 +2302,7 @@ again:
 		pagefault_enable();
 		flush_dcache_page(page);
 
+		dirty = PageDirty(page);
 		mark_page_accessed(page);
 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
 						page, fsdata);
@@ -2327,7 +2329,8 @@ again:
 		pos += copied;
 		written += copied;
 
-		balance_dirty_pages_ratelimited(mapping);
+		if (!dirty)
+			balance_dirty_pages_ratelimited(mapping);
 
 	} while (iov_iter_count(i));
 
--- linux-next.orig/include/linux/sched.h	2011-03-02 10:45:47.000000000 +0800
+++ linux-next/include/linux/sched.h	2011-03-02 10:45:57.000000000 +0800
@@ -1487,6 +1487,14 @@ struct task_struct {
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+	/*
+	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
+	 * balance_dirty_pages() for some dirty throttling pause
+	 */
+	int nr_dirtied;
+	int nr_dirtied_pause;
+	unsigned long paused_when;	/* start of a write-and-pause period */
+
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
--- linux-next.orig/mm/memory_hotplug.c	2011-03-02 10:45:47.000000000 +0800
+++ linux-next/mm/memory_hotplug.c	2011-03-02 10:45:57.000000000 +0800
@@ -468,8 +468,6 @@ int online_pages(unsigned long pfn, unsi
 
 	vm_total_pages = nr_free_pagecache_pages();
 
-	writeback_set_ratelimit();
-
 	if (onlined_pages)
 		memory_notify(MEM_ONLINE, &arg);
 	unlock_memory_hotplug();
@@ -901,7 +899,6 @@ repeat:
 	}
 
 	vm_total_pages = nr_free_pagecache_pages();
-	writeback_set_ratelimit();
 
 	memory_notify(MEM_OFFLINE, &arg);
 	unlock_memory_hotplug();
--- linux-next.orig/include/linux/backing-dev.h	2011-03-02 10:45:47.000000000 +0800
+++ linux-next/include/linux/backing-dev.h	2011-03-02 10:45:58.000000000 +0800
@@ -39,7 +39,9 @@ typedef int (congested_fn)(void *, int);
 
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
+	BDI_DIRTIED,
 	BDI_WRITEBACK,
+	BDI_WRITTEN,
 	NR_BDI_STAT_ITEMS
 };
 
@@ -73,8 +75,25 @@ struct backing_dev_info {
 
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
 
+	unsigned long bw_time_stamp;
+	unsigned long dirtied_stamp;
+	unsigned long written_stamp;
+	unsigned long write_bandwidth;
+	unsigned long avg_bandwidth;
+	unsigned long long throttle_bandwidth;
+	unsigned long long reference_bandwidth;
+	unsigned long long old_ref_bandwidth;
+	unsigned long avg_dirty;
+	unsigned long old_dirty;
+	unsigned long dirty_threshold;
+	unsigned long old_dirty_threshold;
+
 	struct prop_local_percpu completions;
-	int dirty_exceeded;
+
+	/* last time exceeded (limit - limit/DIRTY_MARGIN) */
+	unsigned long dirty_exceed_time;
+	/* last time dropped below (background_thresh + dirty_thresh) / 2 */
+	unsigned long dirty_free_run;
 
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
--- linux-next.orig/fs/fs-writeback.c	2011-03-02 10:45:48.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-03-02 10:45:58.000000000 +0800
@@ -330,6 +330,8 @@ static int
 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
+	long per_file_limit = wbc->per_file_limit;
+	long nr_to_write = wbc->nr_to_write;
 	unsigned dirty;
 	int ret;
 
@@ -349,7 +351,8 @@ writeback_single_inode(struct inode *ino
 		 */
 		if (wbc->sync_mode != WB_SYNC_ALL) {
 			requeue_io(inode);
-			return 0;
+			ret = 0;
+			goto out;
 		}
 
 		/*
@@ -365,8 +368,14 @@ writeback_single_inode(struct inode *ino
 	inode->i_state &= ~I_DIRTY_PAGES;
 	spin_unlock(&inode_lock);
 
+	if (per_file_limit)
+		wbc->nr_to_write = per_file_limit;
+
 	ret = do_writepages(mapping, wbc);
 
+	if (per_file_limit)
+		wbc->nr_to_write += nr_to_write - per_file_limit;
+
 	/*
 	 * Make sure to wait on the data before writing out the metadata.
 	 * This is important for filesystems that modify metadata on data
@@ -436,6 +445,9 @@ writeback_single_inode(struct inode *ino
 		}
 	}
 	inode_sync_complete(inode);
+out:
+	trace_writeback_single_inode(inode, wbc,
+				     nr_to_write - wbc->nr_to_write);
 	return ret;
 }
 
@@ -584,15 +596,6 @@ static void __writeback_inodes_sb(struct
 	spin_unlock(&inode_lock);
 }
 
-/*
- * The maximum number of pages to writeout in a single bdi flush/kupdate
- * operation.  We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.  Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES     1024
-
 static inline bool over_bground_thresh(void)
 {
 	unsigned long background_thresh, dirty_thresh;
@@ -604,6 +607,39 @@ static inline bool over_bground_thresh(v
 }
 
 /*
+ * Give each inode a nr_to_write that can complete within 1 second.
+ */
+static unsigned long writeback_chunk_size(struct backing_dev_info *bdi,
+					  int sync_mode)
+{
+	unsigned long pages;
+
+	/*
+	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+	 * here avoids calling into writeback_inodes_wb() more than once.
+	 *
+	 * The intended call sequence for WB_SYNC_ALL writeback is:
+	 *
+	 *      wb_writeback()
+	 *          __writeback_inodes_sb()     <== called only once
+	 *              write_cache_pages()     <== called once for each inode
+	 *                  (quickly) tag currently dirty pages
+	 *                  (maybe slowly) sync all tagged pages
+	 */
+	if (sync_mode == WB_SYNC_ALL)
+		return LONG_MAX;
+
+	pages = min(bdi->avg_bandwidth,
+		    bdi->dirty_threshold / DIRTY_SCOPE);
+
+	if (pages <= MIN_WRITEBACK_PAGES)
+		return MIN_WRITEBACK_PAGES;
+
+	return rounddown_pow_of_two(pages);
+}
+
+/*
  * Explicit flushing or periodic writeback of "old" data.
  *
  * Define "old": the first time one of an inode's pages is dirtied, we mark the
@@ -643,25 +679,9 @@ static long wb_writeback(struct bdi_writ
 		wbc.range_end = LLONG_MAX;
 	}
 
-	/*
-	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
-	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
-	 * here avoids calling into writeback_inodes_wb() more than once.
-	 *
-	 * The intended call sequence for WB_SYNC_ALL writeback is:
-	 *
-	 *      wb_writeback()
-	 *          __writeback_inodes_sb()     <== called only once
-	 *              write_cache_pages()     <== called once for each inode
-	 *                   (quickly) tag currently dirty pages
-	 *                   (maybe slowly) sync all tagged pages
-	 */
-	if (wbc.sync_mode == WB_SYNC_NONE)
-		write_chunk = MAX_WRITEBACK_PAGES;
-	else
-		write_chunk = LONG_MAX;
-
 	wbc.wb_start = jiffies; /* livelock avoidance */
+	bdi_update_write_bandwidth(wb->bdi, wbc.wb_start);
+
 	for (;;) {
 		/*
 		 * Stop writeback when nr_pages has been consumed
@@ -687,7 +707,9 @@ static long wb_writeback(struct bdi_writ
 			break;
 
 		wbc.more_io = 0;
+		write_chunk = writeback_chunk_size(wb->bdi, wbc.sync_mode);
 		wbc.nr_to_write = write_chunk;
+		wbc.per_file_limit = write_chunk;
 		wbc.pages_skipped = 0;
 
 		trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -697,6 +719,8 @@ static long wb_writeback(struct bdi_writ
 			writeback_inodes_wb(wb, &wbc);
 		trace_wbc_writeback_written(&wbc, wb->bdi);
 
+		bdi_update_write_bandwidth(wb->bdi, wbc.wb_start);
+
 		work->nr_pages -= write_chunk - wbc.nr_to_write;
 		wrote += write_chunk - wbc.nr_to_write;
 
@@ -720,6 +744,12 @@ static long wb_writeback(struct bdi_writ
 		 * become available for writeback. Otherwise
 		 * we'll just busyloop.
 		 */
+		if (list_empty(&wb->b_more_io)) {
+			trace_wbc_writeback_wait(&wbc, wb->bdi);
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			io_schedule_timeout(max(HZ/100, 1));
+			continue;
+		}
 		spin_lock(&inode_lock);
 		if (!list_empty(&wb->b_more_io))  {
 			inode = wb_inode(wb->b_more_io.prev);
--- linux-next.orig/include/trace/events/writeback.h	2011-03-02 10:45:45.000000000 +0800
+++ linux-next/include/trace/events/writeback.h	2011-03-02 10:45:58.000000000 +0800
@@ -7,9 +7,23 @@
 #include <linux/backing-dev.h>
 #include <linux/device.h>
 #include <linux/writeback.h>
+#include <linux/proportions.h>
 
 struct wb_writeback_work;
 
+#define show_inode_state(state)					\
+	__print_flags(state, "|",				\
+		{I_DIRTY_SYNC,		"I_DIRTY_SYNC"},	\
+		{I_DIRTY_DATASYNC,	"I_DIRTY_DATASYNC"},	\
+		{I_DIRTY_PAGES,		"I_DIRTY_PAGES"},	\
+		{I_NEW,			"I_NEW"},		\
+		{I_WILL_FREE,		"I_WILL_FREE"},		\
+		{I_FREEING,		"I_FREEING"},		\
+		{I_CLEAR,		"I_CLEAR"},		\
+		{I_SYNC,		"I_SYNC"},		\
+		{I_REFERENCED,		"I_REFERENCED"}		\
+		)
+
 DECLARE_EVENT_CLASS(writeback_work_class,
 	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
 	TP_ARGS(bdi, work),
@@ -147,11 +161,238 @@ DEFINE_EVENT(wbc_class, name, \
 DEFINE_WBC_EVENT(wbc_writeback_start);
 DEFINE_WBC_EVENT(wbc_writeback_written);
 DEFINE_WBC_EVENT(wbc_writeback_wait);
-DEFINE_WBC_EVENT(wbc_balance_dirty_start);
-DEFINE_WBC_EVENT(wbc_balance_dirty_written);
-DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
 DEFINE_WBC_EVENT(wbc_writepage);
 
+TRACE_EVENT(writeback_single_inode,
+
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 unsigned long wrote
+	),
+
+	TP_ARGS(inode, wbc, wrote),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned long, ino)
+		__field(unsigned long, state)
+		__field(unsigned long, age)
+		__field(unsigned long, wrote)
+		__field(long, nr_to_write)
+		__field(unsigned long, writeback_index)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name,
+			dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+		__entry->ino		= inode->i_ino;
+		__entry->state		= inode->i_state;
+		__entry->age		= (jiffies - inode->dirtied_when) *
+								1000 / HZ;
+		__entry->wrote		= wrote;
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->writeback_index = inode->i_mapping->writeback_index;
+	),
+
+	TP_printk("bdi %s: ino=%lu state=%s age=%lu "
+		  "wrote=%lu to_write=%ld index=%lu",
+		  __entry->name,
+		  __entry->ino,
+		  show_inode_state(__entry->state),
+		  __entry->age,
+		  __entry->wrote,
+		  __entry->nr_to_write,
+		  __entry->writeback_index
+	)
+);
+
+TRACE_EVENT(global_dirty_state,
+
+	TP_PROTO(unsigned long background_thresh,
+		 unsigned long dirty_thresh
+	),
+
+	TP_ARGS(background_thresh,
+		dirty_thresh
+	),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	nr_dirty)
+		__field(unsigned long,	nr_writeback)
+		__field(unsigned long,	nr_unstable)
+		__field(unsigned long,	background_thresh)
+		__field(unsigned long,	dirty_thresh)
+		__field(unsigned long,	poll_thresh)
+		__field(unsigned long,	nr_dirtied)
+		__field(unsigned long,	nr_written)
+	),
+
+	TP_fast_assign(
+		__entry->nr_dirty	= global_page_state(NR_FILE_DIRTY);
+		__entry->nr_writeback	= global_page_state(NR_WRITEBACK);
+		__entry->nr_unstable	= global_page_state(NR_UNSTABLE_NFS);
+		__entry->nr_dirtied	= global_page_state(NR_DIRTIED);
+		__entry->nr_written	= global_page_state(NR_WRITTEN);
+		__entry->background_thresh	= background_thresh;
+		__entry->dirty_thresh		= dirty_thresh;
+		__entry->poll_thresh		= current->nr_dirtied_pause;
+	),
+
+	TP_printk("dirty=%lu writeback=%lu unstable=%lu "
+		  "bg_thresh=%lu thresh=%lu gap=%ld poll=%ld "
+		  "dirtied=%lu written=%lu",
+		  __entry->nr_dirty,
+		  __entry->nr_writeback,
+		  __entry->nr_unstable,
+		  __entry->background_thresh,
+		  __entry->dirty_thresh,
+		  __entry->dirty_thresh - __entry->nr_dirty -
+		  __entry->nr_writeback - __entry->nr_unstable,
+		  __entry->poll_thresh,
+		  __entry->nr_dirtied,
+		  __entry->nr_written
+	)
+);
+
+#define KBps(x)			((x) << (PAGE_SHIFT - 10))
+#define Bps(x)			((x) >> (BASE_BW_SHIFT - PAGE_SHIFT))
+
+TRACE_EVENT(throttle_bandwidth,
+
+	TP_PROTO(struct backing_dev_info *bdi,
+		 unsigned long dirty_bw,
+		 unsigned long long pos_bw,
+		 unsigned long long ref_bw),
+
+	TP_ARGS(bdi, dirty_bw, pos_bw, ref_bw),
+
+	TP_STRUCT__entry(
+		__array(char,			bdi, 32)
+		__field(unsigned long,		write_bw)
+		__field(unsigned long,		avg_bw)
+		__field(unsigned long,		dirty_bw)
+		__field(unsigned long long,	base_bw)
+		__field(unsigned long long,	pos_bw)
+		__field(unsigned long long,	ref_bw)
+		__field(unsigned long long,	avg_ref_bw)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+		__entry->write_bw	= KBps(bdi->write_bandwidth);
+		__entry->avg_bw		= KBps(bdi->avg_bandwidth);
+		__entry->dirty_bw	= KBps(dirty_bw);
+		__entry->base_bw	= Bps(bdi->throttle_bandwidth);
+		__entry->pos_bw		= Bps(pos_bw);
+		__entry->ref_bw		= Bps(ref_bw);
+		__entry->avg_ref_bw	= Bps(bdi->reference_bandwidth);
+	),
+
+
+	TP_printk("bdi %s: "
+		  "write_bw=%lu avg_bw=%lu dirty_bw=%lu "
+		  "base_bw=%llu pos_bw=%llu ref_bw=%llu aref_bw=%lu",
+		  __entry->bdi,
+		  __entry->write_bw,	/* bdi write bandwidth */
+		  __entry->avg_bw,	/* bdi avg write bandwidth */
+		  __entry->dirty_bw,	/* bdi dirty bandwidth */
+		  __entry->base_bw,	/* base throttle bandwidth */
+		  __entry->pos_bw,	/* position control bandwidth */
+		  __entry->ref_bw,	/* reference throttle bandwidth */
+		  __entry->avg_ref_bw	/* smoothed reference bandwidth */
+	)
+);
+
+
+TRACE_EVENT(balance_dirty_pages,
+
+	TP_PROTO(struct backing_dev_info *bdi,
+		 unsigned long thresh,
+		 unsigned long dirty,
+		 unsigned long bdi_dirty,
+		 unsigned long task_bw,
+		 unsigned long dirtied,
+		 unsigned long period,
+		 long pause,
+		 unsigned long start_time),
+
+	TP_ARGS(bdi, thresh, dirty, bdi_dirty,
+		task_bw, dirtied, period, pause, start_time),
+
+	TP_STRUCT__entry(
+		__array(	 char,	bdi, 32)
+		__field(unsigned long,	bdi_weight)
+		__field(unsigned long,	task_weight)
+		__field(unsigned long,	limit)
+		__field(unsigned long,	goal)
+		__field(unsigned long,	dirty)
+		__field(unsigned long,	bdi_goal)
+		__field(unsigned long,	bdi_dirty)
+		__field(unsigned long,	avg_dirty)
+		__field(unsigned long,	base_bw)
+		__field(unsigned long,	task_bw)
+		__field(unsigned long,	dirtied)
+		__field(unsigned long,	period)
+		__field(	 long,	think)
+		__field(	 long,	pause)
+		__field(unsigned long,	paused)
+	),
+
+	TP_fast_assign(
+		long numerator;
+		long denominator;
+
+		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+
+		bdi_writeout_fraction(bdi, &numerator, &denominator);
+		__entry->bdi_weight	= 1000 * numerator / denominator;
+		task_dirties_fraction(current, &numerator, &denominator);
+		__entry->task_weight	= 1000 * numerator / denominator;
+
+		__entry->limit = default_backing_dev_info.dirty_threshold;
+		__entry->goal		= thresh - thresh / DIRTY_SCOPE;
+		__entry->dirty		= dirty;
+		__entry->bdi_goal	= bdi->dirty_threshold -
+					  bdi->dirty_threshold / DIRTY_SCOPE;
+		__entry->bdi_dirty	= bdi_dirty;
+		__entry->avg_dirty	= bdi->avg_dirty;
+		__entry->base_bw	= KBps(bdi->throttle_bandwidth) >>
+								BASE_BW_SHIFT;
+		__entry->task_bw	= KBps(task_bw);
+		__entry->dirtied	= dirtied;
+		__entry->think		= current->paused_when == 0 ? 0 :
+			 (long)(jiffies - current->paused_when) * 1000 / HZ;
+		__entry->period		= period * 1000 / HZ;
+		__entry->pause		= pause * 1000 / HZ;
+		__entry->paused		= (jiffies - start_time) * 1000 / HZ;
+	),
+
+
+	TP_printk("bdi %s: bdi_weight=%lu task_weight=%lu "
+		  "limit=%lu goal=%lu dirty=%lu "
+		  "bdi_goal=%lu bdi_dirty=%lu avg_dirty=%lu "
+		  "base_bw=%lu task_bw=%lu "
+		  "dirtied=%lu "
+		  "period=%lu think=%ld pause=%ld paused=%lu",
+		  __entry->bdi,
+		  __entry->bdi_weight,
+		  __entry->task_weight,
+		  __entry->limit,
+		  __entry->goal,
+		  __entry->dirty,
+		  __entry->bdi_goal,
+		  __entry->bdi_dirty,
+		  __entry->avg_dirty,
+		  __entry->base_bw,	/* base throttle bandwidth */
+		  __entry->task_bw,	/* task throttle bandwidth */
+		  __entry->dirtied,
+		  __entry->period,	/* ms */
+		  __entry->think,	/* ms */
+		  __entry->pause,	/* ms */
+		  __entry->paused	/* ms */
+		  )
+);
+
 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
 
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
--- linux-next.orig/fs/btrfs/file.c	2011-03-02 10:45:44.000000000 +0800
+++ linux-next/fs/btrfs/file.c	2011-03-02 10:45:57.000000000 +0800
@@ -770,7 +770,8 @@ out:
 static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 			 struct page **pages, size_t num_pages,
 			 loff_t pos, unsigned long first_index,
-			 unsigned long last_index, size_t write_bytes)
+			 unsigned long last_index, size_t write_bytes,
+			 int *nr_dirtied)
 {
 	struct extent_state *cached_state = NULL;
 	int i;
@@ -837,7 +838,8 @@ again:
 				     GFP_NOFS);
 	}
 	for (i = 0; i < num_pages; i++) {
-		clear_page_dirty_for_io(pages[i]);
+		if (!clear_page_dirty_for_io(pages[i]))
+			(*nr_dirtied)++;
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
@@ -947,9 +949,9 @@ static ssize_t btrfs_file_aio_write(stru
 	}
 
 	iov_iter_init(&i, iov, nr_segs, count, num_written);
-	nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
-		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
-		     (sizeof(struct page *)));
+	nrptrs = min(DIV_ROUND_UP(iov_iter_count(&i), PAGE_CACHE_SIZE),
+		     min(PAGE_CACHE_SIZE / sizeof(struct page *),
+			 current->nr_dirtied_pause));
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 	if (!pages) {
 		ret = -ENOMEM;
@@ -989,6 +991,7 @@ static ssize_t btrfs_file_aio_write(stru
 	}
 
 	while (iov_iter_count(&i) > 0) {
+		int nr_dirtied = 0;
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(iov_iter_count(&i),
 					 nrptrs * (size_t)PAGE_CACHE_SIZE -
@@ -1015,7 +1018,7 @@ static ssize_t btrfs_file_aio_write(stru
 
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
-				    write_bytes);
+				    write_bytes, &nr_dirtied);
 		if (ret) {
 			btrfs_delalloc_release_space(inode,
 					num_pages << PAGE_CACHE_SHIFT);
@@ -1050,7 +1053,7 @@ static ssize_t btrfs_file_aio_write(stru
 			} else {
 				balance_dirty_pages_ratelimited_nr(
 							inode->i_mapping,
-							dirty_pages);
+							nr_dirtied);
 				if (dirty_pages <
 				(root->leafsize >> PAGE_CACHE_SHIFT) + 1)
 					btrfs_btree_balance_dirty(root, 1);
--- linux-next.orig/fs/btrfs/ioctl.c	2011-03-02 10:45:44.000000000 +0800
+++ linux-next/fs/btrfs/ioctl.c	2011-03-02 10:45:57.000000000 +0800
@@ -654,6 +654,7 @@ static int btrfs_defrag_file(struct file
 	u64 skip = 0;
 	u64 defrag_end = 0;
 	unsigned long i;
+	int dirtied;
 	int ret;
 	int compress_type = BTRFS_COMPRESS_ZLIB;
 
@@ -766,7 +767,7 @@ again:
 
 		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
 		ClearPageChecked(page);
-		set_page_dirty(page);
+		dirtied = set_page_dirty(page);
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 loop_unlock:
@@ -774,7 +775,8 @@ loop_unlock:
 		page_cache_release(page);
 		mutex_unlock(&inode->i_mutex);
 
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+		if (dirtied)
+			balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
 		i++;
 	}
 
--- linux-next.orig/fs/btrfs/relocation.c	2011-03-02 10:45:44.000000000 +0800
+++ linux-next/fs/btrfs/relocation.c	2011-03-02 10:45:57.000000000 +0800
@@ -2902,6 +2902,7 @@ static int relocate_file_extent_cluster(
 	struct file_ra_state *ra;
 	int nr = 0;
 	int ret = 0;
+	int dirtied;
 
 	if (!cluster->nr)
 		return 0;
@@ -2978,7 +2979,7 @@ static int relocate_file_extent_cluster(
 		}
 
 		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
-		set_page_dirty(page);
+		dirtied = set_page_dirty(page);
 
 		unlock_extent(&BTRFS_I(inode)->io_tree,
 			      page_start, page_end, GFP_NOFS);
@@ -2986,7 +2987,8 @@ static int relocate_file_extent_cluster(
 		page_cache_release(page);
 
 		index++;
-		balance_dirty_pages_ratelimited(inode->i_mapping);
+		if (dirtied)
+			balance_dirty_pages_ratelimited(inode->i_mapping);
 		btrfs_throttle(BTRFS_I(inode)->root);
 	}
 	WARN_ON(nr != cluster->nr);
--- linux-next.orig/fs/btrfs/disk-io.c	2011-03-02 10:45:44.000000000 +0800
+++ linux-next/fs/btrfs/disk-io.c	2011-03-02 10:45:57.000000000 +0800
@@ -616,6 +616,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_
 			extent_submit_bio_hook_t *submit_bio_done)
 {
 	struct async_submit_bio *async;
+	int limit;
 
 	async = kmalloc(sizeof(*async), GFP_NOFS);
 	if (!async)
@@ -643,6 +644,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_
 
 	btrfs_queue_worker(&fs_info->workers, &async->work);
 
+	limit = btrfs_async_submit_limit(fs_info);
+
+	if (atomic_read(&fs_info->nr_async_bios) > limit)
+		wait_event(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_bios) < limit));
+
 	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
 		wait_event(fs_info->async_submit_wait,
--- linux-next.orig/fs/nfs/file.c	2011-03-02 10:45:44.000000000 +0800
+++ linux-next/fs/nfs/file.c	2011-03-02 10:45:57.000000000 +0800
@@ -392,15 +392,6 @@ static int nfs_write_begin(struct file *
 			   IOMODE_RW);
 
 start:
-	/*
-	 * Prevent starvation issues if someone is doing a consistency
-	 * sync-to-disk
-	 */
-	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
-			nfs_wait_bit_killable, TASK_KILLABLE);
-	if (ret)
-		return ret;
-
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
--- linux-next.orig/fs/nfs/write.c	2011-03-02 10:45:44.000000000 +0800
+++ linux-next/fs/nfs/write.c	2011-03-02 10:45:58.000000000 +0800
@@ -29,6 +29,9 @@
 #include "nfs4_fs.h"
 #include "fscache.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/nfs.h>
+
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
 #define MIN_POOL_WRITE		(32)
@@ -185,11 +188,68 @@ static int wb_priority(struct writeback_
  * NFS congestion control
  */
 
+#define NFS_WAIT_PAGES	(1024L >> (PAGE_SHIFT - 10))
 int nfs_congestion_kb;
 
-#define NFS_CONGESTION_ON_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-10))
-#define NFS_CONGESTION_OFF_THRESH	\
-	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+/*
+ * SYNC requests will block on (2*limit) and wakeup on (2*limit-NFS_WAIT_PAGES)
+ * ASYNC requests will block on (limit) and wakeup on (limit - NFS_WAIT_PAGES)
+ * In this way SYNC writes will never be blocked by ASYNC ones.
+ */
+
+static void nfs_set_congested(long nr, struct backing_dev_info *bdi)
+{
+	long limit = nfs_congestion_kb >> (PAGE_SHIFT - 10);
+
+	if (nr > limit && !test_bit(BDI_async_congested, &bdi->state))
+		set_bdi_congested(bdi, BLK_RW_ASYNC);
+	else if (nr > 2 * limit && !test_bit(BDI_sync_congested, &bdi->state))
+		set_bdi_congested(bdi, BLK_RW_SYNC);
+}
+
+static void nfs_wait_contested(int is_sync,
+			       struct backing_dev_info *bdi,
+			       wait_queue_head_t *wqh)
+{
+	int waitbit = is_sync ? BDI_sync_congested : BDI_async_congested;
+	DEFINE_WAIT(wait);
+
+	if (!test_bit(waitbit, &bdi->state))
+		return;
+
+	for (;;) {
+		prepare_to_wait(&wqh[is_sync], &wait, TASK_UNINTERRUPTIBLE);
+		if (!test_bit(waitbit, &bdi->state))
+			break;
+
+		io_schedule();
+	}
+	finish_wait(&wqh[is_sync], &wait);
+}
+
+static void nfs_wakeup_congested(long nr,
+				 struct backing_dev_info *bdi,
+				 wait_queue_head_t *wqh)
+{
+	long limit = nfs_congestion_kb >> (PAGE_SHIFT - 10);
+
+	if (nr < 2 * limit - min(limit / 8, NFS_WAIT_PAGES)) {
+		if (test_bit(BDI_sync_congested, &bdi->state)) {
+			clear_bdi_congested(bdi, BLK_RW_SYNC);
+			smp_mb__after_clear_bit();
+		}
+		if (waitqueue_active(&wqh[BLK_RW_SYNC]))
+			wake_up(&wqh[BLK_RW_SYNC]);
+	}
+	if (nr < limit - min(limit / 8, NFS_WAIT_PAGES)) {
+		if (test_bit(BDI_async_congested, &bdi->state)) {
+			clear_bdi_congested(bdi, BLK_RW_ASYNC);
+			smp_mb__after_clear_bit();
+		}
+		if (waitqueue_active(&wqh[BLK_RW_ASYNC]))
+			wake_up(&wqh[BLK_RW_ASYNC]);
+	}
+}
 
 static int nfs_set_page_writeback(struct page *page)
 {
@@ -200,11 +260,8 @@ static int nfs_set_page_writeback(struct
 		struct nfs_server *nfss = NFS_SERVER(inode);
 
 		page_cache_get(page);
-		if (atomic_long_inc_return(&nfss->writeback) >
-				NFS_CONGESTION_ON_THRESH) {
-			set_bdi_congested(&nfss->backing_dev_info,
-						BLK_RW_ASYNC);
-		}
+		nfs_set_congested(atomic_long_inc_return(&nfss->writeback),
+				  &nfss->backing_dev_info);
 	}
 	return ret;
 }
@@ -216,8 +273,10 @@ static void nfs_end_page_writeback(struc
 
 	end_page_writeback(page);
 	page_cache_release(page);
-	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
+	nfs_wakeup_congested(atomic_long_dec_return(&nfss->writeback),
+			     &nfss->backing_dev_info,
+			     nfss->writeback_wait);
 }
 
 static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
@@ -318,45 +377,49 @@ static int nfs_writepage_locked(struct p
 
 int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
+	struct inode *inode = page->mapping->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
 	int ret;
 
 	ret = nfs_writepage_locked(page, wbc);
 	unlock_page(page);
+
+	nfs_wait_contested(wbc->sync_mode == WB_SYNC_ALL,
+			   &nfss->backing_dev_info,
+			   nfss->writeback_wait);
+
 	return ret;
 }
 
-static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
+static int nfs_writepages_callback(struct page *page,
+				   struct writeback_control *wbc, void *data)
 {
+	struct inode *inode = page->mapping->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
 	int ret;
 
 	ret = nfs_do_writepage(page, wbc, data);
 	unlock_page(page);
+
+	nfs_wait_contested(wbc->sync_mode == WB_SYNC_ALL,
+			   &nfss->backing_dev_info,
+			   nfss->writeback_wait);
+
 	return ret;
 }
 
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	unsigned long *bitlock = &NFS_I(inode)->flags;
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
-	/* Stop dirtying of new pages while we sync */
-	err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
-			nfs_wait_bit_killable, TASK_KILLABLE);
-	if (err)
-		goto out_err;
-
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
 
-	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
-	smp_mb__after_clear_bit();
-	wake_up_bit(bitlock, NFS_INO_FLUSHING);
-
 	if (err < 0)
 		goto out_err;
 	err = pgio.pg_error;
@@ -1244,7 +1307,7 @@ static void nfs_commitdata_release(void 
  */
 static int nfs_commit_rpcsetup(struct list_head *head,
 		struct nfs_write_data *data,
-		int how)
+		int how, pgoff_t offset, pgoff_t count)
 {
 	struct nfs_page *first = nfs_list_entry(head->next);
 	struct inode *inode = first->wb_context->path.dentry->d_inode;
@@ -1276,8 +1339,8 @@ static int nfs_commit_rpcsetup(struct li
 
 	data->args.fh     = NFS_FH(data->inode);
 	/* Note: we always request a commit of the entire inode */
-	data->args.offset = 0;
-	data->args.count  = 0;
+	data->args.offset = offset;
+	data->args.count  = count;
 	data->args.context = get_nfs_open_context(first->wb_context);
 	data->res.count   = 0;
 	data->res.fattr   = &data->fattr;
@@ -1300,7 +1363,8 @@ static int nfs_commit_rpcsetup(struct li
  * Commit dirty pages
  */
 static int
-nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+nfs_commit_list(struct inode *inode, struct list_head *head, int how,
+		pgoff_t offset, pgoff_t count)
 {
 	struct nfs_write_data	*data;
 	struct nfs_page         *req;
@@ -1311,7 +1375,7 @@ nfs_commit_list(struct inode *inode, str
 		goto out_bad;
 
 	/* Set up the argument struct */
-	return nfs_commit_rpcsetup(head, data, how);
+	return nfs_commit_rpcsetup(head, data, how, offset, count);
  out_bad:
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
@@ -1379,6 +1443,9 @@ static void nfs_commit_release(void *cal
 		nfs_clear_page_tag_locked(req);
 	}
 	nfs_commit_clear_lock(NFS_I(data->inode));
+	trace_nfs_commit_release(data->inode,
+				 data->args.offset,
+				 data->args.count);
 	nfs_commitdata_release(calldata);
 }
 
@@ -1393,6 +1460,8 @@ static const struct rpc_call_ops nfs_com
 int nfs_commit_inode(struct inode *inode, int how)
 {
 	LIST_HEAD(head);
+	pgoff_t first_index;
+	pgoff_t last_index;
 	int may_wait = how & FLUSH_SYNC;
 	int res = 0;
 
@@ -1400,9 +1469,14 @@ int nfs_commit_inode(struct inode *inode
 		goto out_mark_dirty;
 	spin_lock(&inode->i_lock);
 	res = nfs_scan_commit(inode, &head, 0, 0);
+	if (res) {
+		first_index = nfs_list_entry(head.next)->wb_index;
+		last_index  = nfs_list_entry(head.prev)->wb_index;
+	}
 	spin_unlock(&inode->i_lock);
 	if (res) {
-		int error = nfs_commit_list(inode, &head, how);
+		int error = nfs_commit_list(inode, &head, how, first_index,
+					    last_index - first_index + 1);
 		if (error < 0)
 			return error;
 		if (may_wait)
@@ -1432,9 +1506,10 @@ static int nfs_commit_unstable_pages(str
 
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		/* Don't commit yet if this is a non-blocking flush and there
-		 * are a lot of outstanding writes for this mapping.
+		 * are a lot of outstanding writes for this mapping, until
+		 * collected enough pages to commit.
 		 */
-		if (nfsi->ncommit <= (nfsi->npages >> 1))
+		if (nfsi->ncommit <= nfsi->npages / DIRTY_MARGIN)
 			goto out_mark_dirty;
 
 		/* don't wait for the COMMIT response */
@@ -1443,17 +1518,15 @@ static int nfs_commit_unstable_pages(str
 
 	ret = nfs_commit_inode(inode, flags);
 	if (ret >= 0) {
-		if (wbc->sync_mode == WB_SYNC_NONE) {
-			if (ret < wbc->nr_to_write)
-				wbc->nr_to_write -= ret;
-			else
-				wbc->nr_to_write = 0;
-		}
-		return 0;
+		wbc->nr_to_write -= ret;
+		goto out;
 	}
+
 out_mark_dirty:
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	return ret;
+out:
+	trace_nfs_commit_unstable_pages(inode, wbc, flags, ret);
+	return ret >= 0 ? 0 : ret;
 }
 #else
 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
@@ -1582,6 +1655,9 @@ out:
 
 int __init nfs_init_writepagecache(void)
 {
+	unsigned long background_thresh;
+	unsigned long dirty_thresh;
+
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
 					     sizeof(struct nfs_write_data),
 					     0, SLAB_HWCACHE_ALIGN,
@@ -1619,6 +1695,16 @@ int __init nfs_init_writepagecache(void)
 	if (nfs_congestion_kb > 256*1024)
 		nfs_congestion_kb = 256*1024;
 
+	/*
+	 * Limit to 1/8 dirty threshold, so that writeback+in_commit pages
+	 * won't overnumber dirty+to_commit pages.
+	 */
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+	dirty_thresh <<= PAGE_SHIFT - 10;
+
+	if (nfs_congestion_kb > dirty_thresh / 8)
+		nfs_congestion_kb = dirty_thresh / 8;
+
 	return 0;
 }
 
--- linux-next.orig/include/linux/nfs_fs.h	2011-03-02 10:45:44.000000000 +0800
+++ linux-next/include/linux/nfs_fs.h	2011-03-02 10:45:57.000000000 +0800
@@ -215,7 +215,6 @@ struct nfs_inode {
 #define NFS_INO_ADVISE_RDPLUS	(0)		/* advise readdirplus */
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
-#define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
 #define NFS_INO_COMMIT		(7)		/* inode is committing unstable writes */
--- linux-next.orig/include/linux/nfs_fs_sb.h	2011-03-02 10:45:44.000000000 +0800
+++ linux-next/include/linux/nfs_fs_sb.h	2011-03-02 10:45:57.000000000 +0800
@@ -102,6 +102,7 @@ struct nfs_server {
 	struct nfs_iostats __percpu *io_stats;	/* I/O statistics */
 	struct backing_dev_info	backing_dev_info;
 	atomic_long_t		writeback;	/* number of writeback pages */
+	wait_queue_head_t	writeback_wait[2];
 	int			flags;		/* various flags */
 	unsigned int		caps;		/* server capabilities */
 	unsigned int		rsize;		/* read size */
--- linux-next.orig/fs/nfs/client.c	2011-03-02 10:45:44.000000000 +0800
+++ linux-next/fs/nfs/client.c	2011-03-02 10:45:57.000000000 +0800
@@ -1042,6 +1042,8 @@ static struct nfs_server *nfs_alloc_serv
 	INIT_LIST_HEAD(&server->delegations);
 
 	atomic_set(&server->active, 0);
+	init_waitqueue_head(&server->writeback_wait[BLK_RW_SYNC]);
+	init_waitqueue_head(&server->writeback_wait[BLK_RW_ASYNC]);
 
 	server->io_stats = nfs_alloc_iostats();
 	if (!server->io_stats) {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-next/include/trace/events/nfs.h	2011-03-02 10:45:58.000000000 +0800
@@ -0,0 +1,88 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs
+
+#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS_H
+
+#include <linux/nfs_fs.h>
+
+
+TRACE_EVENT(nfs_commit_unstable_pages,
+
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 int sync,
+		 int ret
+	),
+
+	TP_ARGS(inode, wbc, sync, ret),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned long,	ino)
+		__field(unsigned long,	npages)
+		__field(unsigned long,	to_commit)
+		__field(unsigned long,	write_chunk)
+		__field(int,		sync)
+		__field(int,		ret)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name,
+			dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+		__entry->ino		= inode->i_ino;
+		__entry->npages		= NFS_I(inode)->npages;
+		__entry->to_commit	= NFS_I(inode)->ncommit;
+		__entry->write_chunk	= wbc->per_file_limit;
+		__entry->sync		= sync;
+		__entry->ret		= ret;
+	),
+
+	TP_printk("bdi %s: ino=%lu npages=%ld tocommit=%lu "
+		  "write_chunk=%lu sync=%d ret=%d",
+		  __entry->name,
+		  __entry->ino,
+		  __entry->npages,
+		  __entry->to_commit,
+		  __entry->write_chunk,
+		  __entry->sync,
+		  __entry->ret
+	)
+);
+
+TRACE_EVENT(nfs_commit_release,
+
+	TP_PROTO(struct inode *inode,
+		 unsigned long offset,
+		 unsigned long len),
+
+	TP_ARGS(inode, offset, len),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned long,	ino)
+		__field(unsigned long,	offset)
+		__field(unsigned long,	len)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name,
+			dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+		__entry->ino		= inode->i_ino;
+		__entry->offset		= offset;
+		__entry->len		= len;
+	),
+
+	TP_printk("bdi %s: ino=%lu offset=%lu len=%lu",
+		  __entry->name,
+		  __entry->ino,
+		  __entry->offset,
+		  __entry->len
+	)
+);
+
+
+#endif /* _TRACE_NFS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>