--- linux-next.orig/mm/page-writeback.c 2011-04-13 17:18:06.000000000 +0800 +++ linux-next/mm/page-writeback.c 2011-04-15 13:48:58.000000000 +0800 @@ -37,24 +37,11 @@ #include /* - * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited - * will look to see if it needs to force writeback or throttling. + * Sleep at most 200ms at a time in balance_dirty_pages(). */ -static long ratelimit_pages = 32; +#define MAX_PAUSE max(HZ/5, 1) -/* - * When balance_dirty_pages decides that the caller needs to perform some - * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than dirtied pages to ensure that reasonably - * large amounts of I/O are submitted. - */ -static inline long sync_writeback_pages(unsigned long dirtied) -{ - if (dirtied < ratelimit_pages) - dirtied = ratelimit_pages; - - return dirtied + dirtied / 2; -} +#define RATIO_SHIFT 10 /* The following parameters are exported via /proc/sys/vm */ @@ -219,6 +206,7 @@ int dirty_bytes_handler(struct ctl_table */ static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) { + __inc_bdi_stat(bdi, BDI_WRITTEN); __prop_inc_percpu_max(&vm_completions, &bdi->completions, bdi->max_prop_frac); } @@ -244,53 +232,11 @@ void task_dirty_inc(struct task_struct * static void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { - if (bdi_cap_writeback_dirty(bdi)) { - prop_fraction_percpu(&vm_completions, &bdi->completions, - numerator, denominator); - } else { - *numerator = 0; - *denominator = 1; - } -} - -static inline void task_dirties_fraction(struct task_struct *tsk, - long *numerator, long *denominator) -{ - prop_fraction_single(&vm_dirties, &tsk->dirties, + prop_fraction_percpu(&vm_completions, &bdi->completions, numerator, denominator); } /* - * task_dirty_limit - scale down dirty throttling threshold for one task - * - * task specific dirty limit: - * - * dirty -= (dirty/8) * p_{t} - * - * To protect light/slow dirtying tasks from heavier/fast ones, we start - * throttling individual tasks before reaching the bdi dirty limit. - * Relatively low thresholds will be allocated to heavy dirtiers. So when - * dirty pages grow large, heavy dirtiers will be throttled first, which will - * effectively curb the growth of dirty pages. Light dirtiers with high enough - * dirty threshold may never get throttled. - */ -static unsigned long task_dirty_limit(struct task_struct *tsk, - unsigned long bdi_dirty) -{ - long numerator, denominator; - unsigned long dirty = bdi_dirty; - u64 inv = dirty >> 3; - - task_dirties_fraction(tsk, &numerator, &denominator); - inv *= numerator; - do_div(inv, denominator); - - dirty -= inv; - - return max(dirty, bdi_dirty/2); -} - -/* * */ static unsigned int bdi_min_ratio; @@ -397,14 +343,18 @@ unsigned long determine_dirtyable_memory return x + 1; /* Ensure that we never return 0 */ } +static unsigned long hard_dirty_limit(unsigned long thresh) +{ + return max(thresh + thresh / DIRTY_BRAKE, + default_backing_dev_info.dirty_threshold); +} + /* * global_dirty_limits - background-writeback and dirty-throttling thresholds * * Calculate the dirty thresholds based on sysctl parameters * - vm.dirty_background_ratio or vm.dirty_background_bytes * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { @@ -426,21 +376,31 @@ void global_dirty_limits(unsigned long * else background = (dirty_background_ratio * available_memory) / 100; - if (background >= dirty) - background = dirty / 2; + /* + * Ensure at least 1/4 gap between background and dirty thresholds, so + * that when dirty throttling starts at (background + dirty)/2, it's + * below or at the entrance of the soft dirty throttle scope. + */ + if (background > dirty - dirty / DIRTY_FULL_SCOPE) + background = dirty - dirty / DIRTY_FULL_SCOPE; + tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; - } *pbackground = background; *pdirty = dirty; + trace_global_dirty_state(background, dirty); } -/* +/** * bdi_dirty_limit - @bdi's share of dirty throttling threshold + * @bdi: the backing_dev_info to query + * @dirty: global dirty limit in pages + * + * Returns @bdi's dirty limit in pages. The term "dirty" in the context of + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * And the "limit" in the name is not seriously taken as hard limit in + * balance_dirty_pages(). * - * Allocate high/low dirty limits to fast/slow devices, in order to prevent + * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * @@ -469,6 +429,643 @@ unsigned long bdi_dirty_limit(struct bac } /* + * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() + * will look to see if it needs to start dirty throttling. + * + * If ratelimit_pages is too low then big NUMA machines will call the expensive + * global_page_state() too often. So scale it near-sqrt to the safety margin + * (the number of pages we may dirty without exceeding the dirty limits). + */ +static unsigned long ratelimit_pages(unsigned long dirty, + unsigned long thresh) +{ + if (thresh > dirty) + return 1UL << (ilog2(thresh - dirty) >> 1); + + return 1; +} + +/* + * last time exceeded (limit - limit/DIRTY_BRAKE) + */ +static bool dirty_exceeded_recently(struct backing_dev_info *bdi, + unsigned long time_window) +{ + return jiffies - bdi->dirty_exceed_time <= time_window; +} + +/* + * last time dropped below (thresh - 2*thresh/DIRTY_SCOPE + thresh/DIRTY_RAMPUP) + */ +static bool dirty_free_run_recently(struct backing_dev_info *bdi, + unsigned long time_window) +{ + return jiffies - bdi->dirty_free_run <= time_window; +} + +/* + * Position based bandwidth control. + * + * (1) boundary guarding areas + * + * The loop area is required to stop large number of slow dirtiers, because + * the max-pause area is only able to throttle a task at 1page/200ms=20KB/s. + * + * The pass-good region can stop a slow UKEY with 100+ slow dirtiers, while + * still avoid looping for the other good disk, so that their performance won't + * be impacted. + * + * The max-pause area can safeguard unknown bugs in the control algorithms + * as well as the possible surges in small memory boxes. + * + * The brake area is a good leeway for holding off the dirty pages in sudden + * workload change, or when some bdi dirty goal is excessively exceeded. + * + * The loop, pass-good and max-pause areas are enforced inside the loop of + * balance_dirty_pages(). Others can be found in bdi_position_ratio(). + * + * loop area, loop until drop below the area -----------------------|<=== + * pass-good area, dirty exceeded bdi's will loop -----------------|<===>| + * max-pause area, sleep(max_pause) and return -----------|<===>| + * brake area, bw scaled from 1 down to 0 ---|<=====>| + * ----------------------------------------------------o-------o-----o-----o---- + * ^ ^ ^ ^ + * limit - limit/DIRTY_BRAKE ---' | | | + * limit -----------' | | + * limit + limit/DIRTY_MAXPAUSE -----------------' | + * limit + limit/DIRTY_PASSGOOD -----------------------' + * + * (2) global control areas + * + * The rampup area is for ramping up the base bandwidth whereas the above brake + * area is for scaling down the base bandwidth. + * + * The global thresh typically lies at the bottom of the brake area. @thresh + * is real-time computed from global_dirty_limits() and @limit is tracking + * (thresh + thresh/DIRTY_BRAKE) at 200ms intervals in update_dirty_limit(). + * + *rampup area setpoint/goal + *|<=======>| v + * |-------------------------------*-------------------------------|------------ + * ^ ^ ^ + * thresh - 2*thresh/DIRTY_SCOPE thresh - thresh/DIRTY_SCOPE thresh + * + * (3) bdi control areas + * + * The bdi reserve area tries to keep a reasonable number of dirty pages for + * preventing block queue underrun. + * + * reserve area, scale up bw as dirty pages drop low bdi_setpoint + * |<=============================================>| v + * |-------------------------------------------------------*-------|---------- + * 0 bdi_thresh - bdi_thresh/DIRTY_SCOPE^ ^bdi_thresh + * + * (4) global/bdi control lines + * + * bdi_position_ratio() applies 2 main and 3 regional control lines for + * scaling up/down the base bandwidth based on the position of dirty pages. + * + * The two main control lines for the global/bdi control scopes do not end at + * thresh/bdi_thresh. They are centered at setpoint/bdi_setpoint and cover the + * whole [0, limit]. If the control line drops below 0 before reaching @limit, + * an auxiliary line will be setup to connect them. The below figure illustrates + * the main bdi control line with an auxiliary line extending it to @limit. + * + * This allows smoothly throttling down bdi_dirty back to normal if it starts + * high in situations like + * - start writing to a slow SD card and a fast disk at the same time. The SD + * card's bdi_dirty may rush to 5 times higher than bdi_setpoint. + * - the global/bdi dirty thresh/goal may be knocked down suddenly either on + * user request or on increased memory consumption. + * + * o + * o + * o [o] main control line + * o [*] auxiliary control line + * o + * o + * o + * o + * o + * o + * o--------------------- balance point, bw scale = 1 + * | o + * | o + * | o + * | o + * | o + * | o + * | o------- connect point, bw scale = 1/2 + * | .* + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * [--------------------*-----------------------------.--------------------*] + * 0 bdi_setpoint bdi_origin limit + * + * The bdi control line: if (bdi_origin < limit), an auxiliary control line (*) + * will be setup to extend the main control line (o) to @limit. + */ +static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty) +{ + unsigned long limit = hard_dirty_limit(thresh); + unsigned long bdi_thresh = bdi->dirty_threshold; + unsigned long origin; + unsigned long goal; + unsigned long long span; + unsigned long long bw; + + if (unlikely(dirty >= limit)) + return 0; + + /* + * global setpoint + */ + goal = thresh - thresh / DIRTY_SCOPE; + origin = goal + 2 * thresh; + + if (unlikely(origin < limit && dirty > (goal + origin) / 2)) { + origin = limit; + goal = (goal + origin) / 2; + bw >>= 1; + } + bw = origin - dirty; + bw <<= RATIO_SHIFT; + do_div(bw, origin - goal + 1); + + /* + * brake area, hold off dirtiers when the estimated dirty_ratelimit + * and/or write_bandwidth are adapting to sudden workload changes. + * It also balances the pressure to push global pages high when some + * bdi dirty pages are over-committed (eg. a UKEY's bdi goal could be + * exceeded a lot in the free run area; an unresponsing server may make + * an NFS bdi's dirty goal drop much lower than its dirty pages). + */ + if (dirty > limit - limit / DIRTY_BRAKE) { + bw *= limit - dirty; + do_div(bw, limit / DIRTY_BRAKE + 1); + } + + /* + * rampup area, immediately above the unthrottled free-run region. + * It's setup mainly to get an estimation of ref_bw for reliably + * ramping up the base bandwidth. + */ + dirty = default_backing_dev_info.avg_dirty; + origin = thresh - thresh / DIRTY_FULL_SCOPE + thresh / DIRTY_RAMPUP; + if (dirty < origin) { + span = (origin - dirty) * bw; + do_div(span, thresh / (4 * DIRTY_RAMPUP) + 1); + bw += min(span, 4 * bw); + } + + /* + * bdi reserve area, safeguard against bdi dirty underflow and disk idle + */ + origin = bdi->avg_write_bandwidth / 2 + 2 * MIN_WRITEBACK_PAGES; + origin = min(origin, thresh - thresh / DIRTY_FULL_SCOPE); + if (bdi_dirty < origin) { + if (bdi_dirty > origin / 4) + bw = bw * origin / bdi_dirty; + else + bw = bw * 4; + } + + /* + * bdi setpoint + */ + if (unlikely(bdi_thresh > thresh)) + bdi_thresh = thresh; + goal = bdi_thresh - bdi_thresh / DIRTY_SCOPE; + /* + * In JBOD case, bdi_thresh could fluctuate proportional to its own + * size. Otherwise the bdi write bandwidth is good for limiting the + * floating area, to compensate for the global control line being too + * flat in large memory systems. + */ + span = (u64) bdi_thresh * (thresh - bdi_thresh) + + (2 * bdi->avg_write_bandwidth) * bdi_thresh; + do_div(span, thresh + 1); + origin = goal + 2 * span; + + if (likely(bdi->avg_dirty)) + bdi_dirty = bdi->avg_dirty; + if (unlikely(bdi_dirty > goal + span)) { + if (bdi_dirty > limit) + return 0; + if (origin < limit) { + origin = limit; + goal += span; + bw >>= 1; + } + } + bw *= origin - bdi_dirty; + do_div(bw, origin - goal + 1); + + return bw; +} + +static void bdi_update_dirty_smooth(struct backing_dev_info *bdi, + unsigned long dirty) +{ + unsigned long avg = bdi->avg_dirty; + unsigned long old = bdi->old_dirty; + + if (unlikely(!avg)) { + avg = dirty; + goto update; + } + + /* + * dirty pages are departing upwards, follow up + */ + if (avg < old && old <= dirty) { + avg += (old - avg) >> 2; + goto update; + } + + /* + * dirty pages are departing downwards, follow down + */ + if (avg > old && old >= dirty) { + avg -= (avg - old) >> 2; + goto update; + } + + /* + * This can filter out one half unnecessary updates when bdi_dirty is + * fluctuating around the balance point, and is most effective on XFS, + * whose pattern is + * . + * [.] dirty [-] avg . . + * . . + * . . . . . . + * --------------------------------------- . . + * . . . . . . + * . . . . . . + * . . . . . . + * . . . . . . + * . . . . + * . . . . (fluctuated) + * . . . . + * . . . . + * + * @avg will remain flat at the cost of being biased towards high. In + * practice the error tend to be much smaller: thanks to more coarse + * grained fluctuations, @avg becomes the real average number for the + * last two rising lines of @dirty. + */ + goto out; + +update: + bdi->avg_dirty = avg; +out: + bdi->old_dirty = dirty; +} + +static void __bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long elapsed, + unsigned long written) +{ + const unsigned long period = roundup_pow_of_two(3 * HZ); + unsigned long avg = bdi->avg_write_bandwidth; + unsigned long old = bdi->write_bandwidth; + unsigned long cur; + u64 bw; + + bw = written - bdi->written_stamp; + bw *= HZ; + if (unlikely(elapsed > period / 2)) { + do_div(bw, elapsed); + elapsed = period / 2; + bw *= elapsed; + } + bw += (u64)bdi->write_bandwidth * (period - elapsed); + cur = bw >> ilog2(period); + bdi->write_bandwidth = cur; + + /* + * one more level of smoothing + */ + if (avg > old && old > cur) + avg -= (avg - old) >> 3; + + if (avg < old && old < cur) + avg += (old - avg) >> 3; + + bdi->avg_write_bandwidth = avg; +} + +static void update_dirty_limit(unsigned long thresh, + unsigned long dirty) +{ + unsigned long limit = default_backing_dev_info.dirty_threshold; + unsigned long min = dirty + limit / DIRTY_BRAKE; + + thresh += thresh / DIRTY_BRAKE; + + if (limit < thresh) { + limit = thresh; + goto update; + } + + /* take care not to follow into the brake area */ + if (limit > thresh && + limit > min) { + limit -= (limit - max(thresh, min)) >> 5; + goto update; + } + return; +update: + default_backing_dev_info.dirty_threshold = limit; +} + +static void bdi_update_dirty_threshold(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty) +{ + unsigned long old = bdi->old_dirty_threshold; + unsigned long avg = bdi->dirty_threshold; + + thresh = bdi_dirty_limit(bdi, thresh); + + if (avg > old && old >= thresh) + avg -= (avg - old) >> 3; + + if (avg < old && old <= thresh) + avg += (old - avg) >> 3; + + bdi->dirty_threshold = avg; + bdi->old_dirty_threshold = thresh; +} + +/* + * ref_bw typically fluctuates within a small range, with large isolated points + * from time to time. The smoothed reference_ratelimit can effectively filter + * out 1 such standalone point. When there comes 2+ isolated points together -- + * observed in ext4 on sudden redirty -- reference_ratelimit may surge high and + * take long time to return to normal, which can mostly be counteracted by + * xref_bw and other update restrictions in bdi_update_dirty_ratelimit(). + */ +static void bdi_update_reference_ratelimit(struct backing_dev_info *bdi, + unsigned long ref_bw) +{ + unsigned long old = bdi->old_ref_ratelimit; + unsigned long avg = bdi->reference_ratelimit; + + if (avg > old && old >= ref_bw && avg - old >= old - ref_bw) + avg -= (avg - old) >> 2; + + if (avg < old && old <= ref_bw && old - avg >= ref_bw - old) + avg += (old - avg) >> 2; + + bdi->reference_ratelimit = avg; + bdi->old_ref_ratelimit = ref_bw; +} + +/* + * Base throttle bandwidth. + */ +static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty, + unsigned long dirtied, + unsigned long elapsed) +{ + unsigned long limit = default_backing_dev_info.dirty_threshold; + unsigned long goal = thresh - thresh / DIRTY_SCOPE; + unsigned long bw = bdi->dirty_ratelimit; + unsigned long dirty_bw; + unsigned long pos_bw; + unsigned long delta; + unsigned long ref_bw; + unsigned long xref_bw; + unsigned long long pos_ratio; + + if (dirty > limit - limit / DIRTY_BRAKE) + bdi->dirty_exceed_time = jiffies; + + if (dirty < thresh - thresh / DIRTY_FULL_SCOPE + thresh / DIRTY_RAMPUP) + bdi->dirty_free_run = jiffies; + + /* + * The dirty rate will match the writeback rate in long term, except + * when dirty pages are truncated by userspace before IO submission, or + * re-dirtied when the FS finds it not suitable to do IO at the time. + */ + dirty_bw = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; + + pos_ratio = bdi_position_ratio(bdi, thresh, dirty, bdi_dirty); + /* + * (pos_bw > bw) means the position of the number of dirty pages is + * lower than the global and/or bdi setpoints. It does not necessarily + * mean the base throttle bandwidth is larger than its balanced value. + * The latter is likely only when + * - (position) the dirty pages are at some distance from the setpoint, + * - (speed) and either stands still or is departing from the setpoint. + */ + pos_bw = bw * pos_ratio >> RATIO_SHIFT; + + /* + * There may be + * 1) X dd tasks writing to the current disk, and/or + * 2) Y "rsync --bwlimit" tasks. + * The below estimation is accurate enough for (1). For (2), where not + * all task's dirty rate can be changed proportionally by adjusting the + * base throttle bandwidth, it would require multiple adjust-reestimate + * cycles to approach the rate balance point. That is not a big concern + * as we do small steps anyway for the sake of other unknown noises. + * The un-controllable tasks may only slow down the approximating + * progress and is harmless otherwise. + */ + pos_ratio *= bdi->avg_write_bandwidth; + do_div(pos_ratio, dirty_bw | 1); + ref_bw = bw * pos_ratio >> RATIO_SHIFT; + ref_bw = min(ref_bw, bdi->avg_write_bandwidth); + + /* + * Update the base throttle bandwidth rigidly: eg. only try lowering it + * when both the global/bdi dirty pages are away from their setpoints, + * and are either standing still or continue departing away. + * + * The "+ (avg_dirty >> 8)" margin mainly help btrfs, which behaves + * amazingly smoothly. Its @avg_dirty is ever approaching @dirty, + * slower and slower, but very hard to cross it to trigger a base + * bandwidth update. The added margin says "when @avg_dirty is _close + * enough_ to @dirty, it indicates slowed down @dirty change rate, + * hence the other inequalities are now a good indication of something + * unbalanced in the current bdi". + * + * In the cases of hitting the upper/lower margins, it's obviously + * necessary to adjust the (possibly very unbalanced) base bandwidth, + * unless the opposite margin was also been hit recently, which + * indicates that the dirty control scope may be smaller than the bdi + * write bandwidth and hence the dirty pages are quickly fluctuating + * between the upper/lower margins. + */ + if (bw < pos_bw) { + if (dirty < goal && + dirty <= default_backing_dev_info.avg_dirty + + (default_backing_dev_info.avg_dirty >> 8) && + bdi_dirty <= bdi->avg_dirty + (bdi->avg_dirty >> 8) && + bdi_dirty <= bdi->old_dirty) + goto adjust; + if (dirty < thresh - thresh / DIRTY_FULL_SCOPE + + thresh / DIRTY_RAMPUP && + !dirty_exceeded_recently(bdi, HZ)) + goto adjust; + } + + if (bw > pos_bw) { + if (dirty > goal && + dirty >= default_backing_dev_info.avg_dirty - + (default_backing_dev_info.avg_dirty >> 8) && + bdi_dirty >= bdi->avg_dirty - (bdi->avg_dirty >> 8) && + bdi_dirty >= bdi->old_dirty) + goto adjust; + if (dirty > limit - limit / DIRTY_BRAKE && + !dirty_free_run_recently(bdi, HZ)) + goto adjust; + } + + goto out; + +adjust: + /* + * The min/max'ed xref_bw is an effective safeguard against transient + * large deviations. By considering not only the current ref_bw value, + * but also the old/avg values, the sudden drop can be filtered out. + */ + if (pos_bw > bw) { + xref_bw = min(ref_bw, bdi->old_ref_ratelimit); + xref_bw = min(xref_bw, bdi->reference_ratelimit); + if (xref_bw > bw) + delta = xref_bw - bw; + else + delta = 0; + } else { + xref_bw = max(ref_bw, bdi->old_ref_ratelimit); + xref_bw = max(xref_bw, bdi->reference_ratelimit); + if (xref_bw < bw) + delta = bw - xref_bw; + else + delta = 0; + } + + /* + * Don't pursue 100% rate matching. It's impossible since the balanced + * rate itself is constantly fluctuating. So decrease the track speed + * when it gets close to the target. This avoids possible oscillations. + * Also limit the step size to avoid overshooting. + */ + delta >>= bw / (8 * delta + 1); + + if (pos_bw > bw) + bw += min(delta, pos_bw - bw) >> 2; + else + bw -= min(delta, bw - pos_bw) >> 2; + + bdi->dirty_ratelimit = bw; +out: + bdi_update_reference_ratelimit(bdi, ref_bw); + trace_dirty_ratelimit(bdi, dirty_bw, pos_bw, ref_bw); +} + +void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty, + unsigned long start_time) +{ + static DEFINE_SPINLOCK(dirty_lock); + unsigned long now = jiffies; + unsigned long elapsed; + unsigned long dirtied; + unsigned long written; + + if (!spin_trylock(&dirty_lock)) + return; + + elapsed = now - bdi->bw_time_stamp; + dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); + written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + + /* skip quiet periods when disk bandwidth is under-utilized */ + if (elapsed > 4 * MAX_PAUSE && + elapsed > now - start_time) + goto snapshot; + + /* + * rate-limit, only update once every 200ms. Demand higher threshold + * on the flusher so that the throttled task(s) can do most updates. + */ + if (!thresh && elapsed <= 2 * MAX_PAUSE) + goto unlock; + if (elapsed <= MAX_PAUSE) + goto unlock; + + if (thresh && + now - default_backing_dev_info.bw_time_stamp >= MAX_PAUSE) { + default_backing_dev_info.bw_time_stamp = now; + update_dirty_limit(thresh, dirty); + bdi_update_dirty_smooth(&default_backing_dev_info, dirty); + } + if (thresh) { + bdi_update_dirty_ratelimit(bdi, thresh, dirty, + bdi_dirty, dirtied, elapsed); + bdi_update_dirty_threshold(bdi, thresh, dirty); + bdi_update_dirty_smooth(bdi, bdi_dirty); + } + __bdi_update_write_bandwidth(bdi, elapsed, written); + +snapshot: + bdi->dirtied_stamp = dirtied; + bdi->written_stamp = written; + bdi->bw_time_stamp = now; +unlock: + spin_unlock(&dirty_lock); +} + +static unsigned long max_pause(struct backing_dev_info *bdi, + unsigned long bdi_dirty) +{ + unsigned long hi = ilog2(bdi->write_bandwidth); + unsigned long lo = ilog2(bdi->dirty_ratelimit); + unsigned long t; + + /* target for 10ms pause on 1-dd case */ + t = HZ / 50; + + /* + * Scale up pause time for concurrent dirtiers in order to reduce CPU + * overheads. + * + * (N * 20ms) on 2^N concurrent tasks. + */ + if (hi > lo) + t += (hi - lo) * (20 * HZ) / 1024; + + /* + * Limit pause time for small memory systems. If sleeping for too long + * time, a small pool of dirty/writeback pages may go empty and disk go + * idle. + * + * 1ms for every 1MB; may further consider bdi bandwidth. + */ + if (bdi_dirty) + t = min(t, bdi_dirty >> (30 - PAGE_CACHE_SHIFT - ilog2(HZ))); + + return clamp_val(t, 4, MAX_PAUSE); +} + +/* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. @@ -476,29 +1073,32 @@ unsigned long bdi_dirty_limit(struct bac * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, - unsigned long write_chunk) + unsigned long pages_dirtied) { - long nr_reclaimable, bdi_nr_reclaimable; - long nr_writeback, bdi_nr_writeback; + unsigned long nr_reclaimable; + unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ + unsigned long bdi_dirty; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; - unsigned long pages_written = 0; - unsigned long pause = 1; - bool dirty_exceeded = false; + unsigned long bw; + unsigned long base_bw; + unsigned long period; + unsigned long pause = 0; + unsigned long pause_max; struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long start_time = jiffies; for (;;) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = write_chunk, - .range_cyclic = 1, - }; - + unsigned long now = jiffies; + /* + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + */ nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - nr_writeback = global_page_state(NR_WRITEBACK); + nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); global_dirty_limits(&background_thresh, &dirty_thresh); @@ -507,12 +1107,11 @@ static void balance_dirty_pages(struct a * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ - if (nr_reclaimable + nr_writeback <= - (background_thresh + dirty_thresh) / 2) + if (nr_dirty <= (background_thresh + dirty_thresh) / 2) { + current->paused_when = jiffies; + current->nr_dirtied = 0; break; - - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - bdi_thresh = task_dirty_limit(current, bdi_thresh); + } /* * In order to avoid the stacked BDI deadlock we need @@ -524,62 +1123,107 @@ static void balance_dirty_pages(struct a * actually dirty; with m+n sitting in the percpu * deltas. */ - if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); + if (bdi->dirty_threshold < 2*bdi_stat_error(bdi)) { + bdi_dirty = bdi_stat_sum(bdi, BDI_RECLAIMABLE) + + bdi_stat_sum(bdi, BDI_WRITEBACK); } else { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + bdi_dirty = bdi_stat(bdi, BDI_RECLAIMABLE) + + bdi_stat(bdi, BDI_WRITEBACK); } - /* - * The bdi thresh is somehow "soft" limit derived from the - * global "hard" limit. The former helps to prevent heavy IO - * bdi or process from holding back light ones; The latter is - * the last resort safeguard. - */ - dirty_exceeded = - (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) - || (nr_reclaimable + nr_writeback > dirty_thresh); + bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, + bdi_dirty, start_time); - if (!dirty_exceeded) - break; + if (unlikely(!writeback_in_progress(bdi))) + bdi_start_background_writeback(bdi); - if (!bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; + pause_max = max_pause(bdi, bdi_dirty); - /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - * Only move pages to writeback if this bdi is over its - * threshold otherwise wait until the disk writes catch - * up. + base_bw = bdi->dirty_ratelimit; + /* + * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and + * real-time tasks. */ - trace_wbc_balance_dirty_start(&wbc, bdi); - if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes_wb(&bdi->wb, &wbc); - pages_written += write_chunk - wbc.nr_to_write; - trace_wbc_balance_dirty_written(&wbc, bdi); - if (pages_written >= write_chunk) - break; /* We've done our duty */ + if (current->flags & PF_LESS_THROTTLE || rt_task(current)) + base_bw *= 2; + bw = bdi_position_ratio(bdi, dirty_thresh, nr_dirty, bdi_dirty); + if (unlikely(bw == 0)) { + period = pause_max; + pause = pause_max; + goto pause; } - trace_wbc_balance_dirty_wait(&wbc, bdi); + bw = (u64)base_bw * bw >> RATIO_SHIFT; + period = (HZ * pages_dirtied + bw / 2) / (bw | 1); + pause = current->paused_when + period - now; + /* + * Take it as long think time if pause falls into (-10s, 0). + * If it's less than 500ms (ext2 blocks the dirtier task for + * up to 400ms from time to time on 1-HDD; so does xfs, however + * at much less frequency), try to compensate it in future by + * updating the virtual time; otherwise just reset the time, as + * it may be a light dirtier. + */ + if (unlikely(-pause < HZ*10)) { + trace_balance_dirty_pages(bdi, + dirty_thresh, + nr_dirty, + bdi_dirty, + base_bw, + bw, + pages_dirtied, + period, + pause, + start_time); + if (-pause > HZ/2) { + current->paused_when = now; + current->nr_dirtied = 0; + } else if (period) { + current->paused_when += period; + current->nr_dirtied = 0; + } + pause = 1; + break; + } + pause = min(pause, pause_max); + +pause: + trace_balance_dirty_pages(bdi, + dirty_thresh, + nr_dirty, + bdi_dirty, + base_bw, + bw, + pages_dirtied, + period, + pause, + start_time); + current->paused_when = now; __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); + current->paused_when += pause; + current->nr_dirtied = 0; - /* - * Increase the delay for each loop, up to our previous - * default of taking a 100ms nap. - */ - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; + dirty_thresh = hard_dirty_limit(dirty_thresh); + if (nr_dirty < dirty_thresh + dirty_thresh / DIRTY_MAXPAUSE) + break; + if (nr_dirty < dirty_thresh + dirty_thresh / DIRTY_PASSGOOD && + bdi_dirty < bdi->dirty_threshold) + break; } - if (!dirty_exceeded && bdi->dirty_exceeded) - bdi->dirty_exceeded = 0; + if (pause == 0) + current->nr_dirtied_pause = + ratelimit_pages(nr_dirty, dirty_thresh); + else if (period <= pause_max / 4) + current->nr_dirtied_pause = clamp_val( + base_bw * (pause_max/2) / HZ, + pages_dirtied + pages_dirtied/8, + pages_dirtied * 4); + else if (pause >= pause_max) + current->nr_dirtied_pause = 1 | clamp_val( + base_bw * (pause_max*3/8) / HZ, + current->nr_dirtied_pause / 4, + current->nr_dirtied_pause*7/8); if (writeback_in_progress(bdi)) return; @@ -592,8 +1236,10 @@ static void balance_dirty_pages(struct a * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ - if ((laptop_mode && pages_written) || - (!laptop_mode && (nr_reclaimable > background_thresh))) + if (laptop_mode) + return; + + if (nr_reclaimable > background_thresh) bdi_start_background_writeback(bdi); } @@ -607,8 +1253,6 @@ void set_page_dirty_balance(struct page } } -static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; - /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -618,36 +1262,35 @@ static DEFINE_PER_CPU(unsigned long, bdp * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * - * On really big machines, get_writeback_state is expensive, so try to avoid + * On really big machines, global_page_state() is expensive, so try to avoid * calling it too often (ratelimiting). But once we're over the dirty memory - * limit we decrease the ratelimiting by a lot, to prevent individual processes - * from overshooting the limit by (ratelimit_pages) each. + * limit we disable the ratelimiting, to prevent individual processes from + * overshooting the limit by (ratelimit_pages) each. */ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - unsigned long ratelimit; - unsigned long *p; + struct backing_dev_info *bdi = mapping->backing_dev_info; + + if (!bdi_cap_account_dirty(bdi)) + return; + + current->nr_dirtied += nr_pages_dirtied; - ratelimit = ratelimit_pages; - if (mapping->backing_dev_info->dirty_exceeded) - ratelimit = 8; + if (dirty_exceeded_recently(bdi, MAX_PAUSE)) { + unsigned long max = current->nr_dirtied + + (128 >> (PAGE_SHIFT - 10)); + + if (current->nr_dirtied_pause > max) + current->nr_dirtied_pause = max; + } /* * Check the rate limiting. Also, we do not want to throttle real-time * tasks in balance_dirty_pages(). Period. */ - preempt_disable(); - p = &__get_cpu_var(bdp_ratelimits); - *p += nr_pages_dirtied; - if (unlikely(*p >= ratelimit)) { - ratelimit = sync_writeback_pages(*p); - *p = 0; - preempt_enable(); - balance_dirty_pages(mapping, ratelimit); - return; - } - preempt_enable(); + if (unlikely(current->nr_dirtied >= current->nr_dirtied_pause)) + balance_dirty_pages(mapping, current->nr_dirtied); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); @@ -735,44 +1378,6 @@ void laptop_sync_completion(void) #endif /* - * If ratelimit_pages is too high then we can get into dirty-data overload - * if a large number of processes all perform writes at the same time. - * If it is too low then SMP machines will call the (expensive) - * get_writeback_state too often. - * - * Here we set ratelimit_pages to a level which ensures that when all CPUs are - * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory - * thresholds before writeback cuts in. - * - * But the limit should not be set too high. Because it also controls the - * amount of memory which the balance_dirty_pages() caller has to write back. - * If this is too large then the caller will block on the IO queue all the - * time. So limit it to four megabytes - the balance_dirty_pages() caller - * will write six megabyte chunks, max. - */ - -void writeback_set_ratelimit(void) -{ - ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); - if (ratelimit_pages < 16) - ratelimit_pages = 16; - if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) - ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; -} - -static int __cpuinit -ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) -{ - writeback_set_ratelimit(); - return NOTIFY_DONE; -} - -static struct notifier_block __cpuinitdata ratelimit_nb = { - .notifier_call = ratelimit_handler, - .next = NULL, -}; - -/* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory @@ -794,9 +1399,6 @@ void __init page_writeback_init(void) { int shift; - writeback_set_ratelimit(); - register_cpu_notifier(&ratelimit_nb); - shift = calc_period_shift(); prop_descriptor_init(&vm_completions, shift); prop_descriptor_init(&vm_dirties, shift); @@ -1127,6 +1729,7 @@ void account_page_dirtied(struct page *p __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } --- linux-next.orig/mm/filemap.c 2011-04-13 17:18:06.000000000 +0800 +++ linux-next/mm/filemap.c 2011-04-13 17:18:10.000000000 +0800 @@ -2313,6 +2313,7 @@ static ssize_t generic_perform_write(str long status = 0; ssize_t written = 0; unsigned int flags = 0; + unsigned int dirty; /* * Copies from kernel address space cannot fail (NFSD is a big user). @@ -2361,6 +2362,7 @@ again: pagefault_enable(); flush_dcache_page(page); + dirty = PageDirty(page); mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); @@ -2387,7 +2389,8 @@ again: pos += copied; written += copied; - balance_dirty_pages_ratelimited(mapping); + if (!dirty) + balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); --- linux-next.orig/include/linux/backing-dev.h 2011-04-14 09:20:56.000000000 +0800 +++ linux-next/include/linux/backing-dev.h 2011-04-15 13:41:32.000000000 +0800 @@ -40,6 +40,8 @@ typedef int (congested_fn)(void *, int); enum bdi_stat_item { BDI_RECLAIMABLE, BDI_WRITEBACK, + BDI_DIRTIED, + BDI_WRITTEN, NR_BDI_STAT_ITEMS }; @@ -71,8 +73,27 @@ struct backing_dev_info { struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; + unsigned long bw_time_stamp; + unsigned long dirtied_stamp; + unsigned long written_stamp; + unsigned long write_bandwidth; + unsigned long avg_write_bandwidth; + /* the base bandwidth, the task's dirty rate will be curbed under it */ + unsigned long dirty_ratelimit; + /* the estimated balance point, base bw will follow it step by step */ + unsigned long reference_ratelimit; + unsigned long old_ref_ratelimit; + unsigned long avg_dirty; + unsigned long old_dirty; + unsigned long dirty_threshold; + unsigned long old_dirty_threshold; + struct prop_local_percpu completions; - int dirty_exceeded; + + /* last time exceeded (limit - limit/DIRTY_BRAKE) */ + unsigned long dirty_exceed_time; + /* last time dropped to the rampup area or even the unthrottled area */ + unsigned long dirty_free_run; unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; --- linux-next.orig/mm/backing-dev.c 2011-04-14 09:20:56.000000000 +0800 +++ linux-next/mm/backing-dev.c 2011-04-15 13:42:45.000000000 +0800 @@ -81,20 +81,30 @@ static int bdi_debug_stats_show(struct s #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, - "BdiWriteback: %8lu kB\n" - "BdiReclaimable: %8lu kB\n" - "BdiDirtyThresh: %8lu kB\n" - "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n" - "b_dirty: %8lu\n" - "b_io: %8lu\n" - "b_more_io: %8lu\n" - "bdi_list: %8u\n" - "state: %8lx\n", + "BdiWriteback: %10lu kB\n" + "BdiReclaimable: %10lu kB\n" + "BdiDirtyThresh: %10lu kB\n" + "DirtyThresh: %10lu kB\n" + "BackgroundThresh: %10lu kB\n" + "BdiDirtied: %10lu kB\n" + "BdiWritten: %10lu kB\n" + "BdiWriteBandwidth: %10lu kBps\n" + "b_dirty: %10lu\n" + "b_io: %10lu\n" + "b_more_io: %10lu\n" + "bdi_list: %10u\n" + "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), K(dirty_thresh), - K(background_thresh), nr_dirty, nr_io, nr_more_io, + K(bdi_thresh), + K(dirty_thresh), + K(background_thresh), + (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), + (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), + (unsigned long) K(bdi->write_bandwidth), + nr_dirty, + nr_io, + nr_more_io, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -631,6 +641,11 @@ static void bdi_wb_init(struct bdi_write setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } +/* + * initial write bandwidth: 100 MB/s + */ +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -652,7 +667,17 @@ int bdi_init(struct backing_dev_info *bd goto err; } - bdi->dirty_exceeded = 0; + bdi->bw_time_stamp = jiffies; + bdi->written_stamp = 0; + + bdi->write_bandwidth = INIT_BW; + bdi->avg_write_bandwidth = INIT_BW; + bdi->dirty_ratelimit = INIT_BW; + + bdi->avg_dirty = 0; + bdi->old_dirty = 0; + bdi->dirty_threshold = MIN_WRITEBACK_PAGES; + err = prop_local_init_percpu(&bdi->completions); if (err) { --- linux-next.orig/fs/fs-writeback.c 2011-04-14 21:51:23.000000000 +0800 +++ linux-next/fs/fs-writeback.c 2011-04-15 13:48:58.000000000 +0800 @@ -689,6 +689,7 @@ static long wb_writeback(struct bdi_writ write_chunk = LONG_MAX; wbc.wb_start = jiffies; /* livelock avoidance */ + bdi_update_write_bandwidth(wb->bdi, wbc.wb_start); for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -724,6 +725,8 @@ static long wb_writeback(struct bdi_writ writeback_inodes_wb(wb, &wbc); trace_wbc_writeback_written(&wbc, wb->bdi); + bdi_update_write_bandwidth(wb->bdi, wbc.wb_start); + work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; --- linux-next.orig/include/linux/writeback.h 2011-04-14 21:51:23.000000000 +0800 +++ linux-next/include/linux/writeback.h 2011-04-15 13:48:58.000000000 +0800 @@ -12,6 +12,44 @@ struct backing_dev_info; extern spinlock_t inode_wb_list_lock; /* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) + +/* + * The 1/4 region under the global dirty thresh is for smooth dirty throttling: + * + * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) + * + * The 1/4 region under the global dirty limit will be more rigidly throttled: + * + * (limit - limit/DIRTY_BRAKE, limit) + * + * The 1/32 region above the global dirty limit will be put to maximum pauses: + * + * (limit, limit + limit/DIRTY_MAXPAUSE) + * + * The 1/16 region above the global dirty limit, dirty exceeded bdi's will be + * put to loops: + * + * (limit, limit + limit/DIRTY_PASSGOOD) + * + * Further beyond, all dirtier tasks will enter a loop waiting (possibly long + * time) for the dirty pages to drop. + * + * The global dirty threshold is normally at the lower bound of the brake + * region, except when the system suddenly allocates a lot of anonymous memory + * and knocks down the global dirty threshold quickly, in which case the global + * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. + */ +#define DIRTY_RAMPUP 32 +#define DIRTY_SCOPE 8 +#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) +#define DIRTY_BRAKE 8 +#define DIRTY_MAXPAUSE 32 +#define DIRTY_PASSGOOD 16 + +/* * fs/fs-writeback.c */ enum writeback_sync_modes { @@ -128,6 +166,17 @@ void global_dirty_limits(unsigned long * unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); +void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty, + unsigned long start_time); +static inline void bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time) +{ + bdi_update_bandwidth(bdi, 0, 0, 0, start_time); +} + void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied); --- linux-next.orig/include/linux/sched.h 2011-04-15 13:41:32.000000000 +0800 +++ linux-next/include/linux/sched.h 2011-04-15 13:41:32.000000000 +0800 @@ -1493,6 +1493,14 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + /* + * when (nr_dirtied >= nr_dirtied_pause), it's time to call + * balance_dirty_pages() for some dirty throttling pause + */ + int nr_dirtied; + int nr_dirtied_pause; + unsigned long paused_when; /* start of a write-and-pause period */ + #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; --- linux-next.orig/mm/memory_hotplug.c 2011-04-15 13:41:32.000000000 +0800 +++ linux-next/mm/memory_hotplug.c 2011-04-15 13:41:32.000000000 +0800 @@ -468,8 +468,6 @@ int online_pages(unsigned long pfn, unsi vm_total_pages = nr_free_pagecache_pages(); - writeback_set_ratelimit(); - if (onlined_pages) memory_notify(MEM_ONLINE, &arg); unlock_memory_hotplug(); @@ -901,7 +899,6 @@ repeat: } vm_total_pages = nr_free_pagecache_pages(); - writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); unlock_memory_hotplug(); --- linux-next.orig/include/trace/events/writeback.h 2011-04-15 13:41:31.000000000 +0800 +++ linux-next/include/trace/events/writeback.h 2011-04-15 13:48:58.000000000 +0800 @@ -147,11 +147,184 @@ DEFINE_EVENT(wbc_class, name, \ DEFINE_WBC_EVENT(wbc_writeback_start); DEFINE_WBC_EVENT(wbc_writeback_written); DEFINE_WBC_EVENT(wbc_writeback_wait); -DEFINE_WBC_EVENT(wbc_balance_dirty_start); -DEFINE_WBC_EVENT(wbc_balance_dirty_written); -DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); +#define KBps(x) ((x) << (PAGE_SHIFT - 10)) + +TRACE_EVENT(dirty_ratelimit, + + TP_PROTO(struct backing_dev_info *bdi, + unsigned long dirty_bw, + unsigned long pos_bw, + unsigned long ref_bw), + + TP_ARGS(bdi, dirty_bw, pos_bw, ref_bw), + + TP_STRUCT__entry( + __array(char, bdi, 32) + __field(unsigned long, write_bw) + __field(unsigned long, avg_bw) + __field(unsigned long, dirty_bw) + __field(unsigned long, base_bw) + __field(unsigned long, pos_bw) + __field(unsigned long, ref_bw) + __field(unsigned long, avg_ref_bw) + ), + + TP_fast_assign( + strlcpy(__entry->bdi, dev_name(bdi->dev), 32); + __entry->write_bw = KBps(bdi->write_bandwidth); + __entry->avg_bw = KBps(bdi->avg_write_bandwidth); + __entry->dirty_bw = KBps(dirty_bw); + __entry->base_bw = KBps(bdi->dirty_ratelimit); + __entry->pos_bw = KBps(pos_bw); + __entry->ref_bw = KBps(ref_bw); + __entry->avg_ref_bw = KBps(bdi->reference_ratelimit); + ), + + + TP_printk("bdi %s: " + "write_bw=%lu awrite_bw=%lu dirty_bw=%lu " + "base_bw=%lu pos_bw=%lu ref_bw=%lu aref_bw=%lu", + __entry->bdi, + __entry->write_bw, /* write bandwidth */ + __entry->avg_bw, /* avg write bandwidth */ + __entry->dirty_bw, /* dirty bandwidth */ + __entry->base_bw, /* dirty ratelimit on each task */ + __entry->pos_bw, /* position control ratelimit */ + __entry->ref_bw, /* the reference ratelimit */ + __entry->avg_ref_bw /* smoothed reference ratelimit */ + ) +); + +TRACE_EVENT(balance_dirty_pages, + + TP_PROTO(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_dirty, + unsigned long base_bw, + unsigned long task_bw, + unsigned long dirtied, + unsigned long period, + long pause, + unsigned long start_time), + + TP_ARGS(bdi, thresh, dirty, bdi_dirty, + base_bw, task_bw, dirtied, period, pause, start_time), + + TP_STRUCT__entry( + __array( char, bdi, 32) + __field(unsigned long, limit) + __field(unsigned long, goal) + __field(unsigned long, dirty) + __field(unsigned long, bdi_goal) + __field(unsigned long, bdi_dirty) + __field(unsigned long, avg_dirty) + __field(unsigned long, base_bw) + __field(unsigned long, task_bw) + __field(unsigned int, dirtied) + __field(unsigned int, dirtied_pause) + __field(unsigned long, period) + __field( long, think) + __field( long, pause) + __field(unsigned long, paused) + ), + + TP_fast_assign( + strlcpy(__entry->bdi, dev_name(bdi->dev), 32); + + __entry->limit = default_backing_dev_info.dirty_threshold; + __entry->goal = thresh - thresh / DIRTY_SCOPE; + __entry->dirty = dirty; + __entry->bdi_goal = bdi->dirty_threshold - + bdi->dirty_threshold / DIRTY_SCOPE; + __entry->bdi_dirty = bdi_dirty; + __entry->avg_dirty = bdi->avg_dirty; + __entry->base_bw = KBps(base_bw); + __entry->task_bw = KBps(task_bw); + __entry->dirtied = dirtied; + __entry->dirtied_pause = current->nr_dirtied_pause; + __entry->think = current->paused_when == 0 ? 0 : + (long)(jiffies - current->paused_when) * 1000 / HZ; + __entry->period = period * 1000 / HZ; + __entry->pause = pause * 1000 / HZ; + __entry->paused = (jiffies - start_time) * 1000 / HZ; + ), + + + TP_printk("bdi %s: " + "limit=%lu goal=%lu dirty=%lu " + "bdi_goal=%lu bdi_dirty=%lu avg_dirty=%lu " + "base_bw=%lu task_bw=%lu " + "dirtied=%u dirtied_pause=%u " + "period=%lu think=%ld pause=%ld paused=%lu", + __entry->bdi, + __entry->limit, + __entry->goal, + __entry->dirty, + __entry->bdi_goal, + __entry->bdi_dirty, + __entry->avg_dirty, + __entry->base_bw, /* base throttle bandwidth */ + __entry->task_bw, /* task throttle bandwidth */ + __entry->dirtied, + __entry->dirtied_pause, + __entry->period, /* ms */ + __entry->think, /* ms */ + __entry->pause, /* ms */ + __entry->paused /* ms */ + ) +); + +TRACE_EVENT(global_dirty_state, + + TP_PROTO(unsigned long background_thresh, + unsigned long dirty_thresh + ), + + TP_ARGS(background_thresh, + dirty_thresh + ), + + TP_STRUCT__entry( + __field(unsigned long, nr_dirty) + __field(unsigned long, nr_writeback) + __field(unsigned long, nr_unstable) + __field(unsigned long, background_thresh) + __field(unsigned long, dirty_thresh) + __field(unsigned long, dirty_limit) + __field(unsigned long, nr_dirtied) + __field(unsigned long, nr_written) + ), + + TP_fast_assign( + __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); + __entry->nr_writeback = global_page_state(NR_WRITEBACK); + __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); + __entry->nr_dirtied = global_page_state(NR_DIRTIED); + __entry->nr_written = global_page_state(NR_WRITTEN); + __entry->background_thresh = background_thresh; + __entry->dirty_thresh = dirty_thresh; + __entry->dirty_limit = default_backing_dev_info.dirty_threshold; + ), + + TP_printk("dirty=%lu writeback=%lu unstable=%lu " + "bg_thresh=%lu thresh=%lu limit=%lu gap=%ld " + "dirtied=%lu written=%lu", + __entry->nr_dirty, + __entry->nr_writeback, + __entry->nr_unstable, + __entry->background_thresh, + __entry->dirty_thresh, + __entry->dirty_limit, + __entry->dirty_thresh - __entry->nr_dirty - + __entry->nr_writeback - __entry->nr_unstable, + __entry->nr_dirtied, + __entry->nr_written + ) +); + DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),