Subject: writeback: trace balance_dirty_pages Date: Sun Aug 29 23:33:20 CST 2010 It would be useful for analyzing the dynamics of the throttling algorithms, and helpful for debugging user reported problems. Here is an interesting test to verify the theory with balance_dirty_pages() tracing. On a partition that can do ~60MB/s, a sparse file is created and 4 rsync tasks with different write bandwidth started: dd if=/dev/zero of=/mnt/1T bs=1M count=1 seek=1024000 echo 1 > /debug/tracing/events/writeback/balance_dirty_pages/enable rsync localhost:/mnt/1T /mnt/a --bwlimit 10000& rsync localhost:/mnt/1T /mnt/A --bwlimit 10000& rsync localhost:/mnt/1T /mnt/b --bwlimit 20000& rsync localhost:/mnt/1T /mnt/c --bwlimit 30000& Trace outputs within 0.1 second, grouped by tasks: rsync-3824 [004] 15002.076447: balance_dirty_pages: bdi=btrfs-2 weight=15% limit=130876 gap=5340 dirtied=192 pause=20 rsync-3822 [003] 15002.091701: balance_dirty_pages: bdi=btrfs-2 weight=15% limit=130777 gap=5113 dirtied=192 pause=20 rsync-3821 [006] 15002.004667: balance_dirty_pages: bdi=btrfs-2 weight=30% limit=129570 gap=3714 dirtied=64 pause=8 rsync-3821 [006] 15002.012654: balance_dirty_pages: bdi=btrfs-2 weight=30% limit=129589 gap=3733 dirtied=64 pause=8 rsync-3821 [006] 15002.021838: balance_dirty_pages: bdi=btrfs-2 weight=30% limit=129604 gap=3748 dirtied=64 pause=8 rsync-3821 [004] 15002.091193: balance_dirty_pages: bdi=btrfs-2 weight=29% limit=129583 gap=3983 dirtied=64 pause=8 rsync-3821 [004] 15002.102729: balance_dirty_pages: bdi=btrfs-2 weight=29% limit=129594 gap=3802 dirtied=64 pause=8 rsync-3821 [000] 15002.109252: balance_dirty_pages: bdi=btrfs-2 weight=29% limit=129619 gap=3827 dirtied=64 pause=8 rsync-3823 [002] 15002.009029: balance_dirty_pages: bdi=btrfs-2 weight=39% limit=128762 gap=2842 dirtied=64 pause=12 rsync-3823 [002] 15002.021598: balance_dirty_pages: bdi=btrfs-2 weight=39% limit=128813 gap=3021 dirtied=64 pause=12 rsync-3823 [003] 15002.032973: balance_dirty_pages: bdi=btrfs-2 weight=39% limit=128805 gap=2885 dirtied=64 pause=12 rsync-3823 [003] 15002.048800: balance_dirty_pages: bdi=btrfs-2 weight=39% limit=128823 gap=2967 dirtied=64 pause=12 rsync-3823 [003] 15002.060728: balance_dirty_pages: bdi=btrfs-2 weight=39% limit=128821 gap=3221 dirtied=64 pause=12 rsync-3823 [000] 15002.073152: balance_dirty_pages: bdi=btrfs-2 weight=39% limit=128825 gap=3225 dirtied=64 pause=12 rsync-3823 [005] 15002.090111: balance_dirty_pages: bdi=btrfs-2 weight=39% limit=128782 gap=3214 dirtied=64 pause=12 rsync-3823 [004] 15002.102520: balance_dirty_pages: bdi=btrfs-2 weight=39% limit=128764 gap=3036 dirtied=64 pause=12 The data vividly show that - the heaviest writer is throttled a bit (weight=39%) - the lighter writers run at full speed (weight=15%,15%,30%) Signed-off-by: Wu Fengguang --- include/linux/sched.h | 1 include/linux/writeback.h | 5 + include/trace/events/writeback.h | 89 +++++++++++++++++++++++++++++ mm/page-writeback.c | 29 ++++++++- 4 files changed, 122 insertions(+), 2 deletions(-) --- linux-2.6.orig/include/trace/events/writeback.h 2011-03-08 00:19:02.000000000 -0800 +++ linux-2.6/include/trace/events/writeback.h 2011-03-08 09:32:14.000000000 -0800 @@ -236,6 +236,95 @@ TRACE_EVENT(writeback_distribute_page_co ) ); +#define KBps(x) ((x) << (PAGE_SHIFT - 10)) + +TRACE_EVENT(balance_dirty_pages, + + TP_PROTO(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long bdi_thresh, + unsigned long dirty, + unsigned long bdi_dirty, + unsigned long task_bw, + unsigned long dirtied, + unsigned long period, + long pause, + unsigned long start_time), + + TP_ARGS(bdi, thresh, bdi_thresh, dirty, bdi_dirty, + task_bw, dirtied, period, pause, start_time), + + TP_STRUCT__entry( + __array( char, bdi, 32) + __field(unsigned long, bdi_weight) + __field(unsigned long, task_weight) + __field(unsigned long, limit) + __field(unsigned long, goal) + __field(unsigned long, dirty) + __field(unsigned long, bdi_goal) + __field(unsigned long, bdi_dirty) + __field(unsigned long, avg_dirty) + __field(unsigned long, base_bw) + __field(unsigned long, task_bw) + __field(unsigned long, dirtied) + __field(unsigned long, period) + __field( long, think) + __field( long, pause) + __field(unsigned long, paused) + ), + + TP_fast_assign( + long numerator; + long denominator; + + strlcpy(__entry->bdi, dev_name(bdi->dev), 32); + + bdi_writeout_fraction(bdi, &numerator, &denominator); + __entry->bdi_weight = 1000 * numerator / denominator; + task_dirties_fraction(current, &numerator, &denominator); + __entry->task_weight = 1000 * numerator / denominator; + + __entry->limit = thresh; + __entry->goal = thresh - thresh / 8; + __entry->dirty = dirty; + __entry->bdi_goal = bdi_thresh - bdi_thresh / 8; + __entry->bdi_dirty = bdi_dirty; + __entry->avg_dirty = dirty; + __entry->base_bw = KBps(bdi->pages_per_s); + __entry->task_bw = task_bw << 2; /* KB write_chunk */ + __entry->dirtied = dirtied; + __entry->think = 0; + __entry->period = 0; + __entry->pause = pause * 1000 / HZ; + __entry->paused = (jiffies - start_time) * 1000 / HZ; + ), + + + TP_printk("bdi %s: bdi_weight=%lu task_weight=%lu " + "limit=%lu goal=%lu dirty=%lu " + "bdi_goal=%lu bdi_dirty=%lu avg_dirty=%lu " + "base_bw=%lu task_bw=%lu " + "dirtied=%lu " + "period=%lu think=%ld pause=%ld paused=%lu", + __entry->bdi, + __entry->bdi_weight, + __entry->task_weight, + __entry->limit, + __entry->goal, + __entry->dirty, + __entry->bdi_goal, + __entry->bdi_dirty, + __entry->avg_dirty, + __entry->base_bw, /* base throttle bandwidth */ + __entry->task_bw, /* task throttle bandwidth */ + __entry->dirtied, + __entry->period, /* ms */ + __entry->think, /* ms */ + __entry->pause, /* ms */ + __entry->paused /* ms */ + ) +); + DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), --- linux-2.6.orig/mm/page-writeback.c 2011-03-08 00:19:02.000000000 -0800 +++ linux-2.6/mm/page-writeback.c 2011-03-08 09:34:28.000000000 -0800 @@ -248,7 +248,7 @@ void task_dirty_inc(struct task_struct * /* * Obtain an accurate fraction of the BDI's portion. */ -static void bdi_writeout_fraction(struct backing_dev_info *bdi, +void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { if (bdi_cap_writeback_dirty(bdi)) { @@ -260,7 +260,7 @@ static void bdi_writeout_fraction(struct } } -static inline void task_dirties_fraction(struct task_struct *tsk, +void task_dirties_fraction(struct task_struct *tsk, long *numerator, long *denominator) { prop_fraction_single(&vm_dirties, &tsk->dirties, @@ -727,6 +727,7 @@ static void balance_dirty_pages(struct a struct balance_waiter bw; struct dirty_limit_state st; int dirty_exceeded = check_dirty_limits(bdi, &st); + unsigned long start_time = jiffies; if (dirty_exceeded < DIRTY_MAY_EXCEED_LIMIT || (dirty_exceeded == DIRTY_MAY_EXCEED_LIMIT && @@ -745,6 +746,18 @@ static void balance_dirty_pages(struct a */ if (!laptop_mode && dirty_exceeded == DIRTY_EXCEED_BACKGROUND) bdi_start_background_writeback(bdi); + if (dirty_exceeded == DIRTY_MAY_EXCEED_LIMIT) + trace_balance_dirty_pages(bdi, + st.dirty_thresh, + st.bdi_thresh, + st.nr_reclaimable + st.nr_writeback, + st.bdi_nr_reclaimable + st.bdi_nr_writeback, + write_chunk, /* show as task_bw */ + current->nr_dirtied, + bdi->balance_waiters, /* show as period */ + jiffies - start_time, + start_time); + current->nr_dirtied = 0; return; } @@ -788,6 +801,17 @@ static void balance_dirty_pages(struct a */ WARN_ON(!list_empty(&bw.bw_list)); trace_writeback_balance_dirty_pages_woken(bdi); + trace_balance_dirty_pages(bdi, + st.dirty_thresh, + st.bdi_thresh, + st.nr_reclaimable + st.nr_writeback, + st.bdi_nr_reclaimable + st.bdi_nr_writeback, + write_chunk, /* show as task_bw */ + current->nr_dirtied, + bdi->balance_waiters, /* show as period */ + jiffies - start_time, + start_time); + current->nr_dirtied = 0; } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -833,6 +857,7 @@ void balance_dirty_pages_ratelimited_nr( preempt_disable(); p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; + current->nr_dirtied += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { ratelimit = sync_writeback_pages(*p); *p = 0; --- linux-2.6.orig/include/linux/writeback.h 2011-03-08 00:19:02.000000000 -0800 +++ linux-2.6/include/linux/writeback.h 2011-03-08 05:01:03.000000000 -0800 @@ -128,6 +128,11 @@ void global_dirty_limits(unsigned long * unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); +void bdi_writeout_fraction(struct backing_dev_info *bdi, + long *numerator, long *denominator); +void task_dirties_fraction(struct task_struct *tsk, + long *numerator, long *denominator); + void page_writeback_init(void); void distribute_page_completions(struct work_struct *work); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, --- linux-2.6.orig/include/linux/sched.h 2011-03-08 00:11:22.000000000 -0800 +++ linux-2.6/include/linux/sched.h 2011-03-08 05:01:03.000000000 -0800 @@ -1487,6 +1487,7 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + int nr_dirtied; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT];