From: Wu Fengguang The read-ahead logic is called when the reading hits - a PG_readahead marked page; - a non-present page. ra.prev_page should be properly setup on entrance, and readahead_cache_hit() should be called on every page reference as a feedback. This call scheme achieves the following goals: - makes all stateful/stateless methods happy; - eliminates the cache hit problem naturally; - lives in harmony with application managed read-aheads via fadvise/madvise. [efault@gmx.de: fix leak encountered with rpm -qaV] Signed-off-by: Wu Fengguang Signed-off-by: Mike Galbraith Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++ mm/filemap.c | 42 +++++++++++- mm/readahead.c | 143 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+), 4 deletions(-) diff -puN include/linux/mm.h~readahead-call-scheme include/linux/mm.h --- a/include/linux/mm.h~readahead-call-scheme +++ a/include/linux/mm.h @@ -1069,6 +1069,22 @@ unsigned long page_cache_readahead(struc void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset); unsigned long max_sane_readahead(unsigned long nr); +unsigned long +page_cache_readahead_adaptive(struct address_space *mapping, + struct file_ra_state *ra, + struct file *filp, + struct page *page, + pgoff_t offset, + unsigned long size); + +#if defined(CONFIG_DEBUG_READAHEAD) +void readahead_cache_hit(struct file_ra_state *ra, struct page *page); +#else +static inline void readahead_cache_hit(struct file_ra_state *ra, + struct page *page) +{ +} +#endif #ifdef CONFIG_ADAPTIVE_READAHEAD extern int readahead_ratio; diff -puN mm/filemap.c~readahead-call-scheme mm/filemap.c --- a/mm/filemap.c~readahead-call-scheme +++ a/mm/filemap.c @@ -945,16 +945,33 @@ void do_generic_mapping_read(struct addr nr = nr - offset; cond_resched(); - if (index == next_index) + + if (!prefer_adaptive_readahead() && index == next_index) next_index = page_cache_readahead(mapping, &ra, filp, index, last_index - index); find_page: page = find_get_page(mapping, index); + if (prefer_adaptive_readahead()) { + if (unlikely(page == NULL)) { + ra.prev_page = prev_index; + page_cache_readahead_adaptive(mapping, + &ra, filp, NULL, + index, last_index - index); + page = find_get_page(mapping, index); + } else if (PageReadahead(page)) { + ra.prev_page = prev_index; + page_cache_readahead_adaptive(mapping, + &ra, filp, page, + index, last_index - index); + } + } if (unlikely(page == NULL)) { - handle_ra_miss(mapping, &ra, index); + if (!prefer_adaptive_readahead()) + handle_ra_miss(mapping, &ra, index); goto no_cached_page; } + readahead_cache_hit(&ra, page); if (!PageUptodate(page)) goto page_not_up_to_date; page_ok: @@ -1102,6 +1119,8 @@ no_cached_page: out: *_ra = ra; + if (prefer_adaptive_readahead()) + _ra->prev_page = prev_index; *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) @@ -1372,6 +1391,7 @@ struct page *filemap_nopage(struct vm_ar unsigned long size, pgoff; int did_readaround = 0, majmin = VM_FAULT_MINOR; + ra->flags |= RA_FLAG_MMAP; pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; retry_all: @@ -1389,7 +1409,7 @@ retry_all: * * For sequential accesses, we use the generic readahead logic. */ - if (VM_SequentialReadHint(area)) + if (!prefer_adaptive_readahead() && VM_SequentialReadHint(area)) page_cache_readahead(mapping, ra, file, pgoff, 1); /* @@ -1397,11 +1417,22 @@ retry_all: */ retry_find: page = find_get_page(mapping, pgoff); + if (prefer_adaptive_readahead() && VM_SequentialReadHint(area)) { + if (!page) { + page_cache_readahead_adaptive(mapping, ra, file, NULL, + pgoff, 1); + page = find_get_page(mapping, pgoff); + } else if (PageReadahead(page)) { + page_cache_readahead_adaptive(mapping, ra, file, page, + pgoff, 1); + } + } if (!page) { unsigned long ra_pages; if (VM_SequentialReadHint(area)) { - handle_ra_miss(mapping, ra, pgoff); + if (!prefer_adaptive_readahead()) + handle_ra_miss(mapping, ra, pgoff); goto no_cached_page; } ra->mmap_miss++; @@ -1437,6 +1468,7 @@ retry_find: if (!did_readaround) ra->mmap_hit++; + readahead_cache_hit(ra, page); /* * Ok, found a page in the page cache, now we need to check @@ -1452,6 +1484,8 @@ success: mark_page_accessed(page); if (type) *type = majmin; + if (prefer_adaptive_readahead()) + ra->prev_page = page->index; return page; outside_data_content: diff -puN mm/readahead.c~readahead-call-scheme mm/readahead.c --- a/mm/readahead.c~readahead-call-scheme +++ a/mm/readahead.c @@ -1559,6 +1559,149 @@ static inline void get_readahead_bounds( #endif /* CONFIG_ADAPTIVE_READAHEAD */ +/** + * page_cache_readahead_adaptive - thrashing safe adaptive read-ahead + * @mapping, @ra, @filp, @offset, @req_size: the same as page_cache_readahead() + * @page: the page at @offset, or NULL if non-present + * + * page_cache_readahead_adaptive() is the entry point of the adaptive + * read-ahead logic. It tries a set of methods in turn to determine the + * appropriate readahead action and submits the readahead I/O. + * + * This function is expected to be called on two conditions: + * 1. @page == NULL + * A cache miss happened, some pages have to be read in + * 2. @page != NULL && PageReadahead(@page) + * A look-ahead mark encountered, this is set by a previous read-ahead + * invocation to instruct the caller to give the function a chance to + * check up and do next read-ahead in advance. + */ +unsigned long +page_cache_readahead_adaptive(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *page, + pgoff_t offset, unsigned long req_size) +{ + unsigned long ra_size; + unsigned long ra_min; + unsigned long ra_max; + int ret; + + if (page) { + ClearPageReadahead(page); + + /* + * Defer read-ahead on IO congestion. + */ + if (bdi_read_congested(mapping->backing_dev_info)) { + ra_account(ra, RA_EVENT_IO_CONGESTION, req_size); + return 0; + } + } + + if (page) + ra_account(ra, RA_EVENT_LOOKAHEAD_HIT, ra_lookahead_size(ra)); + else if (offset) + ra_account(ra, RA_EVENT_CACHE_MISS, req_size); + + get_readahead_bounds(ra, &ra_min, &ra_max); + + /* read-ahead disabled? */ + if (unlikely(!ra_max || !readahead_ratio)) { + ra_size = max_sane_readahead(req_size); + goto readit; + } + + /* + * Start of file. + */ + if (offset == 0) + return initial_readahead(mapping, filp, ra, req_size); + + /* + * State based sequential read-ahead. + */ + if (offset == ra->prev_page + 1 && + offset == ra->lookahead_index && + !debug_option(disable_stateful_method)) + return state_based_readahead(mapping, filp, ra, page, + offset, req_size, ra_max); + + /* + * Recover from possible thrashing. + */ + if (!page && offset == ra->prev_page + 1 && ra_has_index(ra, offset)) + return thrashing_recovery_readahead(mapping, filp, ra, + offset, ra_max); + + /* + * Backward read-ahead. + */ + if (!page && try_backward_prefetching(ra, offset, req_size, ra_max)) + return ra_submit(ra, mapping, filp); + + /* + * Context based sequential read-ahead. + */ + ret = try_context_based_readahead(mapping, ra, page, + offset, ra_min, ra_max); + if (ret > 0) + return ra_submit(ra, mapping, filp); + if (ret < 0) + return 0; + + /* No action on look-ahead time? */ + if (page) { + ra_account(ra, RA_EVENT_LOOKAHEAD_NOACTION, + ra->readahead_index - offset); + return 0; + } + + /* + * Random read. + */ + ra_size = min(req_size, ra_max); +readit: + ra_size = __do_page_cache_readahead(mapping, filp, offset, ra_size, 0); + + ra_account(ra, RA_EVENT_RANDOM_READ, ra_size); + dprintk("random_read(ino=%lu, req=%lu+%lu) = %lu\n", + mapping->host->i_ino, offset, req_size, ra_size); + + return ra_size; +} +EXPORT_SYMBOL_GPL(page_cache_readahead_adaptive); + +#if CONFIG_DEBUG_READAHEAD +/** + * readahead_cache_hit - adaptive read-ahead feedback function + * @ra: file_ra_state which holds the readahead state + * @page: the page just accessed + * + * This is the optional feedback route of the adaptive read-ahead logic. + * It must be called on every access on the read-ahead pages. + */ +void readahead_cache_hit(struct file_ra_state *ra, struct page *page) +{ + if (!prefer_adaptive_readahead()) + return; + + if (PageActive(page) || PageReferenced(page)) + return; + + if (!PageUptodate(page)) + ra_account(ra, RA_EVENT_IO_BLOCK, 1); + + if (!ra_has_index(ra, page->index)) + return; + + if (page->index >= ra->ra_index) + ra_account(ra, RA_EVENT_READAHEAD_HIT, 1); + else + ra_account(ra, RA_EVENT_READAHEAD_HIT, -1); +} +#endif /* CONFIG_DEBUG_READAHEAD */ + /* * Read-ahead events accounting. */ _