Readahead support for the variable order page cache Readahead is now dependent on the page size. For larger page sizes we want less readahead. Add a parameter to max_sane_readahead specifying the page order and update the code in mm/readahead.c to be aware of variant page sizes. Mark the 2M readahead constant as a potential future problem. Signed-off-by: Christoph Lameter --- include/linux/mm.h | 2 +- mm/fadvise.c | 5 +++-- mm/filemap.c | 5 +++-- mm/madvise.c | 4 +++- mm/readahead.c | 27 +++++++++++++++++---------- 5 files changed, 27 insertions(+), 16 deletions(-) Index: linux-2.6.21-rc7-mm2/include/linux/mm.h =================================================================== --- linux-2.6.21-rc7-mm2.orig/include/linux/mm.h 2007-05-01 10:34:04.000000000 -0700 +++ linux-2.6.21-rc7-mm2/include/linux/mm.h 2007-05-01 15:48:19.000000000 -0700 @@ -1184,7 +1184,7 @@ unsigned long page_cache_readahead(struc unsigned long size); void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset); -unsigned long max_sane_readahead(unsigned long nr); +unsigned long max_sane_readahead(unsigned long nr, int order); #ifdef CONFIG_ADAPTIVE_READAHEAD unsigned long Index: linux-2.6.21-rc7-mm2/mm/fadvise.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/fadvise.c 2007-05-01 15:46:40.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/fadvise.c 2007-05-01 15:48:19.000000000 -0700 @@ -86,10 +86,11 @@ asmlinkage long sys_fadvise64_64(int fd, nrpages = end_index - start_index + 1; if (!nrpages) nrpages = ~0UL; - + ret = force_page_cache_readahead(mapping, file, start_index, - max_sane_readahead(nrpages)); + max_sane_readahead(nrpages, + mapping_order(mapping))); if (ret > 0) ret = 0; break; Index: linux-2.6.21-rc7-mm2/mm/filemap.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/filemap.c 2007-05-01 15:47:35.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/filemap.c 2007-05-01 15:48:19.000000000 -0700 @@ -1328,7 +1328,7 @@ do_readahead(struct address_space *mappi return -EINVAL; force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); + max_sane_readahead(nr, mapping_order(mapping))); return 0; } @@ -1470,7 +1470,8 @@ retry_find: count_vm_event(PGMAJFAULT); } did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); + ra_pages = max_sane_readahead(file->f_ra.ra_pages, + mapping_order(mapping)); if (ra_pages) { pgoff_t start = 0; Index: linux-2.6.21-rc7-mm2/mm/madvise.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/madvise.c 2007-04-27 22:51:29.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/madvise.c 2007-05-01 15:48:19.000000000 -0700 @@ -124,7 +124,9 @@ static long madvise_willneed(struct vm_a end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; force_page_cache_readahead(file->f_mapping, - file, start, max_sane_readahead(end - start)); + file, start, + max_sane_readahead(end - start, + mapping_order(file->f_mapping))); return 0; } Index: linux-2.6.21-rc7-mm2/mm/readahead.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/readahead.c 2007-04-27 22:51:29.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/readahead.c 2007-05-01 15:48:19.000000000 -0700 @@ -143,7 +143,7 @@ EXPORT_SYMBOL_GPL(file_ra_state_init); */ static inline unsigned long get_max_readahead(struct file_ra_state *ra) { - return max_sane_readahead(ra->ra_pages); + return max_sane_readahead(ra->ra_pages, 0); } static inline unsigned long get_min_readahead(struct file_ra_state *ra) @@ -251,7 +251,7 @@ int read_cache_pages(struct address_spac put_pages_list(pages); break; } - task_io_account_read(PAGE_CACHE_SIZE); + task_io_account_read(page_cache_size(mapping)); } pagevec_lru_add(&lru_pvec); return ret; @@ -380,7 +380,7 @@ __do_page_cache_readahead(struct address if (isize == 0) goto out; - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + end_index = page_cache_index(mapping, isize - 1); /* * Preallocate as many pages as we will need. @@ -437,7 +437,11 @@ int force_page_cache_readahead(struct ad while (nr_to_read) { int err; - unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; + /* + * FIXME: Note the 2M constant here that may prove to + * be a problem if page sizes become bigger than one megabyte. + */ + unsigned long this_chunk = page_cache_index(mapping, 2 * 1024 * 1024); if (this_chunk > nr_to_read) this_chunk = nr_to_read; @@ -695,13 +699,15 @@ void handle_ra_miss(struct address_space } /* - * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a + * Given a desired number of page order readahead pages, return a * sensible upper limit. */ -unsigned long max_sane_readahead(unsigned long nr) +unsigned long max_sane_readahead(unsigned long nr, int order) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); + unsigned long base_pages = node_page_state(numa_node_id(), NR_INACTIVE) + + node_page_state(numa_node_id(), NR_FREE_PAGES); + + return min(nr, (base_pages / 2) >> order); } /* @@ -941,7 +947,8 @@ static unsigned long ra_submit(struct fi int actual; eof = /* it's a past-the-end index! */ - DIV_ROUND_UP(i_size_read(mapping->host), PAGE_CACHE_SIZE); + DIV_ROUND_UP(i_size_read(mapping->host), + page_cache_size(mapping)); if (unlikely(ra->ra_index >= eof)) return 0; @@ -1421,7 +1428,7 @@ try_context_based_readahead(struct addre * select a conservative initial size, plus user prefered agressiveness. */ ra_min = min(req_size, MIN_RA_PAGES) + - readahead_hit_rate * 8192 / PAGE_CACHE_SIZE; + readahead_hit_rate * 8192 / page_cache_size(mapping); /* * Case r1: the same context info as s2, but not that obvious.