lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140113020132.GO1992@bbox>
Date:	Mon, 13 Jan 2014 11:01:32 +0900
From:	Minchan Kim <minchan@...nel.org>
To:	Johannes Weiner <hannes@...xchg.org>
Cc:	Andrew Morton <akpm@...ux-foundation.org>,
	Andi Kleen <andi@...stfloor.org>,
	Andrea Arcangeli <aarcange@...hat.com>,
	Bob Liu <bob.liu@...cle.com>,
	Christoph Hellwig <hch@...radead.org>,
	Dave Chinner <david@...morbit.com>,
	Greg Thelen <gthelen@...gle.com>,
	Hugh Dickins <hughd@...gle.com>, Jan Kara <jack@...e.cz>,
	KOSAKI Motohiro <kosaki.motohiro@...fujitsu.com>,
	Luigi Semenzato <semenzato@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Metin Doslu <metin@...usdata.com>,
	Michel Lespinasse <walken@...gle.com>,
	Ozgun Erdogan <ozgun@...usdata.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Rik van Riel <riel@...hat.com>,
	Roman Gushchin <klamm@...dex-team.ru>,
	Ryan Mallon <rmallon@...il.com>, Tejun Heo <tj@...nel.org>,
	Vlastimil Babka <vbabka@...e.cz>, linux-mm@...ck.org,
	linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [patch 5/9] mm + fs: prepare for non-page entries in page cache
 radix trees

On Fri, Jan 10, 2014 at 01:10:39PM -0500, Johannes Weiner wrote:
> shmem mappings already contain exceptional entries where swap slot
> information is remembered.
> 
> To be able to store eviction information for regular page cache,
> prepare every site dealing with the radix trees directly to handle
> entries other than pages.
> 
> The common lookup functions will filter out non-page entries and
> return NULL for page cache holes, just as before.  But provide a raw
> version of the API which returns non-page entries as well, and switch
> shmem over to use it.
> 
> Signed-off-by: Johannes Weiner <hannes@...xchg.org>
Reviewed-by: Minchan Kim <minchan@...nel.org>

Below are just nitpicks.

> ---
>  fs/btrfs/compression.c   |   2 +-
>  include/linux/mm.h       |   8 ++
>  include/linux/pagemap.h  |  15 ++--
>  include/linux/pagevec.h  |   3 +
>  include/linux/shmem_fs.h |   1 +
>  mm/filemap.c             | 196 +++++++++++++++++++++++++++++++++++++++++------
>  mm/mincore.c             |  20 +++--
>  mm/readahead.c           |   2 +-
>  mm/shmem.c               |  97 +++++------------------
>  mm/swap.c                |  47 ++++++++++++
>  mm/truncate.c            |  73 ++++++++++++++----
>  11 files changed, 336 insertions(+), 128 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index 6aad98cb343f..c88316587900 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -474,7 +474,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
>  		rcu_read_lock();
>  		page = radix_tree_lookup(&mapping->page_tree, pg_index);
>  		rcu_read_unlock();
> -		if (page) {
> +		if (page && !radix_tree_exceptional_entry(page)) {
>  			misses++;
>  			if (misses > 4)
>  				break;
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8b6e55ee8855..c09ef3ae55bc 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -906,6 +906,14 @@ extern void show_free_areas(unsigned int flags);
>  extern bool skip_free_areas_node(unsigned int flags, int nid);
>  
>  int shmem_zero_setup(struct vm_area_struct *);
> +#ifdef CONFIG_SHMEM
> +bool shmem_mapping(struct address_space *mapping);
> +#else
> +static inline bool shmem_mapping(struct address_space *mapping)
> +{
> +	return false;
> +}
> +#endif
>  
>  extern int can_do_mlock(void);
>  extern int user_shm_lock(size_t, struct user_struct *);
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index c73130c607c4..b6854b7c58cb 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -248,12 +248,15 @@ pgoff_t page_cache_next_hole(struct address_space *mapping,
>  pgoff_t page_cache_prev_hole(struct address_space *mapping,
>  			     pgoff_t index, unsigned long max_scan);
>  
> -extern struct page * find_get_page(struct address_space *mapping,
> -				pgoff_t index);
> -extern struct page * find_lock_page(struct address_space *mapping,
> -				pgoff_t index);
> -extern struct page * find_or_create_page(struct address_space *mapping,
> -				pgoff_t index, gfp_t gfp_mask);
> +struct page *__find_get_page(struct address_space *mapping, pgoff_t offset);
> +struct page *find_get_page(struct address_space *mapping, pgoff_t offset);
> +struct page *__find_lock_page(struct address_space *mapping, pgoff_t offset);
> +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset);
> +struct page *find_or_create_page(struct address_space *mapping, pgoff_t index,
> +				 gfp_t gfp_mask);
> +unsigned __find_get_pages(struct address_space *mapping, pgoff_t start,
> +			  unsigned int nr_pages, struct page **pages,
> +			  pgoff_t *indices);
>  unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
>  			unsigned int nr_pages, struct page **pages);
>  unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
> diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
> index e4dbfab37729..3c6b8b1e945b 100644
> --- a/include/linux/pagevec.h
> +++ b/include/linux/pagevec.h
> @@ -22,6 +22,9 @@ struct pagevec {
>  
>  void __pagevec_release(struct pagevec *pvec);
>  void __pagevec_lru_add(struct pagevec *pvec);
> +unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
> +			  pgoff_t start, unsigned nr_pages, pgoff_t *indices);
> +void pagevec_remove_exceptionals(struct pagevec *pvec);
>  unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
>  		pgoff_t start, unsigned nr_pages);
>  unsigned pagevec_lookup_tag(struct pagevec *pvec,
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 30aa0dc60d75..deb49609cd36 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -49,6 +49,7 @@ extern struct file *shmem_file_setup(const char *name,
>  					loff_t size, unsigned long flags);
>  extern int shmem_zero_setup(struct vm_area_struct *);
>  extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
> +extern bool shmem_mapping(struct address_space *mapping);
>  extern void shmem_unlock_mapping(struct address_space *mapping);
>  extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
>  					pgoff_t index, gfp_t gfp_mask);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 0746b7a4658f..23eb3be27205 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -446,6 +446,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
>  }
>  EXPORT_SYMBOL_GPL(replace_page_cache_page);
>  
> +static int page_cache_tree_insert(struct address_space *mapping,
> +				  struct page *page)
> +{
> +	void **slot;
> +	int error;
> +
> +	slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
> +	if (slot) {
> +		void *p;
> +
> +		p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
> +		if (!radix_tree_exceptional_entry(p))
> +		    return -EEXIST;
> +		radix_tree_replace_slot(slot, page);
> +		mapping->nrpages++;
> +		return 0;
> +	}
> +	error = radix_tree_insert(&mapping->page_tree, page->index, page);
> +	if (!error)
> +		mapping->nrpages++;
> +	return error;
> +}
> +
>  /**
>   * add_to_page_cache_locked - add a locked page to the pagecache
>   * @page:	page to add
> @@ -480,11 +503,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
>  	page->index = offset;
>  
>  	spin_lock_irq(&mapping->tree_lock);
> -	error = radix_tree_insert(&mapping->page_tree, offset, page);
> +	error = page_cache_tree_insert(mapping, page);
>  	radix_tree_preload_end();
>  	if (unlikely(error))
>  		goto err_insert;
> -	mapping->nrpages++;
>  	__inc_zone_page_state(page, NR_FILE_PAGES);
>  	spin_unlock_irq(&mapping->tree_lock);
>  	trace_mm_filemap_add_to_page_cache(page);
> @@ -712,7 +734,10 @@ pgoff_t page_cache_next_hole(struct address_space *mapping,
>  	unsigned long i;
>  
>  	for (i = 0; i < max_scan; i++) {
> -		if (!radix_tree_lookup(&mapping->page_tree, index))
> +		struct page *page;
> +
> +		page = radix_tree_lookup(&mapping->page_tree, index);
> +		if (!page || radix_tree_exceptional_entry(page))
>  			break;
>  		index++;
>  		if (index == 0)
> @@ -750,7 +775,10 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping,
>  	unsigned long i;
>  
>  	for (i = 0; i < max_scan; i++) {
> -		if (!radix_tree_lookup(&mapping->page_tree, index))
> +		struct page *page;
> +
> +		page = radix_tree_lookup(&mapping->page_tree, index);
> +		if (!page || radix_tree_exceptional_entry(page))
>  			break;
>  		index--;
>  		if (index == ULONG_MAX)
> @@ -762,14 +790,19 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping,
>  EXPORT_SYMBOL(page_cache_prev_hole);
>  
>  /**
> - * find_get_page - find and get a page reference
> + * __find_get_page - find and get a page reference
>   * @mapping: the address_space to search
>   * @offset: the page index
>   *
> - * Is there a pagecache struct page at the given (mapping, offset) tuple?
> - * If yes, increment its refcount and return it; if no, return NULL.
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned with an increased refcount.
> + *
> + * If the slot holds a shadow entry of a previously evicted page, it
> + * is returned.
> + *
> + * Otherwise, %NULL is returned.
>   */
> -struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
> +struct page *__find_get_page(struct address_space *mapping, pgoff_t offset)
>  {
>  	void **pagep;
>  	struct page *page;
> @@ -810,24 +843,49 @@ out:
>  
>  	return page;
>  }
> +EXPORT_SYMBOL(__find_get_page);
> +
> +/**
> + * find_get_page - find and get a page reference
> + * @mapping: the address_space to search
> + * @offset: the page index
> + *
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned with an increased refcount.
> + *
> + * Otherwise, %NULL is returned.
> + */
> +struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
> +{
> +	struct page *page = __find_get_page(mapping, offset);
> +
> +	if (radix_tree_exceptional_entry(page))
> +		page = NULL;
> +	return page;
> +}
>  EXPORT_SYMBOL(find_get_page);
>  
>  /**
> - * find_lock_page - locate, pin and lock a pagecache page
> + * __find_lock_page - locate, pin and lock a pagecache page
>   * @mapping: the address_space to search
>   * @offset: the page index
>   *
> - * Locates the desired pagecache page, locks it, increments its reference
> - * count and returns its address.
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned locked and with an increased
> + * refcount.
> + *
> + * If the slot holds a shadow entry of a previously evicted page, it
> + * is returned.
> + *
> + * Otherwise, %NULL is returned.
>   *
> - * Returns zero if the page was not present. find_lock_page() may sleep.
> + * __find_lock_page() may sleep.
>   */
> -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
> +struct page *__find_lock_page(struct address_space *mapping, pgoff_t offset)
>  {
>  	struct page *page;
> -
>  repeat:
> -	page = find_get_page(mapping, offset);
> +	page = __find_get_page(mapping, offset);
>  	if (page && !radix_tree_exception(page)) {
>  		lock_page(page);
>  		/* Has the page been truncated? */
> @@ -840,6 +898,29 @@ repeat:
>  	}
>  	return page;
>  }
> +EXPORT_SYMBOL(__find_lock_page);
> +
> +/**
> + * find_lock_page - locate, pin and lock a pagecache page
> + * @mapping: the address_space to search
> + * @offset: the page index
> + *
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned locked and with an increased
> + * refcount.
> + *
> + * Otherwise, %NULL is returned.
> + *
> + * find_lock_page() may sleep.
> + */
> +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
> +{
> +	struct page *page = __find_lock_page(mapping, offset);
> +
> +	if (radix_tree_exceptional_entry(page))
> +		page = NULL;
> +	return page;
> +}
>  EXPORT_SYMBOL(find_lock_page);
>  
>  /**
> @@ -848,16 +929,18 @@ EXPORT_SYMBOL(find_lock_page);
>   * @index: the page's index into the mapping
>   * @gfp_mask: page allocation mode
>   *
> - * Locates a page in the pagecache.  If the page is not present, a new page
> - * is allocated using @gfp_mask and is added to the pagecache and to the VM's
> - * LRU list.  The returned page is locked and has its reference count
> - * incremented.
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned locked and with an increased
> + * refcount.
>   *
> - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
> - * allocation!
> + * If the page is not present, a new page is allocated using @gfp_mask
> + * and added to the page cache and the VM's LRU list.  The page is
> + * returned locked and with an increased refcount.
>   *
> - * find_or_create_page() returns the desired page's address, or zero on
> - * memory exhaustion.
> + * On memory exhaustion, %NULL is returned.
> + *
> + * find_or_create_page() may sleep, even if @gfp_flags specifies an
> + * atomic allocation!
>   */
>  struct page *find_or_create_page(struct address_space *mapping,
>  		pgoff_t index, gfp_t gfp_mask)
> @@ -890,6 +973,73 @@ repeat:
>  EXPORT_SYMBOL(find_or_create_page);
>  
>  /**
> + * __find_get_pages - gang pagecache lookup
> + * @mapping:	The address_space to search
> + * @start:	The starting page index
> + * @nr_pages:	The maximum number of pages
> + * @pages:	Where the resulting pages are placed

where is @indices?

> + *
> + * __find_get_pages() will search for and return a group of up to
> + * @nr_pages pages in the mapping.  The pages are placed at @pages.
> + * __find_get_pages() takes a reference against the returned pages.
> + *
> + * The search returns a group of mapping-contiguous pages with ascending
> + * indexes.  There may be holes in the indices due to not-present pages.
> + *
> + * Any shadow entries of evicted pages are included in the returned
> + * array.
> + *
> + * __find_get_pages() returns the number of pages and shadow entries
> + * which were found.
> + */
> +unsigned __find_get_pages(struct address_space *mapping,
> +			  pgoff_t start, unsigned int nr_pages,
> +			  struct page **pages, pgoff_t *indices)
> +{
> +	void **slot;
> +	unsigned int ret = 0;
> +	struct radix_tree_iter iter;
> +
> +	if (!nr_pages)
> +		return 0;
> +
> +	rcu_read_lock();
> +restart:
> +	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
> +		struct page *page;
> +repeat:
> +		page = radix_tree_deref_slot(slot);
> +		if (unlikely(!page))
> +			continue;
> +		if (radix_tree_exception(page)) {
> +			if (radix_tree_deref_retry(page))
> +				goto restart;
> +			/*
> +			 * Otherwise, we must be storing a swap entry
> +			 * here as an exceptional entry: so return it
> +			 * without attempting to raise page count.
> +			 */
> +			goto export;
> +		}
> +		if (!page_cache_get_speculative(page))
> +			goto repeat;
> +
> +		/* Has the page moved? */
> +		if (unlikely(page != *slot)) {
> +			page_cache_release(page);
> +			goto repeat;
> +		}
> +export:
> +		indices[ret] = iter.index;
> +		pages[ret] = page;
> +		if (++ret == nr_pages)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
> +/**
>   * find_get_pages - gang pagecache lookup
>   * @mapping:	The address_space to search
>   * @start:	The starting page index
> diff --git a/mm/mincore.c b/mm/mincore.c
> index da2be56a7b8f..ad411ec86a55 100644
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
>  	 * any other file mapping (ie. marked !present and faulted in with
>  	 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
>  	 */
> -	page = find_get_page(mapping, pgoff);
>  #ifdef CONFIG_SWAP
> -	/* shmem/tmpfs may return swap: account for swapcache page too. */
> -	if (radix_tree_exceptional_entry(page)) {
> -		swp_entry_t swap = radix_to_swp_entry(page);
> -		page = find_get_page(swap_address_space(swap), swap.val);
> -	}
> +	if (shmem_mapping(mapping)) {
> +		page = __find_get_page(mapping, pgoff);
> +		/*
> +		 * shmem/tmpfs may return swap: account for swapcache
> +		 * page too.
> +		 */
> +		if (radix_tree_exceptional_entry(page)) {
> +			swp_entry_t swp = radix_to_swp_entry(page);
> +			page = find_get_page(swap_address_space(swp), swp.val);
> +		}
> +	} else
> +		page = find_get_page(mapping, pgoff);
> +#else
> +	page = find_get_page(mapping, pgoff);
>  #endif
>  	if (page) {
>  		present = PageUptodate(page);
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 9eeeeda4ac0e..912c00358112 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -179,7 +179,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
>  		rcu_read_lock();
>  		page = radix_tree_lookup(&mapping->page_tree, page_offset);
>  		rcu_read_unlock();
> -		if (page)
> +		if (page && !radix_tree_exceptional_entry(page))
>  			continue;
>  
>  		page = page_cache_alloc_readahead(mapping);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 7c67249d6f28..1f4b65f7b831 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -329,56 +329,6 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
>  }
>  
>  /*
> - * Like find_get_pages, but collecting swap entries as well as pages.
> - */
> -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
> -					pgoff_t start, unsigned int nr_pages,
> -					struct page **pages, pgoff_t *indices)
> -{
> -	void **slot;
> -	unsigned int ret = 0;
> -	struct radix_tree_iter iter;
> -
> -	if (!nr_pages)
> -		return 0;
> -
> -	rcu_read_lock();
> -restart:
> -	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
> -		struct page *page;
> -repeat:
> -		page = radix_tree_deref_slot(slot);
> -		if (unlikely(!page))
> -			continue;
> -		if (radix_tree_exception(page)) {
> -			if (radix_tree_deref_retry(page))
> -				goto restart;
> -			/*
> -			 * Otherwise, we must be storing a swap entry
> -			 * here as an exceptional entry: so return it
> -			 * without attempting to raise page count.
> -			 */
> -			goto export;
> -		}
> -		if (!page_cache_get_speculative(page))
> -			goto repeat;
> -
> -		/* Has the page moved? */
> -		if (unlikely(page != *slot)) {
> -			page_cache_release(page);
> -			goto repeat;
> -		}
> -export:
> -		indices[ret] = iter.index;
> -		pages[ret] = page;
> -		if (++ret == nr_pages)
> -			break;
> -	}
> -	rcu_read_unlock();
> -	return ret;
> -}
> -
> -/*
>   * Remove swap entry from radix tree, free the swap and its page cache.
>   */
>  static int shmem_free_swap(struct address_space *mapping,
> @@ -396,21 +346,6 @@ static int shmem_free_swap(struct address_space *mapping,
>  }
>  
>  /*
> - * Pagevec may contain swap entries, so shuffle up pages before releasing.
> - */
> -static void shmem_deswap_pagevec(struct pagevec *pvec)
> -{
> -	int i, j;
> -
> -	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
> -		struct page *page = pvec->pages[i];
> -		if (!radix_tree_exceptional_entry(page))
> -			pvec->pages[j++] = page;
> -	}
> -	pvec->nr = j;
> -}
> -
> -/*
>   * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
>   */
>  void shmem_unlock_mapping(struct address_space *mapping)
> @@ -428,12 +363,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
>  		 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
>  		 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
>  		 */
> -		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> +		pvec.nr = __find_get_pages(mapping, index,
>  					PAGEVEC_SIZE, pvec.pages, indices);
>  		if (!pvec.nr)
>  			break;
>  		index = indices[pvec.nr - 1] + 1;
> -		shmem_deswap_pagevec(&pvec);
> +		pagevec_remove_exceptionals(&pvec);
>  		check_move_unevictable_pages(pvec.pages, pvec.nr);
>  		pagevec_release(&pvec);
>  		cond_resched();
> @@ -465,9 +400,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  	pagevec_init(&pvec, 0);
>  	index = start;
>  	while (index < end) {
> -		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> -				min(end - index, (pgoff_t)PAGEVEC_SIZE),
> -							pvec.pages, indices);
> +		pvec.nr = __find_get_pages(mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE),
> +			pvec.pages, indices);
>  		if (!pvec.nr)
>  			break;
>  		mem_cgroup_uncharge_start();
> @@ -496,7 +431,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  			}
>  			unlock_page(page);
>  		}
> -		shmem_deswap_pagevec(&pvec);
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		cond_resched();
> @@ -534,9 +469,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  	index = start;
>  	for ( ; ; ) {
>  		cond_resched();
> -		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> +
> +		pvec.nr = __find_get_pages(mapping, index,
>  				min(end - index, (pgoff_t)PAGEVEC_SIZE),
> -							pvec.pages, indices);
> +				pvec.pages, indices);
>  		if (!pvec.nr) {
>  			if (index == start || unfalloc)
>  				break;
> @@ -544,7 +480,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  			continue;
>  		}
>  		if ((index == start || unfalloc) && indices[0] >= end) {
> -			shmem_deswap_pagevec(&pvec);
> +			pagevec_remove_exceptionals(&pvec);
>  			pagevec_release(&pvec);
>  			break;
>  		}
> @@ -573,7 +509,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  			}
>  			unlock_page(page);
>  		}
> -		shmem_deswap_pagevec(&pvec);
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		index++;
> @@ -1081,7 +1017,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>  		return -EFBIG;
>  repeat:
>  	swap.val = 0;
> -	page = find_lock_page(mapping, index);
> +	page = __find_lock_page(mapping, index);
>  	if (radix_tree_exceptional_entry(page)) {
>  		swap = radix_to_swp_entry(page);
>  		page = NULL;
> @@ -1418,6 +1354,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
>  	return inode;
>  }
>  
> +bool shmem_mapping(struct address_space *mapping)
> +{
> +	return mapping->backing_dev_info == &shmem_backing_dev_info;
> +}
> +
>  #ifdef CONFIG_TMPFS
>  static const struct inode_operations shmem_symlink_inode_operations;
>  static const struct inode_operations shmem_short_symlink_operations;
> @@ -1730,7 +1671,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
>  	pagevec_init(&pvec, 0);
>  	pvec.nr = 1;		/* start small: we may be there already */
>  	while (!done) {
> -		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> +		pvec.nr = __find_get_pages(mapping, index,
>  					pvec.nr, pvec.pages, indices);
>  		if (!pvec.nr) {
>  			if (whence == SEEK_DATA)
> @@ -1757,7 +1698,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
>  				break;
>  			}
>  		}
> -		shmem_deswap_pagevec(&pvec);
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		pvec.nr = PAGEVEC_SIZE;
>  		cond_resched();
> diff --git a/mm/swap.c b/mm/swap.c
> index 759c3caf44bd..f624e5b4b724 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -894,6 +894,53 @@ EXPORT_SYMBOL(__pagevec_lru_add);
>  
>  /**
>   * pagevec_lookup - gang pagecache lookup

      __pagevec_lookup?

> + * @pvec:	Where the resulting entries are placed
> + * @mapping:	The address_space to search
> + * @start:	The starting entry index
> + * @nr_pages:	The maximum number of entries

      missing @indices?

> + *
> + * pagevec_lookup() will search for and return a group of up to
> + * @nr_pages pages and shadow entries in the mapping.  All entries are
> + * placed in @pvec.  pagevec_lookup() takes a reference against actual
> + * pages in @pvec.
> + *
> + * The search returns a group of mapping-contiguous entries with
> + * ascending indexes.  There may be holes in the indices due to
> + * not-present entries.
> + *
> + * pagevec_lookup() returns the number of entries which were found.

      __pagevec_lookup

> + */
> +unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
> +			  pgoff_t start, unsigned nr_pages, pgoff_t *indices)
> +{
> +	pvec->nr = __find_get_pages(mapping, start, nr_pages,
> +				    pvec->pages, indices);
> +	return pagevec_count(pvec);
> +}
> +
> +/**
> + * pagevec_remove_exceptionals - pagevec exceptionals pruning
> + * @pvec:	The pagevec to prune
> + *
> + * __pagevec_lookup() fills both pages and exceptional radix tree
> + * entries into the pagevec.  This function prunes all exceptionals
> + * from @pvec without leaving holes, so that it can be passed on to
> + * other pagevec operations.
> + */
> +void pagevec_remove_exceptionals(struct pagevec *pvec)
> +{
> +	int i, j;
> +
> +	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
> +		struct page *page = pvec->pages[i];
> +		if (!radix_tree_exceptional_entry(page))
> +			pvec->pages[j++] = page;
> +	}
> +	pvec->nr = j;
> +}
> +
> +/**
> + * pagevec_lookup - gang pagecache lookup
>   * @pvec:	Where the resulting pages are placed
>   * @mapping:	The address_space to search
>   * @start:	The starting page index
> diff --git a/mm/truncate.c b/mm/truncate.c
> index 353b683afd6e..b0f4d4bee8ab 100644
> --- a/mm/truncate.c
> +++ b/mm/truncate.c
> @@ -22,6 +22,22 @@
>  #include <linux/cleancache.h>
>  #include "internal.h"
>  
> +static void clear_exceptional_entry(struct address_space *mapping,
> +				    pgoff_t index, void *entry)
> +{
> +	/* Handled by shmem itself */
> +	if (shmem_mapping(mapping))
> +		return;
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	/*
> +	 * Regular page slots are stabilized by the page lock even
> +	 * without the tree itself locked.  These unlocked entries
> +	 * need verification under the tree lock.
> +	 */

Could you explain why repeated spin_lock with irq disabled isn't problem
in truncation path?

> +	radix_tree_delete_item(&mapping->page_tree, index, entry);
> +	spin_unlock_irq(&mapping->tree_lock);
> +}
>  
>  /**
>   * do_invalidatepage - invalidate part or all of a page
> @@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  	unsigned int	partial_start;	/* inclusive */
>  	unsigned int	partial_end;	/* exclusive */
>  	struct pagevec	pvec;
> +	pgoff_t		indices[PAGEVEC_SIZE];
>  	pgoff_t		index;
>  	int		i;
>  
> @@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  
>  	pagevec_init(&pvec, 0);
>  	index = start;
> -	while (index < end && pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> +	while (index < end && __pagevec_lookup(&pvec, mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE),
> +			indices)) {
>  		mem_cgroup_uncharge_start();
>  		for (i = 0; i < pagevec_count(&pvec); i++) {
>  			struct page *page = pvec.pages[i];
>  
>  			/* We rely upon deletion not changing page->index */
> -			index = page->index;
> +			index = indices[i];
>  			if (index >= end)
>  				break;
>  
> +			if (radix_tree_exceptional_entry(page)) {
> +				clear_exceptional_entry(mapping, index, page);
> +				continue;
> +			}
> +
>  			if (!trylock_page(page))
>  				continue;
>  			WARN_ON(page->index != index);
> @@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  			truncate_inode_page(mapping, page);
>  			unlock_page(page);
>  		}
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		cond_resched();
> @@ -307,14 +331,15 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  	index = start;
>  	for ( ; ; ) {
>  		cond_resched();
> -		if (!pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> +		if (!__pagevec_lookup(&pvec, mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE),
> +			indices)) {
>  			if (index == start)
>  				break;
>  			index = start;
>  			continue;
>  		}
> -		if (index == start && pvec.pages[0]->index >= end) {
> +		if (index == start && indices[0] >= end) {
>  			pagevec_release(&pvec);
>  			break;
>  		}
> @@ -323,16 +348,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  			struct page *page = pvec.pages[i];
>  
>  			/* We rely upon deletion not changing page->index */
> -			index = page->index;
> +			index = indices[i];
>  			if (index >= end)
>  				break;
>  
> +			if (radix_tree_exceptional_entry(page)) {
> +				clear_exceptional_entry(mapping, index, page);
> +				continue;
> +			}
> +
>  			lock_page(page);
>  			WARN_ON(page->index != index);
>  			wait_on_page_writeback(page);
>  			truncate_inode_page(mapping, page);
>  			unlock_page(page);
>  		}
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		index++;
> @@ -375,6 +406,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
>  unsigned long invalidate_mapping_pages(struct address_space *mapping,
>  		pgoff_t start, pgoff_t end)
>  {
> +	pgoff_t indices[PAGEVEC_SIZE];
>  	struct pagevec pvec;
>  	pgoff_t index = start;
>  	unsigned long ret;
> @@ -390,17 +422,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
>  	 */
>  
>  	pagevec_init(&pvec, 0);
> -	while (index <= end && pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> +	while (index <= end && __pagevec_lookup(&pvec, mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
> +			indices)) {
>  		mem_cgroup_uncharge_start();
>  		for (i = 0; i < pagevec_count(&pvec); i++) {
>  			struct page *page = pvec.pages[i];
>  
>  			/* We rely upon deletion not changing page->index */
> -			index = page->index;
> +			index = indices[i];
>  			if (index > end)
>  				break;
>  
> +			if (radix_tree_exceptional_entry(page)) {
> +				clear_exceptional_entry(mapping, index, page);
> +				continue;
> +			}
> +
>  			if (!trylock_page(page))
>  				continue;
>  			WARN_ON(page->index != index);
> @@ -414,6 +452,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
>  				deactivate_page(page);
>  			count += ret;
>  		}
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		cond_resched();
> @@ -481,6 +520,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
>  int invalidate_inode_pages2_range(struct address_space *mapping,
>  				  pgoff_t start, pgoff_t end)
>  {
> +	pgoff_t indices[PAGEVEC_SIZE];
>  	struct pagevec pvec;
>  	pgoff_t index;
>  	int i;
> @@ -491,17 +531,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
>  	cleancache_invalidate_inode(mapping);
>  	pagevec_init(&pvec, 0);
>  	index = start;
> -	while (index <= end && pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> +	while (index <= end && __pagevec_lookup(&pvec, mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
> +			indices)) {
>  		mem_cgroup_uncharge_start();
>  		for (i = 0; i < pagevec_count(&pvec); i++) {
>  			struct page *page = pvec.pages[i];
>  
>  			/* We rely upon deletion not changing page->index */
> -			index = page->index;
> +			index = indices[i];
>  			if (index > end)
>  				break;
>  
> +			if (radix_tree_exceptional_entry(page)) {
> +				clear_exceptional_entry(mapping, index, page);
> +				continue;
> +			}
> +
>  			lock_page(page);
>  			WARN_ON(page->index != index);
>  			if (page->mapping != mapping) {
> @@ -539,6 +585,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
>  				ret = ret2;
>  			unlock_page(page);
>  		}
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		cond_resched();
> -- 
> 1.8.4.2
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@...ck.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@...ck.org"> email@...ck.org </a>

-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ