linux-kernel - Re: [rfc] possible page manipulation simplifications?

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20061204144005.GA22233@skynet.ie>
Date:	Mon, 4 Dec 2006 14:40:05 +0000
From:	mel@...net.ie (Mel Gorman)
To:	Nick Piggin <npiggin@...e.de>
Cc:	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Linux Memory Management List <linux-mm@...ck.org>
Subject: Re: [rfc] possible page manipulation simplifications?

On (02/12/06 13:15), Nick Piggin didst pronounce:
> Hi,
> 
> While working in this area, I noticed a few things we do that may not
> have a positive payoff under the most common conditions. Untested yet,
> and probably needs a bit of instrumentation, but it saves about half a
> K of code, lots of branches, and makes things look nicer. Any thoughts?
> 
> Quite a bit of code is used in maintaining these "cached pages" that are
> probably pretty unlikely to get used.
> 

I think you might be leaking now though. More comments below.

> Also, buffered write path (and others) uses its own LRU pagevec when we should
> be just using the per-CPU LRU pagevec (which will cut down on both data and
> code size cacheline footprint).
> 

Splitting the patch into two could be nice but it's grand for the
moment.

> Index: linux-2.6/mm/filemap.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap.c
> +++ linux-2.6/mm/filemap.c
> @@ -686,26 +686,18 @@ EXPORT_SYMBOL(find_lock_page);
>  struct page *find_or_create_page(struct address_space *mapping,
>  		unsigned long index, gfp_t gfp_mask)
>  {
> -	struct page *page, *cached_page = NULL;
> +	struct page *page;
>  	int err;
>  repeat:
>  	page = find_lock_page(mapping, index);
>  	if (!page) {
> -		if (!cached_page) {
> -			cached_page = alloc_page(gfp_mask);
> -			if (!cached_page)
> -				return NULL;
> -		}
> -		err = add_to_page_cache_lru(cached_page, mapping,
> -					index, gfp_mask);
> -		if (!err) {
> -			page = cached_page;
> -			cached_page = NULL;
> -		} else if (err == -EEXIST)
> +		page = alloc_page(gfp_mask);
> +		if (!page)
> +			return NULL;
> +		err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
> +		if (err == -EEXIST)
>  			goto repeat;

Lets say the alloc_page() succeeds, but add_to_page_cache_lru() returns
-EEXIST, we'll jump back to the repeat label which calls page =
find_lock_page(). We've lost the page at that point, right? If
add_to_page_cache_lru() returns any error in fact, the page leaks.

You appear to do the right thing later when you call
page_cache_release() on error adding to the page cache. This is probably
safe to do here. Sure, for non-EEXIST errors, you'll allocate the page
twice but you probably don't care if this path is very rarely executed.

>  	}
> -	if (cached_page)
> -		page_cache_release(cached_page);
>  	return page;
>  }
>  EXPORT_SYMBOL(find_or_create_page);
> @@ -891,11 +883,9 @@ void do_generic_mapping_read(struct addr
>  	unsigned long next_index;
>  	unsigned long prev_index;
>  	loff_t isize;
> -	struct page *cached_page;
>  	int error;
>  	struct file_ra_state ra = *_ra;
>  
> -	cached_page = NULL;
>  	index = *ppos >> PAGE_CACHE_SHIFT;
>  	next_index = index;
>  	prev_index = ra.prev_page;
> @@ -1059,14 +1049,12 @@ no_cached_page:
>  		 * Ok, it wasn't cached, so we need to create a new
>  		 * page..
>  		 */
> -		if (!cached_page) {
> -			cached_page = page_cache_alloc_cold(mapping);
> -			if (!cached_page) {
> -				desc->error = -ENOMEM;
> -				goto out;
> -			}
> +		page = page_cache_alloc_cold(mapping);
> +		if (!page) {
> +			desc->error = -ENOMEM;
> +			goto out;
>  		}
> -		error = add_to_page_cache_lru(cached_page, mapping,
> +		error = add_to_page_cache_lru(page, mapping,
>  						index, GFP_KERNEL);
>  		if (error) {
>  			if (error == -EEXIST)

I think this might be suffering similar problems with leaking pages.

> @@ -1074,8 +1062,6 @@ no_cached_page:
>  			desc->error = error;
>  			goto out;
>  		}
> -		page = cached_page;
> -		cached_page = NULL;
>  		goto readpage;
>  	}
>  
> @@ -1083,8 +1069,6 @@ out:
>  	*_ra = ra;
>  
>  	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
> -	if (cached_page)
> -		page_cache_release(cached_page);
>  	if (filp)
>  		file_accessed(filp);
>  }
> @@ -1545,35 +1529,28 @@ static inline struct page *__read_cache_
>  				int (*filler)(void *,struct page*),
>  				void *data)
>  {
> -	struct page *page, *cached_page = NULL;
> +	struct page *page;
>  	int err;
>  repeat:
>  	page = find_get_page(mapping, index);
>  	if (!page) {
> -		if (!cached_page) {
> -			cached_page = page_cache_alloc_cold(mapping);
> -			if (!cached_page)
> -				return ERR_PTR(-ENOMEM);
> -		}
> -		err = add_to_page_cache_lru(cached_page, mapping,
> -					index, GFP_KERNEL);
> +		page = page_cache_alloc_cold(mapping);
> +		if (!page)
> +			return ERR_PTR(-ENOMEM);
> +		err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
>  		if (err == -EEXIST)
>  			goto repeat;
>  		if (err < 0) {
>  			/* Presumably ENOMEM for radix tree node */
> -			page_cache_release(cached_page);
> +			page_cache_release(page);
>  			return ERR_PTR(err);
>  		}
> -		page = cached_page;
> -		cached_page = NULL;
>  		err = filler(data, page);
>  		if (err < 0) {
>  			page_cache_release(page);
>  			page = ERR_PTR(err);
>  		}
>  	}
> -	if (cached_page)
> -		page_cache_release(cached_page);

I didn't look carefully here but it looks like more of the same.

>  	return page;
>  }
>  
> @@ -1624,40 +1601,6 @@ retry:
>  EXPORT_SYMBOL(read_cache_page);
>  
>  /*
> - * If the page was newly created, increment its refcount and add it to the
> - * caller's lru-buffering pagevec.  This function is specifically for
> - * generic_file_write().
> - */
> -static inline struct page *
> -__grab_cache_page(struct address_space *mapping, unsigned long index,
> -			struct page **cached_page, struct pagevec *lru_pvec)
> -{
> -	int err;
> -	struct page *page;
> -repeat:
> -	page = find_lock_page(mapping, index);
> -	if (!page) {
> -		if (!*cached_page) {
> -			*cached_page = page_cache_alloc(mapping);
> -			if (!*cached_page)
> -				return NULL;
> -		}
> -		err = add_to_page_cache(*cached_page, mapping,
> -					index, GFP_KERNEL);
> -		if (err == -EEXIST)
> -			goto repeat;
> -		if (err == 0) {
> -			page = *cached_page;
> -			page_cache_get(page);
> -			if (!pagevec_add(lru_pvec, page))
> -				__pagevec_lru_add(lru_pvec);
> -			*cached_page = NULL;
> -		}
> -	}
> -	return page;
> -}
> -
> -/*
>   * The logic we want is
>   *
>   *	if suid or (sgid and xgrp)
> @@ -1862,14 +1805,10 @@ generic_file_buffered_write(struct kiocb
>  	struct inode 	*inode = mapping->host;
>  	long		status = 0;
>  	struct page	*page;
> -	struct page	*cached_page = NULL;
> -	struct pagevec	lru_pvec;
>  	const struct iovec *cur_iov = iov; /* current iovec */
>  	size_t		iov_offset = 0;	   /* offset in the current iovec */
>  	char __user	*buf;
>  
> -	pagevec_init(&lru_pvec, 0);
> -
>  	/*
>  	 * handle partial DIO write.  Adjust cur_iov if needed.
>  	 */
> @@ -1911,10 +1850,23 @@ retry_noprogress:
>  			break;
>  #endif
>  
> -		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
> -		if (!page) {
> -			status = -ENOMEM;
> -			break;
> +
> +repeat:
> +		page = find_lock_page(mapping, index);
> +		if (unlikely(!page)) {
> +			page = page_cache_alloc(mapping);
> +			if (!page) {
> +				status = -ENOMEM;
> +				break;
> +			}
> +			status = add_to_page_cache_lru(page, mapping,
> +						index, GFP_KERNEL);
> +			if (status) {
> +				page_cache_release(page);

We don't leak here but might allocate twice.

> +				if (status == -EEXIST)
> +					goto repeat;
> +				break;
> +			}
>  		}
>  

Otherwise, this seems to be a reasonable cleanup.

>  		status = a_ops->prepare_write(file, page, offset, offset+bytes);
> @@ -2027,9 +1979,6 @@ retry_noprogress:
>  	} while (count);
>  	*ppos = pos;
>  
> -	if (cached_page)
> -		page_cache_release(cached_page);
> -
>  	/*
>  	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
>  	 */
> @@ -2049,7 +1998,6 @@ retry_noprogress:
>  	if (unlikely(file->f_flags & O_DIRECT) && written)
>  		status = filemap_write_and_wait(mapping);
>  
> -	pagevec_lru_add(&lru_pvec);
>  	return written ? written : status;
>  }
>  EXPORT_SYMBOL(generic_file_buffered_write);
> Index: linux-2.6/fs/mpage.c
> ===================================================================
> --- linux-2.6.orig/fs/mpage.c
> +++ linux-2.6/fs/mpage.c
> @@ -389,31 +389,26 @@ mpage_readpages(struct address_space *ma
>  	struct bio *bio = NULL;
>  	unsigned page_idx;
>  	sector_t last_block_in_bio = 0;
> -	struct pagevec lru_pvec;
>  	struct buffer_head map_bh;
>  	unsigned long first_logical_block = 0;
>  
>  	clear_buffer_mapped(&map_bh);
> -	pagevec_init(&lru_pvec, 0);
>  	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
>  		struct page *page = list_entry(pages->prev, struct page, lru);
>  
>  		prefetchw(&page->flags);
>  		list_del(&page->lru);
> -		if (!add_to_page_cache(page, mapping,
> +		if (!add_to_page_cache_lru(page, mapping,
>  					page->index, GFP_KERNEL)) {
>  			bio = do_mpage_readpage(bio, page,
>  					nr_pages - page_idx,
>  					&last_block_in_bio, &map_bh,
>  					&first_logical_block,
>  					get_block);
> -			if (!pagevec_add(&lru_pvec, page))
> -				__pagevec_lru_add(&lru_pvec);
>  		} else {
>  			page_cache_release(page);
>  		}
>  	}
> -	pagevec_lru_add(&lru_pvec);
>  	BUG_ON(!list_empty(pages));
>  	if (bio)
>  		mpage_bio_submit(READ, bio);
> Index: linux-2.6/mm/readahead.c
> ===================================================================
> --- linux-2.6.orig/mm/readahead.c
> +++ linux-2.6/mm/readahead.c
> @@ -132,22 +132,18 @@ int read_cache_pages(struct address_spac
>  			int (*filler)(void *, struct page *), void *data)
>  {
>  	struct page *page;
> -	struct pagevec lru_pvec;
>  	int ret = 0;
>  
> -	pagevec_init(&lru_pvec, 0);
> -
>  	while (!list_empty(pages)) {
>  		page = list_to_page(pages);
>  		list_del(&page->lru);
> -		if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
> +		if (add_to_page_cache_lru(page, mapping,
> +						page->index, GFP_KERNEL)) {
>  			page_cache_release(page);
>  			continue;
>  		}
>  		ret = filler(data, page);
> -		if (!pagevec_add(&lru_pvec, page))
> -			__pagevec_lru_add(&lru_pvec);
> -		if (ret) {
> +		if (unlikely(ret)) {
>  			while (!list_empty(pages)) {
>  				struct page *victim;
>  
> @@ -158,7 +154,6 @@ int read_cache_pages(struct address_spac
>  			break;
>  		}
>  	}
> -	pagevec_lru_add(&lru_pvec);
>  	return ret;
>  }
>  
> @@ -168,7 +163,6 @@ static int read_pages(struct address_spa
>  		struct list_head *pages, unsigned nr_pages)
>  {
>  	unsigned page_idx;
> -	struct pagevec lru_pvec;
>  	int ret;
>  
>  	if (mapping->a_ops->readpages) {
> @@ -178,19 +172,15 @@ static int read_pages(struct address_spa
>  		goto out;
>  	}
>  
> -	pagevec_init(&lru_pvec, 0);
>  	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
>  		struct page *page = list_to_page(pages);
>  		list_del(&page->lru);
> -		if (!add_to_page_cache(page, mapping,
> +		if (!add_to_page_cache_lru(page, mapping,
>  					page->index, GFP_KERNEL)) {
>  			mapping->a_ops->readpage(filp, page);
> -			if (!pagevec_add(&lru_pvec, page))
> -				__pagevec_lru_add(&lru_pvec);
>  		} else
>  			page_cache_release(page);
>  	}
> -	pagevec_lru_add(&lru_pvec);
>  	ret = 0;
>  out:
>  	return ret;
> 

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/