lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170621155941.GA5708@htj.duckdns.org>
Date:   Wed, 21 Jun 2017 11:59:41 -0400
From:   Tejun Heo <tj@...nel.org>
To:     Andrew Morton <akpm@...ux-foundation.org>,
        Jan Kara <jack@...e.com>, Nikolay Borisov <nborisov@...e.com>
Cc:     jbacik@...com, linux-kernel@...r.kernel.org,
        mgorman@...hsingularity.net
Subject: Re: [PATCH v3] writeback: Rework wb_[dec|inc]_stat family of
 functions

Hello,

cc'ing Andrew and Jan and cc'ing whole body.  The original patch
posting can be found at

 https://marc.info/?l=linux-kernel&m=149802995611259&q=raw

On Wed, Jun 21, 2017 at 10:25:37AM +0300, Nikolay Borisov wrote:
> Currently the writeback statistics code uses a percpu counters to hold
> various statistics. Furthermore we have 2 families of functions - those which
> disable local irq and those which doesn't and whose names begin with
> double underscore. However, they both end up calling __add_wb_stats which in
> turn calls percpu_counter_add_batch which is already irq-safe.
> 
> Exploiting this fact allows to eliminated the __wb_* functions since they don't
> add any further protection than we already have. Furthermore, refactor
> the wb_* function to call __add_wb_stat directly without the irq-disabling
> dance. This will likely result in better runtime of code which deals with
> modifying the stat counters.
> 
> While at it also document why percpu_counter_add_batch is in fact preempt and
> irq-safe since at least 3 people got confused.
> 
> Signed-off-by: Nikolay Borisov <nborisov@...e.com>

  Acked-by: Tejun Heo <tj@...nel.org>

Andrew, this looks good to me.  If you're okay with it, can you take
it through -mm?  If not, I can take it through percpu although that'd
be a bit of stretch.

Thanks.

> ---
> 
> Changes since v2: 
>     * Fixed build failure reported by kbuild test robot
>     * Explicitly document that percpu_counter_add_batch is preempt/irq safe
>  fs/fs-writeback.c           |  8 ++++----
>  include/linux/backing-dev.h | 24 ++----------------------
>  lib/percpu_counter.c        |  7 +++++++
>  mm/page-writeback.c         | 10 +++++-----
>  4 files changed, 18 insertions(+), 31 deletions(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 63ee2940775c..309364aab2a5 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -380,8 +380,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
>  		struct page *page = radix_tree_deref_slot_protected(slot,
>  							&mapping->tree_lock);
>  		if (likely(page) && PageDirty(page)) {
> -			__dec_wb_stat(old_wb, WB_RECLAIMABLE);
> -			__inc_wb_stat(new_wb, WB_RECLAIMABLE);
> +			dec_wb_stat(old_wb, WB_RECLAIMABLE);
> +			inc_wb_stat(new_wb, WB_RECLAIMABLE);
>  		}
>  	}
>  
> @@ -391,8 +391,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
>  							&mapping->tree_lock);
>  		if (likely(page)) {
>  			WARN_ON_ONCE(!PageWriteback(page));
> -			__dec_wb_stat(old_wb, WB_WRITEBACK);
> -			__inc_wb_stat(new_wb, WB_WRITEBACK);
> +			dec_wb_stat(old_wb, WB_WRITEBACK);
> +			inc_wb_stat(new_wb, WB_WRITEBACK);
>  		}
>  	}
>  
> diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> index ace73f96eb1e..e9c967b86054 100644
> --- a/include/linux/backing-dev.h
> +++ b/include/linux/backing-dev.h
> @@ -69,34 +69,14 @@ static inline void __add_wb_stat(struct bdi_writeback *wb,
>  	percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
>  }
>  
> -static inline void __inc_wb_stat(struct bdi_writeback *wb,
> -				 enum wb_stat_item item)
> -{
> -	__add_wb_stat(wb, item, 1);
> -}
> -
>  static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
>  {
> -	unsigned long flags;
> -
> -	local_irq_save(flags);
> -	__inc_wb_stat(wb, item);
> -	local_irq_restore(flags);
> -}
> -
> -static inline void __dec_wb_stat(struct bdi_writeback *wb,
> -				 enum wb_stat_item item)
> -{
> -	__add_wb_stat(wb, item, -1);
> +	__add_wb_stat(wb, item, 1);
>  }
>  
>  static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
>  {
> -	unsigned long flags;
> -
> -	local_irq_save(flags);
> -	__dec_wb_stat(wb, item);
> -	local_irq_restore(flags);
> +	__add_wb_stat(wb, item, -1);
>  }
>  
>  static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
> index 8ee7e5ec21be..3bf4a9984f4c 100644
> --- a/lib/percpu_counter.c
> +++ b/lib/percpu_counter.c
> @@ -72,6 +72,13 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
>  }
>  EXPORT_SYMBOL(percpu_counter_set);
>  
> +/**
> + * This function is both preempt and irq safe. The former is due to explicit
> + * preemption disable. The latter is guaranteed by the fact that the slow path
> + * is explicitly protected by an irq-safe spinlock whereas the fast patch uses
> + * this_cpu_add which is irq-safe by definition. Hence there is no need muck
> + * with irq state before calling this one
> + */
>  void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
>  {
>  	s64 count;
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 143c1c25d680..b7451891959a 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -601,7 +601,7 @@ static inline void __wb_writeout_inc(struct bdi_writeback *wb)
>  {
>  	struct wb_domain *cgdom;
>  
> -	__inc_wb_stat(wb, WB_WRITTEN);
> +	inc_wb_stat(wb, WB_WRITTEN);
>  	wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
>  			       wb->bdi->max_prop_frac);
>  
> @@ -2437,8 +2437,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
>  		__inc_node_page_state(page, NR_FILE_DIRTY);
>  		__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
>  		__inc_node_page_state(page, NR_DIRTIED);
> -		__inc_wb_stat(wb, WB_RECLAIMABLE);
> -		__inc_wb_stat(wb, WB_DIRTIED);
> +		inc_wb_stat(wb, WB_RECLAIMABLE);
> +		inc_wb_stat(wb, WB_DIRTIED);
>  		task_io_account_write(PAGE_SIZE);
>  		current->nr_dirtied++;
>  		this_cpu_inc(bdp_ratelimits);
> @@ -2745,7 +2745,7 @@ int test_clear_page_writeback(struct page *page)
>  			if (bdi_cap_account_writeback(bdi)) {
>  				struct bdi_writeback *wb = inode_to_wb(inode);
>  
> -				__dec_wb_stat(wb, WB_WRITEBACK);
> +				dec_wb_stat(wb, WB_WRITEBACK);
>  				__wb_writeout_inc(wb);
>  			}
>  		}
> @@ -2791,7 +2791,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
>  						page_index(page),
>  						PAGECACHE_TAG_WRITEBACK);
>  			if (bdi_cap_account_writeback(bdi))
> -				__inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
> +				inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
>  
>  			/*
>  			 * We can come through here when swapping anonymous
> -- 
> 2.7.4
> 

-- 
tejun

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ