lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0afed890-7c5a-93ee-cdb9-e30775bd9cf1@redhat.com>
Date:   Mon, 29 Oct 2018 15:35:03 -0400
From:   Waiman Long <longman@...hat.com>
To:     Davidlohr Bueso <dave@...olabs.net>, akpm@...ux-foundation.org
Cc:     linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
        Davidlohr Bueso <dbueso@...e.de>
Subject: Re: [PATCH] fs/proc: introduce /proc/stat2 file

On 10/29/2018 03:25 PM, Davidlohr Bueso wrote:
> A recent report from a large database vendor which I shall not name
> shows concerns about poor performance when consuming /proc/stat info.
> Particularly  kstat_irq() pops up in the profiles and most time is
> being spent there. The overall system is under a lot of irqs and
> almost 1k cores, thus this comes to little surprise.
>
> Granted that procfs in general is not known for its performance,
> nor designed for it, for that matter. Some users, however may be able
> to overcome this performance limitation, some not. Therefore it isn't
> bad having a kernel option for users that don't want any hard irq info
> -- and care enough about this.
>
> This patch introduces a new /proc/stat2 file that is identical to the
> regular 'stat' except that it zeroes all hard irq statistics. The new
> file is a drop in replacement to stat for users that need performance.
>
> The stat file is not touched, of course -- this was also previously
> suggested by Waiman:
> https://lore.kernel.org/lkml/1524166562-5644-1-git-send-email-longman@redhat.com/
>
> Signed-off-by: Davidlohr Bueso <dbueso@...e.de>

I am wondering if /proc/stat_noirqs will be a more descriptive name of
the intent of this new procfs file or we should just go with the more
generic stat2 name.

Cheers,
Longman

> ---
>  Documentation/filesystems/proc.txt | 12 +++++++---
>  fs/proc/stat.c                     | 45 ++++++++++++++++++++++++++++++++------
>  2 files changed, 47 insertions(+), 10 deletions(-)
>
> diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
> index 12a5e6e693b6..563b01decb1e 100644
> --- a/Documentation/filesystems/proc.txt
> +++ b/Documentation/filesystems/proc.txt
> @@ -27,7 +27,7 @@ Table of Contents
>    1.5	SCSI info
>    1.6	Parallel port info in /proc/parport
>    1.7	TTY info in /proc/tty
> -  1.8	Miscellaneous kernel statistics in /proc/stat
> +  1.8	Miscellaneous kernel statistics in /proc/stat and /proc/stat2
>    1.9	Ext4 file system parameters
>  
>    2	Modifying System Parameters
> @@ -140,6 +140,7 @@ Table 1-1: Process specific entries in /proc
>   mem		Memory held by this process
>   root		Link to the root directory of this process
>   stat		Process status
> + stat2		Process status without irq information
>   statm		Process memory status information
>   status		Process status in human readable form
>   wchan		Present with CONFIG_KALLSYMS=y: it shows the kernel function
> @@ -1301,8 +1302,8 @@ To see  which  tty's  are  currently in use, you can simply look into the file
>    unknown              /dev/tty        4    1-63 console 
>  
>  
> -1.8 Miscellaneous kernel statistics in /proc/stat
> --------------------------------------------------
> +1.8 Miscellaneous kernel statistics in /proc/stat and /proc/stat2
> +-----------------------------------------------------------------
>  
>  Various pieces   of  information about  kernel activity  are  available in the
>  /proc/stat file.  All  of  the numbers reported  in  this file are  aggregates
> @@ -1371,6 +1372,11 @@ of the possible system softirqs. The first column is the total of all
>  softirqs serviced; each subsequent column is the total for that particular
>  softirq.
>  
> +The stat2 file acts as a performance alternative to /proc/stat for workloads
> +and systems that care and are under heavy irq load. In order to to be completely
> +compatible, /proc/stat and /proc/stat2 are identical with the exception that the
> +later will show 0 for any (hard)irq-related fields. This refers particularly
> +to the "intr" line and 'irq' column for that aggregate in the cpu line.
>  
>  1.9 Ext4 file system parameters
>  -------------------------------
> diff --git a/fs/proc/stat.c b/fs/proc/stat.c
> index 535eda7857cf..349040270003 100644
> --- a/fs/proc/stat.c
> +++ b/fs/proc/stat.c
> @@ -79,7 +79,7 @@ static u64 get_iowait_time(int cpu)
>  
>  #endif
>  
> -static int show_stat(struct seq_file *p, void *v)
> +static int __show_stat(struct seq_file *p, void *v, bool irq_stats)
>  {
>  	int i, j;
>  	u64 user, nice, system, idle, iowait, irq, softirq, steal;
> @@ -100,13 +100,17 @@ static int show_stat(struct seq_file *p, void *v)
>  		system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
>  		idle += get_idle_time(i);
>  		iowait += get_iowait_time(i);
> -		irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
>  		softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
>  		steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
>  		guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
>  		guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
> -		sum += kstat_cpu_irqs_sum(i);
> -		sum += arch_irq_stat_cpu(i);
> +
> +		if (irq_stats) {
> +			irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
> +
> +			sum += kstat_cpu_irqs_sum(i);
> +			sum += arch_irq_stat_cpu(i);
> +		}
>  
>  		for (j = 0; j < NR_SOFTIRQS; j++) {
>  			unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
> @@ -115,7 +119,9 @@ static int show_stat(struct seq_file *p, void *v)
>  			sum_softirq += softirq_stat;
>  		}
>  	}
> -	sum += arch_irq_stat();
> +
> +	if (irq_stats)
> +		sum += arch_irq_stat();
>  
>  	seq_put_decimal_ull(p, "cpu  ", nsec_to_clock_t(user));
>  	seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
> @@ -136,7 +142,8 @@ static int show_stat(struct seq_file *p, void *v)
>  		system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
>  		idle = get_idle_time(i);
>  		iowait = get_iowait_time(i);
> -		irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
> +		if (irq_stats)
> +			irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
>  		softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
>  		steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
>  		guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
> @@ -158,7 +165,7 @@ static int show_stat(struct seq_file *p, void *v)
>  
>  	/* sum again ? it could be updated? */
>  	for_each_irq_nr(j)
> -		seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
> +		seq_put_decimal_ull(p, " ", irq_stats ? kstat_irqs_usr(j) : 0);
>  
>  	seq_printf(p,
>  		"\nctxt %llu\n"
> @@ -181,6 +188,16 @@ static int show_stat(struct seq_file *p, void *v)
>  	return 0;
>  }
>  
> +static int show_stat(struct seq_file *p, void *v)
> +{
> +	return __show_stat(p, v, true);
> +}
> +
> +static int show_stat2(struct seq_file *p, void *v)
> +{
> +	return __show_stat(p, v, false);
> +}
> +
>  static int stat_open(struct inode *inode, struct file *file)
>  {
>  	unsigned int size = 1024 + 128 * num_online_cpus();
> @@ -190,6 +207,12 @@ static int stat_open(struct inode *inode, struct file *file)
>  	return single_open_size(file, show_stat, NULL, size);
>  }
>  
> +static int stat2_open(struct inode *inode, struct file *file)
> +{
> +	unsigned int size = 1024 + 128 * num_online_cpus();
> +	return single_open_size(file, show_stat2, NULL, size);
> +}
> +
>  static const struct file_operations proc_stat_operations = {
>  	.open		= stat_open,
>  	.read		= seq_read,
> @@ -197,9 +220,17 @@ static const struct file_operations proc_stat_operations = {
>  	.release	= single_release,
>  };
>  
> +static const struct file_operations proc_stat2_operations = {
> +	.open		= stat2_open,
> +	.read		= seq_read,
> +	.llseek		= seq_lseek,
> +	.release	= single_release,
> +};
> +
>  static int __init proc_stat_init(void)
>  {
>  	proc_create("stat", 0, NULL, &proc_stat_operations);
> +	proc_create("stat2", 0, NULL, &proc_stat2_operations);
>  	return 0;
>  }
>  fs_initcall(proc_stat_init);



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ