linux-ext4 - Re: PATCH: Making mb_history length a dynamic tunable

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6601abe90904201153q5e66cb30na22a49767245e53f@mail.gmail.com>
Date:	Mon, 20 Apr 2009 11:53:12 -0700
From:	Curt Wohlgemuth <curtw@...gle.com>
To:	Andreas Dilger <adilger@....com>
Cc:	ext4 development <linux-ext4@...r.kernel.org>
Subject: Re: PATCH: Making mb_history length a dynamic tunable

Hi Andreas:

> On Apr 07, 2009  10:20 -0700, Curt Wohlgemuth wrote:
> > Since we frequently run in memory-constrained systems with many partitions,
> > the ~68K for each partition for the mb_history buffer can be excessive.  The
> > following creates a new proc file under /proc/fs/ext4/ to control the number
> > of entries at mount time.
>
> Sorry for the delay, it's been a hectic 2 weeks.  I'm totally OK with this
> patch.

Thanks for the detailed reply.

> > If the notion of a history length tunable is okay, but the location should
> > be under /sys/fs/ext4/ instead of /proc/fs/ext4/, I can change this.  The
> > leftover files under /proc/fs/ext4/<partition>/ are a bit confusing to me.
>
> Complex /proc files like this cannot be moved into /sys because it
> does not support the seqfile interface for having output span more
> than one page.

Got it.

> >  static void ext4_mb_generate_from_freelist(struct super_block *sb,
> > void *bitmap,
> > @@ -2417,31 +2421,36 @@ static struct file_operations ext4_mb_se
> >  static void ext4_mb_history_release(struct super_block *sb)
> >  {
> >  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> > +	int delete_mbhist = sbi->s_mb_history_max != 0;
> >
> >  	if (sbi->s_proc != NULL) {
> >  		remove_proc_entry("mb_groups", sbi->s_proc);
> > -		remove_proc_entry("mb_history", sbi->s_proc);
> > +		if (delete_mbhist)
> > +			remove_proc_entry("mb_history", sbi->s_proc);
> >  	}
> > -	kfree(sbi->s_mb_history);
> > +	if (delete_mbhist)
> > +		kfree(sbi->s_mb_history);
>
> If sbi->s_mb_history == NULL (as opposed to garbage) it is OK to
> just call kfree() without the extra check.

Done.

> Since the s_mb_history buffers are allocated at mount time only,
> and later writing to mb_hist_size does not affect their size,
> it means that mb_hist_size must be set before the filesystems are
> mounted.  However, that makes it impossible to tune the root
> filesystem history size, and at best it is difficult to tune the
> other filesystems because the value has to be set by an init
> script after root mounts but before other filesystems mount.
>
> Another possible option is to have a module parameter that can be
> set when the ext4 module is inserted, so that it is sure to be
> available for all filesystems, but this won't help if ext4 is
> built into the kernel.

Which is certainly our case.

> The final (though more complex) option is to add a per-fs
> tunable that allows changing the history size at runtime for
> each filesystem.

Right, I thought about this.  But avoiding the race when the tunable is
being changed at the same time that history is being updated just didn't
seem worth it to me.

I see the mb_history mostly as a debug issue; having to umount/remount a FS
to change history doesn't seem too onerous -- with the exception, of course,
of the root FS.  Do you think this is too big a burden, and I should look at
handling it while a FS is mounted?

> > @@ -2894,6 +2903,49 @@ static void release_blocks_on_commit(jou
> >  	mb_debug("freed %u blocks in %u structures\n", count, count2);
> >  }
> >
> > +static ssize_t read_mb_hist_size(struct file *file, char __user *buf,
> > +				size_t count, loff_t *ppos)
> > +static ssize_t write_mb_hist_size(struct file *file, const char __user *buf,
> > +				   size_t count, loff_t *ppos)
>
> Please add an ext4_ prefix to these function names.

Done.

> > +{
> > +	/* This allows for 99999 entries, at 68 bytes each */
> > +#define MAX_BUFSIZE 6
> > +	char kbuf[MAX_BUFSIZE + 1];
> > +	int value;
> > +
> > +	if (count) {
> > +		if (count > MAX_BUFSIZE)
> > +			return -EINVAL;
> > +		if (copy_from_user(&kbuf, buf, count))
> > +			return -EFAULT;
> > +		kbuf[min(count, sizeof(kbuf))-1] = '\0';
>
> This is could probably be avoided.  Simply initializing "kbuf" at
> declaration time will ensure that there is a trailing NUL because
> at most MAX_BUFSIZE bytes are copied into it.
>
> 	char kbuf[MAX_BUFSIZE + 1] = "";

Ah, right, thanks; done.

Second patch follows.

       Signed-off-by: Curt Wohlgemuth <curtw@...gle.com>
---

--- ext4/fs/ext4/mballoc.c.orig	2009-04-07 09:13:12.000000000 -0700
+++ ext4/fs/ext4/mballoc.c	2009-04-20 11:22:25.000000000 -0700
@@ -334,6 +334,10 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
+
+#define DEFAULT_HIST_SIZE 1000
+static int ext4_mbhist_size = DEFAULT_HIST_SIZE;
+
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb,
void *bitmap,
@@ -2417,10 +2421,12 @@
 static void ext4_mb_history_release(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int delete_mbhist = sbi->s_mb_history_max != 0;

 	if (sbi->s_proc != NULL) {
 		remove_proc_entry("mb_groups", sbi->s_proc);
-		remove_proc_entry("mb_history", sbi->s_proc);
+		if (delete_mbhist)
+			remove_proc_entry("mb_history", sbi->s_proc);
 	}
 	kfree(sbi->s_mb_history);
 }
@@ -2429,19 +2435,21 @@
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int i;
+	int create_mbhist = ext4_mbhist_size != 0;

 	if (sbi->s_proc != NULL) {
-		proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
-				 &ext4_mb_seq_history_fops, sb);
+		if (create_mbhist)
+			proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+					 &ext4_mb_seq_history_fops, sb);
 		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
 	}

-	sbi->s_mb_history_max = 1000;
+	sbi->s_mb_history_max = ext4_mbhist_size;
 	sbi->s_mb_history_cur = 0;
 	spin_lock_init(&sbi->s_mb_history_lock);
 	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+	sbi->s_mb_history = create_mbhist ? kzalloc(i, GFP_KERNEL) : NULL;
 	/* if we can't allocate history, then we simple won't use it */
 }

@@ -2894,6 +2902,48 @@
 	mb_debug("freed %u blocks in %u structures\n", count, count2);
 }

+static ssize_t ext4_read_mb_hist_size(struct file *file, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	char buffer[20];
+	size_t len;
+
+	len = snprintf(buffer, sizeof(buffer), "%i\n", ext4_mbhist_size);
+
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t ext4_write_mb_hist_size(struct file *file,
+					const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	/* This allows for 99999 entries, at 68 bytes each */
+#define MAX_BUFSIZE 6
+	char kbuf[MAX_BUFSIZE + 1] = "";
+	int value;
+
+	if (count) {
+		if (count > MAX_BUFSIZE)
+			return -EINVAL;
+		if (copy_from_user(&kbuf, buf, count))
+			return -EFAULT;
+
+		value = simple_strtol(kbuf, NULL, 0);
+
+		if (value < 0)
+			return -EINVAL;
+
+		ext4_mbhist_size = value;
+	}
+	return count;
+#undef MAX_BUFSIZE
+}
+
+static struct file_operations ext4_mb_size_fops = {
+	.read		= ext4_read_mb_hist_size,
+	.write		= ext4_write_mb_hist_size,
+};
+
 int __init init_ext4_mballoc(void)
 {
 	ext4_pspace_cachep =
@@ -2912,6 +2962,9 @@
 		return -ENOMEM;
 	}

+	proc_create("mb_hist_size", S_IRUGO | S_IWUGO, ext4_proc_root,
+			 &ext4_mb_size_fops);
+
 	ext4_free_ext_cachep =
 		kmem_cache_create("ext4_free_block_extents",
 				     sizeof(struct ext4_free_data),
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html