linux-kernel - Re: [PATCH] binderfs: implement "max" mount option

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20181223135755.sqnv5ave62jtj4se@brauner.io>
Date:   Sun, 23 Dec 2018 14:57:56 +0100
From:   Christian Brauner <christian@...uner.io>
To:     gregkh@...uxfoundation.org, tkjos@...roid.com,
        devel@...verdev.osuosl.org, linux-kernel@...r.kernel.org
Cc:     arve@...roid.com, maco@...roid.com, joel@...lfernandes.org,
        Todd Kjos <tkjos@...gle.com>
Subject: Re: [PATCH] binderfs: implement "max" mount option

On Sat, Dec 22, 2018 at 10:18:06PM +0100, Christian Brauner wrote:
> Since binderfs can be mounted by userns root in non-initial user namespaces
> some precautions are in order. First, a way to set a maximum on the number
> of binder devices that can be allocated per binderfs instance and second, a
> way to reserve a reasonable chunk of binderfs devices for the initial ipc
> namespace.
> A first approach as seen in [1] used sysctls similiar to devpts but was
> shown to be flawed (cf. [2] and [3]) since some aspects were unneeded. This
> is an alternative approach which avoids sysctls completely and instead
> switches to a single mount option.
> 
> Starting with this commit binderfs instances can be mounted with a limit on
> the number of binder devices that can be allocated. The max=<count> mount
> option serves as a per-instance limit. If max=<count> is set then only
> <count> number of binder devices can be allocated in this binderfs
> instance.
> Additionally, the binderfs instance in the initial ipc namespace will
> always have a reserve of at least 1024 binder devices unless explicitly
> capped via max=<count>.
> 
> This allows to safely bind-mount binderfs instances into unprivileged user
> namespaces since userns root in a non-initial user namespace cannot change
> the mount option as long as it does not own the mount namespace the
> binderfs mount was created in and hence cannot drain the host of minor
> device numbers
> 
> [1]: https://lore.kernel.org/lkml/20181221133909.18794-1-christian@brauner.io/
> [2]; https://lore.kernel.org/lkml/20181221163316.GA8517@kroah.com/
> [3]: https://lore.kernel.org/lkml/CAHRSSEx+gDVW4fKKK8oZNAir9G5icJLyodO8hykv3O0O1jt2FQ@mail.gmail.com/
> [4]: https://lore.kernel.org/lkml/20181221192044.5yvfnuri7gdop4rs@brauner.io/
> 
> Cc: Todd Kjos <tkjos@...gle.com>
> Cc: Greg Kroah-Hartman <gregkh@...uxfoundation.org>
> Signed-off-by: Christian Brauner <christian.brauner@...ntu.com>
> ---
>  drivers/android/binderfs.c | 109 +++++++++++++++++++++++++++++++++++--
>  1 file changed, 104 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
> index 7496b10532aa..93aee00994d4 100644
> --- a/drivers/android/binderfs.c
> +++ b/drivers/android/binderfs.c
> @@ -20,6 +20,7 @@
>  #include <linux/parser.h>
>  #include <linux/radix-tree.h>
>  #include <linux/sched.h>
> +#include <linux/seq_file.h>
>  #include <linux/slab.h>
>  #include <linux/spinlock_types.h>
>  #include <linux/stddef.h>
> @@ -39,6 +40,11 @@
>  #define INODE_OFFSET 3
>  #define INTSTRLEN 21
>  #define BINDERFS_MAX_MINOR (1U << MINORBITS)
> +/*
> + * Ensure that the initial ipc namespace always has a good chunk of devices
> + * available.
> + */
> +#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 1024)
>  
>  static struct vfsmount *binderfs_mnt;
>  
> @@ -46,6 +52,24 @@ static dev_t binderfs_dev;
>  static DEFINE_MUTEX(binderfs_minors_mutex);
>  static DEFINE_IDA(binderfs_minors);
>  
> +/**
> + * binderfs_mount_opts - mount options for binderfs
> + * @max: maximum number of allocatable binderfs binder devices
> + */
> +struct binderfs_mount_opts {
> +	int max;
> +};
> +
> +enum {
> +	Opt_max,
> +	Opt_err
> +};
> +
> +static const match_table_t tokens = {
> +	{ Opt_max, "max=%d" },
> +	{ Opt_err, NULL     }
> +};
> +
>  /**
>   * binderfs_info - information about a binderfs mount
>   * @ipc_ns:         The ipc namespace the binderfs mount belongs to.
> @@ -55,13 +79,16 @@ static DEFINE_IDA(binderfs_minors);
>   *                  created.
>   * @root_gid:       gid that needs to be used when a new binder device is
>   *                  created.
> + * @mount_opts:     The mount options in use.
> + * @device_count:   The current number of allocated binder devices.
>   */
>  struct binderfs_info {
>  	struct ipc_namespace *ipc_ns;
>  	struct dentry *control_dentry;
>  	kuid_t root_uid;
>  	kgid_t root_gid;
> -
> +	struct binderfs_mount_opts mount_opts;
> +	atomic_t device_count;
>  };
>  
>  static inline struct binderfs_info *BINDERFS_I(const struct inode *inode)
> @@ -107,13 +134,22 @@ static int binderfs_binder_device_create(struct inode *ref_inode,
>  	struct inode *inode = NULL;
>  	struct super_block *sb = ref_inode->i_sb;
>  	struct binderfs_info *info = sb->s_fs_info;
> +	bool use_reserve = (info->ipc_ns == &init_ipc_ns);
>  
>  	/* Reserve new minor number for the new device. */
>  	mutex_lock(&binderfs_minors_mutex);
> -	minor = ida_alloc_max(&binderfs_minors, BINDERFS_MAX_MINOR, GFP_KERNEL);
> +	if (atomic_inc_return(&info->device_count) < info->mount_opts.max)
> +		minor = ida_alloc_max(&binderfs_minors,
> +				      use_reserve ? BINDERFS_MAX_MINOR :
> +						    BINDERFS_MAX_MINOR_CAPPED,
> +				      GFP_KERNEL);
> +	else
> +		minor = -ENOSPC;
>  	mutex_unlock(&binderfs_minors_mutex);
> -	if (minor < 0)
> +	if (minor < 0) {
> +		atomic_dec(&info->device_count);
>  		return minor;
> +	}
>  
>  	ret = -ENOMEM;
>  	device = kzalloc(sizeof(*device), GFP_KERNEL);
> @@ -187,6 +223,7 @@ static int binderfs_binder_device_create(struct inode *ref_inode,
>  	kfree(name);
>  	kfree(device);
>  	mutex_lock(&binderfs_minors_mutex);
> +	atomic_dec(&info->device_count);
>  	ida_free(&binderfs_minors, minor);
>  	mutex_unlock(&binderfs_minors_mutex);
>  	iput(inode);
> @@ -232,6 +269,7 @@ static long binder_ctl_ioctl(struct file *file, unsigned int cmd,
>  static void binderfs_evict_inode(struct inode *inode)
>  {
>  	struct binder_device *device = inode->i_private;
> +	struct binderfs_info *info = BINDERFS_I(inode);
>  
>  	clear_inode(inode);
>  
> @@ -239,6 +277,7 @@ static void binderfs_evict_inode(struct inode *inode)
>  		return;
>  
>  	mutex_lock(&binderfs_minors_mutex);
> +	atomic_dec(&info->device_count);
>  	ida_free(&binderfs_minors, device->miscdev.minor);
>  	mutex_unlock(&binderfs_minors_mutex);
>  
> @@ -246,9 +285,64 @@ static void binderfs_evict_inode(struct inode *inode)
>  	kfree(device);
>  }
>  
> +/**
> + * binderfs_parse_mount_opts - parse binderfs mount options
> + * @data: options to set (can be NULL in which case defaults are used)
> + */
> +static int binderfs_parse_mount_opts(char *data,
> +				     struct binderfs_mount_opts *opts)
> +{
> +	char *p;
> +	opts->max = BINDERFS_MAX_MINOR;
> +
> +	while ((p = strsep(&data, ",")) != NULL) {
> +		substring_t args[MAX_OPT_ARGS];
> +		int token;
> +		int max_devices;
> +
> +		if (!*p)
> +			continue;
> +
> +		token = match_token(p, tokens, args);
> +		switch (token) {
> +		case Opt_max:
> +			max_devices = match_int(&args[0], &max_devices);
> +			if (max_devices < 0 || (max_devices > BINDERFS_MAX_MINOR))
> +				return -EINVAL;

One should always account for their own stupid mistakes and here's one
I made: the ways match_int() is used here is nonsense since it
overwrites the parsed max_devices value. This should be:

if (match_int(&args[0], &max_devices) ||
    (max_devices < 0 ||
     (max_devices > BINDERFS_MAX_MINOR)))
        return -EINVAL;

Good that we have to have another version anyway. Sorry about that.

Christian

> +
> +			opts->max = max_devices;
> +			break;
> +		default:
> +			pr_err("Invalid mount options\n");
> +			return -EINVAL;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int binderfs_remount(struct super_block *sb, int *flags, char *data)
> +{
> +	struct binderfs_info *info = sb->s_fs_info;
> +	return binderfs_parse_mount_opts(data, &info->mount_opts);
> +}
> +
> +static int binderfs_show_mount_opts(struct seq_file *seq, struct dentry *root)
> +{
> +	struct binderfs_info *info;
> +
> +	info = root->d_sb->s_fs_info;
> +	if (info->mount_opts.max < BINDERFS_MAX_MINOR)
> +		seq_printf(seq, ",max=%d", info->mount_opts.max);
> +
> +	return 0;
> +}
> +
>  static const struct super_operations binderfs_super_ops = {
> -	.statfs = simple_statfs,
> -	.evict_inode = binderfs_evict_inode,
> +	.evict_inode    = binderfs_evict_inode,
> +	.remount_fs	= binderfs_remount,
> +	.show_options	= binderfs_show_mount_opts,
> +	.statfs         = simple_statfs,
>  };
>  
>  static int binderfs_rename(struct inode *old_dir, struct dentry *old_dentry,
> @@ -409,6 +503,10 @@ static int binderfs_fill_super(struct super_block *sb, void *data, int silent)
>  	if (!info)
>  		goto err_without_dentry;
>  
> +	ret = binderfs_parse_mount_opts(data, &info->mount_opts);
> +	if (ret)
> +		goto err_without_dentry;
> +
>  	info->ipc_ns = ipc_ns;
>  	info->root_gid = make_kgid(sb->s_user_ns, 0);
>  	if (!gid_valid(info->root_gid))
> @@ -416,6 +514,7 @@ static int binderfs_fill_super(struct super_block *sb, void *data, int silent)
>  	info->root_uid = make_kuid(sb->s_user_ns, 0);
>  	if (!uid_valid(info->root_uid))
>  		info->root_uid = GLOBAL_ROOT_UID;
> +	atomic_set(&info->device_count, 0);
>  
>  	sb->s_fs_info = info;
>  
> -- 
> 2.19.1
>