lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <Pine.LNX.4.64.0712141716370.28689@sbz-30.cs.Helsinki.FI>
Date:	Fri, 14 Dec 2007 17:16:54 +0200 (EET)
From:	Pekka J Enberg <penberg@...helsinki.fi>
To:	alan@...hat.com, viro@...iv.linux.org.uk, hch@...radead.org,
	peterz@...radead.org, linux-kernel@...r.kernel.org,
	linux-fsdevel@...r.kernel.org
Subject: [RFC/PATCH 4/8] revoke: core code V7

From: Pekka Enberg <penberg@...helsinki.fi>

The revokeat(2) system call ensures that after successful revocation you can
only access an inode via a file descriptor that is obtained from a subsequent
open(2) call.  The open(2) system call can be blocked by the caller with
chmod(2) and chown(2) prior to calling revokeat(2) to gain exclusive access to
an inode.

After an successful revocation, operations on file descriptors fail with the
EBADF or ENXIO error code for regular and device files, respectively.
Attempting to read from or write to a revoked mapping causes SIGBUS.  The
revokeat(2) system call guarantees that:

  (1) open file descriptors are revoked,

  (2) file descriptors created by fork(2) and dup(2) during
      the operation are revoked,

  (3) file descriptors obtained via a SCM_RIGHTS datagram during or
      after the revoke operation are revoked,

  (4) in-flight read(2) and write(2) operations are either completed
      or aborted before revokeat(2) returns successfully,

  (5) attempting to read from or write to a shared memory mapping
      raises SIGBUS, and

  (6) copy-on-write to a private memory mapping after successful
      revokeat(2) call does not reveal any data written after the
      system call has returned.

TODO:

  - I/O requests that are in-flight
  - Breaking of private mapping COW races with fork

Cc: Alan Cox <alan@...hat.com>
Cc: Al Viro <viro@...iv.linux.org.uk>
Cc: Christoph Hellwig <hch@...radead.org>
Cc: Peter Zijlstra <peterz@...radead.org>
Signed-off-by: Pekka Enberg <penberg@...helsinki.fi>
---
 fs/revoke.c           |  450 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h    |   10 +
 include/linux/magic.h |    2 
 include/linux/mm.h    |    1 
 mm/mmap.c             |   11 +
 5 files changed, 473 insertions(+), 1 deletion(-)

Index: 2.6/fs/revoke.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ 2.6/fs/revoke.c	2007-12-14 16:40:55.000000000 +0200
@@ -0,0 +1,450 @@
+/*
+ * Invalidate all current open file descriptors of an inode.
+ *
+ * Copyright (C) 2006-2007  Pekka Enberg
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/preempt.h>
+#include <linux/bit_spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+
+static void revoke_aliases(struct inode *inode)
+{
+	struct dentry *dentry;
+restart:
+	spin_lock(&dcache_lock);
+	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+		spin_lock(&dentry->d_lock);
+		if (!d_unhashed(dentry)) {
+			dget_locked(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&dcache_lock);
+			dput(dentry);
+			goto restart;
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&dcache_lock);
+}
+
+static int revoke_files(struct inode *inode)
+{
+	struct super_block *sb;
+	struct file *file;
+	int err = 0;
+
+	sb = inode->i_sb;
+	if (!sb)
+		return -EINVAL;
+
+restart:
+	file_list_lock();
+	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
+		struct dentry *dentry = file->f_path.dentry;
+
+		if (dentry->d_inode != inode)
+			continue;
+
+		if (file->f_op != inode->i_fop)
+			continue;
+
+		get_file(file);
+
+		/*
+		 * inode->i_mutex cannot be acquired under files_lock
+		 */
+		file_list_unlock();
+
+		err = file->f_op->revoke(file);
+		make_revoked_file(inode, file);
+		fput(file);
+
+		if (err)
+			goto out;
+
+		if (signal_pending(current)) {
+			err = -EINTR;
+			goto out;
+		}
+		cond_resched();
+		goto restart;
+	}
+	file_list_unlock();
+out:
+	return err;
+}
+
+static inline bool vma_matches(struct vm_area_struct *vma, struct inode *inode)
+{
+	struct file *file = vma->vm_file;
+
+	return file && file->f_path.dentry->d_inode == inode;
+}
+
+/*
+ *	LOCKING: read_lock(&tasklist_lock)
+ */
+static unsigned long nr_tasks_with_mm(void)
+{
+	struct task_struct *g, *p;
+	int ret = 0;
+
+	do_each_thread(g, p) {
+		if (!p->mm)
+			continue;
+		ret++;
+	}
+	while_each_thread(g, p);
+	return ret;
+}
+
+static int task_break_cow(struct task_struct *tsk, struct inode *inode)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	int ret = 0;
+
+	mm = get_task_mm(tsk);
+	if (!mm)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+		int err;
+
+		if (vma->vm_flags & VM_SHARED)
+			continue;
+
+		if (!vma_matches(vma, inode))
+			continue;
+
+		err = get_user_pages(tsk, tsk->mm, vma->vm_start,
+				     vma_pages(vma), 1, 1, NULL, NULL);
+		if (err < 0) {
+			ret = err;
+			break;
+		}
+		if (err != vma_pages(vma)) {
+			ret = -ENOMEM;
+			break;
+		}
+		unlink_file_vma(vma);
+		fput(vma->vm_file);
+		vma->vm_file = NULL;
+	}
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	return ret;
+}
+
+static int revoke_break_cow(struct inode *inode)
+{
+	struct task_struct **tsk_array;
+	struct task_struct *g, *p;
+	unsigned long nr, i;
+	int err = 0;
+
+restart:
+	read_lock(&tasklist_lock);
+	nr = nr_tasks_with_mm();
+	read_unlock(&tasklist_lock);
+
+	tsk_array = kcalloc(nr, sizeof(struct task_struct *), GFP_KERNEL);
+	if (!tsk_array)
+		return -ENOMEM;
+
+	read_lock(&tasklist_lock);
+
+	if (nr != nr_tasks_with_mm()) {
+		read_unlock(&tasklist_lock);
+		kfree(tsk_array);
+		cond_resched();
+		goto restart;
+	}
+
+	i = 0;
+	do_each_thread(g, p) {
+		if (i >= nr) {
+			read_unlock(&tasklist_lock);
+			err = -EAGAIN;
+			goto out;
+		}
+
+		if (!p->mm)
+			continue;
+
+		get_task_struct(p);
+		tsk_array[i++] = p;
+	}
+	while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+
+	for (i = 0; i < nr; i++) {
+		struct task_struct *tsk = tsk_array[i];
+
+		err = task_break_cow(tsk, inode);
+		if (err)
+			break;
+	}
+
+	for (i = 0; i < nr; i++) {
+		struct task_struct *tsk = tsk_array[i];
+
+		put_task_struct(tsk);
+	}
+out:
+	kfree(tsk_array);
+	return err;
+}
+
+/*
+ *	 LOCKING: down_write(&mm->mmap_sem)
+ *	 	    -> spin_lock(&mapping->i_mmap_lock)
+ */
+static int revoke_vma(struct vm_area_struct *vma, struct zap_details *details)
+{
+	unsigned long restart_addr, start_addr, end_addr;
+	int need_break;
+
+	start_addr = vma->vm_start;
+	end_addr = vma->vm_end;
+
+again:
+	restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr,
+				      details);
+
+	need_break = need_resched() || need_lockbreak(details->i_mmap_lock);
+	if (need_break)
+		goto out_need_break;
+
+	if (restart_addr < end_addr) {
+		start_addr = restart_addr;
+		goto again;
+	}
+	vma->vm_flags |= VM_REVOKED;
+	return 0;
+
+out_need_break:
+	spin_unlock(details->i_mmap_lock);
+	cond_resched();
+	spin_lock(details->i_mmap_lock);
+	return -EINTR;
+}
+
+static inline bool vma_is_revocable(struct vm_area_struct *vma)
+{
+	return (vma->vm_flags & VM_SHARED) && !(vma->vm_flags & VM_REVOKED);
+}
+
+/*
+ *	LOCKING: spin_lock(&mapping->i_mmap_lock)
+ */
+static int revoke_mm(struct mm_struct *mm, struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct zap_details details;
+	int err = 0;
+
+	details.i_mmap_lock = &mapping->i_mmap_lock;
+
+	/*
+	 * If ->mmap_sem is under contention, we continue scanning other
+	 * mms and try again later.
+	 */
+	if (!down_write_trylock(&mm->mmap_sem)) {
+		err = -EAGAIN;
+		goto out;
+	}
+	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+		if (!vma_is_revocable(vma))
+			continue;
+
+		if (!vma_matches(vma, mapping->host))
+			continue;
+
+		err = revoke_vma(vma, &details);
+		if (err)
+			break;
+
+		__unlink_file_vma(vma);
+		fput(vma->vm_file);
+		vma->vm_file = NULL;
+	}
+	up_write(&mm->mmap_sem);
+out:
+	return err;
+}
+
+/*
+ *	LOCKING: spin_lock(&mapping->i_mmap_lock)
+ */
+static void revoke_mapping_tree(struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int try_again;
+
+restart:
+	try_again = 0;
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) {
+		int err;
+
+		if (!vma_is_revocable(vma))
+			continue;
+
+		if (!vma_matches(vma, mapping->host))
+			continue;
+
+		err = revoke_mm(vma->vm_mm, mapping);
+		if (err == -EAGAIN)
+			try_again = 1;
+
+		goto restart;
+	}
+	if (try_again) {
+		cond_resched();
+		goto restart;
+	}
+}
+
+/*
+ *	LOCKING: spin_lock(&mapping->i_mmap_lock)
+ */
+static void revoke_mapping_list(struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	int try_again;
+
+restart:
+	try_again = 0;
+
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) {
+		int err;
+
+		if (!vma_is_revocable(vma))
+			continue;
+
+		if (!vma_matches(vma, mapping->host))
+			continue;
+
+		err = revoke_mm(vma->vm_mm, mapping);
+		if (err == -EAGAIN) {
+			try_again = 1;
+			continue;
+		}
+		if (err == -EINTR)
+			goto restart;
+	}
+	if (try_again) {
+		cond_resched();
+		goto restart;
+	}
+}
+
+static void revoke_mapping(struct address_space *mapping)
+{
+	spin_lock(&mapping->i_mmap_lock);
+	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+		revoke_mapping_tree(mapping);
+	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
+		revoke_mapping_list(mapping);
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+static inline void revoke_unlock(struct inode *inode)
+{
+	mutex_lock(&inode->i_mutex);
+	inode->i_flags &= ~S_REVOKE_LOCK;
+	mutex_unlock(&inode->i_mutex);
+}
+
+/*
+ * 	Returns true if revoke lock was acquired
+ */
+static inline bool revoke_trylock(struct inode *inode)
+{
+	bool ret = false;
+
+	mutex_lock(&inode->i_mutex);
+	if (!IS_REVOKE_LOCKED(inode)) {
+		inode->i_flags |= S_REVOKE_LOCK;
+		ret = true;
+	}
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+static int do_revoke(struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	int err = 0;
+
+	if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (!inode->i_sb->s_bdev || !inode->i_fop->revoke)
+		return -EOPNOTSUPP;
+
+	/*
+	 * Take the S_REVOKE_LOCK to avoid concurrent revoke operations on the
+	 * same inode.
+	 */
+	if (!revoke_trylock(inode))
+		return -EBUSY;
+
+	revoke_mapping(mapping);
+
+	err = revoke_break_cow(inode);
+	if (err)
+		goto failed;
+
+	err = revoke_files(inode);
+	if (err)
+		goto failed;
+
+	/*
+	 * Make pending reads fail.
+	 */
+	err = invalidate_inode_pages2(inode->i_mapping);
+	if (err)
+		goto failed;
+
+	make_revoked_inode(inode);
+	remove_inode_hash(inode);
+	revoke_aliases(inode);
+failed:
+	revoke_unlock(inode);
+	wake_up(&inode->i_revoke_wait);
+	return err;
+}
+
+asmlinkage long sys_revokeat(int dfd, const char __user *filename)
+{
+	struct nameidata nd;
+	int err;
+
+	err = __user_walk_fd(dfd, filename, 0, &nd);
+	if (!err) {
+		err = do_revoke(nd.dentry->d_inode);
+		path_release(&nd);
+	}
+	return err;
+}
+
+int generic_file_revoke(struct file *file)
+{
+	return do_fsync(file, 1);
+}
+EXPORT_SYMBOL(generic_file_revoke);
Index: 2.6/include/linux/fs.h
===================================================================
--- 2.6.orig/include/linux/fs.h	2007-12-14 16:40:50.000000000 +0200
+++ 2.6/include/linux/fs.h	2007-12-14 16:40:55.000000000 +0200
@@ -1191,6 +1191,7 @@ struct file_operations {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
+	int (*revoke) (struct file *);
 };
 
 struct inode_operations {
@@ -1824,6 +1825,15 @@ extern ssize_t generic_splice_sendpage(s
 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 		size_t len, unsigned int flags);
 
+/* fs/revoke.c */
+#ifdef CONFIG_MMU
+extern void make_revoked_file(struct inode *, struct file *);
+extern void make_revoked_inode(struct inode *);
+extern int generic_file_revoke(struct file *);
+#else
+#define generic_file_revoke NULL
+#endif
+
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
Index: 2.6/include/linux/mm.h
===================================================================
--- 2.6.orig/include/linux/mm.h	2007-12-14 16:40:48.000000000 +0200
+++ 2.6/include/linux/mm.h	2007-12-14 16:40:55.000000000 +0200
@@ -986,6 +986,7 @@ extern int split_vma(struct mm_struct *,
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
+extern void __unlink_file_vma(struct vm_area_struct *);
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff);
Index: 2.6/mm/mmap.c
===================================================================
--- 2.6.orig/mm/mmap.c	2007-12-14 16:40:49.000000000 +0200
+++ 2.6/mm/mmap.c	2007-12-14 16:40:55.000000000 +0200
@@ -201,6 +201,17 @@ static void __remove_shared_vm_struct(st
 }
 
 /*
+ * Requires inode->i_mapping->i_mmap_lock
+ */
+void __unlink_file_vma(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+
+	__remove_shared_vm_struct(vma, file, mapping);
+}
+
+/*
  * Unlink a file-based vm structure from its prio_tree, to hide
  * vma from rmap and vmtruncate before freeing its page tables.
  */
Index: 2.6/include/linux/magic.h
===================================================================
--- 2.6.orig/include/linux/magic.h	2007-11-23 09:58:11.000000000 +0200
+++ 2.6/include/linux/magic.h	2007-12-14 16:40:55.000000000 +0200
@@ -34,7 +34,7 @@ #define REISERFS_SUPER_MAGIC	0x52654973	
 #define REISERFS_SUPER_MAGIC_STRING	"ReIsErFs"
 #define REISER2FS_SUPER_MAGIC_STRING	"ReIsEr2Fs"
 #define REISER2FS_JR_SUPER_MAGIC_STRING	"ReIsEr3Fs"
-
+#define REVOKEFS_MAGIC		0x5245564B /* REVK */
 #define SMB_SUPER_MAGIC		0x517B
 #define USBDEVICE_SUPER_MAGIC	0x9fa2
 #define CGROUP_SUPER_MAGIC	0x27e0eb
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ