lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20070307112759.GA3796@2ka.mipt.ru>
Date:	Wed, 7 Mar 2007 14:27:59 +0300
From:	Evgeniy Polyakov <johnpol@....mipt.ru>
To:	Ingo Molnar <mingo@...e.hu>
Cc:	Davide Libenzi <davidel@...ilserver.org>,
	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Arjan van de Ven <arjan@...radead.org>,
	Christoph Hellwig <hch@...radead.org>,
	Andrew Morton <akpm@....com.au>,
	Alan Cox <alan@...rguk.ukuu.org.uk>,
	Ulrich Drepper <drepper@...hat.com>,
	Zach Brown <zach.brown@...cle.com>,
	"David S. Miller" <davem@...emloft.net>,
	Suparna Bhattacharya <suparna@...ibm.com>,
	Jens Axboe <jens.axboe@...cle.com>
Subject: [1/1] eventfs: pseudo fs which allows to bind events to file descriptors.

Hello.

This pseudo fs allows to bind a file descriptor to different kinds of
events, which allows to poll them using epoll().

This particular morning hack supports signals only.

If idea is supposed to be right, I can cook up POSIX timers support.

Signal delivery note.
If special flag is set in signalfd(signo, flag), then signals are _not_
delivered through pending mask update but only through epoll queue.
(Copied from kevent).

Userspace signal code and patch itself can be found at:
http://tservice.net.ru/~s0mbre/archive/eventfs/

signal.c is also attached for interested reader.

Signed-off-by: Evgeniy Polyakov <johnpol@....mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 2697e92..b14ee54 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -319,3 +319,4 @@ ENTRY(sys_call_table)
 	.long sys_move_pages
 	.long sys_getcpu
 	.long sys_epoll_pwait
+	.long sys_signalfd		/* 320 */
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index eda7a0d..bc6336c 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -719,4 +719,5 @@ ia32_sys_call_table:
 	.quad compat_sys_move_pages
 	.quad sys_getcpu
 	.quad sys_epoll_pwait
+	.quad sys_signalfd
 ia32_syscall_end:		
diff --git a/fs/Kconfig b/fs/Kconfig
index 3c4886b..09803ad 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1032,6 +1032,15 @@ config CONFIGFS_FS
 	  Both sysfs and configfs can and should exist together on the
 	  same system. One is not a replacement for the other.
 
+config EVENTFS
+	bool "Enable eventpoll filesystem support" if EMBEDDED
+	depends on EPOLL
+	default y
+	help
+	  Allows to bind file descriptors to different kinds of objects
+	  like signals and timers and work with them using epoll 
+	  family of system calls.
+
 endmenu
 
 menu "Miscellaneous filesystems"
diff --git a/fs/Makefile b/fs/Makefile
index 9edf411..185bcb1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -22,6 +22,7 @@ endif
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
+obj-$(CONFIG_EVENTFS)		+= eventfs.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
 nfsd-$(CONFIG_NFSD)		:= nfsctl.o
diff --git a/fs/eventfs.c b/fs/eventfs.c
new file mode 100644
index 0000000..dae108c
--- /dev/null
+++ b/fs/eventfs.c
@@ -0,0 +1,221 @@
+/*
+ * 2007 Copyright (c) Evgeniy Polyakov <johnpol@....mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <asm/io.h>
+
+static inline void eventfs_set_signal_file(int sig, struct file *file)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	current->signal_file[sig-1] = file;
+	spin_unlock_irq(&current->sighand->siglock);
+}
+	
+static int eventfs_signal_release(struct inode *inode, struct file *file)
+{
+	int sig = (int)((unsigned long)(file->private_data) & 0x0fffffff);
+	eventfs_set_signal_file(sig, NULL);
+	return 0;
+}
+
+static unsigned int eventfs_signal_poll(struct file *file, struct poll_table_struct *wait)
+{
+	int sig = (int)((unsigned long)(file->private_data) & 0x0fffffff);
+	unsigned int mask = 0;
+	unsigned long flags;
+
+	poll_wait(file, &current->signal_wait, wait);
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	if (!sigismember(&current->blocked, sig) && (((unsigned long)(file->private_data)) & 0x40000000)) {
+		mask = POLLIN | POLLRDNORM;
+		file->private_data = (void *)(((unsigned long)(file->private_data)) & ~0x40000000);
+	}
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+	return mask;
+}
+
+struct file_operations eventfs_signal_fops = {
+	.release	= eventfs_signal_release,
+	.poll		= eventfs_signal_poll,
+	.owner		= THIS_MODULE,
+};
+
+static struct vfsmount *eventfs_mnt __read_mostly;
+
+static int eventfs_get_sb(struct file_system_type *fs_type, int flags,
+		   const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "eventfs", NULL, 0x193748dd, mnt);
+}
+
+static struct file_system_type eventfs_fs_type = {
+	.name		= "eventfs",
+	.get_sb		= eventfs_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static int eventfs_delete_dentry(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations eventfs_dentry_operations = {
+	.d_delete	= eventfs_delete_dentry,
+};
+
+static int eventfs_init(struct file **filp, struct file_operations *fops, unsigned long priv)
+{
+	struct qstr this;
+	char name[32];
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *file;
+	int err = -ENFILE, fd;
+
+	file = get_empty_filp();
+	if (!file)
+		goto err_out_exit;
+
+	inode = new_inode(eventfs_mnt->mnt_sb);
+	if (!inode)
+		goto err_out_fput;
+
+	inode->i_fop = fops;
+
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	err = get_unused_fd();
+	if (err < 0)
+		goto err_out_iput;
+	fd = err;
+
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = inode->i_ino;
+	dentry = d_alloc(eventfs_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto err_out_put_fd;
+	dentry->d_op = &eventfs_dentry_operations;
+	d_add(dentry, inode);
+	file->f_vfsmnt = mntget(eventfs_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+	file->f_pos = 0;
+	file->f_flags = O_RDONLY;
+	file->f_op = fops;
+	file->f_mode = FMODE_READ;
+	file->f_version = 0;
+	file->private_data = (void *)priv;
+
+	fd_install(fd, file);
+	*filp = file;
+
+	return fd;
+
+err_out_put_fd:
+	put_unused_fd(fd);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	put_filp(file);
+err_out_exit:
+	return err;
+}
+
+asmlinkage long sys_signalfd(int sig, int flags)
+{
+	int fd, err = 0;
+	struct file *file;
+	unsigned long priv = sig;
+
+	if (!valid_signal(sig) || sig < 1/* || sig_kernel_only(sig) */)
+		return -EINVAL;
+
+	spin_lock_irq(&current->sighand->siglock);
+	file = current->signal_file[sig-1];
+	if (file)
+		err = -EEXIST;
+	else
+		current->signal_file[sig-1] = (void *)1;
+	spin_unlock_irq(&current->sighand->siglock);
+
+	if (err)
+		return err;
+
+	file = NULL;
+	if (flags)
+		priv |= 0x80000000;
+	fd = eventfs_init(&file, &eventfs_signal_fops, priv);
+	if (fd < 0)
+		goto err_out_clean_file;
+	
+	eventfs_set_signal_file(sig, file);
+
+	return fd;
+
+err_out_clean_file:
+	eventfs_set_signal_file(sig, NULL);
+	return fd;
+}
+
+/*
+ * Eventfs subsystem initialization - create caches and register
+ * filesystem to get control file descriptors from.
+ */
+static int __init eventfs_sys_init(void)
+{
+	int err;
+
+	err = register_filesystem(&eventfs_fs_type);
+	if (err)
+		goto err_out_exit;
+	
+	eventfs_mnt = kern_mount(&eventfs_fs_type);
+	err = PTR_ERR(eventfs_mnt);
+	if (IS_ERR(eventfs_mnt))
+		goto err_out_unreg;
+
+	printk(KERN_INFO "Eventfs subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_unreg:
+	unregister_filesystem(&eventfs_fs_type);
+err_out_exit:
+	return err;
+}
+
+module_init(eventfs_sys_init);
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 833fa17..c72a568 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -325,10 +325,11 @@
 #define __NR_move_pages		317
 #define __NR_getcpu		318
 #define __NR_epoll_pwait	319
+#define __NR_signalfd		320
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 320
+#define NR_syscalls 321
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index c5f596e..62a21f3 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,8 +619,10 @@ __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_signalfd		280
+__SYSCALL(__NR_signalfd, sys_signalfd)
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_signalfd
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49fe299..22c1412 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -935,6 +935,10 @@ struct task_struct {
 /* signal handlers */
 	struct signal_struct *signal;
 	struct sighand_struct *sighand;
+#ifdef CONFIG_EVENTFS
+	struct file *signal_file[_NSIG];
+	wait_queue_head_t signal_wait;
+#endif
 
 	sigset_t blocked, real_blocked;
 	sigset_t saved_sigmask;		/* To be restored with TIF_RESTORE_SIGMASK */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1912c6c..b34f4e6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -605,4 +605,6 @@ asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct g
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
+asmlinkage long sys_signalfd(int sig, int flags);
+
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index d154cc7..1b318da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1128,6 +1128,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
 
+#ifdef CONFIG_EVENTFS
+	memset(p->signal_file, 0, ARRAY_SIZE(p->signal_file));
+	init_waitqueue_head(&p->signal_wait);
+#endif
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
diff --git a/kernel/signal.c b/kernel/signal.c
index 3670225..059977c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -739,6 +739,16 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	struct sigqueue * q = NULL;
 	int ret = 0;
 
+#ifdef CONFIG_EVENTFS
+	if (t->signal_file[sig-1]) {
+		struct file *file = t->signal_file[sig-1];
+		file->private_data = (void *)(((unsigned long)(file->private_data)) | 0x40000000);
+		wake_up(&t->signal_wait);
+		if (((unsigned long)(file->private_data)) & 0x80000000)
+			return 1;
+	}
+#endif
+
 	/*
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
 	 * or SIGKILL.
@@ -817,6 +827,18 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	ret = send_signal(sig, info, t, &t->pending);
 	if (!ret && !sigismember(&t->blocked, sig))
 		signal_wake_up(t, sig == SIGKILL);
+#ifdef CONFIG_EVENTFS
+	/*
+	 * Eventfs allows to deliver signals through epoll queue, 
+	 * it is possible to setup epoll to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through epoll queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+
+#endif
 out:
 	return ret;
 }
@@ -1006,6 +1028,18 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	 * to avoid several races.
 	 */
 	ret = send_signal(sig, info, p, &p->signal->shared_pending);
+#ifdef CONFIG_EVENTFS
+	/*
+	 * Eventfs allows to deliver signals through epoll queue, 
+	 * it is possible to setup epoll to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through epoll queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+
+#endif
 	if (unlikely(ret))
 		return ret;
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d7306d0..c131d20 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -113,6 +113,8 @@ cond_syscall(sys_vm86);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
 
+cond_syscall(sys_signalfd);
+
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
 cond_syscall(sys_pciconfig_write);

-- 
	Evgeniy Polyakov

View attachment "signal.c" of type "text/plain" (1863 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ