linux-kernel - [patch 04/12] syslets: core code

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20070228214149.GD2305@elte.hu>
Date:	Wed, 28 Feb 2007 22:41:49 +0100
From:	Ingo Molnar <mingo@...e.hu>
To:	linux-kernel@...r.kernel.org
Cc:	Linus Torvalds <torvalds@...ux-foundation.org>,
	Arjan van de Ven <arjan@...radead.org>,
	Christoph Hellwig <hch@...radead.org>,
	Andrew Morton <akpm@....com.au>,
	Alan Cox <alan@...rguk.ukuu.org.uk>,
	Ulrich Drepper <drepper@...hat.com>,
	Zach Brown <zach.brown@...cle.com>,
	Evgeniy Polyakov <johnpol@....mipt.ru>,
	"David S. Miller" <davem@...emloft.net>,
	Suparna Bhattacharya <suparna@...ibm.com>,
	Davide Libenzi <davidel@...ilserver.org>,
	Jens Axboe <jens.axboe@...cle.com>,
	Thomas Gleixner <tglx@...utronix.de>
Subject: [patch 04/12] syslets: core code

From: Ingo Molnar <mingo@...e.hu>

the core syslet / async system calls infrastructure code.

Is built only if CONFIG_ASYNC_SUPPORT is enabled.

Signed-off-by: Ingo Molnar <mingo@...e.hu>
Signed-off-by: Arjan van de Ven <arjan@...ux.intel.com>
---
 kernel/Makefile |    1 
 kernel/async.c  |  989 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 990 insertions(+)

Index: linux/kernel/Makefile
===================================================================
--- linux.orig/kernel/Makefile
+++ linux/kernel/Makefile
@@ -10,6 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
 
+obj-$(CONFIG_ASYNC_SUPPORT) += async.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
Index: linux/kernel/async.c
===================================================================
--- /dev/null
+++ linux/kernel/async.c
@@ -0,0 +1,989 @@
+/*
+ * kernel/async.c
+ *
+ * The syslet and threadlet subsystem - asynchronous syscall and
+ * user-space code execution support.
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@...hat.com>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This code implements asynchronous syscalls via 'syslets'.
+ *
+ * Syslets consist of a set of 'syslet atoms' which are residing
+ * purely in user-space memory and have no kernel-space resource
+ * attached to them. These atoms can be linked to each other via
+ * pointers. Besides the fundamental ability to execute system
+ * calls, syslet atoms can also implement branches, loops and
+ * arithmetics.
+ *
+ * Thus syslets can be used to build small autonomous programs that
+ * the kernel can execute purely from kernel-space, without having
+ * to return to any user-space context. Syslets can be run by any
+ * unprivileged user-space application - they are executed safely
+ * by the kernel.
+ *
+ * "Threadlets" are the user-space equivalent of syslets: small
+ * functions of execution that user-space attempts/expects to execute
+ * without scheduling. If the threadlet nevertheless blocks, the kernel
+ * creates a real thread from it, and that thread is put aside sleeping.
+ * The 'head' context (the context that never blocks) returns to the
+ * original function that called the threadlet. Once the sleeping thread
+ * wakes up again (after it got for whatever it was waiting - IO, timeout,
+ * etc.) the function continues executing asynchronously, as a thread.
+ * A user-space completion ring connects these asynchronous function calls
+ * back to the head context.
+ */
+#include <linux/syscalls.h>
+#include <linux/syslet.h>
+#include <linux/delay.h>
+#include <linux/async.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+/*
+ * An async 'cachemiss context' is either busy, or it is ready.
+ * If it is ready, the 'head' might switch its user-space context
+ * to that ready thread anytime - so that if the ex-head blocks,
+ * one ready thread can become the next head and can continue to
+ * execute user-space code.
+ */
+static void
+__mark_async_thread_ready(struct async_thread *at, struct async_head *ah)
+{
+	list_del(&at->entry);
+	list_add_tail(&at->entry, &ah->ready_async_threads);
+	if (list_empty(&ah->busy_async_threads))
+		wake_up(&ah->wait);
+}
+
+static void
+mark_async_thread_ready(struct async_thread *at, struct async_head *ah)
+{
+	spin_lock(&ah->lock);
+	__mark_async_thread_ready(at, ah);
+	spin_unlock(&ah->lock);
+}
+
+static void
+__mark_async_thread_busy(struct async_thread *at, struct async_head *ah)
+{
+	list_del(&at->entry);
+	list_add_tail(&at->entry, &ah->busy_async_threads);
+}
+
+static void
+mark_async_thread_busy(struct async_thread *at, struct async_head *ah)
+{
+	spin_lock(&ah->lock);
+	__mark_async_thread_busy(at, ah);
+	spin_unlock(&ah->lock);
+}
+
+static void
+__async_thread_init(struct task_struct *t, struct async_thread *at,
+		    struct async_head *ah)
+{
+	INIT_LIST_HEAD(&at->entry);
+	at->exit = 0;
+	at->task = t;
+	at->ah = ah;
+
+	t->at = at;
+}
+
+static void
+async_thread_init(struct task_struct *t, struct async_thread *at,
+		  struct async_head *ah)
+{
+	spin_lock(&ah->lock);
+	__async_thread_init(t, at, ah);
+	__mark_async_thread_ready(at, ah);
+	spin_unlock(&ah->lock);
+}
+
+static void
+async_thread_exit(struct async_thread *at, struct task_struct *t)
+{
+	struct async_head *ah = at->ah;
+
+	spin_lock(&ah->lock);
+	list_del_init(&at->entry);
+	if (at->exit)
+		complete(&ah->exit_done);
+	t->at = NULL;
+	at->task = NULL;
+	spin_unlock(&ah->lock);
+}
+
+static struct async_thread *
+pick_ready_cachemiss_thread(struct async_head *ah)
+{
+	struct list_head *head = &ah->ready_async_threads;
+
+	if (list_empty(head))
+		return NULL;
+
+	return list_entry(head->next, struct async_thread, entry);
+}
+
+void __async_schedule(struct task_struct *t)
+{
+	struct async_thread *new_async_thread;
+	struct async_thread *async_ready;
+	struct async_head *ah = t->ah;
+	struct task_struct *new_task;
+
+	WARN_ON(!ah);
+	spin_lock(&ah->lock);
+
+	new_async_thread = pick_ready_cachemiss_thread(ah);
+	if (!new_async_thread)
+		goto out_unlock;
+
+	async_ready = t->async_ready;
+	WARN_ON(!async_ready);
+	t->async_ready = NULL;
+
+	new_task = new_async_thread->task;
+
+	move_user_context(new_task, t);
+	if (ah->restore_stack) {
+		set_task_stack_reg(new_task, ah->restore_stack);
+		WARN_ON(!ah->restore_ip);
+		task_ip_reg(new_task) = ah->restore_ip;
+		/*
+		 * The return code 0 is needed to tell the
+		 * head user-context that the threadlet went async:
+		 */
+		task_ret_reg(new_task) = 0;
+	}
+
+	new_task->at = NULL;
+	t->ah = NULL;
+	new_task->ah = ah;
+	ah->user_task = new_task;
+
+	wake_up_process(new_task);
+
+	__async_thread_init(t, async_ready, ah);
+	__mark_async_thread_busy(t->at, ah);
+
+ out_unlock:
+	spin_unlock(&ah->lock);
+}
+
+static void async_schedule(struct task_struct *t)
+{
+	if (t->async_ready)
+		__async_schedule(t);
+}
+
+static long __exec_atom(struct task_struct *t, struct syslet_atom *atom)
+{
+	struct async_thread *async_ready_save;
+	long ret;
+
+	/*
+	 * If user-space expects the syscall to schedule then
+	 * (try to) switch user-space to another thread straight
+	 * away and execute the syscall asynchronously:
+	 */
+	if (unlikely(atom->flags & SYSLET_ASYNC))
+		async_schedule(t);
+	/*
+	 * Does user-space want synchronous execution for this atom?:
+	 */
+	async_ready_save = t->async_ready;
+	if (unlikely(atom->flags & SYSLET_SYNC))
+		t->async_ready = NULL;
+
+	if (unlikely(atom->nr >= atom->nr_syscalls))
+		return -ENOSYS;
+
+	ret = atom->call_table[atom->nr](atom->args[0], atom->args[1],
+					 atom->args[2], atom->args[3],
+					 atom->args[4], atom->args[5]);
+
+	if (atom->ret_ptr && put_user(ret, atom->ret_ptr))
+		return -EFAULT;
+
+	if (t->ah)
+		t->async_ready = async_ready_save;
+
+	return ret;
+}
+
+/*
+ * Arithmetics syscall, add a value to a user-space memory location.
+ *
+ * Generic C version - in case the architecture has not implemented it
+ * in assembly.
+ */
+asmlinkage __attribute__((weak)) long
+sys_umem_add(unsigned long __user *uptr, unsigned long inc)
+{
+	unsigned long val, new_val;
+
+	if (get_user(val, uptr))
+		return -EFAULT;
+	/*
+	 * inc == 0 means 'read memory value':
+	 */
+	if (!inc)
+		return val;
+
+	new_val = val + inc;
+	if (__put_user(new_val, uptr))
+		return -EFAULT;
+
+	return new_val;
+}
+
+/*
+ * Open-coded because this is a very hot codepath during syslet
+ * execution and every cycle counts ...
+ *
+ * [ NOTE: it's an explicit fastcall because optimized assembly code
+ *   might depend on this. There are some kernels that disable regparm,
+ *   so lets not break those if possible. ]
+ */
+fastcall __attribute__((weak)) long
+copy_uatom(struct syslet_atom *atom, struct syslet_uatom __user *uatom)
+{
+	unsigned long __user *arg_ptr;
+	long ret = 0;
+
+	if (!access_ok(VERIFY_READ, uatom, sizeof(*uatom)))
+		return -EFAULT;
+
+	ret = __get_user(atom->nr, &uatom->nr);
+	ret |= __get_user(atom->ret_ptr, (long __user **)&uatom->ret_ptr);
+	ret |= __get_user(atom->flags, (unsigned long __user *)&uatom->flags);
+	ret |= __get_user(atom->next,
+			(struct syslet_uatom  __user **)&uatom->next);
+
+	memset(atom->args, 0, sizeof(atom->args));
+
+	ret |= __get_user(arg_ptr, (unsigned long __user **)&uatom->arg_ptr[0]);
+	if (!arg_ptr)
+		return ret;
+	if (!access_ok(VERIFY_READ, arg_ptr, sizeof(*arg_ptr)))
+		return -EFAULT;
+	ret |= __get_user(atom->args[0], arg_ptr);
+
+	ret |= __get_user(arg_ptr, (unsigned long __user **)&uatom->arg_ptr[1]);
+	if (!arg_ptr)
+		return ret;
+	if (!access_ok(VERIFY_READ, arg_ptr, sizeof(*arg_ptr)))
+		return -EFAULT;
+	ret |= __get_user(atom->args[1], arg_ptr);
+
+	ret |= __get_user(arg_ptr, (unsigned long __user **)&uatom->arg_ptr[2]);
+	if (!arg_ptr)
+		return ret;
+	if (!access_ok(VERIFY_READ, arg_ptr, sizeof(*arg_ptr)))
+		return -EFAULT;
+	ret |= __get_user(atom->args[2], arg_ptr);
+
+	ret |= __get_user(arg_ptr, (unsigned long __user **)&uatom->arg_ptr[3]);
+	if (!arg_ptr)
+		return ret;
+	if (!access_ok(VERIFY_READ, arg_ptr, sizeof(*arg_ptr)))
+		return -EFAULT;
+	ret |= __get_user(atom->args[3], arg_ptr);
+
+	ret |= __get_user(arg_ptr, (unsigned long __user **)&uatom->arg_ptr[4]);
+	if (!arg_ptr)
+		return ret;
+	if (!access_ok(VERIFY_READ, arg_ptr, sizeof(*arg_ptr)))
+		return -EFAULT;
+	ret |= __get_user(atom->args[4], arg_ptr);
+
+	ret |= __get_user(arg_ptr, (unsigned long __user **)&uatom->arg_ptr[5]);
+	if (!arg_ptr)
+		return ret;
+	if (!access_ok(VERIFY_READ, arg_ptr, sizeof(*arg_ptr)))
+		return -EFAULT;
+	ret |= __get_user(atom->args[5], arg_ptr);
+
+	return ret;
+}
+
+/*
+ * Should the next atom run, depending on the return value of
+ * the current atom - or should we stop execution?
+ */
+static int run_next_atom(struct syslet_atom *atom, long ret)
+{
+	switch (atom->flags & SYSLET_STOP_MASK) {
+		case SYSLET_STOP_ON_NONZERO:
+			if (!ret)
+				return 1;
+			return 0;
+		case SYSLET_STOP_ON_ZERO:
+			if (ret)
+				return 1;
+			return 0;
+		case SYSLET_STOP_ON_NEGATIVE:
+			if (ret >= 0)
+				return 1;
+			return 0;
+		case SYSLET_STOP_ON_NON_POSITIVE:
+			if (ret > 0)
+				return 1;
+			return 0;
+	}
+	return 1;
+}
+
+static struct syslet_uatom __user *
+next_uatom(struct syslet_atom *atom, struct syslet_uatom *uatom, long ret)
+{
+	/*
+	 * If the stop condition is false then continue
+	 * to atom->next:
+	 */
+	if (run_next_atom(atom, ret))
+		return atom->next;
+	/*
+	 * Special-case: if the stop condition is true and the atom
+	 * has SKIP_TO_NEXT_ON_STOP set, then instead of
+	 * stopping we skip to the atom directly after this atom
+	 * (in linear address-space).
+	 *
+	 * This, combined with the atom->next pointer and the
+	 * stop condition flags is what allows true branches and
+	 * loops in syslets:
+	 */
+	if (atom->flags & SYSLET_SKIP_TO_NEXT_ON_STOP)
+		return uatom + 1;
+
+	return NULL;
+}
+
+/*
+ * If user-space requested a completion event then put the last
+ * executed uatom into the completion ring:
+ */
+static long
+completion_event(struct async_head *ah, struct task_struct *t,
+		 void __user *event, struct async_head_user __user *ahu)
+{
+	unsigned long ring_size_bytes, max_ring_idx, kernel_ring_idx;
+	struct syslet_uatom __user *slot_val = NULL;
+	u64 __user *completion_ring, *ring_slot;
+
+	WARN_ON(!t->at);
+	WARN_ON(t->ah);
+
+	if (!access_ok(VERIFY_WRITE, ahu, sizeof(*ahu)))
+		return -EFAULT;
+
+	if (__get_user(completion_ring,
+				(u64 __user **)&ahu->completion_ring_ptr))
+		return -EFAULT;
+	if (__get_user(ring_size_bytes,
+			(unsigned long __user *)&ahu->ring_size_bytes))
+		return -EFAULT;
+	if (!ring_size_bytes)
+		return -EINVAL;
+
+	max_ring_idx = ring_size_bytes / sizeof(u64);
+	if (ring_size_bytes != max_ring_idx * sizeof(u64))
+		return -EINVAL;
+	/*
+	 * We pre-check the ring pointer, so that in the fastpath
+	 * we can use __get_user():
+	 */
+	if (!access_ok(VERIFY_WRITE, completion_ring, ring_size_bytes))
+		return -EFAULT;
+
+	mutex_lock(&ah->completion_lock);
+	/*
+	 * Asynchron threads can complete in parallel so use the
+	 * head-lock to serialize:
+	 */
+	if (__get_user(kernel_ring_idx,
+			(unsigned long __user *)&ahu->kernel_ring_idx))
+		goto fault_unlock;
+	if (kernel_ring_idx >= max_ring_idx)
+		goto err_unlock;
+
+	ring_slot = completion_ring + kernel_ring_idx;
+	if (__get_user(slot_val, (struct syslet_uatom __user **)ring_slot))
+		goto fault_unlock;
+	/*
+	 * User-space submitted more work than what fits into the
+	 * completion ring - do not stomp over it silently and signal
+	 * the error condition:
+	 */
+	if (slot_val)
+		goto err_unlock;
+
+	slot_val = event;
+	if (__put_user(slot_val, (struct syslet_uatom __user **)ring_slot))
+		goto fault_unlock;
+	/*
+	 * Update the ring index:
+	 */
+	kernel_ring_idx++;
+	if (kernel_ring_idx == max_ring_idx)
+		kernel_ring_idx = 0;
+
+	if (__put_user(kernel_ring_idx, &ahu->kernel_ring_idx))
+		goto fault_unlock;
+
+	/*
+	 * See whether the async-head is waiting and needs a wakeup:
+	 */
+	if (ah->events_left) {
+		if (!--ah->events_left) {
+			/*
+			 * We first unlock the mutex - to reduce the size
+			 * of the critical section. We have a safe
+			 * reference to 'ah':
+			 */
+			mutex_unlock(&ah->completion_lock);
+			wake_up(&ah->wait);
+			goto out;
+		}
+	}
+
+	mutex_unlock(&ah->completion_lock);
+ out:
+	return 0;
+
+ fault_unlock:
+	mutex_unlock(&ah->completion_lock);
+
+	return -EFAULT;
+
+ err_unlock:
+	mutex_unlock(&ah->completion_lock);
+
+	return -EINVAL;
+}
+
+/*
+ * This is the main syslet atom execution loop. This fetches atoms
+ * and executes them until it runs out of atoms or until the
+ * exit condition becomes false:
+ */
+static struct syslet_uatom __user *
+exec_atom(struct async_head *ah, struct task_struct *t,
+	  struct syslet_uatom __user *uatom,
+	  struct async_head_user __user *ahu,
+	  syscall_fn_t *call_table,
+	  unsigned int nr_syscalls)
+{
+	struct syslet_uatom __user *last_uatom;
+	struct syslet_atom atom;
+	long ret;
+
+	atom.call_table = call_table;
+	atom.nr_syscalls = nr_syscalls;
+
+ run_next:
+	if (unlikely(copy_uatom(&atom, uatom)))
+		return ERR_PTR(-EFAULT);
+
+	last_uatom = uatom;
+	ret = __exec_atom(t, &atom);
+	if (unlikely(signal_pending(t)))
+		goto stop;
+	if (need_resched())
+		cond_resched();
+
+	uatom = next_uatom(&atom, uatom, ret);
+	if (uatom)
+		goto run_next;
+ stop:
+	/*
+	 * We do completion only in async context:
+	 */
+	if (t->at && !(atom.flags & SYSLET_NO_COMPLETE)) {
+		if (completion_event(ah, t, last_uatom, ahu))
+			return ERR_PTR(-EFAULT);
+	}
+
+	return last_uatom;
+}
+
+static long
+cachemiss_loop(struct async_thread *at, struct async_head *ah,
+	       struct task_struct *t)
+{
+	for (;;) {
+		mark_async_thread_busy(at, ah);
+		set_task_state(t, TASK_INTERRUPTIBLE);
+		if (unlikely(t->ah || at->exit || signal_pending(t)))
+			break;
+		mark_async_thread_ready(at, ah);
+		schedule();
+	}
+	t->state = TASK_RUNNING;
+
+	async_thread_exit(at, t);
+
+	if (at->exit)
+		do_exit(0);
+
+	if (!t->ah) {
+		/*
+		 * Cachemiss threads return to one given
+		 * user-space instruction address and stack
+		 * pointer:
+		 */
+		set_task_stack_reg(t, at->user_stack);
+		task_ip_reg(t) = at->user_ip;
+
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * This is what a newly created cachemiss thread executes for the
+ * first time: initialize, pick up the user stack/IP addresses from
+ * the head and then execute the cachemiss loop. If the cachemiss
+ * loop returns then we return back to user-space:
+ */
+static long cachemiss_thread(void *data)
+{
+	struct pt_regs *head_regs, *regs;
+	struct task_struct *t = current;
+	struct async_head *ah = data;
+	struct async_thread *at;
+	int ret;
+
+	at = &t->__at;
+	async_thread_init(t, at, ah);
+
+	/*
+	 * Clone the head thread's user-space ptregs over,
+	 * now that we are in kernel-space:
+	 */
+	head_regs = task_pt_regs(ah->user_task);
+	regs = task_pt_regs(t);
+
+	*regs = *head_regs;
+	ret = get_user(at->user_stack, ah->new_stackp);
+	WARN_ON(ret);
+	/*
+	 * Clear the stack pointer, signalling to user-space that
+	 * this thread stack has been used up:
+	 */
+	ret = put_user(0, ah->new_stackp);
+	WARN_ON(ret);
+
+	complete(&ah->start_done);
+
+	return cachemiss_loop(at, ah, t);
+}
+
+/**
+ * sys_async_thread - do work as an async cachemiss thread again
+ *
+ * @event: completion event
+ * @ahu: async head
+ *
+ * If an async thread has returned back to user-space (due to say
+ * a signal) then it is a 'busy' thread during that period. It
+ * can again offer itself into the cachemiss pool by calling this
+ * syscall:
+ */
+asmlinkage long
+sys_async_thread(void __user *event, struct async_head_user __user *ahu)
+{
+	struct task_struct *t = current;
+	struct async_thread *at = t->at;
+	struct async_head *ah = t->__at.ah;
+
+	/*
+	 * Only async threads are allowed to do this:
+	 */
+	if (!ah || t->ah)
+		return -EINVAL;
+
+	/*
+	 * A threadlet might want to signal a completion event:
+	 */
+	if (event) {
+		/*
+		 * threadlet - make sure the stack is never used
+		 * again by this thread:
+		 */
+		set_task_stack_reg(t, 0x11111111);
+		task_ip_reg(t) = 0x22222222;
+
+		if (completion_event(ah, t, event, ahu))
+			return -EFAULT;
+	}
+	/*
+	 * If a cachemiss threadlet calls sys_async_thread()
+	 * then we first have to mark it ready:
+	 */
+	if (at) {
+		mark_async_thread_ready(at, ah);
+	} else {
+		at = &t->__at;
+		WARN_ON(!at->ah);
+
+		async_thread_init(t, at, ah);
+	}
+
+	return cachemiss_loop(at, at->ah, t);
+}
+
+/*
+ * Initialize the in-kernel async head, based on the user-space async
+ * head:
+ */
+static long
+async_head_init(struct task_struct *t, struct async_head_user __user *ahu)
+{
+	struct async_head *ah;
+
+	ah = &t->__ah;
+
+	spin_lock_init(&ah->lock);
+	INIT_LIST_HEAD(&ah->ready_async_threads);
+	INIT_LIST_HEAD(&ah->busy_async_threads);
+	init_waitqueue_head(&ah->wait);
+	mutex_init(&ah->completion_lock);
+	ah->events_left = 0;
+	ah->ahu = NULL;
+	ah->new_stackp = NULL;
+	ah->new_ip = 0;
+	ah->restore_stack = 0;
+	ah->restore_ip = 0;
+	ah->user_task = t;
+	t->ah = ah;
+
+	return 0;
+}
+
+/*
+ * If the head cache-misses then it will become a cachemiss
+ * thread after having finished its current syslet. If it
+ * returns to user-space after that point (to handle a signal
+ * for example) then it will need a thread stack of its own:
+ */
+static long init_head(struct async_head *ah, struct task_struct *t,
+		      struct async_head_user __user *ahu)
+{
+	unsigned long head_stack, head_ip;
+
+	if (get_user(head_stack, (unsigned long __user *)&ahu->head_stack))
+		return -EFAULT;
+	if (get_user(head_ip, (unsigned long __user *)&ahu->head_ip))
+		return -EFAULT;
+	t->__at.user_stack = head_stack;
+	t->__at.user_ip = head_ip;
+
+	return async_head_init(t, ahu);
+}
+
+/*
+ * Simple limit and pool management mechanism for now:
+ */
+static long
+refill_cachemiss_pool(struct async_head *ah, struct task_struct *t,
+		      struct async_head_user __user *ahu)
+{
+	unsigned long new_ip;
+	int pid, ret;
+
+	init_completion(&ah->start_done);
+	ah->new_stackp = (unsigned long __user *)&ahu->new_thread_stack;
+	ret = get_user(new_ip, (unsigned long __user *)&ahu->new_thread_ip);
+	WARN_ON(ret);
+	ah->new_ip = new_ip;
+
+	pid = create_async_thread(cachemiss_thread, (void *)ah,
+			   CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
+			   CLONE_THREAD | CLONE_SYSVSEM);
+	if (pid < 0)
+		return pid;
+
+	wait_for_completion(&ah->start_done);
+	ah->new_stackp = NULL;
+	ah->new_ip = 0;
+
+	return 0;
+}
+
+/**
+ * sys_async_exec - execute a syslet.
+ *
+ * returns the uatom that was last executed, if the kernel was able to
+ * execute the syslet synchronously, or NULL if the syslet became
+ * asynchronous. (in the latter case syslet completion will be notified
+ * via the completion ring)
+ *
+ * (Various errors might also be returned via the usual negative numbers.)
+ */
+static struct syslet_uatom __user *
+__sys_async_exec(struct syslet_uatom __user *uatom,
+		 struct async_head_user __user *ahu,
+		 syscall_fn_t *call_table,
+		 unsigned int nr_syscalls)
+{
+	struct syslet_uatom __user *ret;
+	struct task_struct *t = current;
+	struct async_head *ah = t->ah;
+	struct async_thread *at = &t->__at;
+
+	/*
+	 * Do not allow recursive calls of sys_async_exec():
+	 */
+	if (async_syscall(t))
+		return ERR_PTR(-ENOSYS);
+
+	if (!uatom || !ahu || !ahu->new_thread_stack)
+		return ERR_PTR(-EINVAL);
+
+	if (unlikely(!ah)) {
+		ret = (void *)init_head(ah, t, ahu);
+		if (ret)
+			return ret;
+		ah = t->ah;
+	}
+
+	if (unlikely(list_empty(&ah->ready_async_threads))) {
+		ret = (void *)refill_cachemiss_pool(ah, t, ahu);
+		if (ret)
+			return ret;
+	}
+
+	t->async_ready = at;
+	ah->ahu = ahu;
+
+	ret = exec_atom(ah, t, uatom, ahu, call_table, nr_syscalls);
+
+	/*
+	 * Are we still executing as head?
+	 */
+	if (t->ah) {
+		t->async_ready = NULL;
+
+		return ret;
+	}
+
+	/*
+	 * We got turned into a cachemiss thread,
+	 * enter the cachemiss loop:
+	 */
+	set_task_state(t, TASK_INTERRUPTIBLE);
+	mark_async_thread_ready(at, ah);
+
+	return ERR_PTR(cachemiss_loop(at, ah, t));
+}
+
+asmlinkage struct syslet_uatom __user *
+sys_async_exec(struct syslet_uatom __user *uatom,
+	       struct async_head_user __user *ahu)
+{
+	return __sys_async_exec(uatom, ahu, sys_call_table, NR_syscalls);
+}
+
+#ifdef CONFIG_COMPAT
+
+asmlinkage struct syslet_uatom __user *
+compat_sys_async_exec(struct syslet_uatom __user *uatom,
+		      struct async_head_user __user *ahu)
+{
+	return __sys_async_exec(uatom, ahu, compat_sys_call_table,
+				compat_NR_syscalls);
+}
+
+#endif
+
+/**
+ * sys_async_wait - wait for async completion events
+ *
+ * This syscall waits for @min_wait_events syslet completion events
+ * to finish or for all async processing to finish (whichever
+ * comes first).
+ */
+asmlinkage long
+sys_async_wait(unsigned long min_wait_events, unsigned long user_ring_idx,
+	       struct async_head_user __user *ahu)
+{
+	struct task_struct *t = current;
+	struct async_head *ah = t->ah;
+	unsigned long kernel_ring_idx;
+
+	/*
+	 * Do not allow async waiting:
+	 */
+	if (async_syscall(t))
+		return -ENOSYS;
+	if (!ah)
+		return -EINVAL;
+
+	mutex_lock(&ah->completion_lock);
+	if (get_user(kernel_ring_idx,
+			(unsigned long __user *)&ahu->kernel_ring_idx))
+		goto err_unlock;
+	/*
+	 * Account any completions that happened since user-space
+	 * checked the ring:
+ 	 */
+	ah->events_left = min_wait_events - (kernel_ring_idx - user_ring_idx);
+	mutex_unlock(&ah->completion_lock);
+
+	return wait_event_interruptible(ah->wait,
+		list_empty(&ah->busy_async_threads) || ah->events_left <= 0);
+
+ err_unlock:
+	mutex_unlock(&ah->completion_lock);
+	return -EFAULT;
+}
+
+asmlinkage long
+sys_threadlet_on(unsigned long restore_stack,
+		 unsigned long restore_ip,
+		 struct async_head_user __user *ahu)
+{
+	struct task_struct *t = current;
+	struct async_head *ah = t->ah;
+	struct async_thread *at = &t->__at;
+	long ret;
+
+	/*
+	 * Do not allow recursive calls of sys_threadlet_on():
+	 */
+	if (t->async_ready || t->at)
+		return -EINVAL;
+
+	if (unlikely(!ah)) {
+		ret = init_head(ah, t, ahu);
+		if (ret)
+			return ret;
+		ah = t->ah;
+	}
+
+	if (unlikely(list_empty(&ah->ready_async_threads))) {
+		ret = refill_cachemiss_pool(ah, t, ahu);
+		if (ret)
+			return ret;
+	}
+
+	t->async_ready = at;
+	ah->restore_stack = restore_stack;
+	ah->restore_ip = restore_ip;
+
+	ah->ahu = ahu;
+
+	return 0;
+}
+
+asmlinkage long sys_threadlet_off(void)
+{
+	struct task_struct *t = current;
+	struct async_head *ah = t->ah;
+
+	/*
+	 * Are we still executing as head?
+	 */
+	if (ah) {
+		t->async_ready = NULL;
+
+		return 1;
+	}
+
+	/*
+	 * We got turned into a cachemiss thread,
+	 * return to user-space, which can do
+	 * the notification, etc:
+	 */
+	return 0;
+}
+
+static void __notify_async_thread_exit(struct async_thread *at,
+				       struct async_head *ah)
+{
+	list_del_init(&at->entry);
+	at->exit = 1;
+	init_completion(&ah->exit_done);
+	wake_up_process(at->task);
+}
+
+static void stop_cachemiss_threads(struct async_head *ah)
+{
+	struct async_thread *at;
+
+repeat:
+	spin_lock(&ah->lock);
+	list_for_each_entry(at, &ah->ready_async_threads, entry) {
+
+		__notify_async_thread_exit(at, ah);
+		spin_unlock(&ah->lock);
+
+		wait_for_completion(&ah->exit_done);
+
+		goto repeat;
+	}
+
+	list_for_each_entry(at, &ah->busy_async_threads, entry) {
+
+		__notify_async_thread_exit(at, ah);
+		spin_unlock(&ah->lock);
+
+		wait_for_completion(&ah->exit_done);
+
+		goto repeat;
+	}
+	spin_unlock(&ah->lock);
+}
+
+static void async_head_exit(struct async_head *ah, struct task_struct *t)
+{
+	stop_cachemiss_threads(ah);
+	WARN_ON(!list_empty(&ah->ready_async_threads));
+	WARN_ON(!list_empty(&ah->busy_async_threads));
+	WARN_ON(spin_is_locked(&ah->lock));
+
+	t->ah = NULL;
+}
+
+/*
+ * fork()-time initialization:
+ */
+void async_init(struct task_struct *t)
+{
+	t->at		= NULL;
+	t->async_ready	= NULL;
+	t->ah		= NULL;
+	t->__at.ah	= NULL;
+	t->__at.user_stack = 0;
+}
+
+/*
+ * do_exit()-time cleanup:
+ */
+void async_exit(struct task_struct *t)
+{
+	struct async_thread *at = t->at;
+	struct async_head *ah = t->ah;
+
+	/*
+	 * If head does a sys_exit() then the final schedule() must
+	 * not be passed on to another cachemiss thread:
+	 */
+	t->async_ready = NULL;
+
+	if (unlikely(at))
+		async_thread_exit(at, t);
+
+	if (unlikely(ah))
+		async_head_exit(ah, t);
+}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/