netdev - Re: [PATCH UPDATED 1/3] vhost: replace vhost

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100722155840.GA1743@redhat.com>
Date:	Thu, 22 Jul 2010 18:58:40 +0300
From:	"Michael S. Tsirkin" <mst@...hat.com>
To:	Tejun Heo <tj@...nel.org>
Cc:	Oleg Nesterov <oleg@...hat.com>,
	Sridhar Samudrala <sri@...ibm.com>,
	netdev <netdev@...r.kernel.org>,
	lkml <linux-kernel@...r.kernel.org>,
	"kvm@...r.kernel.org" <kvm@...r.kernel.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Dmitri Vorobiev <dmitri.vorobiev@...ial.com>,
	Jiri Kosina <jkosina@...e.cz>,
	Thomas Gleixner <tglx@...utronix.de>,
	Ingo Molnar <mingo@...e.hu>, Andi Kleen <ak@...ux.intel.com>
Subject: Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with
 per-vhost kthread

On Wed, Jun 02, 2010 at 08:40:00PM +0200, Tejun Heo wrote:
> Replace vhost_workqueue with per-vhost kthread.  Other than callback
> argument change from struct work_struct * to struct vhost_work *,
> there's no visible change to vhost_poll_*() interface.
> 
> This conversion is to make each vhost use a dedicated kthread so that
> resource control via cgroup can be applied.
> 
> Partially based on Sridhar Samudrala's patch.
> 
> * Updated to use sub structure vhost_work instead of directly using
>   vhost_poll at Michael's suggestion.
> 
> * Added flusher wake_up() optimization at Michael's suggestion.
> 
> Signed-off-by: Tejun Heo <tj@...nel.org>
> Cc: Michael S. Tsirkin <mst@...hat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@...il.com>

All the tricky barrier pairing made me uncomfortable.  So I came up with
this on top (untested): if we do all operations under the spinlock, we
can get by without barriers and atomics.  And since we need the lock for
list operations anyway, this should have no paerformance impact.

What do you think?


diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0c6b533..7730a30 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -73,7 +73,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 	INIT_LIST_HEAD(&work->node);
 	work->fn = fn;
 	init_waitqueue_head(&work->done);
-	atomic_set(&work->flushing, 0);
+	work->flushing = 0;
 	work->queue_seq = work->done_seq = 0;
 }
 
@@ -99,13 +99,23 @@ void vhost_poll_stop(struct vhost_poll *poll)
 void vhost_poll_flush(struct vhost_poll *poll)
 {
 	struct vhost_work *work = &poll->work;
-	int seq = work->queue_seq;
+	unsigned seq, left;
+	int flushing;
 
-	atomic_inc(&work->flushing);
-	smp_mb__after_atomic_inc();	/* mb flush-b0 paired with worker-b1 */
-	wait_event(work->done, seq - work->done_seq <= 0);
-	atomic_dec(&work->flushing);
-	smp_mb__after_atomic_dec();	/* rmb flush-b1 paired with worker-b0 */
+	spin_lock_irq(&dev->work_lock);
+	seq = work->queue_seq;
+	work->flushing++;
+	spin_unlock_irq(&dev->work_lock);
+	wait_event(work->done, {
+		   spin_lock_irq(&dev->work_lock);
+		   left = work->done_seq - seq;
+		   spin_unlock_irq(&dev->work_lock);
+		   left < UINT_MAX / 2;
+	});
+	spin_lock_irq(&dev->work_lock);
+	flushing = --work->flushing;
+	spin_unlock_irq(&dev->work_lock);
+	BUG_ON(flushing < 0);
 }
 
 void vhost_poll_queue(struct vhost_poll *poll)
@@ -151,37 +161,37 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 static int vhost_worker(void *data)
 {
 	struct vhost_dev *dev = data;
-	struct vhost_work *work;
+	struct vhost_work *work = NULL;
 
-repeat:
-	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
 
-	if (kthread_should_stop()) {
-		__set_current_state(TASK_RUNNING);
-		return 0;
-	}
+		if (kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		}
 
-	work = NULL;
-	spin_lock_irq(&dev->work_lock);
-	if (!list_empty(&dev->work_list)) {
-		work = list_first_entry(&dev->work_list,
-					struct vhost_work, node);
-		list_del_init(&work->node);
-	}
-	spin_unlock_irq(&dev->work_lock);
+		spin_lock_irq(&dev->work_lock);
+		if (work) {
+			work->done_seq = work->queue_seq;
+			if (work->flushing)
+				wake_up_all(&work->done);
+		}
+		if (!list_empty(&dev->work_list)) {
+			work = list_first_entry(&dev->work_list,
+						struct vhost_work, node);
+			list_del_init(&work->node);
+		} else
+			work = NULL;
+		spin_unlock_irq(&dev->work_lock);
+
+		if (work) {
+			__set_current_state(TASK_RUNNING);
+			work->fn(work);
+		} else
+			schedule();
 
-	if (work) {
-		__set_current_state(TASK_RUNNING);
-		work->fn(work);
-		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
-		work->done_seq = work->queue_seq;
-		smp_mb();	/* mb worker-b1 paired with flush-b0 */
-		if (atomic_read(&work->flushing))
-			wake_up_all(&work->done);
-	} else
-		schedule();
-
-	goto repeat;
+	}
 }
 
 long vhost_dev_init(struct vhost_dev *dev,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0e63091..3693327 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -27,9 +27,9 @@ struct vhost_work {
 	struct list_head	  node;
 	vhost_work_fn_t		  fn;
 	wait_queue_head_t	  done;
-	atomic_t		  flushing;
-	int			  queue_seq;
-	int			  done_seq;
+	int			  flushing;
+	unsigned		  queue_seq;
+	unsigned		  done_seq;
 };
 
 /* Poll a file (eventfd or socket) */
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html