lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:	Mon, 06 Oct 2008 23:55:13 -0500
From:	Tom Zanussi <zanussi@...cast.net>
To:	Jens Axboe <jens.axboe@...cle.com>
Cc:	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Mathieu Desnoyers <compudj@...stal.dyndns.org>,
	Martin Bligh <mbligh@...gle.com>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	prasad@...ux.vnet.ibm.com,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Thomas Gleixner <tglx@...utronix.de>,
	Steven Rostedt <rostedt@...dmis.org>, od@...e.com,
	"Frank Ch. Eigler" <fche@...hat.com>,
	Andrew Morton <akpm@...ux-foundation.org>, hch@....de,
	David Wilder <dwilder@...ibm.com>
Subject: Re: [RFC PATCH 1/1] relay revamp v5


On Mon, 2008-10-06 at 09:40 +0200, Jens Axboe wrote:
> On Mon, Oct 06 2008, Tom Zanussi wrote:
> > The full relay patch.
> > 
> > Basically it includes the changes from the previous 11 that I posted and
> > in addition completely separates the reading part of relay from the
> > writing part.  With the new changes, relay really does become just what
> > its name says and and nothing more - it accepts pages from tracers, and
> > relays the data to userspace via read(2) or splice(2) (and therefore
> > sendfile(2)).  It doesn't allocate any buffer space and provides no
> > write functions - those are expected to be supplied by some other
> > component such as the unified ring-buffer or any other tracer that might
> > want relay pages of trace data to userspace.
> > 
> > Includes original relay write functions and buffers (the no-vmap
> > page-based versions of the previous patchset), which have been split out
> > into a new file called relay_pagewriter.c and provide one means of
> > writing into pages and feeding them into relay.  blktrace and kvmtrace
> > have been 'ported' over to using pagewriter instead of relay directly.
> > 
> > Signed-off-by: Tom Zanussi <zanussi@...cast.net>
> > 
> > diff --git a/block/blktrace.c b/block/blktrace.c
> > index eb9651c..8ba7094 100644
> > --- a/block/blktrace.c
> > +++ b/block/blktrace.c
> > @@ -35,7 +35,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
> >  {
> >  	struct blk_io_trace *t;
> >  
> > -	t = relay_reserve(bt->rchan, sizeof(*t) + len);
> > +	t = kmalloc(sizeof(*t) + len, GFP_KERNEL);
> >  	if (t) {
> >  		const int cpu = smp_processor_id();
> >  
> 
> Ugh, that's no good - it's both way too expensive, and also requires an
> atomic allocation.

This is was only to keep things working until I could add reserve()
back.

> 
> > @@ -166,7 +168,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
> >  	if (unlikely(tsk->btrace_seq != blktrace_seq))
> >  		trace_note_tsk(bt, tsk);
> >  
> > -	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
> > +	t = kmalloc(sizeof(*t) + pdu_len, GFP_KERNEL);
> >  	if (t) {
> >  		cpu = smp_processor_id();
> >  		sequence = per_cpu_ptr(bt->sequence, cpu);
> 
> Ditto - I don't think this approach is viable at all, sorry!
> 

The patch below adds reserve() back and changes blktrace back to using
it.  Adding it back also meant adding padding back into the equation,
but now there's a way to write a padding 'event' as part of the event
stream rather than as metadata.  For blktrace, I added a blktrace_notify
'padding message', which I'm sure isn't really what you'd want, but it
seems to do the trick for now, and didn't even require any changes in
blkparse - it happily skips over the padding as intended.

Tom

    Add pagewrite_reserve().
    
    Also added is a callback name write_padding() which can be used to
    write padding events at the end of the page if an event won't fit in
    the remaining space.

diff --git a/block/blktrace.c b/block/blktrace.c
index 8ba7094..f5b745d 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -35,7 +35,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 {
 	struct blk_io_trace *t;
 
-	t = kmalloc(sizeof(*t) + len, GFP_KERNEL);
+	t = pagewriter_reserve(bt->pagewriter, sizeof(*t) + len, sizeof(*t));
 	if (t) {
 		const int cpu = smp_processor_id();
 
@@ -47,8 +47,6 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 		t->cpu = cpu;
 		t->pdu_len = len;
 		memcpy((void *) t + sizeof(*t), data, len);
-		pagewriter_write(bt->pagewriter, t, sizeof(*t) + len);
-		kfree(t);
 	}
 }
 
@@ -168,7 +166,8 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	if (unlikely(tsk->btrace_seq != blktrace_seq))
 		trace_note_tsk(bt, tsk);
 
-	t = kmalloc(sizeof(*t) + pdu_len, GFP_KERNEL);
+	t = pagewriter_reserve(bt->pagewriter, sizeof(*t) + pdu_len,
+			       sizeof(*t));
 	if (t) {
 		cpu = smp_processor_id();
 		sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -187,8 +186,6 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
-		pagewriter_write(bt->pagewriter, t, sizeof(*t) + pdu_len);
-		kfree(t);
 	}
 
 	local_irq_restore(flags);
@@ -335,6 +332,21 @@ static const struct file_operations blk_msg_fops = {
 	.write =	blk_msg_write,
 };
 
+static void blk_write_padding_callback(struct pagewriter_buf *buf,
+				       size_t length,
+				       void *reserved)
+{
+	struct blk_io_trace *t = reserved;
+
+	t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+	t->action = BLK_TN_PADDING;
+	t->pdu_len = length - sizeof(*t);
+}
+
+static struct pagewriter_callbacks blk_pagewriter_callbacks = {
+	.write_padding           = blk_write_padding_callback,
+};
+
 /*
  * Setup everything required to start tracing
  */
@@ -392,7 +404,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	n_pages = (buts->buf_size * buts->buf_nr) / PAGE_SIZE;
 	n_pages_wakeup = buts->buf_size / PAGE_SIZE;
 	bt->pagewriter = pagewriter_open("trace", dir, n_pages, n_pages_wakeup,
-					 NULL, bt, 0UL);
+					 &blk_pagewriter_callbacks, bt,
+					 PAGEWRITER_PAD_WRITES);
 	if (!bt->pagewriter)
 		goto err;
 
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 59461f2..c9857f1 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -56,6 +56,7 @@ enum blktrace_notify {
 	__BLK_TN_PROCESS = 0,		/* establish pid/name mapping */
 	__BLK_TN_TIMESTAMP,		/* include system clock */
 	__BLK_TN_MESSAGE,		/* Character string message */
+	__BLK_TN_PADDING,		/* Padding message */
 };
 
 
@@ -81,6 +82,7 @@ enum blktrace_notify {
 #define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_MESSAGE		(__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY))
+#define BLK_TN_PADDING		(__BLK_TN_PADDING | BLK_TC_ACT(BLK_TC_NOTIFY))
 
 #define BLK_IO_TRACE_MAGIC	0x65617400
 #define BLK_IO_TRACE_VERSION	0x07
diff --git a/include/linux/relay_pagewriter.h b/include/linux/relay_pagewriter.h
index a056d13..42730c9 100644
--- a/include/linux/relay_pagewriter.h
+++ b/include/linux/relay_pagewriter.h
@@ -22,6 +22,11 @@
 #include <linux/relay.h>
 
 /*
+ * pagewriter flags
+ */
+#define PAGEWRITER_PAD_WRITES		0x00010000	/* don't cross pages */
+
+/*
  * Per-cpu pagewriter buffer
  */
 struct pagewriter_buf {
@@ -48,6 +53,7 @@ struct pagewriter {
 	struct pagewriter_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
 	struct list_head list;		/* for channel list */
 	atomic_t dropped;		/* dropped events due to buffer-full */
+	unsigned long flags;		/* pagewriter flags for this channel */
 };
 
 extern size_t pagewriter_switch_page_default_callback(struct pagewriter_buf *b,
@@ -106,6 +112,21 @@ struct pagewriter_callbacks {
 	size_t (*switch_page)(struct pagewriter_buf *buf,
 			      size_t length,
 			      void **reserved);
+
+	/*
+	 * write_padding - callback for writing padding events
+	 * @buf: the channel buffer
+	 * @length: the length of the padding
+	 * @reserved: a pointer to the start of padding
+	 *
+	 * This callback can be used to write a padding event when
+	 * pagewriter_reserve can't write a complete event.  The
+	 * length of the padding is guaranteed to be at least as large
+	 * as the end_reserve size passed into pagewriter_reserve().
+	 */
+	void (*write_padding)(struct pagewriter_buf *buf,
+			      size_t length,
+			      void *reserved);
 };
 
 /**
@@ -189,6 +210,54 @@ static inline void __pagewriter_write(struct pagewriter *pagewriter,
 }
 
 /**
+ *	pagewriter_reserve - reserve slot in channel buffer
+ *	@pagewriter: pagewriter
+ *	@length: number of bytes to reserve
+ *	@end_reserve: reserve at least this much for a padding event, if needed
+ *
+ *	Returns pointer to reserved slot, NULL if full.
+ *
+ *	Reserves a slot in the current cpu's channel buffer.
+ *	Does not protect the buffer at all - caller must provide
+ *	appropriate synchronization.
+ *
+ *	If the event won't fit, at least end_reserve bytes are
+ *	reserved for a padding event, and the write_padding() callback
+ *	function is called to allow the client to write the padding
+ *	event before switching to the next page.  The write_padding()
+ *	callback is passed a pointer to the start of the padding along
+ *	with its length.
+ */
+
+static inline void *pagewriter_reserve(struct pagewriter *pagewriter,
+				       size_t length,
+				       size_t end_reserve)
+{
+	struct pagewriter_buf *buf;
+	unsigned long flags;
+	void *reserved;
+
+	local_irq_save(flags);
+	buf = pagewriter->buf[smp_processor_id()];
+	reserved = buf->data + buf->offset;
+	if (unlikely(buf->offset + length + end_reserve > PAGE_SIZE)) {
+		if (likely(buf->offset + length != PAGE_SIZE)) {
+			size_t padding = PAGE_SIZE - buf->offset;
+			pagewriter->cb->write_padding(buf, padding, reserved);
+			pagewriter->cb->switch_page(buf, length, &reserved);
+			if (unlikely(!reserved)) {
+				local_irq_restore(flags);
+				return NULL;
+			}
+		}
+	}
+	buf->offset += length;
+	local_irq_restore(flags);
+
+	return reserved;
+}
+
+/**
  *	page_start_reserve - reserve bytes at the start of a page
  *	@buf: pagewriter channel buffer
  *	@length: number of bytes to reserve
diff --git a/kernel/relay_pagewriter.c b/kernel/relay_pagewriter.c
index 4b79274..7eb23e9 100644
--- a/kernel/relay_pagewriter.c
+++ b/kernel/relay_pagewriter.c
@@ -54,7 +54,7 @@ static void __pagewriter_reset(struct pagewriter_buf *buf, unsigned int init);
  *	@n_pages_wakeup: wakeup readers after this many pages, 0 means never
  *	@cb: client callback functions
  *	@private_data: user-defined data
- *	@rchan_flags: relay flags, passed on to relay
+ *	@flags: channel flags, top half for pagewriter, bottom half for relay
  *
  *	Returns pagewriter pointer if successful, %NULL otherwise.
  *
@@ -67,7 +67,7 @@ struct pagewriter *pagewriter_open(const char *base_filename,
 				   size_t n_pages_wakeup,
 				   struct pagewriter_callbacks *cb,
 				   void *private_data,
-				   unsigned long rchan_flags)
+				   unsigned long flags)
 {
 	unsigned int i;
 	struct pagewriter *pagewriter;
@@ -77,7 +77,7 @@ struct pagewriter *pagewriter_open(const char *base_filename,
 		return NULL;
 
 	rchan = relay_open(base_filename, parent, n_pages_wakeup, NULL,
-			   private_data, rchan_flags);
+			   private_data, flags);
 	if (!rchan)
 		return NULL;
 
@@ -88,6 +88,7 @@ struct pagewriter *pagewriter_open(const char *base_filename,
 	}
 
 	pagewriter->rchan = rchan;
+	pagewriter->flags = flags;
 	pagewriter->n_pages = n_pages;
 	atomic_set(&pagewriter->dropped, 0);
 
@@ -414,10 +415,20 @@ static void new_page_default_callback(struct pagewriter_buf *buf,
 {
 }
 
+/*
+ * write_padding() default callback.
+ */
+void pagewriter_write_padding_default_callback(struct pagewriter_buf *buf,
+					       size_t length,
+					       void *reserved)
+{
+}
+
 /* pagewriter default callbacks */
 static struct pagewriter_callbacks default_pagewriter_callbacks = {
 	.new_page = new_page_default_callback,
 	.switch_page = pagewriter_switch_page_default_callback,
+	.write_padding = pagewriter_write_padding_default_callback,
 };
 
 static void setup_callbacks(struct pagewriter *pagewriter,
@@ -432,6 +443,9 @@ static void setup_callbacks(struct pagewriter *pagewriter,
 		cb->new_page = new_page_default_callback;
 	if (!cb->switch_page)
 		cb->switch_page = pagewriter_switch_page_default_callback;
+	if (!cb->write_padding)
+		cb->write_padding = pagewriter_write_padding_default_callback;
+
 	pagewriter->cb = cb;
 }
 
@@ -502,7 +516,7 @@ size_t pagewriter_switch_page_default_callback(struct pagewriter_buf *buf,
 					       size_t length,
 					       void **reserved)
 {
-	size_t remainder;
+	size_t remainder = length;
 	struct relay_page *new_page;
 
 	if (unlikely(pagewriter_event_toobig(buf, length)))
@@ -517,7 +531,8 @@ size_t pagewriter_switch_page_default_callback(struct pagewriter_buf *buf,
 		return 0;
 	}
 
-	remainder = length - (PAGE_SIZE - buf->offset);
+	if (buf->pagewriter->flags & PAGEWRITER_PAD_WRITES)
+		remainder = length - (PAGE_SIZE - buf->offset);
 
 	relay_add_page(buf->pagewriter->rchan, buf->page->page,
 		       &pagewriter_relay_page_callbacks, (void *)buf);



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ