The functions in ltt-relay-alloc.c take care of writing the data into the buffer pages. Those pages are allocated from the page allocator and no virtual mapping is done so we can save precious TLB entries. ltt-relay-alloc.c is the abstraction layer which makes the buffers "look" like a contiguous memory area, although they are made from physically discontiguous pages linked with a linked list. A caching mechanism makes sure we never walk over more than 1-2 entries of the list. We use a linked list rather than a table to make sure we don't depend on vmalloc to allocate large pointer arrays. I did a bit of profiling with oprofile on LTTng and found out that write functions in ltt-relay-alloc.c were taking a lot of CPU time. I thought it would be good to improve them a bit. Running a 2.6.29-rc3 kernel Compiling a 2.6.25 kernel using make -j10 on a 8-cores x86_64 with a vanilla 2.6.29-rc3 kernel (all tests are cache-hot) : real 1m22.103s With dormant instrumentation real 1m24.667s (note : this 2s regression should be identified eventually by doing a bissection of the LTTng tree.) ltt-armall Without modification, with flight recorder tracing active : real 1m31.135s Replacing the memcpy call with a specialized call for 1, 2, 4 and 8 bytes : real 1m30.440s Inlining the fast path of the write function (v2) : real 1m29.349s * KOSAKI Motohiro (kosaki.motohiro@jp.fujitsu.com) wrote: > Hi > > > +static inline void ltt_relay_do_copy(void *dest, const void *src, size_t +len) > > +{ > > + switch (len) { > > + case 1: *(u8 *)dest = *(const u8 *)src; > > + break; > > + case 2: *(u16 *)dest = *(const u16 *)src; > > + break; > > + case 4: *(u32 *)dest = *(const u32 *)src; > > + break; > > +#if (BITS_PER_LONG == 64) > > + case 8: *(u64 *)dest = *(const u64 *)src; > > + break; > > +#endif > > + default: > > + memcpy(dest, src, len); > > + } > > +} > > hm, interesting. > > IIRC, few month ago, linus said this optimization is not optimazation. > lastest gcc does this inlining automatically. > (but I can't point its url, sorry) > > Is this result gcc version independent? and can you send > the difference of gcc assembly outout? Here we go : x86_64 gcc (Debian 4.3.2-1) 4.3.2 (haven't tried other compiler versions) kernel 2.6.29-rc3 char dataout[100]; char datain[100]; int sizea = 8; void testfct_ltt(void) { asm ("/* begin */"); ltt_relay_do_copy(dataout, datain, sizea); asm ("/* end*/"); } Turns into a jump table : movslq sizea(%rip),%rdx cmpq $8, %rdx jbe .L15 .L6: movq $datain, %rsi movq $dataout, %rdi call memcpy .p2align 4,,10 .p2align 3 .L7: [...] .L15: jmp *.L12(,%rdx,8) .section .rodata .align 8 .align 4 .L12: .quad .L7 .quad .L8 .quad .L9 .quad .L6 .quad .L10 .quad .L6 .quad .L6 .quad .L6 .quad .L11 .text .p2align 4,,10 .p2align 3 .L11: movq datain(%rip), %rax movq %rax, dataout(%rip) jmp .L7 .p2align 4,,10 .p2align 3 .L8: movzbl datain(%rip), %eax movb %al, dataout(%rip) jmp .L7 .p2align 4,,10 .p2align 3 .L9: movzwl datain(%rip), %eax movw %ax, dataout(%rip) jmp .L7 .p2align 4,,10 .p2align 3 .L10: movl datain(%rip), %eax movl %eax, dataout(%rip) jmp .L7 .size testfct_ltt, .-testfct_ltt .p2align 4,,15 void testfct_memcpy(void) { asm ("/* begin */"); memcpy(dataout, datain, sizea); asm ("/* end */"); } Turns into a function call because the size is not statically known : movslq sizea(%rip),%rdx movq $datain, %rsi movq $dataout, %rdi call memcpy Below, when a constant is passed, both behave similarly : void testfct_ltt_const(void) { asm ("/* begin */"); ltt_relay_do_copy(dataout, datain, 8); asm ("/* end*/"); } movq datain(%rip), %rax movq %rax, dataout(%rip) void testfct_memcpy_const(void) { asm ("/* begin */"); memcpy(dataout, datain, 8); asm ("/* end */"); } movq datain(%rip), %rax movq %rax, dataout(%rip) Therefore, I agree that when memcpy is passed a constant, it will do the same as my ltt_relay_do_copy. However, when we know we usually expect sizes of 1, 2, 4 and 8 bytes (unknown at compile-time), the jump table saves the costly function call to memcpy. Signed-off-by: Mathieu Desnoyers CC: Martin Bligh CC: Zhaolei --- include/linux/ltt-relay.h | 91 ++++++++++++++++++++++++++++++++++++++++++++-- ltt/ltt-relay-alloc.c | 80 +++++++++------------------------------- 2 files changed, 107 insertions(+), 64 deletions(-) Index: linux-2.6-lttng/ltt/ltt-relay-alloc.c =================================================================== --- linux-2.6-lttng.orig/ltt/ltt-relay-alloc.c 2009-03-05 15:22:47.000000000 -0500 +++ linux-2.6-lttng/ltt/ltt-relay-alloc.c 2009-03-05 15:22:49.000000000 -0500 @@ -428,7 +428,7 @@ EXPORT_SYMBOL_GPL(ltt_relay_close); /* * Start iteration at the previous element. Skip the real list head. */ -static struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf, +struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf, struct buf_page *page, size_t offset, ssize_t diff_offset) { struct buf_page *iter; @@ -460,13 +460,15 @@ static struct buf_page *ltt_relay_find_p return iter; } } + WARN_ON(1); return NULL; } +EXPORT_SYMBOL_GPL(ltt_relay_find_prev_page); /* * Start iteration at the next element. Skip the real list head. */ -static struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf, +struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf, struct buf_page *page, size_t offset, ssize_t diff_offset) { struct buf_page *iter; @@ -498,48 +500,10 @@ static struct buf_page *ltt_relay_find_n return iter; } } + WARN_ON(1); return NULL; } - -/* - * Find the page containing "offset". Cache it if it is after the currently - * cached page. - */ -static struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf, - struct buf_page **page_cache, - struct buf_page *page, size_t offset) -{ - ssize_t diff_offset; - ssize_t half_buf_size = buf->chan->alloc_size >> 1; - - /* - * Make sure this is the page we want to write into. The current - * page is changed concurrently by other writers. [wrh]page are - * used as a cache remembering the last page written - * to/read/looked up for header address. No synchronization; - * could have to find the previous page is a nested write - * occured. Finding the right page is done by comparing the - * dest_offset with the buf_page offsets. - * When at the exact opposite of the buffer, bias towards forward search - * because it will be cached. - */ - - diff_offset = (ssize_t)offset - (ssize_t)page->offset; - if (diff_offset <= -(ssize_t)half_buf_size) - diff_offset += buf->chan->alloc_size; - else if (diff_offset > half_buf_size) - diff_offset -= buf->chan->alloc_size; - - if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) { - page = ltt_relay_find_next_page(buf, page, offset, diff_offset); - WARN_ON(!page); - *page_cache = page; - } else if (unlikely(diff_offset < 0)) { - page = ltt_relay_find_prev_page(buf, page, offset, diff_offset); - WARN_ON(!page); - } - return page; -} +EXPORT_SYMBOL_GPL(ltt_relay_find_next_page); /** * ltt_relay_write - write data to a ltt_relay buffer. @@ -547,26 +511,14 @@ static struct buf_page *ltt_relay_cache_ * @offset : offset within the buffer * @src : source address * @len : length to write + * @page : cached buffer page + * @pagecpy : page size copied so far */ -int ltt_relay_write(struct rchan_buf *buf, size_t offset, - const void *src, size_t len) +void _ltt_relay_write(struct rchan_buf *buf, size_t offset, + const void *src, size_t len, struct buf_page *page, ssize_t pagecpy) { - struct buf_page *page; - ssize_t pagecpy, orig_len; - - orig_len = len; - offset &= buf->chan->alloc_size - 1; - page = buf->wpage; - if (unlikely(!len)) - return 0; - for (;;) { - page = ltt_relay_cache_page(buf, &buf->wpage, page, offset); - pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK)); - memcpy(page_address(page->page) - + (offset & ~PAGE_MASK), src, pagecpy); + do { len -= pagecpy; - if (likely(!len)) - break; src += pagecpy; offset += pagecpy; /* @@ -574,10 +526,14 @@ int ltt_relay_write(struct rchan_buf *bu * subbuffers. */ WARN_ON(offset >= buf->chan->alloc_size); - } - return orig_len; + + page = ltt_relay_cache_page(buf, &buf->wpage, page, offset); + pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK)); + ltt_relay_do_copy(page_address(page->page) + + (offset & ~PAGE_MASK), src, pagecpy); + } while (unlikely(len != pagecpy)); } -EXPORT_SYMBOL_GPL(ltt_relay_write); +EXPORT_SYMBOL_GPL(_ltt_relay_write); /** * ltt_relay_read - read data from ltt_relay_buffer. Index: linux-2.6-lttng/include/linux/ltt-relay.h =================================================================== --- linux-2.6-lttng.orig/include/linux/ltt-relay.h 2009-03-05 15:22:47.000000000 -0500 +++ linux-2.6-lttng/include/linux/ltt-relay.h 2009-03-05 15:23:47.000000000 -0500 @@ -137,8 +137,14 @@ struct rchan_callbacks { int (*remove_buf_file)(struct dentry *dentry); }; -extern int ltt_relay_write(struct rchan_buf *buf, size_t offset, - const void *src, size_t len); +extern struct buf_page *ltt_relay_find_prev_page(struct rchan_buf *buf, + struct buf_page *page, size_t offset, ssize_t diff_offset); + +extern struct buf_page *ltt_relay_find_next_page(struct rchan_buf *buf, + struct buf_page *page, size_t offset, ssize_t diff_offset); + +extern void _ltt_relay_write(struct rchan_buf *buf, size_t offset, + const void *src, size_t len, struct buf_page *page, ssize_t pagecpy); extern int ltt_relay_read(struct rchan_buf *buf, size_t offset, void *dest, size_t len); @@ -156,6 +162,87 @@ extern void *ltt_relay_offset_address(st size_t offset); /* + * Find the page containing "offset". Cache it if it is after the currently + * cached page. + */ +static inline struct buf_page *ltt_relay_cache_page(struct rchan_buf *buf, + struct buf_page **page_cache, + struct buf_page *page, size_t offset) +{ + ssize_t diff_offset; + ssize_t half_buf_size = buf->chan->alloc_size >> 1; + + /* + * Make sure this is the page we want to write into. The current + * page is changed concurrently by other writers. [wrh]page are + * used as a cache remembering the last page written + * to/read/looked up for header address. No synchronization; + * could have to find the previous page is a nested write + * occured. Finding the right page is done by comparing the + * dest_offset with the buf_page offsets. + * When at the exact opposite of the buffer, bias towards forward search + * because it will be cached. + */ + + diff_offset = (ssize_t)offset - (ssize_t)page->offset; + if (diff_offset <= -(ssize_t)half_buf_size) + diff_offset += buf->chan->alloc_size; + else if (diff_offset > half_buf_size) + diff_offset -= buf->chan->alloc_size; + + if (unlikely(diff_offset >= (ssize_t)PAGE_SIZE)) { + page = ltt_relay_find_next_page(buf, page, offset, diff_offset); + *page_cache = page; + } else if (unlikely(diff_offset < 0)) { + page = ltt_relay_find_prev_page(buf, page, offset, diff_offset); + } + return page; +} + +static inline void ltt_relay_do_copy(void *dest, const void *src, size_t len) +{ + switch (len) { + case 0: + break; + case 1: + *(u8 *)dest = *(const u8 *)src; + break; + case 2: + *(u16 *)dest = *(const u16 *)src; + break; + case 4: + *(u32 *)dest = *(const u32 *)src; + break; +#if (BITS_PER_LONG == 64) + case 8: + *(u64 *)dest = *(const u64 *)src; + break; +#endif + default: + memcpy(dest, src, len); + } +} + +static inline int ltt_relay_write(struct rchan_buf *buf, size_t offset, + const void *src, size_t len) +{ + struct buf_page *page; + ssize_t pagecpy; + + offset &= buf->chan->alloc_size - 1; + page = buf->wpage; + + page = ltt_relay_cache_page(buf, &buf->wpage, page, offset); + pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK)); + ltt_relay_do_copy(page_address(page->page) + + (offset & ~PAGE_MASK), src, pagecpy); + + if (unlikely(len != pagecpy)) + _ltt_relay_write(buf, offset, src, len, page, pagecpy); + return len; +} + +/* * CONFIG_LTT_RELAY kernel API, ltt/ltt-relay-alloc.c */ -- Mathieu Desnoyers OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/