lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2f79fc9f-b0e4-a528-b56b-3e7001d9ef89@huawei.com>
Date:   Fri, 17 May 2019 14:35:26 +0800
From:   Gao Xiang <gaoxiang25@...wei.com>
To:     Chenxi Mao <chenxi.mao@...y.com>
CC:     <akpm@...ux-foundation.org>, <linux-kernel@...r.kernel.org>,
        <roy.feng@...y.com>, <yuanli.xu@...y.com>, <robert.alm@...y.com>,
        <masaya.a.takahashi@...y.com>, <yann.collet.73@...il.com>,
        <miaoxie@...wei.com>
Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on
 x86 and ARM64

Hi Chenxi,

Some words about the patch format.. not important tho.

One suggestion is that the subject line should be better written
as "[PATCH v2/v3/...] title" since it's more clear to know
which patch is the latest patch among these emails.

On 2019/5/17 13:56, Chenxi Mao wrote:
> FAST_DEC_LOOP was introduced from LZ4 1.9.0[1]
> This change would be introduce 10% on decompress operation
> according to LZ4 benchmark result on X86 devices.
> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements on ARM64,
> however clang compiler has downgrade if FAST_DEC_LOOP enabled.
> 
> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
> 
> LZ4 FAST_DEC_LOOP bug fixes include as well.
> 1. fixed read-after input in LZ4_decompress_safe() (issue 681)
> 2. Fix out-of-bound read in LZ4_decompress_fast() (issue 676)
> 
> PS2:
> 1. Move common API to lz4defs.h
> 2. Add PPC related inline Macro defination.
> 3. Force inline new static apis.
> 
> Here is the test result on ARM64(cortex-A53)
> Benchmark via ZRAM:
> 
> Test case:
> taskset 03 /data/fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
> --buffer_compress_percentage=75  --size=700M \
> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=1 \
> --filename=/data/test/test --name=seq-read --rw=read --stonewall
> 
> Patched:
>     READ: bw=150MiB/s (157MB/s)
> Vanilla:
>     READ: bw=135MiB/s (142MB/s)
> 
> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
> 
> Signed-off-by: chenxi.mao <chenxi.mao@...y.com>
> ---

It's perferred to move all changelogs here if you don't want these changelog
as a part of commit and there are some patches which can be referenced:
https://lore.kernel.org/lkml/

Thanks,
Gao Xiang

>  lib/lz4/lz4_compress.c   |   4 +-
>  lib/lz4/lz4_decompress.c | 397 ++++++++++++++++++++++++++++++++-------
>  lib/lz4/lz4defs.h        |  60 +++++-
>  lib/lz4/lz4hc_compress.c |   2 +-
>  4 files changed, 392 insertions(+), 71 deletions(-)
> 
> diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
> index cc7b6d4cc7c7..b703ed1ca57d 100644
> --- a/lib/lz4/lz4_compress.c
> +++ b/lib/lz4/lz4_compress.c
> @@ -322,7 +322,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>  				*token = (BYTE)(litLength << ML_BITS);
>  
>  			/* Copy Literals */
> -			LZ4_wildCopy(op, anchor, op + litLength);
> +			LZ4_wildCopy8(op, anchor, op + litLength);
>  			op += litLength;
>  		}
>  
> @@ -628,7 +628,7 @@ static int LZ4_compress_destSize_generic(
>  				*token = (BYTE)(litLength << ML_BITS);
>  
>  			/* Copy Literals */
> -			LZ4_wildCopy(op, anchor, op + litLength);
> +			LZ4_wildCopy8(op, anchor, op + litLength);
>  			op += litLength;
>  		}
>  
> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
> index 0c9d3ad17e0f..8622922304c3 100644
> --- a/lib/lz4/lz4_decompress.c
> +++ b/lib/lz4/lz4_decompress.c
> @@ -50,6 +50,96 @@
>  #define assert(condition) ((void)0)
>  #endif
>  
> +#ifndef LZ4_FAST_DEC_LOOP
> +#if defined(__i386__) || defined(__x86_64__)
> +#define LZ4_FAST_DEC_LOOP 1
> +#elif defined(__aarch64__) && !defined(__clang__)
> +     /* On aarch64, we disable this optimization for clang because on certain
> +      * mobile chipsets and clang, it reduces performance. For more information
> +      * refer to https://github.com/lz4/lz4/pull/707. */
> +#define LZ4_FAST_DEC_LOOP 1
> +#else
> +#define LZ4_FAST_DEC_LOOP 0
> +#endif
> +#endif
> +
> +#if LZ4_FAST_DEC_LOOP
> +#define FASTLOOP_SAFE_DISTANCE 64
> +FORCE_O2_INLINE_GCC_PPC64LE void
> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
> +			     const size_t offset)
> +{
> +	if (offset < 8) {
> +		dstPtr[0] = srcPtr[0];
> +
> +		dstPtr[1] = srcPtr[1];
> +		dstPtr[2] = srcPtr[2];
> +		dstPtr[3] = srcPtr[3];
> +		srcPtr += inc32table[offset];
> +		memcpy(dstPtr + 4, srcPtr, 4);
> +		srcPtr -= dec64table[offset];
> +		dstPtr += 8;
> +	} else {
> +		memcpy(dstPtr, srcPtr, 8);
> +		dstPtr += 8;
> +		srcPtr += 8;
> +	}
> +
> +	LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
> +}
> +
> +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
> + * this version copies two times 16 bytes (instead of one time 32 bytes)
> + * because it must be compatible with offsets >= 16. */
> +FORCE_O2_INLINE_GCC_PPC64LE void
> +LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
> +{
> +	BYTE *d = (BYTE *) dstPtr;
> +	const BYTE *s = (const BYTE *)srcPtr;
> +	BYTE *const e = (BYTE *) dstEnd;
> +
> +	do {
> +		memcpy(d, s, 16);
> +		memcpy(d + 16, s + 16, 16);
> +		d += 32;
> +		s += 32;
> +	} while (d < e);
> +}
> +
> +FORCE_O2_INLINE_GCC_PPC64LE void
> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
> +			const size_t offset)
> +{
> +	BYTE v[8];
> +	switch (offset) {
> +
> +	case 1:
> +		memset(v, *srcPtr, 8);
> +		goto copy_loop;
> +	case 2:
> +		memcpy(v, srcPtr, 2);
> +		memcpy(&v[2], srcPtr, 2);
> +		memcpy(&v[4], &v[0], 4);
> +		goto copy_loop;
> +	case 4:
> +		memcpy(v, srcPtr, 4);
> +		memcpy(&v[4], srcPtr, 4);
> +		goto copy_loop;
> +	default:
> +		LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
> +		return;
> +	}
> +
> +      copy_loop:
> +	memcpy(dstPtr, v, 8);
> +	dstPtr += 8;
> +	while (dstPtr < dstEnd) {
> +		memcpy(dstPtr, v, 8);
> +		dstPtr += 8;
> +	}
> +}
> +#endif
> +
>  /*
>   * LZ4_decompress_generic() :
>   * This generic decompression function covers all use cases.
> @@ -80,25 +170,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  	 const size_t dictSize
>  	 )
>  {
> -	const BYTE *ip = (const BYTE *) src;
> -	const BYTE * const iend = ip + srcSize;
> +	const BYTE *ip = (const BYTE *)src;
> +	const BYTE *const iend = ip + srcSize;
>  
>  	BYTE *op = (BYTE *) dst;
> -	BYTE * const oend = op + outputSize;
> +	BYTE *const oend = op + outputSize;
>  	BYTE *cpy;
>  
> -	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
> -	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
> -	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
> +	const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>  
>  	const int safeDecode = (endOnInput == endOnInputSize);
>  	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
>  
>  	/* Set up the "end" pointers for the shortcut. */
>  	const BYTE *const shortiend = iend -
> -		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>  	const BYTE *const shortoend = oend -
> -		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +
> +	const BYTE *match;
> +	size_t offset;
> +	unsigned int token;
> +	size_t length;
>  
>  	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>  		 srcSize, outputSize);
> @@ -117,15 +210,195 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  	if ((endOnInput) && unlikely(srcSize == 0))
>  		return -1;
>  
> -	/* Main Loop : decode sequences */
> +#if LZ4_FAST_DEC_LOOP
> +	if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
> +		DEBUGLOG(6, "skip fast decode loop");
> +		goto safe_decode;
> +	}
> +
> +	/* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
>  	while (1) {
> -		size_t length;
> -		const BYTE *match;
> -		size_t offset;
> +		/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
> +		assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
> +		if (endOnInput) {
> +			assert(ip < iend);
> +		}
> +		token = *ip++;
> +		length = token >> ML_BITS;	/* literal length */
> +
> +		assert(!endOnInput || ip <= iend);	/* ip < iend before the increment */
> +
> +		/* decode literal length */
> +		if (length == RUN_MASK) {
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - RUN_MASK,
> +						 endOnInput, endOnInput,
> +						 &error);
> +			if (error == initial_error) {
> +				goto _output_error;
> +			}
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (op) + length <
> +					(uptrval) (op))) {
> +				goto _output_error;
> +			}	/* overflow detection */
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (ip) + length <
> +					(uptrval) (ip))) {
> +				goto _output_error;
> +			}
>  
> -		/* get literal length */
> -		unsigned int const token = *ip++;
> -		length = token>>ML_BITS;
> +			/* overflow detection */
> +			/* copy literals */
> +			cpy = op + length;
> +			LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
> +			if (endOnInput) {	/* LZ4_decompress_safe() */
> +				if ((cpy > oend - 32)
> +				    || (ip + length > iend - 32)) {
> +					goto safe_literal_copy;
> +				}
> +				LZ4_wildCopy32(op, ip, cpy);
> +			} else {	/* LZ4_decompress_fast() */
> +				if (cpy > oend - 8) {
> +					goto safe_literal_copy;
> +				}
> +				LZ4_wildCopy8(op, ip, cpy);
> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
> +				/* it doesn't know input length, and only relies on end-of-block */
> +				/* properties */
> +			}
> +			ip += length;
> +			op = cpy;
> +		} else {
> +			cpy = op + length;
> +			if (endOnInput) {	/* LZ4_decompress_safe() */
> +				DEBUGLOG(7,
> +					 "copy %u bytes in a 16-bytes stripe",
> +					 (unsigned)length);
> +				/* We don't need to check oend */
> +				/* since we check it once for each loop below */
> +				if (ip > iend - (16 + 1)) {	/*max lit + offset + nextToken */
> +					goto safe_literal_copy;
> +				}
> +				/* Literals can only be 14, but hope compilers optimize */
> +				/*if we copy by a register size */
> +				memcpy(op, ip, 16);
> +			} else {
> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
> +				/* it doesn't know input length, and relies on end-of-block */
> +				/* properties */
> +				memcpy(op, ip, 8);
> +				if (length > 8) {
> +					memcpy(op + 8, ip + 8, 8);
> +				}
> +			}
> +			ip += length;
> +			op = cpy;
> +		}
> +
> +		/* get offset */
> +		offset = LZ4_readLE16(ip);
> +		ip += 2;	/* end-of-block condition violated */
> +		match = op - offset;
> +
> +		/* get matchlength */
> +		length = token & ML_MASK;
> +
> +		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
> +			goto _output_error;
> +		}
> +		/* Error : offset outside buffers */
> +		if (length == ML_MASK) {
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
> +						 endOnInput, 0, &error);
> +			if (error != ok) {
> +				goto _output_error;
> +			}
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (op) + length < (uptrval) op)) {
> +				goto _output_error;
> +			}	/* overflow detection */
> +			length += MINMATCH;
> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +				goto safe_match_copy;
> +			}
> +		} else {
> +			length += MINMATCH;
> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +				goto safe_match_copy;
> +			}
> +
> +			/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
> +			if (!(dict == usingExtDict) || (match >= lowPrefix)) {
> +				if (offset >= 8) {
> +					memcpy(op, match, 8);
> +					memcpy(op + 8, match + 8, 8);
> +					memcpy(op + 16, match + 16, 2);
> +					op += length;
> +					continue;
> +				}
> +			}
> +		}
> +
> +		/* match starting within external dictionary */
> +		if ((dict == usingExtDict) && (match < lowPrefix)) {
> +			if (unlikely(op + length > oend - LASTLITERALS)) {
> +				if (partialDecoding) {
> +					/* reach end of buffer */
> +					length =
> +					    min(length, (size_t) (oend - op));
> +				} else {
> +					/* end-of-block condition violated */
> +					goto _output_error;
> +				}
> +			}
> +
> +			if (length <= (size_t) (lowPrefix - match)) {
> +				/* match fits entirely within external dictionary : just copy */
> +				memmove(op, dictEnd - (lowPrefix - match),
> +					length);
> +				op += length;
> +			} else {
> +				/* match stretches into both external dict and current block */
> +				size_t const copySize =
> +				    (size_t) (lowPrefix - match);
> +				size_t const restSize = length - copySize;
> +				memcpy(op, dictEnd - copySize, copySize);
> +				op += copySize;
> +				if (restSize > (size_t) (op - lowPrefix)) {	/* overlap copy */
> +					BYTE *const endOfMatch = op + restSize;
> +					const BYTE *copyFrom = lowPrefix;
> +					while (op < endOfMatch) {
> +						*op++ = *copyFrom++;
> +					}
> +				} else {
> +					memcpy(op, lowPrefix, restSize);
> +					op += restSize;
> +				}
> +			}
> +			continue;
> +		}
> +
> +		/* copy match within block */
> +		cpy = op + length;
> +
> +		assert((op <= oend) && (oend - op >= 32));
> +		if (unlikely(offset < 16)) {
> +			LZ4_memcpy_using_offset(op, match, cpy, offset);
> +		} else {
> +			LZ4_wildCopy32(op, match, cpy);
> +		}
> +
> +		op = cpy;	/* wildcopy correction */
> +	}
> +      safe_decode:
> +#endif
> +	/* Main Loop : decode sequences */
> +	while (1) {
> +		length = token >> ML_BITS;
>  
>  		/* ip < iend before the increment */
>  		assert(!endOnInput || ip <= iend);
> @@ -143,26 +416,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  		 * combined check for both stages).
>  		 */
>  		if ((endOnInput ? length != RUN_MASK : length <= 8)
> -		   /*
> -		    * strictly "less than" on input, to re-enter
> -		    * the loop with at least one byte
> -		    */
> -		   && likely((endOnInput ? ip < shortiend : 1) &
> -			     (op <= shortoend))) {
> +		    /*
> +		     * strictly "less than" on input, to re-enter
> +		     * the loop with at least one byte
> +		     */
> +		    && likely((endOnInput ? ip < shortiend : 1) &
> +			      (op <= shortoend))) {
>  			/* Copy the literals */
>  			memcpy(op, ip, endOnInput ? 16 : 8);
> -			op += length; ip += length;
> +			op += length;
> +			ip += length;
>  
>  			/*
>  			 * The second stage:
>  			 * prepare for match copying, decode full info.
>  			 * If it doesn't work out, the info won't be wasted.
>  			 */
> -			length = token & ML_MASK; /* match length */
> +			length = token & ML_MASK;	/* match length */
>  			offset = LZ4_readLE16(ip);
>  			ip += 2;
>  			match = op - offset;
> -			assert(match <= op); /* check overflow */
> +			assert(match <= op);	/* check overflow */
>  
>  			/* Do not deal with overlapping matches. */
>  			if ((length != ML_MASK) &&
> @@ -187,28 +461,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		/* decode literal length */
>  		if (length == RUN_MASK) {
> -			unsigned int s;
>  
> -			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
> -				/* overflow detection */
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - RUN_MASK,
> +						 endOnInput, endOnInput,
> +						 &error);
> +			if (error == initial_error)
>  				goto _output_error;
> -			}
> -			do {
> -				s = *ip++;
> -				length += s;
> -			} while (likely(endOnInput
> -				? ip < iend - RUN_MASK
> -				: 1) & (s == 255));
>  
>  			if ((safeDecode)
> -			    && unlikely((uptrval)(op) +
> -					length < (uptrval)(op))) {
> +			    && unlikely((uptrval) (op) +
> +					length < (uptrval) (op))) {
>  				/* overflow detection */
>  				goto _output_error;
>  			}
>  			if ((safeDecode)
> -			    && unlikely((uptrval)(ip) +
> -					length < (uptrval)(ip))) {
> +			    && unlikely((uptrval) (ip) +
> +					length < (uptrval) (ip))) {
>  				/* overflow detection */
>  				goto _output_error;
>  			}
> @@ -216,11 +486,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		/* copy literals */
>  		cpy = op + length;
> +#if LZ4_FAST_DEC_LOOP
> +	      safe_literal_copy:
> +#endif
>  		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>  
>  		if (((endOnInput) && ((cpy > oend - MFLIMIT)
> -			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
> -			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
> +				      || (ip + length >
> +					  iend - (2 + 1 + LASTLITERALS))))
> +		    || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>  			if (partialDecoding) {
>  				if (cpy > oend) {
>  					/*
> @@ -231,7 +505,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  					length = oend - op;
>  				}
>  				if ((endOnInput)
> -					&& (ip + length > iend)) {
> +				    && (ip + length > iend)) {
>  					/*
>  					 * Error :
>  					 * read attempt beyond
> @@ -241,7 +515,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  				}
>  			} else {
>  				if ((!endOnInput)
> -					&& (cpy != oend)) {
> +				    && (cpy != oend)) {
>  					/*
>  					 * Error :
>  					 * block decoding must
> @@ -250,7 +524,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  					goto _output_error;
>  				}
>  				if ((endOnInput)
> -					&& ((ip + length != iend)
> +				    && ((ip + length != iend)
>  					|| (cpy > oend))) {
>  					/*
>  					 * Error :
> @@ -269,7 +543,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  				break;
>  		} else {
>  			/* may overwrite up to WILDCOPYLENGTH beyond cpy */
> -			LZ4_wildCopy(op, ip, cpy);
> +			LZ4_wildCopy8(op, ip, cpy);
>  			ip += length;
>  			op = cpy;
>  		}
> @@ -288,29 +562,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  			goto _output_error;
>  		}
>  
> -		/* costs ~1%; silence an msan warning when offset == 0 */
> -		/*
> -		 * note : when partialDecoding, there is no guarantee that
> -		 * at least 4 bytes remain available in output buffer
> -		 */
> -		if (!partialDecoding) {
> -			assert(oend > op);
> -			assert(oend - op >= 4);
> -
> -			LZ4_write32(op, (U32)offset);
> -		}
> -
>  		if (length == ML_MASK) {
> -			unsigned int s;
> -
> -			do {
> -				s = *ip++;
> -
> -				if ((endOnInput) && (ip > iend - LASTLITERALS))
> -					goto _output_error;
>  
> -				length += s;
> -			} while (s == 255);
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
> +						 endOnInput, 0, &error);
> +			if (error != ok)
> +				goto _output_error;
>  
>  			if ((safeDecode)
>  				&& unlikely(
> @@ -322,6 +581,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		length += MINMATCH;
>  
> +#if LZ4_FAST_DEC_LOOP
> +safe_match_copy:
> +#endif
> +
>  		/* match starting within external dictionary */
>  		if ((dict == usingExtDict) && (match < lowPrefix)) {
>  			if (unlikely(op + length > oend - LASTLITERALS)) {
> @@ -418,7 +681,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  			}
>  
>  			if (op < oCopyLimit) {
> -				LZ4_wildCopy(op, match, oCopyLimit);
> +				LZ4_wildCopy8(op, match, oCopyLimit);
>  				match += oCopyLimit - op;
>  				op = oCopyLimit;
>  			}
> @@ -427,7 +690,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  		} else {
>  			LZ4_copy8(op, match);
>  			if (length > 16)
> -				LZ4_wildCopy(op + 8, match + 8, cpy);
> +				LZ4_wildCopy8(op + 8, match + 8, cpy);
>  		}
>  		op = cpy; /* wildcopy correction */
>  	}
> diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
> index 1a7fa9d9170f..4cc17cf589ed 100644
> --- a/lib/lz4/lz4defs.h
> +++ b/lib/lz4/lz4defs.h
> @@ -40,6 +40,28 @@
>  
>  #define FORCE_INLINE __always_inline
>  
> +/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
> + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
> + * together with a simple 8-byte copy loop as a fall-back path.
> + * However, this optimization hurts the decompression speed by >30%,
> + * because the execution does not go to the optimized loop
> + * for typical compressible data, and all of the preamble checks
> + * before going to the fall-back path become useless overhead.
> + * This optimization happens only with the -O3 flag, and -O2 generates
> + * a simple 8-byte copy loop.
> + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
> + * functions are annotated with __attribute__((optimize("O2"))),
> + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
> + * of LZ4_wildCopy8 does not affect the compression speed.
> + */
> +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
> +#  define FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
> +#  define FORCE_O2_INLINE_GCC_PPC64LE (__attribute__((optimize("O2"))) FORCE_INLINE)
> +#else
> +#  define FORCE_O2_GCC_PPC64LE		FORCE_INLINE
> +#  define FORCE_O2_INLINE_GCC_PPC64LE	FORCE_INLINE
> +#endif
> +
>  /*-************************************
>   *	Basic Types
>   **************************************/
> @@ -99,6 +121,9 @@ typedef uintptr_t uptrval;
>  #define RUN_BITS (8 - ML_BITS)
>  #define RUN_MASK ((1U << RUN_BITS) - 1)
>  
> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
> +
>  /*-************************************
>   *	Reading and writing into memory
>   **************************************/
> @@ -156,7 +181,7 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
>   * customized variant of memcpy,
>   * which can overwrite up to 7 bytes beyond dstEnd
>   */
> -static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
> +static FORCE_O2_INLINE_GCC_PPC64LE void LZ4_wildCopy8(void *dstPtr,
>  	const void *srcPtr, void *dstEnd)
>  {
>  	BYTE *d = (BYTE *)dstPtr;
> @@ -220,6 +245,39 @@ static FORCE_INLINE unsigned int LZ4_count(
>  	return (unsigned int)(pIn - pStart);
>  }
>  
> +/* Read the variable-length literal or match length.
> + *
> + * ip - pointer to use as input.
> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
> + * error (output) - error code.  Should be set to 0 before call.
> + */
> +typedef enum { loop_error = -2, initial_error = -1, ok = 0} variable_length_error;
> +static FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
> +					   const BYTE *lencheck,
> +					   int loop_check, int initial_check,
> +					   variable_length_error *error)
> +{
> +	unsigned length = 0;
> +	unsigned s;
> +	if (initial_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
> +		*error = initial_error;
> +		return length;
> +	}
> +	do {
> +		s = **ip;
> +		(*ip)++;
> +		length += s;
> +		if (loop_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
> +			*error = loop_error;
> +			return length;
> +		}
> +	} while (s == 255);
> +
> +	return length;
> +}
> +
>  typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
>  typedef enum { byPtr, byU32, byU16 } tableType_t;
>  
> diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
> index 176f03b83e56..e02e041a01d9 100644
> --- a/lib/lz4/lz4hc_compress.c
> +++ b/lib/lz4/lz4hc_compress.c
> @@ -293,7 +293,7 @@ static FORCE_INLINE int LZ4HC_encodeSequence(
>  		*token = (BYTE)(length<<ML_BITS);
>  
>  	/* Copy Literals */
> -	LZ4_wildCopy(*op, *anchor, (*op) + length);
> +	LZ4_wildCopy8(*op, *anchor, (*op) + length);
>  	*op += length;
>  
>  	/* Encode Offset */
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ