lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <29ed736b-1653-561d-7ec6-8e301a1bdc7a@sony.com>
Date:   Fri, 17 May 2019 15:06:06 +0800
From:   "Mao, Chenxi" <Chenxi.Mao@...y.com>
To:     Gao Xiang <gaoxiang25@...wei.com>
CC:     <akpm@...ux-foundation.org>, <linux-kernel@...r.kernel.org>,
        <roy.feng@...y.com>, <yuanli.xu@...y.com>, <robert.alm@...y.com>,
        <masaya.a.takahashi@...y.com>, <yann.collet.73@...il.com>,
        <miaoxie@...wei.com>
Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on
 x86 and ARM64

Hi Xiang:

Got it and thanks.
Will follow the rules in next PS.

Chenxi

On 5/17/19 2:35 PM, Gao Xiang wrote:
> Hi Chenxi,
> 
> Some words about the patch format.. not important tho.
> 
> One suggestion is that the subject line should be better written
> as "[PATCH v2/v3/...] title" since it's more clear to know
> which patch is the latest patch among these emails.
> 
> On 2019/5/17 13:56, Chenxi Mao wrote:
>> FAST_DEC_LOOP was introduced from LZ4 1.9.0[1]
>> This change would be introduce 10% on decompress operation
>> according to LZ4 benchmark result on X86 devices.
>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements on ARM64,
>> however clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>
>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>
>> LZ4 FAST_DEC_LOOP bug fixes include as well.
>> 1. fixed read-after input in LZ4_decompress_safe() (issue 681)
>> 2. Fix out-of-bound read in LZ4_decompress_fast() (issue 676)
>>
>> PS2:
>> 1. Move common API to lz4defs.h
>> 2. Add PPC related inline Macro defination.
>> 3. Force inline new static apis.
>>
>> Here is the test result on ARM64(cortex-A53)
>> Benchmark via ZRAM:
>>
>> Test case:
>> taskset 03 /data/fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>> --buffer_compress_percentage=75  --size=700M \
>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=1 \
>> --filename=/data/test/test --name=seq-read --rw=read --stonewall
>>
>> Patched:
>>     READ: bw=150MiB/s (157MB/s)
>> Vanilla:
>>     READ: bw=135MiB/s (142MB/s)
>>
>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>
>> Signed-off-by: chenxi.mao <chenxi.mao@...y.com>
>> ---
> 
> It's perferred to move all changelogs here if you don't want these changelog
> as a part of commit and there are some patches which can be referenced:
> https://lore.kernel.org/lkml/
> 
> Thanks,
> Gao Xiang
> 
>>  lib/lz4/lz4_compress.c   |   4 +-
>>  lib/lz4/lz4_decompress.c | 397 ++++++++++++++++++++++++++++++++-------
>>  lib/lz4/lz4defs.h        |  60 +++++-
>>  lib/lz4/lz4hc_compress.c |   2 +-
>>  4 files changed, 392 insertions(+), 71 deletions(-)
>>
>> diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
>> index cc7b6d4cc7c7..b703ed1ca57d 100644
>> --- a/lib/lz4/lz4_compress.c
>> +++ b/lib/lz4/lz4_compress.c
>> @@ -322,7 +322,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>>  				*token = (BYTE)(litLength << ML_BITS);
>>  
>>  			/* Copy Literals */
>> -			LZ4_wildCopy(op, anchor, op + litLength);
>> +			LZ4_wildCopy8(op, anchor, op + litLength);
>>  			op += litLength;
>>  		}
>>  
>> @@ -628,7 +628,7 @@ static int LZ4_compress_destSize_generic(
>>  				*token = (BYTE)(litLength << ML_BITS);
>>  
>>  			/* Copy Literals */
>> -			LZ4_wildCopy(op, anchor, op + litLength);
>> +			LZ4_wildCopy8(op, anchor, op + litLength);
>>  			op += litLength;
>>  		}
>>  
>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
>> index 0c9d3ad17e0f..8622922304c3 100644
>> --- a/lib/lz4/lz4_decompress.c
>> +++ b/lib/lz4/lz4_decompress.c
>> @@ -50,6 +50,96 @@
>>  #define assert(condition) ((void)0)
>>  #endif
>>  
>> +#ifndef LZ4_FAST_DEC_LOOP
>> +#if defined(__i386__) || defined(__x86_64__)
>> +#define LZ4_FAST_DEC_LOOP 1
>> +#elif defined(__aarch64__) && !defined(__clang__)
>> +     /* On aarch64, we disable this optimization for clang because on certain
>> +      * mobile chipsets and clang, it reduces performance. For more information
>> +      * refer to https://github.com/lz4/lz4/pull/707. */
>> +#define LZ4_FAST_DEC_LOOP 1
>> +#else
>> +#define LZ4_FAST_DEC_LOOP 0
>> +#endif
>> +#endif
>> +
>> +#if LZ4_FAST_DEC_LOOP
>> +#define FASTLOOP_SAFE_DISTANCE 64
>> +FORCE_O2_INLINE_GCC_PPC64LE void
>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>> +			     const size_t offset)
>> +{
>> +	if (offset < 8) {
>> +		dstPtr[0] = srcPtr[0];
>> +
>> +		dstPtr[1] = srcPtr[1];
>> +		dstPtr[2] = srcPtr[2];
>> +		dstPtr[3] = srcPtr[3];
>> +		srcPtr += inc32table[offset];
>> +		memcpy(dstPtr + 4, srcPtr, 4);
>> +		srcPtr -= dec64table[offset];
>> +		dstPtr += 8;
>> +	} else {
>> +		memcpy(dstPtr, srcPtr, 8);
>> +		dstPtr += 8;
>> +		srcPtr += 8;
>> +	}
>> +
>> +	LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
>> +}
>> +
>> +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
>> + * this version copies two times 16 bytes (instead of one time 32 bytes)
>> + * because it must be compatible with offsets >= 16. */
>> +FORCE_O2_INLINE_GCC_PPC64LE void
>> +LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
>> +{
>> +	BYTE *d = (BYTE *) dstPtr;
>> +	const BYTE *s = (const BYTE *)srcPtr;
>> +	BYTE *const e = (BYTE *) dstEnd;
>> +
>> +	do {
>> +		memcpy(d, s, 16);
>> +		memcpy(d + 16, s + 16, 16);
>> +		d += 32;
>> +		s += 32;
>> +	} while (d < e);
>> +}
>> +
>> +FORCE_O2_INLINE_GCC_PPC64LE void
>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>> +			const size_t offset)
>> +{
>> +	BYTE v[8];
>> +	switch (offset) {
>> +
>> +	case 1:
>> +		memset(v, *srcPtr, 8);
>> +		goto copy_loop;
>> +	case 2:
>> +		memcpy(v, srcPtr, 2);
>> +		memcpy(&v[2], srcPtr, 2);
>> +		memcpy(&v[4], &v[0], 4);
>> +		goto copy_loop;
>> +	case 4:
>> +		memcpy(v, srcPtr, 4);
>> +		memcpy(&v[4], srcPtr, 4);
>> +		goto copy_loop;
>> +	default:
>> +		LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>> +		return;
>> +	}
>> +
>> +      copy_loop:
>> +	memcpy(dstPtr, v, 8);
>> +	dstPtr += 8;
>> +	while (dstPtr < dstEnd) {
>> +		memcpy(dstPtr, v, 8);
>> +		dstPtr += 8;
>> +	}
>> +}
>> +#endif
>> +
>>  /*
>>   * LZ4_decompress_generic() :
>>   * This generic decompression function covers all use cases.
>> @@ -80,25 +170,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  	 const size_t dictSize
>>  	 )
>>  {
>> -	const BYTE *ip = (const BYTE *) src;
>> -	const BYTE * const iend = ip + srcSize;
>> +	const BYTE *ip = (const BYTE *)src;
>> +	const BYTE *const iend = ip + srcSize;
>>  
>>  	BYTE *op = (BYTE *) dst;
>> -	BYTE * const oend = op + outputSize;
>> +	BYTE *const oend = op + outputSize;
>>  	BYTE *cpy;
>>  
>> -	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>> -	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>> -	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>> +	const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>  
>>  	const int safeDecode = (endOnInput == endOnInputSize);
>>  	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
>>  
>>  	/* Set up the "end" pointers for the shortcut. */
>>  	const BYTE *const shortiend = iend -
>> -		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>  	const BYTE *const shortoend = oend -
>> -		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>> +
>> +	const BYTE *match;
>> +	size_t offset;
>> +	unsigned int token;
>> +	size_t length;
>>  
>>  	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>  		 srcSize, outputSize);
>> @@ -117,15 +210,195 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  	if ((endOnInput) && unlikely(srcSize == 0))
>>  		return -1;
>>  
>> -	/* Main Loop : decode sequences */
>> +#if LZ4_FAST_DEC_LOOP
>> +	if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>> +		DEBUGLOG(6, "skip fast decode loop");
>> +		goto safe_decode;
>> +	}
>> +
>> +	/* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
>>  	while (1) {
>> -		size_t length;
>> -		const BYTE *match;
>> -		size_t offset;
>> +		/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>> +		assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>> +		if (endOnInput) {
>> +			assert(ip < iend);
>> +		}
>> +		token = *ip++;
>> +		length = token >> ML_BITS;	/* literal length */
>> +
>> +		assert(!endOnInput || ip <= iend);	/* ip < iend before the increment */
>> +
>> +		/* decode literal length */
>> +		if (length == RUN_MASK) {
>> +			variable_length_error error = ok;
>> +			length +=
>> +			    read_variable_length(&ip, iend - RUN_MASK,
>> +						 endOnInput, endOnInput,
>> +						 &error);
>> +			if (error == initial_error) {
>> +				goto _output_error;
>> +			}
>> +			if ((safeDecode)
>> +			    && unlikely((uptrval) (op) + length <
>> +					(uptrval) (op))) {
>> +				goto _output_error;
>> +			}	/* overflow detection */
>> +			if ((safeDecode)
>> +			    && unlikely((uptrval) (ip) + length <
>> +					(uptrval) (ip))) {
>> +				goto _output_error;
>> +			}
>>  
>> -		/* get literal length */
>> -		unsigned int const token = *ip++;
>> -		length = token>>ML_BITS;
>> +			/* overflow detection */
>> +			/* copy literals */
>> +			cpy = op + length;
>> +			LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>> +			if (endOnInput) {	/* LZ4_decompress_safe() */
>> +				if ((cpy > oend - 32)
>> +				    || (ip + length > iend - 32)) {
>> +					goto safe_literal_copy;
>> +				}
>> +				LZ4_wildCopy32(op, ip, cpy);
>> +			} else {	/* LZ4_decompress_fast() */
>> +				if (cpy > oend - 8) {
>> +					goto safe_literal_copy;
>> +				}
>> +				LZ4_wildCopy8(op, ip, cpy);
>> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>> +				/* it doesn't know input length, and only relies on end-of-block */
>> +				/* properties */
>> +			}
>> +			ip += length;
>> +			op = cpy;
>> +		} else {
>> +			cpy = op + length;
>> +			if (endOnInput) {	/* LZ4_decompress_safe() */
>> +				DEBUGLOG(7,
>> +					 "copy %u bytes in a 16-bytes stripe",
>> +					 (unsigned)length);
>> +				/* We don't need to check oend */
>> +				/* since we check it once for each loop below */
>> +				if (ip > iend - (16 + 1)) {	/*max lit + offset + nextToken */
>> +					goto safe_literal_copy;
>> +				}
>> +				/* Literals can only be 14, but hope compilers optimize */
>> +				/*if we copy by a register size */
>> +				memcpy(op, ip, 16);
>> +			} else {
>> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>> +				/* it doesn't know input length, and relies on end-of-block */
>> +				/* properties */
>> +				memcpy(op, ip, 8);
>> +				if (length > 8) {
>> +					memcpy(op + 8, ip + 8, 8);
>> +				}
>> +			}
>> +			ip += length;
>> +			op = cpy;
>> +		}
>> +
>> +		/* get offset */
>> +		offset = LZ4_readLE16(ip);
>> +		ip += 2;	/* end-of-block condition violated */
>> +		match = op - offset;
>> +
>> +		/* get matchlength */
>> +		length = token & ML_MASK;
>> +
>> +		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>> +			goto _output_error;
>> +		}
>> +		/* Error : offset outside buffers */
>> +		if (length == ML_MASK) {
>> +			variable_length_error error = ok;
>> +			length +=
>> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
>> +						 endOnInput, 0, &error);
>> +			if (error != ok) {
>> +				goto _output_error;
>> +			}
>> +			if ((safeDecode)
>> +			    && unlikely((uptrval) (op) + length < (uptrval) op)) {
>> +				goto _output_error;
>> +			}	/* overflow detection */
>> +			length += MINMATCH;
>> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>> +				goto safe_match_copy;
>> +			}
>> +		} else {
>> +			length += MINMATCH;
>> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>> +				goto safe_match_copy;
>> +			}
>> +
>> +			/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>> +			if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>> +				if (offset >= 8) {
>> +					memcpy(op, match, 8);
>> +					memcpy(op + 8, match + 8, 8);
>> +					memcpy(op + 16, match + 16, 2);
>> +					op += length;
>> +					continue;
>> +				}
>> +			}
>> +		}
>> +
>> +		/* match starting within external dictionary */
>> +		if ((dict == usingExtDict) && (match < lowPrefix)) {
>> +			if (unlikely(op + length > oend - LASTLITERALS)) {
>> +				if (partialDecoding) {
>> +					/* reach end of buffer */
>> +					length =
>> +					    min(length, (size_t) (oend - op));
>> +				} else {
>> +					/* end-of-block condition violated */
>> +					goto _output_error;
>> +				}
>> +			}
>> +
>> +			if (length <= (size_t) (lowPrefix - match)) {
>> +				/* match fits entirely within external dictionary : just copy */
>> +				memmove(op, dictEnd - (lowPrefix - match),
>> +					length);
>> +				op += length;
>> +			} else {
>> +				/* match stretches into both external dict and current block */
>> +				size_t const copySize =
>> +				    (size_t) (lowPrefix - match);
>> +				size_t const restSize = length - copySize;
>> +				memcpy(op, dictEnd - copySize, copySize);
>> +				op += copySize;
>> +				if (restSize > (size_t) (op - lowPrefix)) {	/* overlap copy */
>> +					BYTE *const endOfMatch = op + restSize;
>> +					const BYTE *copyFrom = lowPrefix;
>> +					while (op < endOfMatch) {
>> +						*op++ = *copyFrom++;
>> +					}
>> +				} else {
>> +					memcpy(op, lowPrefix, restSize);
>> +					op += restSize;
>> +				}
>> +			}
>> +			continue;
>> +		}
>> +
>> +		/* copy match within block */
>> +		cpy = op + length;
>> +
>> +		assert((op <= oend) && (oend - op >= 32));
>> +		if (unlikely(offset < 16)) {
>> +			LZ4_memcpy_using_offset(op, match, cpy, offset);
>> +		} else {
>> +			LZ4_wildCopy32(op, match, cpy);
>> +		}
>> +
>> +		op = cpy;	/* wildcopy correction */
>> +	}
>> +      safe_decode:
>> +#endif
>> +	/* Main Loop : decode sequences */
>> +	while (1) {
>> +		length = token >> ML_BITS;
>>  
>>  		/* ip < iend before the increment */
>>  		assert(!endOnInput || ip <= iend);
>> @@ -143,26 +416,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  		 * combined check for both stages).
>>  		 */
>>  		if ((endOnInput ? length != RUN_MASK : length <= 8)
>> -		   /*
>> -		    * strictly "less than" on input, to re-enter
>> -		    * the loop with at least one byte
>> -		    */
>> -		   && likely((endOnInput ? ip < shortiend : 1) &
>> -			     (op <= shortoend))) {
>> +		    /*
>> +		     * strictly "less than" on input, to re-enter
>> +		     * the loop with at least one byte
>> +		     */
>> +		    && likely((endOnInput ? ip < shortiend : 1) &
>> +			      (op <= shortoend))) {
>>  			/* Copy the literals */
>>  			memcpy(op, ip, endOnInput ? 16 : 8);
>> -			op += length; ip += length;
>> +			op += length;
>> +			ip += length;
>>  
>>  			/*
>>  			 * The second stage:
>>  			 * prepare for match copying, decode full info.
>>  			 * If it doesn't work out, the info won't be wasted.
>>  			 */
>> -			length = token & ML_MASK; /* match length */
>> +			length = token & ML_MASK;	/* match length */
>>  			offset = LZ4_readLE16(ip);
>>  			ip += 2;
>>  			match = op - offset;
>> -			assert(match <= op); /* check overflow */
>> +			assert(match <= op);	/* check overflow */
>>  
>>  			/* Do not deal with overlapping matches. */
>>  			if ((length != ML_MASK) &&
>> @@ -187,28 +461,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  
>>  		/* decode literal length */
>>  		if (length == RUN_MASK) {
>> -			unsigned int s;
>>  
>> -			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>> -				/* overflow detection */
>> +			variable_length_error error = ok;
>> +			length +=
>> +			    read_variable_length(&ip, iend - RUN_MASK,
>> +						 endOnInput, endOnInput,
>> +						 &error);
>> +			if (error == initial_error)
>>  				goto _output_error;
>> -			}
>> -			do {
>> -				s = *ip++;
>> -				length += s;
>> -			} while (likely(endOnInput
>> -				? ip < iend - RUN_MASK
>> -				: 1) & (s == 255));
>>  
>>  			if ((safeDecode)
>> -			    && unlikely((uptrval)(op) +
>> -					length < (uptrval)(op))) {
>> +			    && unlikely((uptrval) (op) +
>> +					length < (uptrval) (op))) {
>>  				/* overflow detection */
>>  				goto _output_error;
>>  			}
>>  			if ((safeDecode)
>> -			    && unlikely((uptrval)(ip) +
>> -					length < (uptrval)(ip))) {
>> +			    && unlikely((uptrval) (ip) +
>> +					length < (uptrval) (ip))) {
>>  				/* overflow detection */
>>  				goto _output_error;
>>  			}
>> @@ -216,11 +486,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  
>>  		/* copy literals */
>>  		cpy = op + length;
>> +#if LZ4_FAST_DEC_LOOP
>> +	      safe_literal_copy:
>> +#endif
>>  		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>  
>>  		if (((endOnInput) && ((cpy > oend - MFLIMIT)
>> -			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
>> -			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>> +				      || (ip + length >
>> +					  iend - (2 + 1 + LASTLITERALS))))
>> +		    || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>  			if (partialDecoding) {
>>  				if (cpy > oend) {
>>  					/*
>> @@ -231,7 +505,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  					length = oend - op;
>>  				}
>>  				if ((endOnInput)
>> -					&& (ip + length > iend)) {
>> +				    && (ip + length > iend)) {
>>  					/*
>>  					 * Error :
>>  					 * read attempt beyond
>> @@ -241,7 +515,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  				}
>>  			} else {
>>  				if ((!endOnInput)
>> -					&& (cpy != oend)) {
>> +				    && (cpy != oend)) {
>>  					/*
>>  					 * Error :
>>  					 * block decoding must
>> @@ -250,7 +524,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  					goto _output_error;
>>  				}
>>  				if ((endOnInput)
>> -					&& ((ip + length != iend)
>> +				    && ((ip + length != iend)
>>  					|| (cpy > oend))) {
>>  					/*
>>  					 * Error :
>> @@ -269,7 +543,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  				break;
>>  		} else {
>>  			/* may overwrite up to WILDCOPYLENGTH beyond cpy */
>> -			LZ4_wildCopy(op, ip, cpy);
>> +			LZ4_wildCopy8(op, ip, cpy);
>>  			ip += length;
>>  			op = cpy;
>>  		}
>> @@ -288,29 +562,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  			goto _output_error;
>>  		}
>>  
>> -		/* costs ~1%; silence an msan warning when offset == 0 */
>> -		/*
>> -		 * note : when partialDecoding, there is no guarantee that
>> -		 * at least 4 bytes remain available in output buffer
>> -		 */
>> -		if (!partialDecoding) {
>> -			assert(oend > op);
>> -			assert(oend - op >= 4);
>> -
>> -			LZ4_write32(op, (U32)offset);
>> -		}
>> -
>>  		if (length == ML_MASK) {
>> -			unsigned int s;
>> -
>> -			do {
>> -				s = *ip++;
>> -
>> -				if ((endOnInput) && (ip > iend - LASTLITERALS))
>> -					goto _output_error;
>>  
>> -				length += s;
>> -			} while (s == 255);
>> +			variable_length_error error = ok;
>> +			length +=
>> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
>> +						 endOnInput, 0, &error);
>> +			if (error != ok)
>> +				goto _output_error;
>>  
>>  			if ((safeDecode)
>>  				&& unlikely(
>> @@ -322,6 +581,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  
>>  		length += MINMATCH;
>>  
>> +#if LZ4_FAST_DEC_LOOP
>> +safe_match_copy:
>> +#endif
>> +
>>  		/* match starting within external dictionary */
>>  		if ((dict == usingExtDict) && (match < lowPrefix)) {
>>  			if (unlikely(op + length > oend - LASTLITERALS)) {
>> @@ -418,7 +681,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  			}
>>  
>>  			if (op < oCopyLimit) {
>> -				LZ4_wildCopy(op, match, oCopyLimit);
>> +				LZ4_wildCopy8(op, match, oCopyLimit);
>>  				match += oCopyLimit - op;
>>  				op = oCopyLimit;
>>  			}
>> @@ -427,7 +690,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  		} else {
>>  			LZ4_copy8(op, match);
>>  			if (length > 16)
>> -				LZ4_wildCopy(op + 8, match + 8, cpy);
>> +				LZ4_wildCopy8(op + 8, match + 8, cpy);
>>  		}
>>  		op = cpy; /* wildcopy correction */
>>  	}
>> diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
>> index 1a7fa9d9170f..4cc17cf589ed 100644
>> --- a/lib/lz4/lz4defs.h
>> +++ b/lib/lz4/lz4defs.h
>> @@ -40,6 +40,28 @@
>>  
>>  #define FORCE_INLINE __always_inline
>>  
>> +/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
>> + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
>> + * together with a simple 8-byte copy loop as a fall-back path.
>> + * However, this optimization hurts the decompression speed by >30%,
>> + * because the execution does not go to the optimized loop
>> + * for typical compressible data, and all of the preamble checks
>> + * before going to the fall-back path become useless overhead.
>> + * This optimization happens only with the -O3 flag, and -O2 generates
>> + * a simple 8-byte copy loop.
>> + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
>> + * functions are annotated with __attribute__((optimize("O2"))),
>> + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
>> + * of LZ4_wildCopy8 does not affect the compression speed.
>> + */
>> +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
>> +#  define FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
>> +#  define FORCE_O2_INLINE_GCC_PPC64LE (__attribute__((optimize("O2"))) FORCE_INLINE)
>> +#else
>> +#  define FORCE_O2_GCC_PPC64LE		FORCE_INLINE
>> +#  define FORCE_O2_INLINE_GCC_PPC64LE	FORCE_INLINE
>> +#endif
>> +
>>  /*-************************************
>>   *	Basic Types
>>   **************************************/
>> @@ -99,6 +121,9 @@ typedef uintptr_t uptrval;
>>  #define RUN_BITS (8 - ML_BITS)
>>  #define RUN_MASK ((1U << RUN_BITS) - 1)
>>  
>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>> +
>>  /*-************************************
>>   *	Reading and writing into memory
>>   **************************************/
>> @@ -156,7 +181,7 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
>>   * customized variant of memcpy,
>>   * which can overwrite up to 7 bytes beyond dstEnd
>>   */
>> -static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
>> +static FORCE_O2_INLINE_GCC_PPC64LE void LZ4_wildCopy8(void *dstPtr,
>>  	const void *srcPtr, void *dstEnd)
>>  {
>>  	BYTE *d = (BYTE *)dstPtr;
>> @@ -220,6 +245,39 @@ static FORCE_INLINE unsigned int LZ4_count(
>>  	return (unsigned int)(pIn - pStart);
>>  }
>>  
>> +/* Read the variable-length literal or match length.
>> + *
>> + * ip - pointer to use as input.
>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>> + * error (output) - error code.  Should be set to 0 before call.
>> + */
>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0} variable_length_error;
>> +static FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>> +					   const BYTE *lencheck,
>> +					   int loop_check, int initial_check,
>> +					   variable_length_error *error)
>> +{
>> +	unsigned length = 0;
>> +	unsigned s;
>> +	if (initial_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
>> +		*error = initial_error;
>> +		return length;
>> +	}
>> +	do {
>> +		s = **ip;
>> +		(*ip)++;
>> +		length += s;
>> +		if (loop_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
>> +			*error = loop_error;
>> +			return length;
>> +		}
>> +	} while (s == 255);
>> +
>> +	return length;
>> +}
>> +
>>  typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
>>  typedef enum { byPtr, byU32, byU16 } tableType_t;
>>  
>> diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
>> index 176f03b83e56..e02e041a01d9 100644
>> --- a/lib/lz4/lz4hc_compress.c
>> +++ b/lib/lz4/lz4hc_compress.c
>> @@ -293,7 +293,7 @@ static FORCE_INLINE int LZ4HC_encodeSequence(
>>  		*token = (BYTE)(length<<ML_BITS);
>>  
>>  	/* Copy Literals */
>> -	LZ4_wildCopy(*op, *anchor, (*op) + length);
>> +	LZ4_wildCopy8(*op, *anchor, (*op) + length);
>>  	*op += length;
>>  
>>  	/* Encode Offset */
>>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ