Re: [PATCH 1/1] lz4: Implement lz4 with dynamic offset length.

From: Nick Terrell
Date: Wed Mar 21 2018 - 15:59:25 EST


On (03/21/18 10:10), Maninder Singh wrote:
> diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
> index cc7b6d4..185c358 100644
> --- a/lib/lz4/lz4_compress.c
> +++ b/lib/lz4/lz4_compress.c
> @@ -183,7 +183,8 @@ static FORCE_INLINE int LZ4_compress_generic(
> const tableType_t tableType,
> const dict_directive dict,
> const dictIssue_directive dictIssue,
> - const U32 acceleration)
> + const U32 acceleration,
> + const Dynamic_Offset dynOffset)
> {
> const BYTE *ip = (const BYTE *) source;
> const BYTE *base;
> @@ -199,6 +200,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>
> BYTE *op = (BYTE *) dest;
> BYTE * const olimit = op + maxOutputSize;
> + int max_distance = dynOffset ? MAX_DISTANCE_DYN : MAX_DISTANCE;

Lets mark this variable `const`. If the compiler doesn't realize that this
variable and `dynOffset` are compile time constants, I expect the speed to
be impacted.

>
> U32 forwardH;
> size_t refDelta = 0;
> @@ -245,6 +247,7 @@ static FORCE_INLINE int LZ4_compress_generic(
> for ( ; ; ) {
> const BYTE *match;
> BYTE *token;
> + int curr_offset;
>
> /* Find a match */
> {
> @@ -285,7 +288,7 @@ static FORCE_INLINE int LZ4_compress_generic(
> : 0)
> || ((tableType == byU16)
> ? 0
> - : (match + MAX_DISTANCE < ip))
> + : (match + max_distance < ip))
> || (LZ4_read32(match + refDelta)
> != LZ4_read32(ip)));
> }
> @@ -328,8 +331,26 @@ static FORCE_INLINE int LZ4_compress_generic(
>
> _next_match:
> /* Encode Offset */
> - LZ4_writeLE16(op, (U16)(ip - match));
> - op += 2;
> + if (dynOffset) {
> + curr_offset = (U16)(ip - match);
> +
> + /*
> + * If Ofsset is greater than 127, we need 2 bytes
> + * to store it. Otherwise 1 byte is enough.
> + */
> + if (curr_offset > 127) {
> + curr_offset = (curr_offset << 1) | DYN_BIT;
> + LZ4_writeLE16(op, (U16)curr_offset);
> + op += 2;
> + } else {
> + curr_offset = curr_offset << 1;
> + *op = (BYTE)curr_offset;
> + op++;
> + }

The standard way to do variable sized integers is to use the high-bit as
the control bit, not the low-bit. Do you have benchmarks to show that using
the low-bit is faster than using the high-bit? If so, lets comment in the
code, if not lets use the high-bit.

> + } else {
> + LZ4_writeLE16(op, (U16)(ip - match));
> + op += 2;
> + }
>
> /* Encode MatchLength */
> {
> @@ -480,39 +501,70 @@ static int LZ4_compress_fast_extState(
> return LZ4_compress_generic(ctx, source,
> dest, inputSize, 0,
> noLimit, byU16, noDict,
> - noDictIssue, acceleration);
> + noDictIssue, acceleration, NoDynOffset);
> else
> return LZ4_compress_generic(ctx, source,
> dest, inputSize, 0,
> noLimit, tableType, noDict,
> - noDictIssue, acceleration);
> + noDictIssue, acceleration, NoDynOffset);
> } else {
> if (inputSize < LZ4_64Klimit)
> return LZ4_compress_generic(ctx, source,
> dest, inputSize,
> maxOutputSize, limitedOutput, byU16, noDict,
> - noDictIssue, acceleration);
> + noDictIssue, acceleration, NoDynOffset);
> else
> return LZ4_compress_generic(ctx, source,
> dest, inputSize,
> maxOutputSize, limitedOutput, tableType, noDict,
> - noDictIssue, acceleration);
> + noDictIssue, acceleration, NoDynOffset);
> }
> }
>
> +static int LZ4_compress_fast_extState_dynamic(
> + void *state,
> + const char *source,
> + char *dest,
> + int inputSize,
> + int maxOutputSize,
> + int acceleration)
> +{
> + LZ4_stream_t_internal *ctx = &((LZ4_stream_t *)state)->internal_donotuse;
> +
> + LZ4_resetStream((LZ4_stream_t *)state);
> +
> + if (acceleration < 1)
> + acceleration = LZ4_ACCELERATION_DEFAULT;
> +
> + if (maxOutputSize >= LZ4_COMPRESSBOUND(inputSize))
> + return LZ4_compress_generic(ctx, source,
> + dest, inputSize, 0,
> + noLimit, byU16, noDict,
> + noDictIssue, acceleration, DynOffset);
> + else
> + return LZ4_compress_generic(ctx, source,
> + dest, inputSize,
> + maxOutputSize, limitedOutput, byU16, noDict,
> + noDictIssue, acceleration, DynOffset);
> +}
> +
> int LZ4_compress_fast(const char *source, char *dest, int inputSize,
> - int maxOutputSize, int acceleration, void *wrkmem)
> + int maxOutputSize, int acceleration, void *wrkmem, bool dynOffset)
> {
> - return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize,
> + if (!dynOffset)
> + return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize,
> + maxOutputSize, acceleration);
> +
> + return LZ4_compress_fast_extState_dynamic(wrkmem, source, dest, inputSize,
> maxOutputSize, acceleration);
> }
> EXPORT_SYMBOL(LZ4_compress_fast);
>
> int LZ4_compress_default(const char *source, char *dest, int inputSize,
> - int maxOutputSize, void *wrkmem)
> + int maxOutputSize, void *wrkmem, bool dynOffset)
> {
> return LZ4_compress_fast(source, dest, inputSize,
> - maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem);
> + maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem, dynOffset);
> }
> EXPORT_SYMBOL(LZ4_compress_default);
>
> @@ -900,12 +952,12 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
> result = LZ4_compress_generic(
> streamPtr, source, dest, inputSize,
> maxOutputSize, limitedOutput, byU32,
> - withPrefix64k, dictSmall, acceleration);
> + withPrefix64k, dictSmall, acceleration, NoDynOffset);
> } else {
> result = LZ4_compress_generic(
> streamPtr, source, dest, inputSize,
> maxOutputSize, limitedOutput, byU32,
> - withPrefix64k, noDictIssue, acceleration);
> + withPrefix64k, noDictIssue, acceleration, NoDynOffset);
> }
> streamPtr->dictSize += (U32)inputSize;
> streamPtr->currentOffset += (U32)inputSize;
> @@ -921,12 +973,12 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
> result = LZ4_compress_generic(
> streamPtr, source, dest, inputSize,
> maxOutputSize, limitedOutput, byU32,
> - usingExtDict, dictSmall, acceleration);
> + usingExtDict, dictSmall, acceleration, NoDynOffset);
> } else {
> result = LZ4_compress_generic(
> streamPtr, source, dest, inputSize,
> maxOutputSize, limitedOutput, byU32,
> - usingExtDict, noDictIssue, acceleration);
> + usingExtDict, noDictIssue, acceleration, NoDynOffset);
> }
> streamPtr->dictionary = (const BYTE *)source;
> streamPtr->dictSize = (U32)inputSize;
> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
> index 141734d..337a828 100644
> --- a/lib/lz4/lz4_decompress.c
> +++ b/lib/lz4/lz4_decompress.c
> @@ -71,7 +71,9 @@ static FORCE_INLINE int LZ4_decompress_generic(
> /* only if dict == usingExtDict */
> const BYTE * const dictStart,
> /* note : = 0 if noDict */
> - const size_t dictSize
> + const size_t dictSize,
> + /* offset == 1; dynamic offset */
> + const Dynamic_Offset dynOffset
> )
> {
> /* Local Variables */
> @@ -141,8 +143,8 @@ static FORCE_INLINE int LZ4_decompress_generic(
> /* copy literals */
> cpy = op + length;
> if (((endOnInput) && ((cpy > (partialDecoding ? oexit : oend - MFLIMIT))
> - || (ip + length > iend - (2 + 1 + LASTLITERALS))))
> - || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
> + || (ip + length > iend - (2 + LASTLITERALS))))
> + || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH - 1))) {
> if (partialDecoding) {
> if (cpy > oend) {
> /*
> @@ -188,13 +190,31 @@ static FORCE_INLINE int LZ4_decompress_generic(
> break;
> }
>
> - LZ4_wildCopy(op, ip, cpy);
> + if (dynOffset && length < 4)
> + LZ4_copy4(op, ip);
> + else
> + LZ4_wildCopy(op, ip, cpy);
> +

The LZ4 format enforces that the last 5 bytes are literals so that
LZ4_wildCopy() can be used here. I suspect that having this extra branch
here for `dynOffset` mode hurts decompression speed.

> ip += length;
> op = cpy;
>
> /* get offset */
> - offset = LZ4_readLE16(ip);
> - ip += 2;
> + if (dynOffset) {
> + /*
> + * Check if DYN_BIT is set, means 2 Byte Offset,
> + * else 1 Byte Offset.
> + */
> + if (*ip & DYN_BIT) {
> + offset = LZ4_readLE16(ip) >> 1;
> + ip += 2;
> + } else {
> + offset = *ip >> 1;
> + ip += 1;

If we use the high-bit as the control bit, this branch simply becomes
`offset = *ip`, though the long offset branch becomes a bit longer.

> + }
> + } else {
> + offset = LZ4_readLE16(ip);
> + ip += 2;
> + }
> match = op - offset;
>
> if ((checkOffset) && (unlikely(match < lowLimit))) {
> @@ -335,11 +355,11 @@ static FORCE_INLINE int LZ4_decompress_generic(
> }
>
> int LZ4_decompress_safe(const char *source, char *dest,
> - int compressedSize, int maxDecompressedSize)
> + int compressedSize, int maxDecompressedSize, bool dynOffset)
> {
> return LZ4_decompress_generic(source, dest, compressedSize,
> maxDecompressedSize, endOnInputSize, full, 0,
> - noDict, (BYTE *)dest, NULL, 0);
> + noDict, (BYTE *)dest, NULL, 0, dynOffset);

You'll need to use the same trick that LZ4_compress_fast() uses, by hard
coding `dynOffset`. We want the compiler to generate too version of
LZ4_decompress_generic(), one with `dynOffset` and one with `!dynOffset`.
That way the tight loop won't the branches that check `dynOffset`.

if (dynOffset)
return LZ4_decompress_generic(..., true);
else
return LZ4_decompress_generic(..., false);

Without this trick, I expect that this patch causes a regression to both
LZ4 and LZ4_DYN decompression speed.

> }
>
> int LZ4_decompress_safe_partial(const char *source, char *dest,
> @@ -347,14 +367,14 @@ int LZ4_decompress_safe_partial(const char *source, char *dest,
> {
> return LZ4_decompress_generic(source, dest, compressedSize,
> maxDecompressedSize, endOnInputSize, partial,
> - targetOutputSize, noDict, (BYTE *)dest, NULL, 0);
> + targetOutputSize, noDict, (BYTE *)dest, NULL, 0, NoDynOffset);
> }
>
> int LZ4_decompress_fast(const char *source, char *dest, int originalSize)
> {
> return LZ4_decompress_generic(source, dest, 0, originalSize,
> endOnOutputSize, full, 0, withPrefix64k,
> - (BYTE *)(dest - 64 * KB), NULL, 64 * KB);
> + (BYTE *)(dest - 64 * KB), NULL, 64 * KB, NoDynOffset);
> }
>
> int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
> @@ -392,7 +412,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
> endOnInputSize, full, 0,
> usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize,
> lz4sd->externalDict,
> - lz4sd->extDictSize);
> + lz4sd->extDictSize, NoDynOffset);
>
> if (result <= 0)
> return result;
> @@ -406,7 +426,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
> compressedSize, maxOutputSize,
> endOnInputSize, full, 0,
> usingExtDict, (BYTE *)dest,
> - lz4sd->externalDict, lz4sd->extDictSize);
> + lz4sd->externalDict, lz4sd->extDictSize, NoDynOffset);
> if (result <= 0)
> return result;
> lz4sd->prefixSize = result;
> @@ -427,7 +447,7 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
> endOnOutputSize, full, 0,
> usingExtDict,
> lz4sd->prefixEnd - lz4sd->prefixSize,
> - lz4sd->externalDict, lz4sd->extDictSize);
> + lz4sd->externalDict, lz4sd->extDictSize, NoDynOffset);
>
> if (result <= 0)
> return result;
> @@ -440,7 +460,7 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
> result = LZ4_decompress_generic(source, dest, 0, originalSize,
> endOnOutputSize, full, 0,
> usingExtDict, (BYTE *)dest,
> - lz4sd->externalDict, lz4sd->extDictSize);
> + lz4sd->externalDict, lz4sd->extDictSize, NoDynOffset);
> if (result <= 0)
> return result;
> lz4sd->prefixSize = originalSize;
> @@ -463,19 +483,19 @@ static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source,
> if (dictSize == 0)
> return LZ4_decompress_generic(source, dest,
> compressedSize, maxOutputSize, safe, full, 0,
> - noDict, (BYTE *)dest, NULL, 0);
> + noDict, (BYTE *)dest, NULL, 0, NoDynOffset);
> if (dictStart + dictSize == dest) {
> if (dictSize >= (int)(64 * KB - 1))
> return LZ4_decompress_generic(source, dest,
> compressedSize, maxOutputSize, safe, full, 0,
> - withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0);
> + withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0, NoDynOffset);
> return LZ4_decompress_generic(source, dest, compressedSize,
> maxOutputSize, safe, full, 0, noDict,
> - (BYTE *)dest - dictSize, NULL, 0);
> + (BYTE *)dest - dictSize, NULL, 0, NoDynOffset);
> }
> return LZ4_decompress_generic(source, dest, compressedSize,
> maxOutputSize, safe, full, 0, usingExtDict,
> - (BYTE *)dest, (const BYTE *)dictStart, dictSize);
> + (BYTE *)dest, (const BYTE *)dictStart, dictSize, NoDynOffset);
> }
>
> int LZ4_decompress_safe_usingDict(const char *source, char *dest,
> diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
> index 00a0b58..9451a73 100644
> --- a/lib/lz4/lz4defs.h
> +++ b/lib/lz4/lz4defs.h
> @@ -75,6 +75,7 @@
> #define WILDCOPYLENGTH 8
> #define LASTLITERALS 5
> #define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
> +#define DYN_BIT 0x1
>
> /* Increase this value ==> compression run slower on incompressible data */
> #define LZ4_SKIPTRIGGER 6
> @@ -87,6 +88,7 @@
>
> #define MAXD_LOG 16
> #define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
> +#define MAX_DISTANCE_DYN ((1 << (MAXD_LOG - 1)) - 1)
> #define STEPSIZE sizeof(size_t)
>
> #define ML_BITS 4
> @@ -147,6 +149,13 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
> #endif
> }
>
> +static FORCE_INLINE void LZ4_copy4(void *dst, const void *src)
> +{
> + U32 a = get_unaligned((const U32 *)src);
> +
> + put_unaligned(a, (U32 *)dst);
> +}
> +
> /*
> * customized variant of memcpy,
> * which can overwrite up to 7 bytes beyond dstEnd
> @@ -224,4 +233,6 @@ static FORCE_INLINE unsigned int LZ4_count(
> typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
> typedef enum { full = 0, partial = 1 } earlyEnd_directive;
>
> +typedef enum { NoDynOffset = 0, DynOffset = 1 } Dynamic_Offset;
> +
> #endif
> --
> 1.7.1