From d85962e654a3573129d67b1896e86b6aa1b44918 Mon Sep 17 00:00:00 2001 From: conor42 Date: Mon, 18 Mar 2019 00:05:50 +1000 Subject: [PATCH] Update to Fast LZMA2 1.0.0 --- C/fast-lzma2/compiler.h | 83 +- C/fast-lzma2/count.h | 12 +- C/fast-lzma2/dict_buffer.c | 230 ++ C/fast-lzma2/dict_buffer.h | 81 + C/fast-lzma2/fast-lzma2.h | 558 ++-- C/fast-lzma2/fl2_common.c | 64 +- C/fast-lzma2/fl2_compress.c | 1563 ++++++----- C/fast-lzma2/fl2_compress_internal.h | 55 +- C/fast-lzma2/fl2_error_private.c | 35 - C/fast-lzma2/fl2_error_private.h | 75 - C/fast-lzma2/fl2_errors.h | 28 +- C/fast-lzma2/fl2_internal.h | 17 +- C/fast-lzma2/fl2_pool.c | 198 ++ C/fast-lzma2/{fl2pool.h => fl2_pool.h} | 10 +- .../{fl2threading.c => fl2_threading.c} | 28 +- C/fast-lzma2/fl2_threading.h | 178 ++ C/fast-lzma2/fl2pool.c | 201 -- C/fast-lzma2/fl2threading.h | 120 - C/fast-lzma2/lzma2_enc.c | 2305 ++++++++--------- C/fast-lzma2/lzma2_enc.h | 34 +- C/fast-lzma2/mem.h | 25 +- C/fast-lzma2/platform.h | 100 +- C/fast-lzma2/radix_bitpack.c | 5 +- C/fast-lzma2/radix_engine.h | 562 ++-- C/fast-lzma2/radix_get.h | 210 ++ C/fast-lzma2/radix_internal.h | 56 +- C/fast-lzma2/radix_mf.c | 465 ++-- C/fast-lzma2/radix_mf.h | 19 +- C/fast-lzma2/radix_struct.c | 4 +- C/fast-lzma2/range_enc.c | 216 +- C/fast-lzma2/range_enc.h | 88 +- C/fast-lzma2/util.c | 707 +++++ C/fast-lzma2/util.h | 671 +---- CPP/7zip/7zip.mak | 4 +- CPP/7zip/Bundles/Alone/makefile | 7 +- CPP/7zip/Bundles/Codec_flzma2/makefile | 7 +- CPP/7zip/Bundles/Format7z/makefile | 7 +- CPP/7zip/Bundles/Format7zF/makefile | 7 +- CPP/7zip/Bundles/Format7zFO/makefile | 7 +- CPP/7zip/Bundles/Format7zUSB/makefile | 7 +- CPP/7zip/Compress/Lzma2Encoder.cpp | 282 +- CPP/7zip/Compress/Lzma2Encoder.h | 43 +- CPP/7zip/UI/GUI/CompressDialog.cpp | 36 +- 43 files changed, 5467 insertions(+), 3943 deletions(-) create mode 100644 C/fast-lzma2/dict_buffer.c create mode 100644 C/fast-lzma2/dict_buffer.h delete mode 100644 C/fast-lzma2/fl2_error_private.c delete mode 100644 C/fast-lzma2/fl2_error_private.h create mode 100644 C/fast-lzma2/fl2_pool.c rename C/fast-lzma2/{fl2pool.h => fl2_pool.h} (76%) rename C/fast-lzma2/{fl2threading.c => fl2_threading.c} (73%) create mode 100644 C/fast-lzma2/fl2_threading.h delete mode 100644 C/fast-lzma2/fl2pool.c delete mode 100644 C/fast-lzma2/fl2threading.h create mode 100644 C/fast-lzma2/radix_get.h create mode 100644 C/fast-lzma2/util.c diff --git a/C/fast-lzma2/compiler.h b/C/fast-lzma2/compiler.h index dc3bfff3..b33d18b7 100644 --- a/C/fast-lzma2/compiler.h +++ b/C/fast-lzma2/compiler.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. * All rights reserved. + * Modified for FL2 by Conor McCarthy * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found @@ -8,13 +9,15 @@ * You may select, at your option, one of the above-listed licenses. */ -#ifndef ZSTD_COMPILER_H -#define ZSTD_COMPILER_H +#ifndef FL2_COMPILER_H +#define FL2_COMPILER_H /*-******************************************************* * Compiler specifics *********************************************************/ /* force inlining */ + +#if !defined(FL2_NO_INLINE) #if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ # define INLINE_KEYWORD inline #else @@ -29,6 +32,13 @@ # define FORCE_INLINE_ATTR #endif +#else + +#define INLINE_KEYWORD +#define FORCE_INLINE_ATTR + +#endif + /** * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant * parameters. They must be inlined for the compiler to eliminate the constant @@ -54,24 +64,69 @@ /* force no inlining */ #ifdef _MSC_VER -# define FORCE_NOINLINE static __declspec(noinline) +# define FORCE_NOINLINE __declspec(noinline) #else # ifdef __GNUC__ -# define FORCE_NOINLINE static __attribute__((__noinline__)) +# define FORCE_NOINLINE __attribute__((__noinline__)) # else -# define FORCE_NOINLINE static +# define FORCE_NOINLINE # endif #endif -/* prefetch */ -#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ -# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ -# define PREFETCH(ptr) _mm_prefetch((const char*)ptr, _MM_HINT_T0) -#elif defined(__GNUC__) -# define PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0) -#else -# define PREFETCH(ptr) /* disabled */ +/* target attribute */ +#ifndef __has_attribute + #define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */ #endif +#if defined(__GNUC__) +# define TARGET_ATTRIBUTE(target) __attribute__((__target__(target))) +#else +# define TARGET_ATTRIBUTE(target) +#endif + +/* Enable runtime BMI2 dispatch based on the CPU. + * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. + */ +#ifndef DYNAMIC_BMI2 + #if ((defined(__clang__) && __has_attribute(__target__)) \ + || (defined(__GNUC__) \ + && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ + && (defined(__x86_64__) || defined(_M_X86)) \ + && !defined(__BMI2__) + # define DYNAMIC_BMI2 1 + #else + # define DYNAMIC_BMI2 0 + #endif +#endif + +/* prefetch + * can be disabled, by declaring NO_PREFETCH build macro */ +#if defined(NO_PREFETCH) +# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +#else +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# define PREFETCH_L2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) +# else +# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* NO_PREFETCH */ + +#define CACHELINE_SIZE 64 + +#define PREFETCH_AREA(p, s) { \ + const char* const _ptr = (const char*)(p); \ + size_t const _size = (size_t)(s); \ + size_t _pos; \ + for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ + PREFETCH_L2(_ptr + _pos); \ + } \ +} /* disable warnings */ #ifdef _MSC_VER /* Visual Studio */ @@ -83,4 +138,4 @@ # pragma warning(disable : 4324) /* disable: C4324: padded structure */ #endif -#endif /* ZSTD_COMPILER_H */ +#endif /* FL2_COMPILER_H */ diff --git a/C/fast-lzma2/count.h b/C/fast-lzma2/count.h index 77f796a3..11bf1ef3 100644 --- a/C/fast-lzma2/count.h +++ b/C/fast-lzma2/count.h @@ -1,3 +1,13 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + #ifndef ZSTD_COUNT_H_ #define ZSTD_COUNT_H_ @@ -86,7 +96,7 @@ static unsigned ZSTD_NbCommonBytes(register size_t val) } -MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) +static size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) { const BYTE* const pStart = pIn; const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t) - 1); diff --git a/C/fast-lzma2/dict_buffer.c b/C/fast-lzma2/dict_buffer.c new file mode 100644 index 00000000..06d9a4f0 --- /dev/null +++ b/C/fast-lzma2/dict_buffer.c @@ -0,0 +1,230 @@ +/* +* Copyright (c) 2019, Conor McCarthy +* All rights reserved. +* +* This source code is licensed under both the BSD-style license (found in the +* LICENSE file in the root directory of this source tree) and the GPLv2 (found +* in the COPYING file in the root directory of this source tree). +* You may select, at your option, one of the above-listed licenses. +*/ + +#include +#include "dict_buffer.h" +#include "fl2_internal.h" + +#define ALIGNMENT_SIZE 16U +#define ALIGNMENT_MASK (~(size_t)(ALIGNMENT_SIZE-1)) + +/* DICT_buffer functions */ + +int DICT_construct(DICT_buffer * const buf, int const async) +{ + buf->data[0] = NULL; + buf->data[1] = NULL; + buf->size = 0; + + buf->async = (async != 0); + +#ifndef NO_XXHASH + buf->xxh = NULL; +#endif + + return 0; +} + +int DICT_init(DICT_buffer * const buf, size_t const dict_size, size_t const overlap, unsigned const reset_multiplier, int const do_hash) +{ + /* Allocate if not yet allocated or existing dict too small */ + if (buf->data[0] == NULL || dict_size > buf->size) { + /* Free any existing buffers */ + DICT_destruct(buf); + + buf->data[0] = malloc(dict_size); + + buf->data[1] = NULL; + if (buf->async) + buf->data[1] = malloc(dict_size); + + if (buf->data[0] == NULL || (buf->async && buf->data[1] == NULL)) { + DICT_destruct(buf); + return 1; + } + } + buf->index = 0; + buf->overlap = overlap; + buf->start = 0; + buf->end = 0; + buf->size = dict_size; + buf->total = 0; + buf->reset_interval = (reset_multiplier != 0) ? dict_size * reset_multiplier : ((size_t)1 << 31); + +#ifndef NO_XXHASH + if (do_hash) { + if (buf->xxh == NULL) { + buf->xxh = XXH32_createState(); + if (buf->xxh == NULL) { + DICT_destruct(buf); + return 1; + } + } + XXH32_reset(buf->xxh, 0); + } + else { + XXH32_freeState(buf->xxh); + buf->xxh = NULL; + } +#else + (void)do_hash; +#endif + + return 0; +} + +void DICT_destruct(DICT_buffer * const buf) +{ + free(buf->data[0]); + free(buf->data[1]); + buf->data[0] = NULL; + buf->data[1] = NULL; + buf->size = 0; +#ifndef NO_XXHASH + XXH32_freeState(buf->xxh); + buf->xxh = NULL; +#endif +} + +size_t DICT_size(const DICT_buffer * const buf) +{ + return buf->size; +} + +/* Get the dictionary buffer for adding input */ +size_t DICT_get(DICT_buffer * const buf, void **const dict) +{ + DICT_shift(buf); + + DEBUGLOG(5, "Getting dict buffer %u, pos %u, avail %u", (unsigned)buf->index, (unsigned)buf->end, (unsigned)(buf->size - buf->end)); + *dict = buf->data[buf->index] + buf->end; + return buf->size - buf->end; +} + +/* Update with the amount added */ +int DICT_update(DICT_buffer * const buf, size_t const added_size) +{ + DEBUGLOG(5, "Added %u bytes to dict buffer %u", (unsigned)added_size, (unsigned)buf->index); + buf->end += added_size; + assert(buf->end <= buf->size); + return !DICT_availSpace(buf); +} + +/* Read from input and write to the dict */ +void DICT_put(DICT_buffer * const buf, FL2_inBuffer * const input) +{ + size_t const to_read = MIN(buf->size - buf->end, input->size - input->pos); + + DEBUGLOG(5, "CStream : reading %u bytes", (U32)to_read); + + memcpy(buf->data[buf->index] + buf->end, (BYTE*)input->src + input->pos, to_read); + + input->pos += to_read; + buf->end += to_read; +} + +size_t DICT_availSpace(const DICT_buffer * const buf) +{ + return buf->size - buf->end; +} + +/* Get the size of uncompressed data. start is set to end after compression */ +int DICT_hasUnprocessed(const DICT_buffer * const buf) +{ + return buf->start < buf->end; +} + +/* Get the buffer, overlap and end for compression */ +void DICT_getBlock(DICT_buffer * const buf, FL2_dataBlock * const block) +{ + block->data = buf->data[buf->index]; + block->start = buf->start; + block->end = buf->end; + +#ifndef NO_XXHASH + if (buf->xxh != NULL) + XXH32_update(buf->xxh, buf->data[buf->index] + buf->start, buf->end - buf->start); +#endif + + buf->total += buf->end - buf->start; + buf->start = buf->end; +} + +/* Shift occurs when all is processed and end is beyond the overlap size */ +int DICT_needShift(DICT_buffer * const buf) +{ + if (buf->start < buf->end) + return 0; + /* Reset the dict if the next compression cycle would exceed the reset interval */ + size_t overlap = (buf->total + buf->size - buf->overlap > buf->reset_interval) ? 0 : buf->overlap; + return buf->start == buf->end && (overlap == 0 || buf->end >= overlap + ALIGNMENT_SIZE); +} + +int DICT_async(const DICT_buffer * const buf) +{ + return (int)buf->async; +} + +/* Shift the overlap amount to the start of either the only dict buffer or the alternate one + * if it exists */ +void DICT_shift(DICT_buffer * const buf) +{ + if (buf->start < buf->end) + return; + + size_t overlap = buf->overlap; + /* Reset the dict if the next compression cycle would exceed the reset interval */ + if (buf->total + buf->size - buf->overlap > buf->reset_interval) { + DEBUGLOG(4, "Resetting dictionary after %u bytes", (unsigned)buf->total); + overlap = 0; + } + + if (overlap == 0) { + /* No overlap means a simple buffer switch */ + buf->start = 0; + buf->end = 0; + buf->index ^= buf->async; + buf->total = 0; + } + else if (buf->end >= overlap + ALIGNMENT_SIZE) { + size_t const from = (buf->end - overlap) & ALIGNMENT_MASK; + const BYTE *const src = buf->data[buf->index]; + /* Copy to the alternate if one exists */ + BYTE *const dst = buf->data[buf->index ^ buf->async]; + + overlap = buf->end - from; + + if (overlap <= from || dst != src) { + DEBUGLOG(5, "Copy overlap data : %u bytes from %u", (unsigned)overlap, (unsigned)from); + memcpy(dst, src + from, overlap); + } + else if (from != 0) { + DEBUGLOG(5, "Move overlap data : %u bytes from %u", (unsigned)overlap, (unsigned)from); + memmove(dst, src + from, overlap); + } + /* New data will be written after the overlap */ + buf->start = overlap; + buf->end = overlap; + /* Switch buffers */ + buf->index ^= buf->async; + } +} + +#ifndef NO_XXHASH +XXH32_hash_t DICT_getDigest(const DICT_buffer * const buf) +{ + return XXH32_digest(buf->xxh); +} +#endif + +size_t DICT_memUsage(const DICT_buffer * const buf) +{ + return (1 + buf->async) * buf->size; +} diff --git a/C/fast-lzma2/dict_buffer.h b/C/fast-lzma2/dict_buffer.h new file mode 100644 index 00000000..436472fb --- /dev/null +++ b/C/fast-lzma2/dict_buffer.h @@ -0,0 +1,81 @@ +/* +* Copyright (c) 2018, Conor McCarthy +* All rights reserved. +* +* This source code is licensed under both the BSD-style license (found in the +* LICENSE file in the root directory of this source tree) and the GPLv2 (found +* in the COPYING file in the root directory of this source tree). +* You may select, at your option, one of the above-listed licenses. +*/ + +#include "fast-lzma2.h" +#include "mem.h" +#include "data_block.h" +#ifndef NO_XXHASH +# include "xxhash.h" +#endif + +#ifndef FL2_DICT_BUFFER_H_ +#define FL2_DICT_BUFFER_H_ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* DICT_buffer structure. + * Maintains one or two dictionary buffers. In a dual dict configuration (asyc==1), when the + * current buffer is full, the overlap region will be copied to the other buffer and it + * becomes the destination for input while the first is compressed. This is useful when I/O + * is much slower than compression. */ +typedef struct { + BYTE* data[2]; + size_t index; + size_t async; + size_t overlap; + size_t start; /* start = 0 (first block) or overlap */ + size_t end; /* never < overlap */ + size_t size; /* allocation size */ + size_t total; /* total size compressed after last dict reset */ + size_t reset_interval; +#ifndef NO_XXHASH + XXH32_state_t *xxh; +#endif +} DICT_buffer; + +int DICT_construct(DICT_buffer *const buf, int const async); + +int DICT_init(DICT_buffer *const buf, size_t const dict_size, size_t const overlap, unsigned const reset_multiplier, int const do_hash); + +void DICT_destruct(DICT_buffer *const buf); + +size_t DICT_size(const DICT_buffer *const buf); + +size_t DICT_get(DICT_buffer *const buf, void **const dict); + +int DICT_update(DICT_buffer *const buf, size_t const added_size); + +void DICT_put(DICT_buffer *const buf, FL2_inBuffer* const input); + +size_t DICT_availSpace(const DICT_buffer *const buf); + +int DICT_hasUnprocessed(const DICT_buffer *const buf); + +void DICT_getBlock(DICT_buffer *const buf, FL2_dataBlock *const block); + +int DICT_needShift(DICT_buffer *const buf); + +int DICT_async(const DICT_buffer *const buf); + +void DICT_shift(DICT_buffer *const buf); + +#ifndef NO_XXHASH +XXH32_hash_t DICT_getDigest(const DICT_buffer *const buf); +#endif + +size_t DICT_memUsage(const DICT_buffer *const buf); + +#if defined (__cplusplus) +} +#endif + +#endif /* FL2_DICT_BUFFER_H_ */ \ No newline at end of file diff --git a/C/fast-lzma2/fast-lzma2.h b/C/fast-lzma2/fast-lzma2.h index a1d479c9..7f90de45 100644 --- a/C/fast-lzma2/fast-lzma2.h +++ b/C/fast-lzma2/fast-lzma2.h @@ -53,9 +53,9 @@ Introduction *********************************************************************************************************/ /*------ Version ------*/ -#define FL2_VERSION_MAJOR 0 -#define FL2_VERSION_MINOR 9 -#define FL2_VERSION_RELEASE 2 +#define FL2_VERSION_MAJOR 1 +#define FL2_VERSION_MINOR 0 +#define FL2_VERSION_RELEASE 0 #define FL2_VERSION_NUMBER (FL2_VERSION_MAJOR *100*100 + FL2_VERSION_MINOR *100 + FL2_VERSION_RELEASE) FL2LIB_API unsigned FL2LIB_CALL FL2_versionNumber(void); /**< useful to check dll version */ @@ -67,12 +67,13 @@ FL2LIB_API unsigned FL2LIB_CALL FL2_versionNumber(void); /**< useful to check FL2LIB_API const char* FL2LIB_CALL FL2_versionString(void); +#define FL2_MAXTHREADS 200 + + /*************************************** * Simple API ***************************************/ -#define FL2_MAXTHREADS 200 - /*! FL2_compress() : * Compresses `src` content as a single LZMA2 compressed stream into already allocated `dst`. * Call FL2_compressMt() to use > 1 thread. Specify nbThreads = 0 to use all cores. @@ -88,20 +89,30 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compressMt(void* dst, size_t dstCapacity, unsigned nbThreads); /*! FL2_decompress() : - * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. - * `dstCapacity` is an upper bound of originalSize to regenerate. - * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. + * Decompresses a single LZMA2 compressed stream from `src` into already allocated `dst`. + * `compressedSize` : must be at least the size of the LZMA2 stream. + * `dstCapacity` is the original, uncompressed size to regenerate, returned by calling + * FL2_findDecompressedSize(). + * Call FL2_decompressMt() to use > 1 thread. Specify nbThreads = 0 to use all cores. The stream + * must contain dictionary resets to use multiple threads. These are inserted during compression by + * default. The frequency can be changed/disabled with the FL2_p_resetInterval parameter setting. * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), * or an errorCode if it fails (which can be tested using FL2_isError()). */ FL2LIB_API size_t FL2LIB_CALL FL2_decompress(void* dst, size_t dstCapacity, const void* src, size_t compressedSize); +FL2LIB_API size_t FL2LIB_CALL FL2_decompressMt(void* dst, size_t dstCapacity, + const void* src, size_t compressedSize, + unsigned nbThreads); + /*! FL2_findDecompressedSize() * `src` should point to the start of a LZMA2 encoded stream. * `srcSize` must be at least as large as the LZMA2 stream including end marker. + * A property byte is assumed to exist at position 0 in `src`. If the stream was created without one, + * subtract 1 byte from `src` when passing it to the function. * @return : - decompressed size of the stream in `src`, if known * - FL2_CONTENTSIZE_ERROR if an error occurred (e.g. corruption, srcSize too small) - * note 1 : a 0 return value means the frame is valid but "empty". + * note 1 : a 0 return value means the stream is valid but "empty". * note 2 : decompressed size can be very large (64-bits value), * potentially larger than what local system can handle as a single memory segment. * In which case, it's necessary to use streaming mode to decompress data. @@ -109,122 +120,80 @@ FL2LIB_API size_t FL2LIB_CALL FL2_decompress(void* dst, size_t dstCapacity, * Always ensure return value fits within application's authorized limits. * Each application can set its own limits. */ #define FL2_CONTENTSIZE_ERROR (size_t)-1 -FL2LIB_API size_t FL2LIB_CALL FL2_findDecompressedSize(const void *src, size_t srcSize); +FL2LIB_API unsigned long long FL2LIB_CALL FL2_findDecompressedSize(const void *src, size_t srcSize); /*====== Helper functions ======*/ -#define FL2_COMPRESSBOUND(srcSize) ((srcSize) + (((srcSize) + 0xFFF) / 0x1000) * 3 + 6) /* this formula calculates the maximum size of data stored in uncompressed chunks */ +#define FL2_COMPRESSBOUND(srcSize) ((srcSize) + (((srcSize) + 0xFFF) / 0x1000) * 3 + 6) /*!< calculates the maximum size of data stored in a sequence of uncompressed chunks */ FL2LIB_API size_t FL2LIB_CALL FL2_compressBound(size_t srcSize); /*!< maximum compressed size in worst case scenario */ FL2LIB_API unsigned FL2LIB_CALL FL2_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +FL2LIB_API unsigned FL2LIB_CALL FL2_isTimedOut(size_t code); /*!< tells if a `size_t` function result is the timeout code */ FL2LIB_API const char* FL2LIB_CALL FL2_getErrorName(size_t code); /*!< provides readable string from an error code */ FL2LIB_API int FL2LIB_CALL FL2_maxCLevel(void); /*!< maximum compression level available */ FL2LIB_API int FL2LIB_CALL FL2_maxHighCLevel(void); /*!< maximum compression level available in high mode */ + /*************************************** * Explicit memory management ***************************************/ + /*= Compression context - * When compressing many times, - * it is recommended to allocate a context just once, and re-use it for each successive compression operation. - * This will make workload friendlier for system's memory. - * The context may not use the number of threads requested if the library is compiled for single-threaded - * compression or nbThreads > FL2_MAXTHREADS. Call FL2_CCtx_nbThreads to obtain the actual number. */ + * When compressing many times, it is recommended to allocate a context just once, + * and re-use it for each successive compression operation. This will make workload + * friendlier for system's memory. The context may not use the number of threads requested + * if the library is compiled for single-threaded compression or nbThreads > FL2_MAXTHREADS. + * Call FL2_getCCtxThreadCount to obtain the actual number allocated. */ typedef struct FL2_CCtx_s FL2_CCtx; FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtx(void); FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtxMt(unsigned nbThreads); FL2LIB_API void FL2LIB_CALL FL2_freeCCtx(FL2_CCtx* cctx); -FL2LIB_API unsigned FL2LIB_CALL FL2_CCtx_nbThreads(const FL2_CCtx* ctx); +FL2LIB_API unsigned FL2LIB_CALL FL2_getCCtxThreadCount(const FL2_CCtx* cctx); /*! FL2_compressCCtx() : - * Same as FL2_compress(), requires an allocated FL2_CCtx (see FL2_createCCtx()). */ -FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* ctx, + * Same as FL2_compress(), but requires an allocated FL2_CCtx (see FL2_createCCtx()). */ +FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel); -/************************************************ -* Caller-managed data buffer and overlap section -************************************************/ - -typedef struct { - unsigned char *data; - size_t start; /* start = 0 (first block) or overlap */ - size_t end; /* never < overlap */ - size_t bufSize; /* allocation size */ -} FL2_blockBuffer; - -typedef int (FL2LIB_CALL *FL2_progressFn)(size_t done, void* opaque); - -/* Get the size of the overlap section. */ -FL2LIB_API size_t FL2LIB_CALL FL2_blockOverlap(const FL2_CCtx* ctx); - -/* Copy the overlap section to the start to prepare for more data */ -FL2LIB_API void FL2LIB_CALL FL2_shiftBlock(FL2_CCtx* ctx, FL2_blockBuffer *block); -/* Copy the overlap to a different buffer. This allows a dual-buffer configuration where - * data is read into one block while the other is compressed. */ -FL2LIB_API void FL2LIB_CALL FL2_shiftBlock_switch(FL2_CCtx* ctx, FL2_blockBuffer *block, unsigned char *dst); - -FL2LIB_API void FL2LIB_CALL FL2_beginFrame(FL2_CCtx* const cctx); - -/*! FL2_compressCCtxBlock() : - * Same as FL2_compressCCtx except the caller is responsible for supplying an overlap section. - * The FL2_p_overlapFraction parameter will not be used. - * srcStart + srcSize should equal the dictionary size except on the last call. - * Can be called multiple times. FL2_endFrame() must be called when finished. - * For compatibility with this library the caller must write a property byte at - * the beginning of the output. Obtain it by calling FL2_dictSizeProp() before - * compressing the first block or after the last. No hash will be written, but - * the caller can calculate it using the interface in xxhash.h, write it at the end, - * and set bit 7 in the property byte. */ -FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtxBlock(FL2_CCtx* ctx, - void* dst, size_t dstCapacity, - const FL2_blockBuffer *block, - FL2_progressFn progress, void* opaque); - -/*! FL2_endFrame() : - * Write the end marker to terminate the LZMA2 stream. - * Must be called after compressing with FL2_compressCCtxBlock() */ -FL2LIB_API size_t FL2LIB_CALL FL2_endFrame(FL2_CCtx* ctx, - void* dst, size_t dstCapacity); - -typedef int (FL2LIB_CALL *FL2_writerFn)(const void* src, size_t srcSize, void* opaque); - -/*! FL2_compressCCtxBlock_toFn() : - * Same as FL2_compressCCtx except the caller is responsible for supplying an - * overlap section, and compressed data is written to a callback function. - * The FL2_p_overlapFraction parameter will not be used. - * Can be called multiple times. FL2_endFrame_toFn() must be called when finished. */ -FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtxBlock_toFn(FL2_CCtx* ctx, - FL2_writerFn writeFn, void* opaque, - const FL2_blockBuffer *block, - FL2_progressFn progress); - -/*! FL2_endFrame() : - * Write the end marker to a callback function to terminate the LZMA2 stream. - * Must be called after compressing with FL2_compressCCtxBlock_toFn() */ -FL2LIB_API size_t FL2LIB_CALL FL2_endFrame_toFn(FL2_CCtx* ctx, - FL2_writerFn writeFn, void* opaque); - -/*! FL2_dictSizeProp() : +/*! FL2_getCCtxDictProp() : * Get the dictionary size property. * Intended for use with the FL2_p_omitProperties parameter for creating a - * 7-zip compatible LZMA2 stream. */ -FL2LIB_API unsigned char FL2LIB_CALL FL2_dictSizeProp(FL2_CCtx* ctx); + * 7-zip or XZ compatible LZMA2 stream. */ +FL2LIB_API unsigned char FL2LIB_CALL FL2_getCCtxDictProp(FL2_CCtx* cctx); + + +/**************************** +* Decompression +****************************/ /*= Decompression context - * When decompressing many times, - * it is recommended to allocate a context only once, - * and re-use it for each successive compression operation. - * This will make the workload friendlier for the system's memory. - * Use one context per thread for parallel execution. */ -typedef struct CLzma2Dec_s FL2_DCtx; + * When decompressing many times, it is recommended to allocate a context only once, + * and re-use it for each successive decompression operation. This will make the workload + * friendlier for the system's memory. + * The context may not allocate the number of threads requested if the library is + * compiled for single-threaded compression or nbThreads > FL2_MAXTHREADS. + * Call FL2_getDCtxThreadCount to obtain the actual number allocated. + * At least nbThreads dictionary resets must exist in the stream to use all of the + * threads. Dictionary resets are inserted into the stream according to the + * FL2_p_resetInterval parameter used in the compression context. */ +typedef struct FL2_DCtx_s FL2_DCtx; FL2LIB_API FL2_DCtx* FL2LIB_CALL FL2_createDCtx(void); +FL2LIB_API FL2_DCtx* FL2LIB_CALL FL2_createDCtxMt(unsigned nbThreads); FL2LIB_API size_t FL2LIB_CALL FL2_freeDCtx(FL2_DCtx* dctx); +FL2LIB_API unsigned FL2LIB_CALL FL2_getDCtxThreadCount(const FL2_DCtx* dctx); + + +/*! FL2_initDCtx() : + * Use only when a property byte is not present at input byte 0. No init is necessary otherwise. + * The caller must store the result from FL2_getCCtxDictProp() and pass it to this function. */ +FL2LIB_API size_t FL2LIB_CALL FL2_initDCtx(FL2_DCtx* dctx, unsigned char prop); + /*! FL2_decompressDCtx() : * Same as FL2_decompress(), requires an allocated FL2_DCtx (see FL2_createDCtx()) */ -FL2LIB_API size_t FL2LIB_CALL FL2_decompressDCtx(FL2_DCtx* ctx, +FL2LIB_API size_t FL2LIB_CALL FL2_decompressDCtx(FL2_DCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); @@ -232,90 +201,180 @@ FL2LIB_API size_t FL2LIB_CALL FL2_decompressDCtx(FL2_DCtx* ctx, * Streaming ****************************/ -typedef struct FL2_inBuffer_s { +typedef struct { const void* src; /**< start of input buffer */ size_t size; /**< size of input buffer */ size_t pos; /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */ } FL2_inBuffer; -typedef struct FL2_outBuffer_s { +typedef struct { void* dst; /**< start of output buffer */ size_t size; /**< size of output buffer */ size_t pos; /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */ } FL2_outBuffer; +/*** Push/pull structs ***/ +typedef struct { + void* dst; /**< start of available dict buffer */ + unsigned long size; /**< size of dict remaining */ +} FL2_dictBuffer; + +typedef struct { + const void* src; /**< start of compressed data */ + size_t size; /**< size of compressed data */ +} FL2_cBuffer; /*-*********************************************************************** - * Streaming compression - HowTo + * Streaming compression * * A FL2_CStream object is required to track streaming operation. * Use FL2_createCStream() and FL2_freeCStream() to create/release resources. * FL2_CStream objects can be reused multiple times on consecutive compression operations. - * It is recommended to re-use FL2_CStream in situations where many streaming operations will be achieved consecutively, - * since it will play nicer with system's memory, by re-using already allocated memory. + * It is recommended to re-use FL2_CStream in situations where many streaming operations will be done + * consecutively, since it will reduce allocation and initialization time. * - * Start a new compression by initializing FL2_CStream. - * Use FL2_initCStream() to start a new compression operation. + * Call FL2_createCStreamMt() with a nonzero dualBuffer parameter to use two input dictionary buffers. + * The stream will not block on FL2_compressStream() and continues to accept data while compression is + * underway, until both buffers are full. Useful when I/O is slow. + * To compress with a single thread with dual buffering, call FL2_createCStreamMt with nbThreads=1. + * + * Use FL2_initCStream() on the FL2_CStream object to start a new compression operation. * * Use FL2_compressStream() repetitively to consume input stream. - * The function will automatically update both `pos` fields. - * It will always consume the entire input unless an error occurs, + * The function will automatically update the `pos` field. + * It will always consume the entire input unless an error occurs or the dictionary buffer is filled, * unlike the decompression function. - * @return : a size hint - remaining capacity to fill before compression occurs, - * or an error code, which can be tested using FL2_isError(). - * Note : it's just a hint, any other value will work fine. * - * At any moment, it's possible, but not recommended, to flush whatever data remains - * within internal buffer using FL2_flushStream(). - * `output->pos` will be updated. - * Note 1 : this will reduce compression ratio because the algorithm is block-based. - * Note 2 : some content might still be left within internal buffers if `output->size` is too small. - * @return : nb of bytes still present within internal buffers (0 if they're empty) - * or an error code, which can be tested using FL2_isError(). + * The radix match finder allows compressed data to be stored in its match table during encoding. + * Applications may call streaming compression functions with output == NULL. In this case, + * when the function returns 1, the compressed data must be read from the internal buffers. + * Call FL2_getNextCStreamBuffer() repeatedly until it returns 0. + * Each call returns buffer information in the FL2_inBuffer parameter. Applications typically will + * passed this to an I/O write function or downstream filter. + * Alternately, applications may pass an FL2_outBuffer object pointer to receive the output. In this + * case the return value is 1 if the buffer is full and more compressed data remains. * - * FL2_endStream() instructs to finish a frame. - * It will perform a flush and write the LZMA2 termination byte (required). - * FL2_endStream() may not be able to flush full data if `output->size` is too small. - * In which case, call again FL2_endStream() to complete the flush. - * @return : 0 if stream fully completed and flushed, - * or >0 to indicate the nb of bytes still present within the internal buffers, - * or an error code, which can be tested using FL2_isError(). + * FL2_endStream() instructs to finish a stream. It will perform a flush and write the LZMA2 + * termination byte (required). Call FL2_endStream() repeatedly until it returns 0. + * + * Most functions may return a size_t error code, which can be tested using FL2_isError(). * * *******************************************************************/ -typedef struct FL2_CStream_s FL2_CStream; +typedef struct FL2_CCtx_s FL2_CStream; /*===== FL2_CStream management functions =====*/ FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStream(void); -FL2LIB_API size_t FL2LIB_CALL FL2_freeCStream(FL2_CStream* fcs); +FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStreamMt(unsigned nbThreads, int dualBuffer); +FL2LIB_API void FL2LIB_CALL FL2_freeCStream(FL2_CStream * fcs); /*===== Streaming compression functions =====*/ -FL2LIB_API size_t FL2LIB_CALL FL2_initCStream(FL2_CStream* fcs, int compressionLevel); -FL2LIB_API size_t FL2LIB_CALL FL2_compressStream(FL2_CStream* fcs, FL2_outBuffer* output, FL2_inBuffer* input); -FL2LIB_API size_t FL2LIB_CALL FL2_flushStream(FL2_CStream* fcs, FL2_outBuffer* output); -FL2LIB_API size_t FL2LIB_CALL FL2_endStream(FL2_CStream* fcs, FL2_outBuffer* output); +/*! FL2_initCStream() : + * Call this function before beginning a new compressed data stream. To keep the stream object's + * current parameters, specify zero for the compression level. The object is set to the default + * level upon creation. */ +FL2LIB_API size_t FL2LIB_CALL FL2_initCStream(FL2_CStream* fcs, int compressionLevel); + +/*! FL2_setCStreamTimeout() : + * Sets a timeout in milliseconds. Zero disables the timeout (default). If a nonzero timout is set, functions + * FL2_compressStream(), FL2_updateDictionary(), FL2_getNextCStreamBuffer(), FL2_flushStream(), and + * FL2_endStream() may return a timeout code before compression of the current dictionary of data + * completes. FL2_isError() returns true for the timeout code, so check the code with FL2_isTimedOut() before + * testing for errors. With the exception of FL2_updateDictionary(), the above functions may be called again + * to wait for completion. A typical application for timeouts is to update the user on compression progress. */ +FL2LIB_API size_t FL2LIB_CALL FL2_setCStreamTimeout(FL2_CStream * fcs, unsigned timeout); + +/*! FL2_compressStream() : + * Reads data from input into the dictionary buffer. Compression will begin if the buffer fills up. + * A dual buffering stream will fill the second buffer while compression proceeds on the first. + * A call to FL2_compressStream() will wait for ongoing compression to complete if all dictionary space + * is filled. FL2_compressStream() must not be called with output == NULL unless the caller has read all + * compressed data from the CStream object. + * Returns 1 to indicate compressed data must be read (or output is full), or 0 otherwise. */ +FL2LIB_API size_t FL2LIB_CALL FL2_compressStream(FL2_CStream* fcs, FL2_outBuffer *output, FL2_inBuffer* input); + +/*** Push/pull functions ***/ + +/*! FL2_getDictionaryBuffer() : + * Returns a buffer in the FL2_outBuffer object, which the caller can directly read data into. + * Applications will normally pass this buffer to an I/O read function or upstream filter. + * Returns 0, or an error or timeout code. */ +FL2LIB_API size_t FL2LIB_CALL FL2_getDictionaryBuffer(FL2_CStream* fcs, FL2_dictBuffer* dict); + +/*! FL2_updateDictionary() : + * Informs the CStream how much data was added to the buffer. Compression begins if the dictionary + * was filled. Returns 1 to indicate compressed data must be read, 0 if not, or an error code. */ +FL2LIB_API size_t FL2LIB_CALL FL2_updateDictionary(FL2_CStream* fcs, size_t addedSize); + +/*! FL2_getNextCStreamBuffer() : + * Returns a buffer containing a slice of the compressed data. Call this function and process the data + * until the function returns zero. In most cases it will return a buffer for each compression thread + * used. It is sometimes less but never more than nbThreads. If asynchronous compression is in progress, + * this function will wait for completion before returning, or it will return the timeout code. */ +FL2LIB_API size_t FL2LIB_CALL FL2_getNextCStreamBuffer(FL2_CStream* fcs, FL2_cBuffer* cbuf); + +/******/ + +/*! FL2_getCStreamProgress() : + * Returns the number of bytes processed since the stream was initialized. This is a synthetic + * estimate because the match finder does not proceed sequentially through the data. If + * outputSize is not NULL, returns the number of bytes of compressed data generated. */ +FL2LIB_API unsigned long long FL2LIB_CALL FL2_getCStreamProgress(const FL2_CStream * fcs, unsigned long long *outputSize); + +/*! FL2_waitCStream() : + * Waits for compression to end. This function returns after the timeout set using + * FL2_setCStreamTimeout has elapsed. Unnecessary when no timeout is set. + * Returns 1 if compressed output is available, 0 if not, or the timeout code. */ +FL2LIB_API size_t FL2LIB_CALL FL2_waitCStream(FL2_CStream * fcs); + +/*! FL2_cancelCStream() : + * Cancels any compression operation underway. Useful only when dual buffering and/or timeouts + * are enabled. The stream will be returned to an uninitialized state. */ +FL2LIB_API void FL2LIB_CALL FL2_cancelCStream(FL2_CStream *fcs); + +/*! FL2_remainingOutputSize() : + * The amount of compressed data remaining to be read from the CStream object. */ +FL2LIB_API size_t FL2LIB_CALL FL2_remainingOutputSize(const FL2_CStream* fcs); + +/*! FL2_flushStream() : + * Compress all data remaining in the dictionary buffer(s). It may be necessary to call + * FL2_flushStream() more than once. If output == NULL the compressed data must be read from the + * CStream object after each call. + * Flushing is not normally useful and produces larger output. + * Returns 1 if input or output still exists in the CStream object, 0 if complete, or an error code. */ +FL2LIB_API size_t FL2LIB_CALL FL2_flushStream(FL2_CStream* fcs, FL2_outBuffer *output); + +/*! FL2_endStream() : + * Compress all data remaining in the dictionary buffer(s) and write the stream end marker. It may + * be necessary to call FL2_endStream() more than once. If output == NULL the compressed data must + * be read from the CStream object after each call. + * Returns 0 when compression is complete and all output has been flushed, 1 if not complete, or + * an error code. */ +FL2LIB_API size_t FL2LIB_CALL FL2_endStream(FL2_CStream* fcs, FL2_outBuffer *output); /*-*************************************************************************** - * Streaming decompression - HowTo + * Streaming decompression * * A FL2_DStream object is required to track streaming operations. * Use FL2_createDStream() and FL2_freeDStream() to create/release resources. * FL2_DStream objects can be re-used multiple times. * * Use FL2_initDStream() to start a new decompression operation. - * @return : recommended first input size + * @return : zero or an error code * * Use FL2_decompressStream() repetitively to consume your input. * The function will update both `pos` fields. * If `input.pos < input.size`, some input has not been consumed. - * It's up to the caller to present again remaining data. - * More data must be loaded if `input.pos + LZMA_REQUIRED_INPUT_MAX >= input.size` + * It's up to the caller to present again the remaining data. + * More data must be loaded if `input.pos + LZMA_REQUIRED_INPUT_MAX >= input.size`. In this case, + * move the remaining input (<= LZMA_REQUIRED_INPUT_MAX bytes) to the start of the buffer and + * load new data after it. * If `output.pos < output.size`, decoder has flushed everything it could. - * @return : 0 when a frame is completely decoded and fully flushed, - * an error code, which can be tested using FL2_isError(), - * 1, which means there is still some decoding to do to complete current frame. + * @return : 0 when a stream is completely decoded and fully flushed, + * 1, which means there is still some decoding to do to complete the stream, + * or an error code, which can be tested using FL2_isError(). * *******************************************************************************/ #define LZMA_REQUIRED_INPUT_MAX 20 @@ -324,101 +383,187 @@ typedef struct FL2_DStream_s FL2_DStream; /*===== FL2_DStream management functions =====*/ FL2LIB_API FL2_DStream* FL2LIB_CALL FL2_createDStream(void); +FL2LIB_API FL2_DStream* FL2LIB_CALL FL2_createDStreamMt(unsigned nbThreads); FL2LIB_API size_t FL2LIB_CALL FL2_freeDStream(FL2_DStream* fds); +/*! FL2_setDStreamMemoryLimitMt() : + * Set a total size limit for multithreaded decoder input and output buffers. MT decoder memory + * usage is unknown until the input is parsed. If the limit is exceeded, the decoder switches to + * using a single thread. + * MT decoding memory usage is typically dictionary_size * 4 * nbThreads for the output + * buffers plus the size of the compressed input for that amount of output. */ +FL2LIB_API void FL2LIB_CALL FL2_setDStreamMemoryLimitMt(FL2_DStream* fds, size_t limit); + +/*! FL2_setDStreamTimeout() : + * Sets a timeout in milliseconds. Zero disables the timeout. If a nonzero timout is set, + * FL2_decompressStream() may return a timeout code before decompression of the available data + * completes. FL2_isError() returns true for the timeout code, so check the code with FL2_isTimedOut() + * before testing for errors. After a timeout occurs, do not call FL2_decompressStream() again unless + * a call to FL2_waitDStream() returns 1. A typical application for timeouts is to update the user on + * decompression progress. */ +FL2LIB_API size_t FL2LIB_CALL FL2_setDStreamTimeout(FL2_DStream * fds, unsigned timeout); + +/*! FL2_waitDStream() : + * Waits for decompression to end after a timeout has occurred. This function returns after the + * timeout set using FL2_setDStreamTimeout() has elapsed, or when decompression of available input is + * complete. Unnecessary when no timeout is set. + * Returns 0 if the stream is complete, 1 if not complete, or an error code. */ +FL2LIB_API size_t FL2LIB_CALL FL2_waitDStream(FL2_DStream * fds); + +/*! FL2_cancelDStream() : + * Frees memory allocated for MT decoding. If a timeout is set and the caller is waiting + * for completion of MT decoding, decompression in progress will be canceled. */ +FL2LIB_API void FL2LIB_CALL FL2_cancelDStream(FL2_DStream *fds); + +/*! FL2_getDStreamProgress() : + * Returns the number of bytes decoded since the stream was initialized. */ +FL2LIB_API unsigned long long FL2LIB_CALL FL2_getDStreamProgress(const FL2_DStream * fds); + /*===== Streaming decompression functions =====*/ + +/*! FL2_initDStream() : + * Call this function before decompressing a stream. FL2_initDStream_withProp() + * must be used for streams which do not include a property byte at position zero. + * The caller is responsible for storing and passing the property byte. + * Returns 0 if okay, or an error if the stream object is still in use from a + * previous call to FL2_decompressStream() (see timeout info above). */ FL2LIB_API size_t FL2LIB_CALL FL2_initDStream(FL2_DStream* fds); +FL2LIB_API size_t FL2LIB_CALL FL2_initDStream_withProp(FL2_DStream* fds, unsigned char prop); + +/*! FL2_decompressStream() : + * Reads data from input and decompresses to output. + * Returns 1 if the stream is unfinished, 0 if the terminator was encountered (he'll be back) + * and all data was written to output, or an error code. Call this function repeatedly if + * necessary, removing data from output and/or loading data into input before each call. + * Note the requirement for LZMA_REQUIRED_INPUT_MAX bytes of input if the input data is + * incomplete (see intro above). */ FL2LIB_API size_t FL2LIB_CALL FL2_decompressStream(FL2_DStream* fds, FL2_outBuffer* output, FL2_inBuffer* input); /*-*************************************************************************** - * Compression parameters - HowTo + * Compression parameters * * Any function that takes a 'compressionLevel' parameter will replace any * parameters affected by compression level that are already set. - * Call FL2_CCtx_setParameter with FL2_p_compressionLevel to set the level, - * then call FL2_CCtx_setParameter again with any other settings to change. - * Specify compressionLevel=0 when calling a compression function. + * To use a preset level and modify it, call FL2_CCtx_setParameter with + * FL2_p_compressionLevel to set the level, then call FL2_CCtx_setParameter again + * with any other settings to change. + * Specify a compressionLevel of 0 when calling a compression function to keep + * the current parameters. * *******************************************************************************/ +#define FL2_DICTLOG_MIN 20 #define FL2_DICTLOG_MAX_32 27 #define FL2_DICTLOG_MAX_64 30 -#define FL2_DICTLOG_MAX ((unsigned)(sizeof(size_t) == 4 ? FL2_DICTLOG_MAX_32 : FL2_DICTLOG_MAX_64)) -#define FL2_DICTLOG_MIN 20 -#define FL2_CHAINLOG_MAX 14 -#define FL2_CHAINLOG_MIN 4 -#define FL2_SEARCHLOG_MAX (FL2_CHAINLOG_MAX-1) -#define FL2_SEARCHLOG_MIN 0 -#define FL2_FASTLENGTH_MIN 6 /* only used by optimizer */ -#define FL2_FASTLENGTH_MAX 273 /* only used by optimizer */ +#define FL2_DICTLOG_MAX ((unsigned)(sizeof(size_t) == 4 ? FL2_DICTLOG_MAX_32 : FL2_DICTLOG_MAX_64)) +#define FL2_DICTSIZE_MAX (1U << FL2_DICTLOG_MAX) +#define FL2_DICTSIZE_MIN (1U << FL2_DICTLOG_MIN) #define FL2_BLOCK_OVERLAP_MIN 0 #define FL2_BLOCK_OVERLAP_MAX 14 -#define FL2_BLOCK_LOG_MIN 12 -#define FL2_BLOCK_LOG_MAX 32 +#define FL2_RESET_INTERVAL_MIN 1 +#define FL2_RESET_INTERVAL_MAX 16 /* small enough to fit FL2_DICTSIZE_MAX * FL2_RESET_INTERVAL_MAX in 32-bit size_t */ +#define FL2_BUFFER_SIZE_LOG_MIN 0 +#define FL2_BUFFER_SIZE_LOG_MAX 6 +#define FL2_CHAINLOG_MIN 4 +#define FL2_CHAINLOG_MAX 14 +#define FL2_HYBRIDCYCLES_MIN 1 +#define FL2_HYBRIDCYCLES_MAX 64 #define FL2_SEARCH_DEPTH_MIN 6 #define FL2_SEARCH_DEPTH_MAX 254 -#define FL2_BUFFER_SIZE_LOG_MIN 6 -#define FL2_BUFFER_SIZE_LOG_MAX 12 +#define FL2_FASTLENGTH_MIN 6 /* only used by optimizer */ +#define FL2_FASTLENGTH_MAX 273 /* only used by optimizer */ #define FL2_LC_MIN 0 #define FL2_LC_MAX 4 #define FL2_LP_MIN 0 #define FL2_LP_MAX 4 #define FL2_PB_MIN 0 #define FL2_PB_MAX 4 +#define FL2_LCLP_MAX 4 + +typedef enum { + FL2_fast, + FL2_opt, + FL2_ultra +} FL2_strategy; + +typedef struct { + size_t dictionarySize; /* largest match distance : larger == more compression, more memory needed during decompression; >= 27 == more memory per byte, slower */ + unsigned overlapFraction; /* overlap between consecutive blocks in 1/16 units: larger == more compression, slower */ + unsigned chainLog; /* HC3 sliding window : larger == more compression, slower; hybrid mode only (ultra) */ + unsigned cyclesLog; /* nb of searches : larger == more compression, slower; hybrid mode only (ultra) */ + unsigned searchDepth; /* maximum depth for resolving string matches : larger == more compression, slower */ + unsigned fastLength; /* acceptable match size for parser : larger == more compression, slower; fast bytes parameter from 7-zip */ + unsigned divideAndConquer; /* split long chains of 2-byte matches into shorter chains with a small overlap : faster, somewhat less compression; enabled by default */ + unsigned bufferLog; /* buffer size for processing match chains is (dictionarySize >> (12 - bufferLog)) : affects compression when divideAndConquer enabled; */ + /* when divideAndConquer disabled, affects speed in a hardware-dependent manner */ + FL2_strategy strategy; /* encoder strategy : fast, optimized or ultra (hybrid) */ +} FL2_compressionParameters; typedef enum { /* compression parameters */ FL2_p_compressionLevel, /* Update all compression parameters according to pre-defined cLevel table - * Default level is FL2_CLEVEL_DEFAULT==9. - * Setting FL2_p_highCompression to 1 switches to an alternate cLevel table. - * Special: value 0 means "do not change cLevel". */ + * Default level is FL2_CLEVEL_DEFAULT==6. + * Setting FL2_p_highCompression to 1 switches to an alternate cLevel table. */ FL2_p_highCompression, /* Maximize compression ratio for a given dictionary size. - * Has 9 levels instead of 12, with dictionaryLog 20 - 28. */ - FL2_p_7zLevel, /* For use by the 7-zip fork employing this library. 1 - 9 */ + * Levels 1..10 = dictionaryLog 20..29 (1 Mb..512 Mb). + * Typically provides a poor speed/ratio tradeoff. */ FL2_p_dictionaryLog, /* Maximum allowed back-reference distance, expressed as power of 2. - * Must be clamped between FL2_DICTLOG_MIN and FL2_DICTLOG_MAX. - * Special: value 0 means "do not change dictionaryLog". */ + * Must be clamped between FL2_DICTLOG_MIN and FL2_DICTLOG_MAX. + * Default = 24 */ + FL2_p_dictionarySize, /* Same as above but expressed as an absolute value. + * Must be clamped between FL2_DICTSIZE_MIN and FL2_DICTSIZE_MAX. + * Default = 16 Mb */ FL2_p_overlapFraction, /* The radix match finder is block-based, so some overlap is retained from * each block to improve compression of the next. This value is expressed * as n / 16 of the block size (dictionary size). Larger values are slower. - * Values above 2 mostly yield only a small improvement in compression. */ - FL2_p_blockSize, + * Values above 2 mostly yield only a small improvement in compression. + * A large value for a small dictionary may worsen multithreaded compression. + * Default = 2 */ + FL2_p_resetInterval, /* For multithreaded decompression. A dictionary reset will occur + * after each dictionarySize * resetInterval bytes of input. + * Default = 4 */ FL2_p_bufferLog, /* Buffering speeds up the matchfinder. Buffer size is - * 2 ^ (dictionaryLog - bufferLog). Lower number = slower, better compression, - * higher memory usage. */ - FL2_p_chainLog, /* Size of the full-search table, as a power of 2. - * Resulting table size is (1 << (chainLog+2)). - * Larger tables result in better and slower compression. - * This parameter is useless when using "fast" strategy. - * Special: value 0 means "do not change chainLog". */ - FL2_p_searchLog, /* Number of search attempts, as a power of 2, made by the HC3 match finder - * used only in hybrid mode. - * More attempts result in slightly better and slower compression. - * This parameter is not used by the "fast" and "optimize" strategies. - * Special: value 0 means "do not change searchLog". */ - FL2_p_literalCtxBits, /* lc value for LZMA2 encoder */ - FL2_p_literalPosBits, /* lp value for LZMA2 encoder */ - FL2_p_posBits, /* pb value for LZMA2 encoder */ + * (dictionarySize >> (12 - bufferLog)) * 12 bytes. Higher number = slower, + * better compression, higher memory usage. A CPU with a large memory cache + * may make effective use of a larger buffer. + * Default = 4 */ + FL2_p_hybridChainLog, /* Size of the hybrid mode HC3 hash chain, as a power of 2. + * Resulting table size is (1 << (chainLog+2)) bytes. + * Larger tables result in better and slower compression. + * This parameter is only used by the hybrid "ultra" strategy. + * Default = 9 */ + FL2_p_hybridCycles, /* Number of search attempts made by the HC3 match finder. + * Used only by the hybrid "ultra" strategy. + * More attempts result in slightly better and slower compression. + * Default = 1 */ FL2_p_searchDepth, /* Match finder will resolve string matches up to this length. If a longer - * match exists further back in the input, it will not be found. */ + * match exists further back in the input, it will not be found. + * Default = 42 */ FL2_p_fastLength, /* Only useful for strategies >= opt. - * Length of Match considered "good enough" to stop search. + * Length of match considered "good enough" to stop search. * Larger values make compression stronger and slower. - * Special: value 0 means "do not change fastLength". */ + * Default = 48 */ FL2_p_divideAndConquer, /* Split long chains of 2-byte matches into shorter chains with a small overlap - * during further processing. Allows buffering of all chains at length 2. - * Faster, less compression. Generally a good tradeoff. Enabled by default. */ - FL2_p_strategy, /* 1 = fast; 2 = optimize, 3 = ultra (hybrid mode). + * for further processing. Allows buffering of all chains at length 2. + * Faster, less compression. Generally a good tradeoff. + * Default = enabled */ + FL2_p_strategy, /* 1 = fast; 2 = optimized, 3 = ultra (hybrid mode). * The higher the value of the selected strategy, the more complex it is, * resulting in stronger and slower compression. - * Special: value 0 means "do not change strategy". */ + * Default = ultra */ + FL2_p_literalCtxBits, /* lc value for LZMA2 encoder + * Default = 3 */ + FL2_p_literalPosBits, /* lp value for LZMA2 encoder + * Default = 0 */ + FL2_p_posBits, /* pb value for LZMA2 encoder + * Default = 2 */ + FL2_p_omitProperties, /* Omit the property byte at the start of the stream. For use within 7-zip */ + /* or other containers which store the property byte elsewhere. */ + /* A stream compressed under this setting cannot be decoded by this library. */ #ifndef NO_XXHASH FL2_p_doXXHash, /* Calculate a 32-bit xxhash value from the input data and store it * after the stream terminator. The value will be checked on decompression. * 0 = do not calculate; 1 = calculate (default) */ #endif - FL2_p_omitProperties, /* Omit the property byte at the start of the stream. For use within 7-zip */ - /* or other containers which store the property byte elsewhere. */ - /* Cannot be decoded by this library. */ #ifdef RMF_REFERENCE FL2_p_useReferenceMF /* Use the reference matchfinder for development purposes. SLOW. */ #endif @@ -429,8 +574,32 @@ typedef enum { * Set one compression parameter, selected by enum FL2_cParameter. * @result : informational value (typically, the one being set, possibly corrected), * or an error code (which can be tested with FL2_isError()). */ -FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_setParameter(FL2_CCtx* cctx, FL2_cParameter param, unsigned value); -FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cParameter param, unsigned value); +FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_setParameter(FL2_CCtx* cctx, FL2_cParameter param, size_t value); + +/*! FL2_CCtx_getParameter() : + * Get one compression parameter, selected by enum FL2_cParameter. + * @result : the parameter value, or the parameter_unsupported error code + * (which can be tested with FL2_isError()). */ +FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_getParameter(FL2_CCtx* cctx, FL2_cParameter param); + +/*! FL2_CStream_setParameter() : + * Set one compression parameter, selected by enum FL2_cParameter. + * @result : informational value (typically, the one being set, possibly corrected), + * or an error code (which can be tested with FL2_isError()). */ +FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cParameter param, size_t value); + +/*! FL2_CStream_getParameter() : + * Get one compression parameter, selected by enum FL2_cParameter. + * @result : the parameter value, or the parameter_unsupported error code + * (which can be tested with FL2_isError()). */ +FL2LIB_API size_t FL2LIB_CALL FL2_CStream_getParameter(FL2_CStream* fcs, FL2_cParameter param); + +/*! FL2_getLevelParameters() : + * Get all compression parameter values defined by the preset compressionLevel. + * @result : the values in a FL2_compressionParameters struct, or the parameter_outOfBound error code + * (which can be tested with FL2_isError()) if compressionLevel is invalid. */ +FL2LIB_API size_t FL2LIB_CALL FL2_getLevelParameters(int compressionLevel, int high, FL2_compressionParameters *params); + /*************************************** * Context memory usage @@ -441,12 +610,29 @@ FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cPa * FL2_estimateCCtxSize() will provide a budget large enough for any compression level up to selected one. * To use FL2_estimateCCtxSize_usingCCtx, set the compression level and any other settings for the context, * then call the function. Some allocation occurs when the context is created, but the large memory buffers -* used for string matching are allocated only when compression begins. */ +* used for string matching are allocated only when compression is initialized. */ FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize(int compressionLevel, unsigned nbThreads); /*!< memory usage determined by level */ +FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize_byParams(const FL2_compressionParameters *params, unsigned nbThreads); /*!< memory usage determined by params */ FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize_usingCCtx(const FL2_CCtx* cctx); /*!< memory usage determined by settings */ -FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize(int compressionLevel, unsigned nbThreads); -FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_usingCCtx(const FL2_CStream* fcs); +FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize(int compressionLevel, unsigned nbThreads, int dualBuffer); /*!< memory usage determined by level */ +FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_byParams(const FL2_compressionParameters *params, unsigned nbThreads, int dualBuffer); /*!< memory usage determined by params */ +FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_usingCStream(const FL2_CStream* fcs); /*!< memory usage determined by settings */ + +/*! FL2_getDictSizeFromProp() : + * Get the dictionary size from the property byte for a stream. The property byte is the first byte +* in the stream, unless omitProperties was enabled, in which case the caller must store it. */ +FL2LIB_API size_t FL2LIB_CALL FL2_getDictSizeFromProp(unsigned char prop); + +/*! FL2_estimateDCtxSize() : + * The size of a DCtx does not include a dictionary buffer because the caller must supply one. */ +FL2LIB_API size_t FL2LIB_CALL FL2_estimateDCtxSize(unsigned nbThreads); + +/*! FL2_estimateDStreamSize() : + * Estimate decompression memory use from the dictionary size and number of threads. + * For nbThreads == 0 the number of available cores will be used. + * Obtain dictSize by passing the property byte to FL2_getDictSizeFromProp. */ +FL2LIB_API size_t FL2LIB_CALL FL2_estimateDStreamSize(size_t dictSize, unsigned nbThreads); /*!< obtain dictSize from FL2_getDictSizeFromProp() */ #endif /* FAST_LZMA2_H */ diff --git a/C/fast-lzma2/fl2_common.c b/C/fast-lzma2/fl2_common.c index 85780c56..6db70714 100644 --- a/C/fast-lzma2/fl2_common.c +++ b/C/fast-lzma2/fl2_common.c @@ -14,10 +14,8 @@ /*-************************************* * Dependencies ***************************************/ -#include /* malloc, calloc, free */ -#include /* memset */ #include "fast-lzma2.h" -#include "fl2_error_private.h" +#include "fl2_errors.h" #include "fl2_internal.h" @@ -29,6 +27,9 @@ FL2LIB_API unsigned FL2LIB_CALL FL2_versionNumber(void) { return FL2_VERSION_NUM FL2LIB_API const char* FL2LIB_CALL FL2_versionString(void) { return FL2_VERSION_STRING; } +/*-**************************************** +* Compression helpers +******************************************/ FL2LIB_API size_t FL2LIB_CALL FL2_compressBound(size_t srcSize) { return FL2_COMPRESSBOUND(srcSize); @@ -37,21 +38,70 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compressBound(size_t srcSize) /*-**************************************** * FL2 Error Management ******************************************/ +HINT_INLINE +unsigned IsError(size_t code) +{ + return (code > FL2_ERROR(maxCode)); +} + /*! FL2_isError() : * tells if a return value is an error code */ -FL2LIB_API unsigned FL2LIB_CALL FL2_isError(size_t code) { return ERR_isError(code); } +FL2LIB_API unsigned FL2LIB_CALL FL2_isError(size_t code) +{ + return IsError(code); +} + +/*! FL2_isTimedOut() : + * tells if a return value is the timeout code */ +FL2LIB_API unsigned FL2LIB_CALL FL2_isTimedOut(size_t code) +{ + return (code == FL2_ERROR(timedOut)); +} /*! FL2_getErrorName() : * provides error code string from function result (useful for debugging) */ -FL2LIB_API const char* FL2LIB_CALL FL2_getErrorName(size_t code) { return ERR_getErrorName(code); } +FL2LIB_API const char* FL2LIB_CALL FL2_getErrorName(size_t code) +{ + return FL2_getErrorString(FL2_getErrorCode(code)); +} /*! FL2_getError() : * convert a `size_t` function result into a proper FL2_errorCode enum */ -FL2LIB_API FL2_ErrorCode FL2LIB_CALL FL2_getErrorCode(size_t code) { return ERR_getErrorCode(code); } +FL2LIB_API FL2_ErrorCode FL2LIB_CALL FL2_getErrorCode(size_t code) +{ + if (!IsError(code)) + return (FL2_ErrorCode)0; + + return (FL2_ErrorCode)(0 - code); +} /*! FL2_getErrorString() : * provides error code string from enum */ -FL2LIB_API const char* FL2LIB_CALL FL2_getErrorString(FL2_ErrorCode code) { return ERR_getFL2ErrorString(code); } +FL2LIB_API const char* FL2LIB_CALL FL2_getErrorString(FL2_ErrorCode code) +{ + static const char* const notErrorCode = "Unspecified error code"; + switch (code) + { + case PREFIX(no_error): return "No error detected"; + case PREFIX(GENERIC): return "Error (generic)"; + case PREFIX(corruption_detected): return "Corrupted block detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(lclpMax_exceeded): return "Parameters lc+lp > 4"; + case PREFIX(stage_wrong): return "Not possible at this stage of encoding"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(canceled): return "Processing was canceled by a call to FL2_cancelCStream() or FL2_cancelDStream()"; + case PREFIX(buffer): return "Streaming progress halted due to buffer(s) full/empty"; + case PREFIX(timedOut): return "Wait timed out. Timeouts should be handled before errors using FL2_isTimedOut()"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(maxCode): + default: return notErrorCode; + } +} /*! g_debuglog_enable : * turn on/off debug traces (global switch) */ diff --git a/C/fast-lzma2/fl2_compress.c b/C/fast-lzma2/fl2_compress.c index 7785364b..2f1af130 100644 --- a/C/fast-lzma2/fl2_compress.c +++ b/C/fast-lzma2/fl2_compress.c @@ -11,29 +11,100 @@ #include #include "fast-lzma2.h" +#include "fl2_errors.h" #include "fl2_internal.h" #include "platform.h" #include "mem.h" #include "util.h" #include "fl2_compress_internal.h" -#include "fl2threading.h" -#include "fl2pool.h" +#include "fl2_threading.h" +#include "fl2_pool.h" #include "radix_mf.h" #include "lzma2_enc.h" -#define MIN_BYTES_PER_THREAD 0x10000 - -#define ALIGNMENT_MASK (~(size_t)15) +#define FL2_MAX_LOOPS 10U /*-===== Pre-defined compression levels =====-*/ -#define FL2_CLEVEL_DEFAULT 9 -#define FL2_MAX_CLEVEL 12 -#define FL2_MAX_7Z_CLEVEL 9 -#define FL2_MAX_HIGH_CLEVEL 9 +#define MB *(1U<<20) + +#define FL2_MAX_HIGH_CLEVEL 10 + +#ifdef FL2_XZ_BUILD + +#define FL2_CLEVEL_DEFAULT 6 +#define FL2_MAX_CLEVEL 9 + +static const FL2_compressionParameters FL2_defaultCParameters[FL2_MAX_CLEVEL + 1] = { + { 0,0,0,0,0,0,0,0,0 }, + { 1 MB, 1, 7, 0, 6, 32, 1, 4, FL2_fast }, /* 1 */ + { 2 MB, 2, 7, 0, 14, 32, 1, 4, FL2_fast }, /* 2 */ + { 2 MB, 2, 7, 0, 14, 40, 1, 4, FL2_opt }, /* 3 */ + { 4 MB, 2, 7, 0, 26, 40, 1, 4, FL2_opt }, /* 4 */ + { 16 MB, 2, 8, 0, 42, 48, 1, 4, FL2_opt }, /* 5 */ + { 16 MB, 2, 9, 1, 42, 48, 1, 4, FL2_ultra }, /* 6 */ + { 32 MB, 2, 10, 1, 50, 64, 1, 4, FL2_ultra }, /* 7 */ + { 64 MB, 2, 11, 2, 62, 96, 1, 3, FL2_ultra }, /* 8 */ + { 64 MB, 4, 12, 3, 90, 273, 0, 3, FL2_ultra }, /* 9 */ +}; + +#elif defined(FL2_7ZIP_BUILD) + +#define FL2_CLEVEL_DEFAULT 5 +#define FL2_MAX_CLEVEL 9 + +static const FL2_compressionParameters FL2_defaultCParameters[FL2_MAX_CLEVEL + 1] = { + { 0,0,0,0,0,0,0,0,0 }, + { 1 MB, 1, 7, 0, 6, 32, 1, 4, FL2_fast }, /* 1 */ + { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_fast }, /* 2 */ + { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_opt }, /* 3 */ + { 4 MB, 2, 7, 0, 14, 32, 1, 4, FL2_opt }, /* 4 */ + { 16 MB, 2, 9, 0, 42, 48, 1, 4, FL2_ultra }, /* 5 */ + { 32 MB, 2, 10, 0, 50, 64, 1, 4, FL2_ultra }, /* 6 */ + { 64 MB, 2, 11, 1, 62, 96, 1, 3, FL2_ultra }, /* 7 */ + { 64 MB, 4, 12, 2, 90, 273, 1, 3, FL2_ultra }, /* 8 */ + { 128 MB, 2, 14, 3, 254, 273, 0, 2, FL2_ultra } /* 9 */ +}; + +#else + +#define FL2_CLEVEL_DEFAULT 6 +#define FL2_MAX_CLEVEL 10 + +static const FL2_compressionParameters FL2_defaultCParameters[FL2_MAX_CLEVEL + 1] = { + { 0,0,0,0,0,0,0,0,0 }, + { 1 MB, 1, 7, 0, 6, 32, 1, 4, FL2_fast }, /* 1 */ + { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_fast }, /* 2 */ + { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_opt }, /* 3 */ + { 4 MB, 2, 7, 0, 26, 40, 1, 4, FL2_opt }, /* 4 */ + { 8 MB, 2, 8, 0, 42, 48, 1, 4, FL2_opt }, /* 5 */ + { 16 MB, 2, 9, 0, 42, 48, 1, 4, FL2_ultra }, /* 6 */ + { 32 MB, 2, 10, 0, 50, 64, 1, 4, FL2_ultra }, /* 7 */ + { 64 MB, 2, 11, 1, 62, 96, 1, 3, FL2_ultra }, /* 8 */ + { 64 MB, 4, 12, 2, 90, 273, 1, 3, FL2_ultra }, /* 9 */ + { 128 MB, 2, 14, 3, 254, 273, 0, 2, FL2_ultra } /* 10 */ +}; + +#endif + +static const FL2_compressionParameters FL2_highCParameters[FL2_MAX_HIGH_CLEVEL + 1] = { + { 0,0,0,0,0,0,0,0,0 }, + { 1 MB, 4, 9, 2, 254, 273, 0, 4, FL2_ultra }, /* 1 */ + { 2 MB, 4, 10, 2, 254, 273, 0, 4, FL2_ultra }, /* 2 */ + { 4 MB, 4, 11, 2, 254, 273, 0, 4, FL2_ultra }, /* 3 */ + { 8 MB, 4, 12, 2, 254, 273, 0, 4, FL2_ultra }, /* 4 */ + { 16 MB, 4, 13, 3, 254, 273, 0, 4, FL2_ultra }, /* 5 */ + { 32 MB, 4, 14, 3, 254, 273, 0, 4, FL2_ultra }, /* 6 */ + { 64 MB, 4, 14, 4, 254, 273, 0, 4, FL2_ultra }, /* 7 */ + { 128 MB, 4, 14, 4, 254, 273, 0, 4, FL2_ultra }, /* 8 */ + { 256 MB, 4, 14, 5, 254, 273, 0, 3, FL2_ultra }, /* 9 */ + { 512 MB, 4, 14, 5, 254, 273, 0, 2, FL2_ultra } /* 10 */ +}; + +#undef MB FL2LIB_API int FL2LIB_CALL FL2_maxCLevel(void) -{ +{ return FL2_MAX_CLEVEL; } @@ -42,135 +113,89 @@ FL2LIB_API int FL2LIB_CALL FL2_maxHighCLevel(void) return FL2_MAX_HIGH_CLEVEL; } -static const FL2_compressionParameters FL2_defaultCParameters[FL2_MAX_CLEVEL + 1] = { - { 0,0,0,0,0,0,0 }, - { 20, 1, 7, 0, 6, 32, 1, 8, FL2_fast }, /* 1 */ - { 20, 2, 7, 0, 12, 32, 1, 8, FL2_fast }, /* 2 */ - { 21, 2, 7, 0, 14, 32, 1, 8, FL2_fast }, /* 3 */ - { 20, 2, 7, 0, 12, 32, 1, 8, FL2_opt }, /* 4 */ - { 21, 2, 7, 0, 14, 40, 1, 8, FL2_opt }, /* 5 */ - { 22, 2, 7, 0, 26, 40, 1, 8, FL2_opt }, /* 6 */ - { 23, 2, 8, 0, 42, 48, 1, 8, FL2_opt }, /* 7 */ - { 24, 2, 9, 0, 42, 48, 1, 8, FL2_ultra }, /* 8 */ - { 25, 2, 10, 0, 50, 64, 1, 8, FL2_ultra }, /* 9 */ - { 26, 2, 11, 1, 60, 64, 1, 9, FL2_ultra }, /* 10 */ - { 27, 2, 12, 2, 126, 96, 1, 10, FL2_ultra }, /* 11 */ - { 28, 2, 14, 3, 254, 160, 1, 10, FL2_ultra } /* 12 */ -}; - -static const FL2_compressionParameters FL2_7zCParameters[FL2_MAX_7Z_CLEVEL + 1] = { - { 0,0,0,0,0,0,0 }, - { 20, 1, 7, 0, 6, 32, 1, 8, FL2_fast }, /* 1 */ - { 20, 2, 7, 0, 12, 32, 1, 8, FL2_fast }, /* 2 */ - { 21, 2, 7, 0, 16, 32, 1, 8, FL2_fast }, /* 3 */ - { 20, 2, 7, 0, 16, 32, 1, 8, FL2_opt }, /* 4 */ - { 24, 2, 9, 0, 40, 48, 1, 8, FL2_ultra }, /* 5 */ - { 25, 2, 10, 0, 48, 64, 1, 8, FL2_ultra }, /* 6 */ - { 26, 2, 11, 1, 60, 96, 1, 9, FL2_ultra }, /* 7 */ - { 27, 2, 12, 2, 128, 128, 1, 10, FL2_ultra }, /* 8 */ - { 27, 3, 14, 3, 252, 160, 0, 10, FL2_ultra } /* 9 */ -}; - -static const FL2_compressionParameters FL2_highCParameters[FL2_MAX_HIGH_CLEVEL + 1] = { - { 0,0,0,0,0,0,0 }, - { 20, 3, 9, 1, 60, 128, 0, 8, FL2_ultra }, /* 1 */ - { 21, 3, 10, 1, 60, 128, 0, 8, FL2_ultra }, /* 2 */ - { 22, 3, 11, 2, 60, 128, 0, 8, FL2_ultra }, /* 3 */ - { 23, 3, 12, 2, 60, 128, 0, 8, FL2_ultra }, /* 4 */ - { 24, 3, 13, 3, 60, 128, 0, 8, FL2_ultra }, /* 5 */ - { 25, 3, 14, 3, 60, 160, 0, 8, FL2_ultra }, /* 6 */ - { 26, 3, 14, 4, 60, 160, 0, 8, FL2_ultra }, /* 7 */ - { 27, 3, 14, 4, 128, 160, 0, 8, FL2_ultra }, /* 8 */ - { 28, 3, 14, 5, 128, 160, 0, 9, FL2_ultra } /* 9 */ -}; - -void FL2_fillParameters(FL2_CCtx* const cctx, const FL2_compressionParameters* const params) +static void FL2_fillParameters(FL2_CCtx* const cctx, const FL2_compressionParameters* const params) { FL2_lzma2Parameters* const cParams = &cctx->params.cParams; - RMF_parameters* const rParams = &cctx->params.rParams; cParams->lc = 3; cParams->lp = 0; cParams->pb = 2; cParams->fast_length = params->fastLength; - cParams->match_cycles = 1U << params->searchLog; + cParams->match_cycles = 1U << params->cyclesLog; cParams->strategy = params->strategy; cParams->second_dict_bits = params->chainLog; - cParams->random_filter = 0; - rParams->dictionary_log = MIN(params->dictionaryLog, FL2_DICTLOG_MAX); /* allow for reduced dict in 32-bit version */ - rParams->match_buffer_log = params->bufferLog; + + RMF_parameters* const rParams = &cctx->params.rParams; + rParams->dictionary_size = MIN(params->dictionarySize, FL2_DICTSIZE_MAX); /* allows for reduced dict in 32-bit version */ + rParams->match_buffer_log = RMF_BUFFER_LOG_BASE - params->bufferLog; rParams->overlap_fraction = params->overlapFraction; - rParams->block_size_log = rParams->dictionary_log + 2; rParams->divide_and_conquer = params->divideAndConquer; rParams->depth = params->searchDepth; -} - -FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtx(void) -{ - return FL2_createCCtxMt(1); -} - -FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtxMt(unsigned nbThreads) -{ - FL2_CCtx* cctx; - -#ifndef FL2_SINGLETHREAD - if (!nbThreads) { - nbThreads = UTIL_countPhysicalCores(); - nbThreads += !nbThreads; - } - if (nbThreads > FL2_MAXTHREADS) { - nbThreads = FL2_MAXTHREADS; - } -#else - nbThreads = 1; +#ifdef RMF_REFERENCE + rParams->use_ref_mf = 1; #endif +} + +static FL2_CCtx* FL2_createCCtx_internal(unsigned nbThreads, int const dualBuffer) +{ + nbThreads = FL2_checkNbThreads(nbThreads); DEBUGLOG(3, "FL2_createCCtxMt : %u threads", nbThreads); - cctx = malloc(sizeof(FL2_CCtx) + (nbThreads - 1) * sizeof(FL2_job)); + FL2_CCtx* const cctx = calloc(1, sizeof(FL2_CCtx) + (nbThreads - 1) * sizeof(FL2_job)); if (cctx == NULL) return NULL; cctx->jobCount = nbThreads; - for (unsigned u = 0; u < nbThreads; ++u) { + for (unsigned u = 0; u < nbThreads; ++u) cctx->jobs[u].enc = NULL; - } - cctx->params.highCompression = 0; - FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, FL2_CLEVEL_DEFAULT); #ifndef NO_XXHASH cctx->params.doXXH = 1; #endif - cctx->params.omitProp = 0; - -#ifdef RMF_REFERENCE - cctx->params.rParams.use_ref_mf = 0; -#endif cctx->matchTable = NULL; #ifndef FL2_SINGLETHREAD + cctx->compressThread = NULL; cctx->factory = FL2POOL_create(nbThreads - 1); if (nbThreads > 1 && cctx->factory == NULL) { FL2_freeCCtx(cctx); return NULL; } + if (dualBuffer) { + cctx->compressThread = FL2POOL_create(1); + if (cctx->compressThread == NULL) + return NULL; + } #endif for (unsigned u = 0; u < nbThreads; ++u) { - cctx->jobs[u].enc = FL2_lzma2Create(); + cctx->jobs[u].enc = LZMA2_createECtx(); if (cctx->jobs[u].enc == NULL) { FL2_freeCCtx(cctx); return NULL; } cctx->jobs[u].cctx = cctx; } - cctx->dictMax = 0; - cctx->block_total = 0; + + DICT_construct(&cctx->buf, dualBuffer); + + FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, FL2_CLEVEL_DEFAULT); + cctx->params.cParams.reset_interval = 4; return cctx; } +FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtx(void) +{ + return FL2_createCCtx_internal(1, 0); +} + +FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtxMt(unsigned nbThreads) +{ + return FL2_createCCtx_internal(nbThreads, 0); +} + FL2LIB_API void FL2LIB_CALL FL2_freeCCtx(FL2_CCtx* cctx) { if (cctx == NULL) @@ -178,137 +203,116 @@ FL2LIB_API void FL2LIB_CALL FL2_freeCCtx(FL2_CCtx* cctx) DEBUGLOG(3, "FL2_freeCCtx : %u threads", cctx->jobCount); + DICT_destruct(&cctx->buf); + for (unsigned u = 0; u < cctx->jobCount; ++u) { - FL2_lzma2Free(cctx->jobs[u].enc); + LZMA2_freeECtx(cctx->jobs[u].enc); } #ifndef FL2_SINGLETHREAD FL2POOL_free(cctx->factory); + FL2POOL_free(cctx->compressThread); #endif RMF_freeMatchTable(cctx->matchTable); free(cctx); } -FL2LIB_API unsigned FL2LIB_CALL FL2_CCtx_nbThreads(const FL2_CCtx* cctx) +FL2LIB_API unsigned FL2LIB_CALL FL2_getCCtxThreadCount(const FL2_CCtx* cctx) { return cctx->jobCount; } /* FL2_buildRadixTable() : FL2POOL_function type */ -static void FL2_buildRadixTable(void* const jobDescription, size_t n) +static void FL2_buildRadixTable(void* const jobDescription, ptrdiff_t const n) { - const FL2_job* const job = (FL2_job*)jobDescription; - FL2_CCtx* const cctx = job->cctx; + FL2_CCtx* const cctx = (FL2_CCtx*)jobDescription; - RMF_buildTable(cctx->matchTable, n, 1, cctx->curBlock, NULL, NULL, 0, 0); + RMF_buildTable(cctx->matchTable, n, 1, cctx->curBlock); } /* FL2_compressRadixChunk() : FL2POOL_function type */ -static void FL2_compressRadixChunk(void* const jobDescription, size_t n) +static void FL2_compressRadixChunk(void* const jobDescription, ptrdiff_t const n) { - const FL2_job* const job = (FL2_job*)jobDescription; - FL2_CCtx* const cctx = job->cctx; + FL2_CCtx* const cctx = (FL2_CCtx*)jobDescription; - cctx->jobs[n].cSize = FL2_lzma2Encode(cctx->jobs[n].enc, cctx->matchTable, job->block, &cctx->params.cParams, NULL, NULL, 0, 0); + cctx->jobs[n].cSize = LZMA2_encode(cctx->jobs[n].enc, cctx->matchTable, + cctx->jobs[n].block, + &cctx->params.cParams, + -1, + &cctx->progressIn, &cctx->progressOut, &cctx->canceled); } static int FL2_initEncoders(FL2_CCtx* const cctx) { for(unsigned u = 0; u < cctx->jobCount; ++u) { - if (FL2_lzma2HashAlloc(cctx->jobs[u].enc, &cctx->params.cParams) != 0) + if (LZMA2_hashAlloc(cctx->jobs[u].enc, &cctx->params.cParams) != 0) return 1; } return 0; } -static size_t FL2_compressCurBlock(FL2_CCtx* const cctx, FL2_progressFn progress, void* opaque) +static void FL2_initProgress(FL2_CCtx* const cctx) +{ + RMF_initProgress(cctx->matchTable); + cctx->progressIn = 0; + cctx->streamCsize += cctx->progressOut; + cctx->progressOut = 0; + cctx->canceled = 0; +} + +/* FL2_compressCurBlock_blocking() : + * Compress cctx->curBlock and wait until complete. + * Write streamProp as the first byte if >= 0 + */ +static size_t FL2_compressCurBlock_blocking(FL2_CCtx* const cctx, int const streamProp) { size_t const encodeSize = (cctx->curBlock.end - cctx->curBlock.start); - size_t init_done; - U32 rmf_weight = ZSTD_highbit32((U32)cctx->curBlock.end); - U32 depth_weight = 2 + (cctx->params.rParams.depth >= 12) + (cctx->params.rParams.depth >= 28); - U32 enc_weight; - int err = 0; #ifndef FL2_SINGLETHREAD size_t mfThreads = cctx->curBlock.end / RMF_MIN_BYTES_PER_THREAD; - size_t nbThreads = MIN(cctx->jobCount, encodeSize / MIN_BYTES_PER_THREAD); + size_t nbThreads = MIN(cctx->jobCount, encodeSize / ENC_MIN_BYTES_PER_THREAD); nbThreads += !nbThreads; #else size_t mfThreads = 1; size_t nbThreads = 1; #endif - if (rmf_weight >= 20) { - rmf_weight = depth_weight * (rmf_weight - 10) + (rmf_weight - 19) * 12; - if (cctx->params.cParams.strategy == 0) - enc_weight = 20; - else if (cctx->params.cParams.strategy == 1) - enc_weight = 50; - else - enc_weight = 60 + cctx->params.cParams.second_dict_bits + ZSTD_highbit32(cctx->params.cParams.fast_length) * 3U; - rmf_weight = (rmf_weight << 4) / (rmf_weight + enc_weight); - enc_weight = 16 - rmf_weight; - } - else { - rmf_weight = 8; - enc_weight = 8; - } - DEBUGLOG(5, "FL2_compressCurBlock : %u threads, %u start, %u bytes", (U32)nbThreads, (U32)cctx->curBlock.start, (U32)encodeSize); - /* Free unsuitable match table before reallocating anything else */ - if (cctx->matchTable && !RMF_compatibleParameters(cctx->matchTable, &cctx->params.rParams, cctx->curBlock.end)) { - RMF_freeMatchTable(cctx->matchTable); - cctx->matchTable = NULL; + size_t sliceStart = cctx->curBlock.start; + size_t const sliceSize = encodeSize / nbThreads; + cctx->jobs[0].block.data = cctx->curBlock.data; + cctx->jobs[0].block.start = sliceStart; + cctx->jobs[0].block.end = sliceStart + sliceSize; + + for (size_t u = 1; u < nbThreads; ++u) { + sliceStart += sliceSize; + cctx->jobs[u].block.data = cctx->curBlock.data; + cctx->jobs[u].block.start = sliceStart; + cctx->jobs[u].block.end = sliceStart + sliceSize; } - - if(FL2_initEncoders(cctx) != 0) /* Create hash objects together, leaving the (large) match table last */ - return FL2_ERROR(memory_allocation); - - if (!cctx->matchTable) { - cctx->matchTable = RMF_createMatchTable(&cctx->params.rParams, cctx->curBlock.end, cctx->jobCount); - if (cctx->matchTable == NULL) - return FL2_ERROR(memory_allocation); - } - else { - DEBUGLOG(5, "Have compatible match table"); - RMF_applyParameters(cctx->matchTable, &cctx->params.rParams, cctx->curBlock.end); - } - - { size_t sliceStart = cctx->curBlock.start; - size_t sliceSize = encodeSize / nbThreads; - cctx->jobs[0].block.data = cctx->curBlock.data; - cctx->jobs[0].block.start = sliceStart; - cctx->jobs[0].block.end = sliceStart + sliceSize; - - for (size_t u = 1; u < nbThreads; ++u) { - sliceStart += sliceSize; - cctx->jobs[u].block.data = cctx->curBlock.data; - cctx->jobs[u].block.start = sliceStart; - cctx->jobs[u].block.end = sliceStart + sliceSize; - } - cctx->jobs[nbThreads - 1].block.end = cctx->curBlock.end; - } - - /* update largest dict size used */ - cctx->dictMax = MAX(cctx->dictMax, cctx->curBlock.end); + cctx->jobs[nbThreads - 1].block.end = cctx->curBlock.end; /* initialize to length 2 */ - init_done = RMF_initTable(cctx->matchTable, cctx->curBlock.data, cctx->curBlock.start, cctx->curBlock.end); + cctx->matchTable->progress = RMF_initTable(cctx->matchTable, cctx->curBlock.data, cctx->curBlock.end); + + if (cctx->canceled) { + RMF_resetIncompleteBuild(cctx->matchTable); + return FL2_ERROR(canceled); + } #ifndef FL2_SINGLETHREAD + mfThreads = MIN(RMF_threadCount(cctx->matchTable), mfThreads); - for (size_t u = 1; u < mfThreads; ++u) { - FL2POOL_add(cctx->factory, FL2_buildRadixTable, &cctx->jobs[u], u); - } + FL2POOL_addRange(cctx->factory, FL2_buildRadixTable, cctx, 1, mfThreads); + #endif - err = RMF_buildTable(cctx->matchTable, 0, mfThreads > 1, cctx->curBlock, progress, opaque, rmf_weight, init_done); + int err = RMF_buildTable(cctx->matchTable, 0, mfThreads > 1, cctx->curBlock); #ifndef FL2_SINGLETHREAD - FL2POOL_waitAll(cctx->factory); + FL2POOL_waitAll(cctx->factory, 0); if (err) return FL2_ERROR(canceled); @@ -319,12 +323,14 @@ static size_t FL2_compressCurBlock(FL2_CCtx* const cctx, FL2_progressFn progress return FL2_ERROR(internal); #endif - for (size_t u = 1; u < nbThreads; ++u) { - FL2POOL_add(cctx->factory, FL2_compressRadixChunk, &cctx->jobs[u], u); - } + FL2POOL_addRange(cctx->factory, FL2_compressRadixChunk, cctx, 1, nbThreads); - cctx->jobs[0].cSize = FL2_lzma2Encode(cctx->jobs[0].enc, cctx->matchTable, cctx->jobs[0].block, &cctx->params.cParams, progress, opaque, (rmf_weight * encodeSize) >> 4, enc_weight * (U32)nbThreads); - FL2POOL_waitAll(cctx->factory); + cctx->jobs[0].cSize = LZMA2_encode(cctx->jobs[0].enc, cctx->matchTable, + cctx->jobs[0].block, + &cctx->params.cParams, streamProp, + &cctx->progressIn, &cctx->progressOut, &cctx->canceled); + + FL2POOL_waitAll(cctx->factory, 0); #else /* FL2_SINGLETHREAD */ @@ -336,88 +342,199 @@ static size_t FL2_compressCurBlock(FL2_CCtx* const cctx, FL2_progressFn progress if (err) return FL2_ERROR(internal); #endif - cctx->jobs[0].cSize = FL2_lzma2Encode(cctx->jobs[0].enc, cctx->matchTable, cctx->jobs[0].block, &cctx->params.cParams, progress, opaque, (rmf_weight * encodeSize) >> 4, enc_weight); + cctx->jobs[0].cSize = LZMA2_encode(cctx->jobs[0].enc, cctx->matchTable, + cctx->jobs[0].block, + &cctx->params.cParams, streamProp, + &cctx->progressIn, &cctx->progressOut, &cctx->canceled); #endif - return nbThreads; + for (size_t u = 0; u < nbThreads; ++u) + if (FL2_isError(cctx->jobs[u].cSize)) + return cctx->jobs[u].cSize; + + cctx->threadCount = nbThreads; + + return FL2_error_no_error; } -FL2LIB_API void FL2LIB_CALL FL2_beginFrame(FL2_CCtx* const cctx) +/* FL2_compressCurBlock_async() : FL2POOL_function type */ +static void FL2_compressCurBlock_async(void* const jobDescription, ptrdiff_t const n) +{ + FL2_CCtx* const cctx = (FL2_CCtx*)jobDescription; + + cctx->asyncRes = FL2_compressCurBlock_blocking(cctx, (int)n); +} + +/* FL2_compressCurBlock() : + * Update total input size. + * Clear the compressed data buffers. + * Init progress info. + * Start compression of cctx->curBlock, and wait for completion if no async compression thread exists. + */ +static size_t FL2_compressCurBlock(FL2_CCtx* const cctx, int const streamProp) +{ + FL2_initProgress(cctx); + + if (cctx->curBlock.start == cctx->curBlock.end) + return FL2_error_no_error; + + /* update largest dict size used */ + cctx->dictMax = MAX(cctx->dictMax, cctx->curBlock.end); + + cctx->outThread = 0; + cctx->threadCount = 0; + cctx->outPos = 0; + + U32 rmfWeight = ZSTD_highbit32((U32)cctx->curBlock.end); + U32 depthWeight = 2 + (cctx->params.rParams.depth >= 12) + (cctx->params.rParams.depth >= 28); + U32 encWeight; + + if (rmfWeight >= 20) { + rmfWeight = depthWeight * (rmfWeight - 10) + (rmfWeight - 19) * 12; + if (cctx->params.cParams.strategy == 0) + encWeight = 20; + else if (cctx->params.cParams.strategy == 1) + encWeight = 50; + else + encWeight = 60 + cctx->params.cParams.second_dict_bits + ZSTD_highbit32(cctx->params.cParams.fast_length) * 3U; + rmfWeight = (rmfWeight << 4) / (rmfWeight + encWeight); + encWeight = 16 - rmfWeight; + } + else { + rmfWeight = 8; + encWeight = 8; + } + + cctx->rmfWeight = rmfWeight; + cctx->encWeight = encWeight; + +#ifndef FL2_SINGLETHREAD + if(cctx->compressThread != NULL) + FL2POOL_add(cctx->compressThread, FL2_compressCurBlock_async, cctx, streamProp); + else +#endif + cctx->asyncRes = FL2_compressCurBlock_blocking(cctx, streamProp); + + return cctx->asyncRes; +} + +/* FL2_getProp() : + * Get the LZMA2 dictionary size property byte. If xxhash is enabled, includes the xxhash flag bit. + */ +static BYTE FL2_getProp(FL2_CCtx* const cctx, size_t const dictionarySize) +{ +#ifndef NO_XXHASH + return LZMA2_getDictSizeProp(dictionarySize) | (BYTE)((cctx->params.doXXH != 0) << FL2_PROP_HASH_BIT); +#else + (void)cctx; + return LZMA2_getDictSizeProp(dictionarySize); +#endif +} + +static void FL2_preBeginFrame(FL2_CCtx* const cctx, size_t const dictReduce) +{ + /* Free unsuitable match table before reallocating anything else */ + if (cctx->matchTable && !RMF_compatibleParameters(cctx->matchTable, &cctx->params.rParams, dictReduce)) { + RMF_freeMatchTable(cctx->matchTable); + cctx->matchTable = NULL; + } +} + +static size_t FL2_beginFrame(FL2_CCtx* const cctx, size_t const dictReduce) +{ + if (FL2_initEncoders(cctx) != 0) /* Create hash objects together, leaving the (large) match table last */ + return FL2_ERROR(memory_allocation); + + if (!cctx->matchTable) { + cctx->matchTable = RMF_createMatchTable(&cctx->params.rParams, dictReduce, cctx->jobCount); + if (cctx->matchTable == NULL) + return FL2_ERROR(memory_allocation); + } + else { + DEBUGLOG(5, "Have compatible match table"); + RMF_applyParameters(cctx->matchTable, &cctx->params.rParams, dictReduce); + } + + cctx->dictMax = 0; + cctx->streamTotal = 0; + cctx->streamCsize = 0; + cctx->progressIn = 0; + cctx->progressOut = 0; + RMF_initProgress(cctx->matchTable); + cctx->asyncRes = 0; + cctx->outThread = 0; + cctx->threadCount = 0; + cctx->outPos = 0; + cctx->curBlock.start = 0; + cctx->curBlock.end = 0; + cctx->lockParams = 1; + + return FL2_error_no_error; +} + +static void FL2_endFrame(FL2_CCtx* const cctx) { cctx->dictMax = 0; - cctx->block_total = 0; + cctx->asyncRes = 0; + cctx->lockParams = 0; } -static size_t FL2_compressBlock(FL2_CCtx* const cctx, - const void* const src, size_t srcStart, size_t const srcEnd, - void* const dst, size_t dstCapacity, - FL2_writerFn const writeFn, void* const opaque, - FL2_progressFn progress) +/* Compress a memory buffer which may be larger than the dictionary. + * The property byte is written first unless the omit flag is set. + * Return: compressed size. + */ +static size_t FL2_compressBuffer(FL2_CCtx* const cctx, + const void* const src, size_t srcSize, + void* const dst, size_t dstCapacity) { - BYTE* dstBuf = dst; - size_t outSize = 0; - size_t const dictionary_size = (size_t)1 << cctx->params.rParams.dictionary_log; - size_t const block_overlap = OVERLAP_FROM_DICT_LOG(cctx->params.rParams.dictionary_log, cctx->params.rParams.overlap_fraction); - - if (srcStart >= srcEnd) + if (srcSize == 0) return 0; + + BYTE* dstBuf = dst; + size_t const dictionarySize = cctx->params.rParams.dictionary_size; + size_t const blockOverlap = OVERLAP_FROM_DICT_SIZE(dictionarySize, cctx->params.rParams.overlap_fraction); + int streamProp = cctx->params.omitProp ? -1 : FL2_getProp(cctx, MIN(srcSize, dictionarySize)); + cctx->curBlock.data = src; - cctx->curBlock.start = srcStart; + cctx->curBlock.start = 0; - while (srcStart < srcEnd) { - size_t nbThreads; + size_t blockTotal = 0; - cctx->curBlock.end = cctx->curBlock.start + MIN(srcEnd - srcStart, dictionary_size - cctx->curBlock.start); + do { + cctx->curBlock.end = cctx->curBlock.start + MIN(srcSize, dictionarySize - cctx->curBlock.start); + blockTotal += cctx->curBlock.end - cctx->curBlock.start; - nbThreads = FL2_compressCurBlock(cctx, progress, opaque); - if (FL2_isError(nbThreads)) - return nbThreads; + CHECK_F(FL2_compressCurBlock(cctx, streamProp)); - for (size_t u = 0; u < nbThreads; ++u) { - const BYTE* const outBuf = RMF_getTableAsOutputBuffer(cctx->matchTable, cctx->jobs[u].block.start); - - if (FL2_isError(cctx->jobs[u].cSize)) - return cctx->jobs[u].cSize; + streamProp = -1; + for (size_t u = 0; u < cctx->threadCount; ++u) { DEBUGLOG(5, "Write thread %u : %u bytes", (U32)u, (U32)cctx->jobs[u].cSize); - if (writeFn == NULL && dstCapacity < cctx->jobs[u].cSize) { + if (dstCapacity < cctx->jobs[u].cSize) return FL2_ERROR(dstSize_tooSmall); - } - if (writeFn != NULL) { - if(writeFn(outBuf, cctx->jobs[u].cSize, opaque)) - return FL2_ERROR(write_failed); - outSize += cctx->jobs[u].cSize; - } - else { - memcpy(dstBuf, outBuf, cctx->jobs[u].cSize); - dstBuf += cctx->jobs[u].cSize; - dstCapacity -= cctx->jobs[u].cSize; - } + + const BYTE* const outBuf = RMF_getTableAsOutputBuffer(cctx->matchTable, cctx->jobs[u].block.start); + memcpy(dstBuf, outBuf, cctx->jobs[u].cSize); + + dstBuf += cctx->jobs[u].cSize; + dstCapacity -= cctx->jobs[u].cSize; } - srcStart += cctx->curBlock.end - cctx->curBlock.start; - cctx->block_total += cctx->curBlock.end - cctx->curBlock.start; - if (cctx->params.rParams.block_size_log && cctx->block_total + MIN(cctx->curBlock.end - block_overlap, srcEnd - srcStart) > ((U64)1 << cctx->params.rParams.block_size_log)) { + srcSize -= cctx->curBlock.end - cctx->curBlock.start; + if (cctx->params.cParams.reset_interval + && blockTotal + MIN(dictionarySize - blockOverlap, srcSize) > dictionarySize * cctx->params.cParams.reset_interval) { /* periodically reset the dictionary for mt decompression */ + DEBUGLOG(4, "Resetting dictionary after %u bytes", (unsigned)blockTotal); cctx->curBlock.start = 0; - cctx->block_total = 0; + blockTotal = 0; } else { - cctx->curBlock.start = block_overlap; + cctx->curBlock.start = blockOverlap; } cctx->curBlock.data += cctx->curBlock.end - cctx->curBlock.start; - } - return (writeFn != NULL) ? outSize : dstBuf - (const BYTE*)dst; -} - -static BYTE FL2_getProp(FL2_CCtx* cctx, size_t dictionary_size) -{ - return FL2_getDictSizeProp(dictionary_size) -#ifndef NO_XXHASH - | (BYTE)((cctx->params.doXXH != 0) << FL2_PROP_HASH_BIT) -#endif - ; + } while (srcSize != 0); + return dstBuf - (const BYTE*)dst; } FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* cctx, @@ -425,31 +542,39 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* cctx, const void* src, size_t srcSize, int compressionLevel) { - BYTE* dstBuf = dst; - BYTE* const end = dstBuf + dstCapacity; - size_t cSize = 0; + if (dstCapacity < 2U - cctx->params.omitProp) /* empty LZMA2 stream is byte sequence {0, 0} */ + return FL2_ERROR(dstSize_tooSmall); if (compressionLevel > 0) FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, compressionLevel); DEBUGLOG(4, "FL2_compressCCtx : level %u, %u src => %u avail", cctx->params.compressionLevel, (U32)srcSize, (U32)dstCapacity); - if (dstCapacity < 2U - cctx->params.omitProp) /* empty LZMA2 stream is byte sequence {0, 0} */ - return FL2_ERROR(dstSize_tooSmall); +#ifndef FL2_SINGLETHREAD + /* No async compression for in-memory function */ + FL2POOL_free(cctx->compressThread); + cctx->compressThread = NULL; + cctx->timeout = 0; +#endif - FL2_beginFrame(cctx); + FL2_preBeginFrame(cctx, srcSize); + CHECK_F(FL2_beginFrame(cctx, srcSize)); - dstBuf += !cctx->params.omitProp; - cSize = FL2_compressBlock(cctx, src, 0, srcSize, dstBuf, end - dstBuf, NULL, NULL, NULL); - if(!cctx->params.omitProp) - dstBuf[-1] = FL2_getProp(cctx, cctx->dictMax); + size_t const cSize = FL2_compressBuffer(cctx, src, srcSize, dst, dstCapacity); if (FL2_isError(cSize)) return cSize; + BYTE* dstBuf = dst; + BYTE* const end = dstBuf + dstCapacity; + dstBuf += cSize; if(dstBuf >= end) return FL2_ERROR(dstSize_tooSmall); + + if (cSize == 0) + *dstBuf++ = FL2_getProp(cctx, 0); + *dstBuf++ = LZMA2_END_MARKER; #ifndef NO_XXHASH @@ -463,100 +588,25 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* cctx, dstBuf += XXHASH_SIZEOF; } #endif + + FL2_endFrame(cctx); + return dstBuf - (BYTE*)dst; } -FL2LIB_API size_t FL2LIB_CALL FL2_blockOverlap(const FL2_CCtx* cctx) -{ - return OVERLAP_FROM_DICT_LOG(cctx->params.rParams.dictionary_log, cctx->params.rParams.overlap_fraction); -} - -FL2LIB_API void FL2LIB_CALL FL2_shiftBlock(FL2_CCtx* cctx, FL2_blockBuffer *block) -{ - FL2_shiftBlock_switch(cctx, block, NULL); -} - -FL2LIB_API void FL2LIB_CALL FL2_shiftBlock_switch(FL2_CCtx* cctx, FL2_blockBuffer *block, unsigned char *dst) -{ - size_t const block_overlap = OVERLAP_FROM_DICT_LOG(cctx->params.rParams.dictionary_log, cctx->params.rParams.overlap_fraction); - - if (block_overlap == 0) { - block->start = 0; - block->end = 0; - } - else if (block->end > block_overlap) { - size_t const from = (block->end - block_overlap) & ALIGNMENT_MASK; - size_t overlap = block->end - from; - - cctx->block_total += block->end - block->start; - if (cctx->params.rParams.block_size_log && cctx->block_total + from > ((U64)1 << cctx->params.rParams.block_size_log)) { - /* periodically reset the dictionary for mt decompression */ - overlap = 0; - cctx->block_total = 0; - } - else if (overlap <= from || dst != NULL) { - DEBUGLOG(5, "Copy overlap data : %u bytes", (U32)overlap); - memcpy(dst ? dst : block->data, block->data + from, overlap); - } - else if (from != 0) { - DEBUGLOG(5, "Move overlap data : %u bytes", (U32)overlap); - memmove(block->data, block->data + from, overlap); - } - block->start = overlap; - block->end = overlap; - } - else { - block->start = block->end; - } -} - -FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtxBlock(FL2_CCtx* cctx, - void* dst, size_t dstCapacity, - const FL2_blockBuffer *block, - FL2_progressFn progress, void* opaque) -{ - return FL2_compressBlock(cctx, block->data, block->start, block->end, dst, dstCapacity, NULL, opaque, progress); -} - -FL2LIB_API size_t FL2LIB_CALL FL2_endFrame(FL2_CCtx* ctx, - void* dst, size_t dstCapacity) -{ - if (!dstCapacity) - return FL2_ERROR(dstSize_tooSmall); - *(BYTE*)dst = LZMA2_END_MARKER; - return 1; -} - -FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtxBlock_toFn(FL2_CCtx* cctx, - FL2_writerFn writeFn, void* opaque, - const FL2_blockBuffer *block, - FL2_progressFn progress) -{ - return FL2_compressBlock(cctx, block->data, block->start, block->end, NULL, 0, writeFn, opaque, progress); -} - -FL2LIB_API size_t FL2LIB_CALL FL2_endFrame_toFn(FL2_CCtx* ctx, - FL2_writerFn writeFn, void* opaque) -{ - BYTE c = LZMA2_END_MARKER; - if(writeFn(&c, 1, opaque)) - return FL2_ERROR(write_failed); - return 1; -} - FL2LIB_API size_t FL2LIB_CALL FL2_compressMt(void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel, unsigned nbThreads) { - size_t cSize; FL2_CCtx* const cctx = FL2_createCCtxMt(nbThreads); if (cctx == NULL) return FL2_ERROR(memory_allocation); - cSize = FL2_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel); + size_t const cSize = FL2_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel); FL2_freeCCtx(cctx); + return cSize; } @@ -567,462 +617,691 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compress(void* dst, size_t dstCapacity, return FL2_compressMt(dst, dstCapacity, src, srcSize, compressionLevel, 1); } -FL2LIB_API BYTE FL2LIB_CALL FL2_dictSizeProp(FL2_CCtx* cctx) +FL2LIB_API BYTE FL2LIB_CALL FL2_getCCtxDictProp(FL2_CCtx* cctx) { - return FL2_getDictSizeProp(cctx->dictMax ? cctx->dictMax : (size_t)1 << cctx->params.rParams.dictionary_log); + return LZMA2_getDictSizeProp(cctx->dictMax ? cctx->dictMax : cctx->params.rParams.dictionary_size); } -#define CLAMPCHECK(val,min,max) { \ +#define MAXCHECK(val,max) do { \ + if ((val)>(max)) { \ + return FL2_ERROR(parameter_outOfBound); \ +} } while(0) + +#define CLAMPCHECK(val,min,max) do { \ if (((val)<(min)) | ((val)>(max))) { \ return FL2_ERROR(parameter_outOfBound); \ -} } +} } while(0) -FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_setParameter(FL2_CCtx* cctx, FL2_cParameter param, unsigned value) + +FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_setParameter(FL2_CCtx* cctx, FL2_cParameter param, size_t value) +{ + if (cctx->lockParams + && param != FL2_p_literalCtxBits && param != FL2_p_literalPosBits && param != FL2_p_posBits) + return FL2_ERROR(stage_wrong); + + switch (param) + { + case FL2_p_compressionLevel: + if (cctx->params.highCompression) { + CLAMPCHECK(value, 1, FL2_MAX_HIGH_CLEVEL); + FL2_fillParameters(cctx, &FL2_highCParameters[value]); + } + else { + CLAMPCHECK(value, 1, FL2_MAX_CLEVEL); + FL2_fillParameters(cctx, &FL2_defaultCParameters[value]); + } + cctx->params.compressionLevel = (unsigned)value; + break; + + case FL2_p_highCompression: + cctx->params.highCompression = value != 0; + FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, cctx->params.compressionLevel); + break; + + case FL2_p_dictionaryLog: + CLAMPCHECK(value, FL2_DICTLOG_MIN, FL2_DICTLOG_MAX); + cctx->params.rParams.dictionary_size = (size_t)1 << value; + break; + + case FL2_p_dictionarySize: + CLAMPCHECK(value, FL2_DICTSIZE_MIN, FL2_DICTSIZE_MAX); + cctx->params.rParams.dictionary_size = value; + break; + + case FL2_p_overlapFraction: + MAXCHECK(value, FL2_BLOCK_OVERLAP_MAX); + cctx->params.rParams.overlap_fraction = (unsigned)value; + break; + + case FL2_p_resetInterval: + if (value != 0) + CLAMPCHECK(value, FL2_RESET_INTERVAL_MIN, FL2_RESET_INTERVAL_MAX); + cctx->params.cParams.reset_interval = (unsigned)value; + break; + + case FL2_p_bufferLog: + MAXCHECK(value, FL2_BUFFER_SIZE_LOG_MAX); + cctx->params.rParams.match_buffer_log = RMF_BUFFER_LOG_BASE - (unsigned)value; + break; + + case FL2_p_hybridChainLog: + CLAMPCHECK(value, FL2_CHAINLOG_MIN, FL2_CHAINLOG_MAX); + cctx->params.cParams.second_dict_bits = (unsigned)value; + break; + + case FL2_p_hybridCycles: + CLAMPCHECK(value, FL2_HYBRIDCYCLES_MIN, FL2_HYBRIDCYCLES_MAX); + cctx->params.cParams.match_cycles = (unsigned)value; + break; + + case FL2_p_searchDepth: + CLAMPCHECK(value, FL2_SEARCH_DEPTH_MIN, FL2_SEARCH_DEPTH_MAX); + cctx->params.rParams.depth = (unsigned)value; + break; + + case FL2_p_fastLength: + CLAMPCHECK(value, FL2_FASTLENGTH_MIN, FL2_FASTLENGTH_MAX); + cctx->params.cParams.fast_length = (unsigned)value; + break; + + case FL2_p_divideAndConquer: + cctx->params.rParams.divide_and_conquer = value != 0; + break; + + case FL2_p_strategy: + MAXCHECK(value, (unsigned)FL2_ultra); + cctx->params.cParams.strategy = (FL2_strategy)value; + break; + + /* lc, lp, pb can be changed between encoder chunks. + * A condition where lc+lp > 4 is permitted to allow sequential setting, + * but will return an error code to alert the calling function. + * If lc+lp is still >4 when encoding begins, lc will be reduced. */ + case FL2_p_literalCtxBits: + MAXCHECK(value, FL2_LC_MAX); + cctx->params.cParams.lc = (unsigned)value; + if (value + cctx->params.cParams.lp > FL2_LCLP_MAX) + return FL2_ERROR(lclpMax_exceeded); + break; + + case FL2_p_literalPosBits: + MAXCHECK(value, FL2_LP_MAX); + cctx->params.cParams.lp = (unsigned)value; + if (cctx->params.cParams.lc + value > FL2_LCLP_MAX) + return FL2_ERROR(lclpMax_exceeded); + break; + + case FL2_p_posBits: + MAXCHECK(value, FL2_PB_MAX); + cctx->params.cParams.pb = (unsigned)value; + break; + +#ifndef NO_XXHASH + case FL2_p_doXXHash: + cctx->params.doXXH = value != 0; + break; +#endif + + case FL2_p_omitProperties: + cctx->params.omitProp = value != 0; + break; +#ifdef RMF_REFERENCE + case FL2_p_useReferenceMF: + cctx->params.rParams.use_ref_mf = value != 0; + break; +#endif + default: return FL2_ERROR(parameter_unsupported); + } + return value; +} + +FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_getParameter(FL2_CCtx* cctx, FL2_cParameter param) { switch (param) { case FL2_p_compressionLevel: - if (value > 0) { /* 0 : does not change current level */ - if (cctx->params.highCompression) { - if ((int)value > FL2_MAX_HIGH_CLEVEL) value = FL2_MAX_HIGH_CLEVEL; - FL2_fillParameters(cctx, &FL2_highCParameters[value]); - } - else { - if ((int)value > FL2_MAX_CLEVEL) value = FL2_MAX_CLEVEL; - FL2_fillParameters(cctx, &FL2_defaultCParameters[value]); - } - cctx->params.compressionLevel = value; - } return cctx->params.compressionLevel; case FL2_p_highCompression: - if ((int)value >= 0) { /* < 0 : does not change highCompression */ - cctx->params.highCompression = value != 0; - FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, cctx->params.compressionLevel); - } return cctx->params.highCompression; - case FL2_p_7zLevel: - if (value > 0) { /* 0 : does not change current level */ - if ((int)value > FL2_MAX_7Z_CLEVEL) value = FL2_MAX_7Z_CLEVEL; - FL2_fillParameters(cctx, &FL2_7zCParameters[value]); - cctx->params.compressionLevel = value; - } - return cctx->params.compressionLevel; + case FL2_p_dictionaryLog: { + size_t dictLog = FL2_DICTLOG_MIN; + while (((size_t)1 << dictLog) < cctx->params.rParams.dictionary_size) + ++dictLog; + return dictLog; + } - case FL2_p_dictionaryLog: - if (value) { /* 0 : does not change current dictionaryLog */ - CLAMPCHECK(value, FL2_DICTLOG_MIN, FL2_DICTLOG_MAX); - cctx->params.rParams.dictionary_log = value; - } - return cctx->params.rParams.dictionary_log; + case FL2_p_dictionarySize: + return cctx->params.rParams.dictionary_size; case FL2_p_overlapFraction: - if ((int)value >= 0) { /* < 0 : does not change current overlapFraction */ - CLAMPCHECK(value, FL2_BLOCK_OVERLAP_MIN, FL2_BLOCK_OVERLAP_MAX); - cctx->params.rParams.overlap_fraction = value; - } return cctx->params.rParams.overlap_fraction; - case FL2_p_blockSize: - if ((int)value >= 0) { /* < 0 : does not change current overlapFraction */ - CLAMPCHECK(value, FL2_BLOCK_LOG_MIN, FL2_BLOCK_LOG_MAX); - cctx->params.rParams.block_size_log = value; - } - return cctx->params.rParams.block_size_log; + case FL2_p_resetInterval: + return cctx->params.cParams.reset_interval; case FL2_p_bufferLog: - if (value) { /* 0 : does not change current bufferLog */ - CLAMPCHECK(value, FL2_BUFFER_SIZE_LOG_MIN, FL2_BUFFER_SIZE_LOG_MAX); - cctx->params.rParams.match_buffer_log = value; - } - return cctx->params.rParams.match_buffer_log; + return RMF_BUFFER_LOG_BASE - cctx->params.rParams.match_buffer_log; - case FL2_p_chainLog: - if (value) { /* 0 : does not change current chainLog */ - CLAMPCHECK(value, FL2_CHAINLOG_MIN, FL2_CHAINLOG_MAX); - cctx->params.cParams.second_dict_bits = value; - } + case FL2_p_hybridChainLog: return cctx->params.cParams.second_dict_bits; - case FL2_p_searchLog: - if ((int)value >= 0) { /* < 0 : does not change current searchLog */ - CLAMPCHECK(value, FL2_SEARCHLOG_MIN, FL2_SEARCHLOG_MAX); - cctx->params.cParams.match_cycles = 1U << value; - } - return value; + case FL2_p_hybridCycles: + return cctx->params.cParams.match_cycles; case FL2_p_literalCtxBits: - if ((int)value >= 0) { /* < 0 : does not change current lc */ - CLAMPCHECK(value, FL2_LC_MIN, FL2_LC_MAX); - cctx->params.cParams.lc = value; - } return cctx->params.cParams.lc; case FL2_p_literalPosBits: - if ((int)value >= 0) { /* < 0 : does not change current lp */ - CLAMPCHECK(value, FL2_LP_MIN, FL2_LP_MAX); - cctx->params.cParams.lp = value; - } return cctx->params.cParams.lp; case FL2_p_posBits: - if ((int)value >= 0) { /* < 0 : does not change current pb */ - CLAMPCHECK(value, FL2_PB_MIN, FL2_PB_MAX); - cctx->params.cParams.pb = value; - } return cctx->params.cParams.pb; case FL2_p_searchDepth: - if (value) { /* 0 : does not change current depth */ - CLAMPCHECK(value, FL2_SEARCH_DEPTH_MIN, FL2_SEARCH_DEPTH_MAX); - cctx->params.rParams.depth = value; - } return cctx->params.rParams.depth; case FL2_p_fastLength: - if (value) { /* 0 : does not change current fast_length */ - CLAMPCHECK(value, FL2_FASTLENGTH_MIN, FL2_FASTLENGTH_MAX); - cctx->params.cParams.fast_length = value; - } return cctx->params.cParams.fast_length; case FL2_p_divideAndConquer: - if ((int)value >= 0) { /* < 0 : does not change current divide_and_conquer */ - cctx->params.rParams.divide_and_conquer = value; - } return cctx->params.rParams.divide_and_conquer; case FL2_p_strategy: - if ((int)value >= 0) { /* < 0 : does not change current strategy */ - CLAMPCHECK(value, (unsigned)FL2_fast, (unsigned)FL2_ultra); - cctx->params.cParams.strategy = (FL2_strategy)value; - } return (size_t)cctx->params.cParams.strategy; #ifndef NO_XXHASH case FL2_p_doXXHash: - if ((int)value >= 0) { /* < 0 : does not change doXXHash */ - cctx->params.doXXH = value != 0; - } return cctx->params.doXXH; #endif case FL2_p_omitProperties: - if ((int)value >= 0) { /* < 0 : does not change omitProp */ - cctx->params.omitProp = value != 0; - } return cctx->params.omitProp; #ifdef RMF_REFERENCE case FL2_p_useReferenceMF: - if ((int)value >= 0) { /* < 0 : does not change useRefMF */ - cctx->params.rParams.use_ref_mf = value != 0; - } return cctx->params.rParams.use_ref_mf; #endif default: return FL2_ERROR(parameter_unsupported); } } -FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStream(void) +FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cParameter param, size_t value) { - FL2_CCtx* const cctx = FL2_createCCtx(); - FL2_CStream* const fcs = malloc(sizeof(FL2_CStream)); - - DEBUGLOG(3, "FL2_createCStream"); - - if (cctx == NULL || fcs == NULL) { - free(cctx); - free(fcs); - return NULL; - } - fcs->cctx = cctx; - fcs->inBuff.bufSize = 0; - fcs->inBuff.data = NULL; - fcs->inBuff.start = 0; - fcs->inBuff.end = 0; -#ifndef NO_XXHASH - fcs->xxh = NULL; -#endif - fcs->out_thread = 0; - fcs->thread_count = 0; - fcs->out_pos = 0; - fcs->hash_pos = 0; - fcs->end_marked = 0; - fcs->wrote_prop = 0; - return fcs; + return FL2_CCtx_setParameter(fcs, param, value); } -FL2LIB_API size_t FL2LIB_CALL FL2_freeCStream(FL2_CStream* fcs) +FL2LIB_API size_t FL2LIB_CALL FL2_CStream_getParameter(FL2_CStream* fcs, FL2_cParameter param) { - if (fcs == NULL) - return 0; + return FL2_CCtx_getParameter(fcs, param); +} - DEBUGLOG(3, "FL2_freeCStream"); +FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStream(void) +{ + return FL2_createCCtx_internal(1, 0); +} - free(fcs->inBuff.data); -#ifndef NO_XXHASH - XXH32_freeState(fcs->xxh); -#endif - FL2_freeCCtx(fcs->cctx); - free(fcs); - return 0; +FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStreamMt(unsigned nbThreads, int dualBuffer) +{ + return FL2_createCCtx_internal(nbThreads, dualBuffer); +} + +FL2LIB_API void FL2LIB_CALL FL2_freeCStream(FL2_CStream * fcs) +{ + FL2_freeCCtx(fcs); } FL2LIB_API size_t FL2LIB_CALL FL2_initCStream(FL2_CStream* fcs, int compressionLevel) { DEBUGLOG(4, "FL2_initCStream level %d", compressionLevel); - fcs->inBuff.start = 0; - fcs->inBuff.end = 0; - fcs->out_thread = 0; - fcs->thread_count = 0; - fcs->out_pos = 0; - fcs->hash_pos = 0; - fcs->end_marked = 0; - fcs->wrote_prop = 0; + fcs->endMarked = 0; + fcs->wroteProp = 0; + fcs->loopCount = 0; - FL2_CCtx_setParameter(fcs->cctx, FL2_p_compressionLevel, compressionLevel); + if(compressionLevel > 0) + FL2_CCtx_setParameter(fcs, FL2_p_compressionLevel, compressionLevel); -#ifndef NO_XXHASH - if (fcs->cctx->params.doXXH && !fcs->cctx->params.omitProp) { - if (fcs->xxh == NULL) { - fcs->xxh = XXH32_createState(); - if (fcs->xxh == NULL) - return FL2_ERROR(memory_allocation); - } - XXH32_reset(fcs->xxh, 0); - } + DICT_buffer *const buf = &fcs->buf; + size_t const dictSize = fcs->params.rParams.dictionary_size; + + /* Free unsuitable objects before reallocating anything new */ + if (DICT_size(buf) < dictSize) + DICT_destruct(buf); + + FL2_preBeginFrame(fcs, 0); + +#ifdef NO_XXHASH + int const doHash = 0; +#else + int const doHash = (fcs->params.doXXH && !fcs->params.omitProp); #endif + size_t dictOverlap = OVERLAP_FROM_DICT_SIZE(fcs->params.rParams.dictionary_size, fcs->params.rParams.overlap_fraction); + if (DICT_init(buf, dictSize, dictOverlap, fcs->params.cParams.reset_interval, doHash) != 0) + return FL2_ERROR(memory_allocation); + + CHECK_F(FL2_beginFrame(fcs, 0)); - FL2_beginFrame(fcs->cctx); return 0; } -static size_t FL2_compressStream_internal(FL2_CStream* const fcs, - FL2_outBuffer* const output, int const ending) +FL2LIB_API size_t FL2LIB_CALL FL2_setCStreamTimeout(FL2_CStream * fcs, unsigned timeout) { - FL2_CCtx* const cctx = fcs->cctx; - - if (output->pos >= output->size) - return 0; - - if (fcs->out_thread == fcs->thread_count) { - if (fcs->inBuff.start < fcs->inBuff.end) { -#ifndef NO_XXHASH - if (cctx->params.doXXH && !cctx->params.omitProp) { - XXH32_update(fcs->xxh, fcs->inBuff.data + fcs->inBuff.start, fcs->inBuff.end - fcs->inBuff.start); - } -#endif - cctx->curBlock.data = fcs->inBuff.data; - cctx->curBlock.start = fcs->inBuff.start; - cctx->curBlock.end = fcs->inBuff.end; - - fcs->out_thread = 0; - fcs->thread_count = FL2_compressCurBlock(cctx, NULL, NULL); - if (FL2_isError(fcs->thread_count)) - return fcs->thread_count; - - fcs->inBuff.start = fcs->inBuff.end; - } - if (!fcs->wrote_prop && !cctx->params.omitProp) { - size_t dictionary_size = ending ? cctx->dictMax : (size_t)1 << cctx->params.rParams.dictionary_log; - ((BYTE*)output->dst)[output->pos] = FL2_getProp(cctx, dictionary_size); - DEBUGLOG(4, "Writing property byte : 0x%X", ((BYTE*)output->dst)[output->pos]); - ++output->pos; - fcs->wrote_prop = 1; +#ifndef FL2_SINGLETHREAD + if (timeout != 0) { + if (fcs->compressThread == NULL) { + fcs->compressThread = FL2POOL_create(1); + if (fcs->compressThread == NULL) + return FL2_ERROR(memory_allocation); } } - for (; fcs->out_thread < fcs->thread_count; ++fcs->out_thread) { - const BYTE* const outBuf = RMF_getTableAsOutputBuffer(cctx->matchTable, cctx->jobs[fcs->out_thread].block.start) + fcs->out_pos; + else if (!DICT_async(&fcs->buf) && fcs->dictMax == 0) { + /* Only free the thread if not dual buffering and compression not underway */ + FL2POOL_free(fcs->compressThread); + fcs->compressThread = NULL; + } + fcs->timeout = timeout; +#endif + return FL2_error_no_error; +} + +static size_t FL2_compressStream_internal(FL2_CStream* const fcs, int const ending) +{ + CHECK_F(FL2_waitCStream(fcs)); + + DICT_buffer *const buf = &fcs->buf; + + /* no compression can occur while compressed output exists */ + if (fcs->outThread == fcs->threadCount && DICT_hasUnprocessed(buf)) { + fcs->streamTotal += fcs->curBlock.end - fcs->curBlock.start; + + DICT_getBlock(buf, &fcs->curBlock); + + int streamProp = -1; + + if (!fcs->wroteProp && !fcs->params.omitProp) { + /* If the LZMA2 property byte is required and not already written, + * pass it to the compression function + */ + size_t dictionarySize = ending ? MAX(fcs->dictMax, fcs->curBlock.end) + : fcs->params.rParams.dictionary_size; + streamProp = FL2_getProp(fcs, dictionarySize); + DEBUGLOG(4, "Writing property byte : 0x%X", streamProp); + fcs->wroteProp = 1; + } + + CHECK_F(FL2_compressCurBlock(fcs, streamProp)); + } + return FL2_error_no_error; +} + +/* Copy the compressed output stored in the match table buffer. + * One slice exists per thread. + */ +static void FL2_copyCStreamOutput(FL2_CStream* fcs, FL2_outBuffer *output) +{ + for (; fcs->outThread < fcs->threadCount; ++fcs->outThread) { + const BYTE* const outBuf = RMF_getTableAsOutputBuffer(fcs->matchTable, fcs->jobs[fcs->outThread].block.start) + fcs->outPos; BYTE* const dstBuf = (BYTE*)output->dst + output->pos; size_t const dstCapacity = output->size - output->pos; - size_t to_write = cctx->jobs[fcs->out_thread].cSize; + size_t toWrite = fcs->jobs[fcs->outThread].cSize; - if (FL2_isError(to_write)) - return to_write; + toWrite = MIN(toWrite - fcs->outPos, dstCapacity); - to_write = MIN(to_write - fcs->out_pos, dstCapacity); + DEBUGLOG(5, "CStream : writing %u bytes", (U32)toWrite); - DEBUGLOG(5, "CStream : writing %u bytes", (U32)to_write); + memcpy(dstBuf, outBuf, toWrite); + fcs->outPos += toWrite; + output->pos += toWrite; - memcpy(dstBuf, outBuf, to_write); - fcs->out_pos += to_write; - output->pos += to_write; - - if (fcs->out_pos < cctx->jobs[fcs->out_thread].cSize) + /* If the slice is not flushed, the output is full */ + if (fcs->outPos < fcs->jobs[fcs->outThread].cSize) break; - fcs->out_pos = 0; + fcs->outPos = 0; } - return 0; } -static size_t FL2_remainingOutputSize(FL2_CStream* const fcs) +static size_t FL2_compressStream_input(FL2_CStream* fcs, FL2_inBuffer* input) { - FL2_CCtx* const cctx = fcs->cctx; - size_t pos = fcs->out_pos; - size_t total = 0; + CHECK_F(fcs->asyncRes); - if (FL2_isError(fcs->thread_count)) - return fcs->thread_count; + DICT_buffer * const buf = &fcs->buf; - for (size_t u = fcs->out_thread; u < fcs->thread_count; ++u) { - size_t to_write = cctx->jobs[u].cSize; - - if (FL2_isError(to_write)) - return to_write; - total += to_write - pos; - pos = 0; - } - return total; -} - -FL2LIB_API size_t FL2LIB_CALL FL2_compressStream(FL2_CStream* fcs, FL2_outBuffer* output, FL2_inBuffer* input) -{ - FL2_blockBuffer* const inBuff = &fcs->inBuff; - FL2_CCtx* const cctx = fcs->cctx; - size_t block_overlap = OVERLAP_FROM_DICT_LOG(cctx->params.rParams.dictionary_log, cctx->params.rParams.overlap_fraction); - - if (FL2_isError(fcs->thread_count)) - return fcs->thread_count; - - if (output->pos < output->size) while (input->pos < input->size) { - /* read input and/or write output until a buffer is full */ - if (inBuff->data == NULL) { - inBuff->bufSize = (size_t)1 << cctx->params.rParams.dictionary_log; - - DEBUGLOG(3, "Allocating input buffer : %u bytes", (U32)inBuff->bufSize); - - inBuff->data = malloc(inBuff->bufSize); - - if (inBuff->data == NULL) - return FL2_ERROR(memory_allocation); - - inBuff->start = 0; - inBuff->end = 0; + while (input->pos < input->size) { + /* read input until the buffer(s) are full */ + if (DICT_needShift(buf)) { + /* cannot shift single dict during compression */ + if(!DICT_async(buf)) + CHECK_F(FL2_waitCStream(fcs)); + DICT_shift(buf); } - if (inBuff->start > block_overlap && input->pos < input->size) { - FL2_shiftBlock(fcs->cctx, inBuff); + + CHECK_F(fcs->asyncRes); + + DICT_put(buf, input); + + if (!DICT_availSpace(buf)) { + /* break if the compressor is not available */ + if (fcs->outThread < fcs->threadCount) + break; + + CHECK_F(FL2_compressStream_internal(fcs, 0)); } - if (fcs->out_thread == fcs->thread_count) { - /* no compressed output to write, so read */ - size_t const toRead = MIN(input->size - input->pos, inBuff->bufSize - inBuff->end); - DEBUGLOG(5, "CStream : reading %u bytes", (U32)toRead); - - memcpy(inBuff->data + inBuff->end, (char*)input->src + input->pos, toRead); - input->pos += toRead; - inBuff->end += toRead; - } - if (inBuff->end == inBuff->bufSize || fcs->out_thread < fcs->thread_count) { - CHECK_F(FL2_compressStream_internal(fcs, output, 0)); - } - /* compressed output remains, so output buffer is full */ - if (fcs->out_thread < fcs->thread_count) - break; - } - return (inBuff->data == NULL) ? (size_t)1 << cctx->params.rParams.dictionary_log : inBuff->bufSize - inBuff->end; -} - -static size_t FL2_flushStream_internal(FL2_CStream* fcs, FL2_outBuffer* output, int ending) -{ - if (FL2_isError(fcs->thread_count)) - return fcs->thread_count; - - DEBUGLOG(4, "FL2_flushStream_internal : %u to compress, %u to write", - (U32)(fcs->inBuff.end - fcs->inBuff.start), - (U32)FL2_remainingOutputSize(fcs)); - - CHECK_F(FL2_compressStream_internal(fcs, output, ending)); - - return FL2_remainingOutputSize(fcs); -} - -FL2LIB_API size_t FL2LIB_CALL FL2_flushStream(FL2_CStream* fcs, FL2_outBuffer* output) -{ - return FL2_flushStream_internal(fcs, output, 0); -} - -FL2LIB_API size_t FL2LIB_CALL FL2_endStream(FL2_CStream* fcs, FL2_outBuffer* output) -{ - { size_t cSize = FL2_flushStream_internal(fcs, output, 1); - if (cSize != 0) - return cSize; + CHECK_F(fcs->asyncRes); } - if(!fcs->end_marked) { - if (output->pos >= output->size) - return 1; - DEBUGLOG(4, "Writing end marker"); - ((BYTE*)output->dst)[output->pos] = LZMA2_END_MARKER; - ++output->pos; - fcs->end_marked = 1; + return FL2_error_no_error; +} + +static size_t FL2_loopCheck(FL2_CStream* fcs, int unchanged) +{ + if (unchanged) { + ++fcs->loopCount; + if (fcs->loopCount > FL2_MAX_LOOPS) + return FL2_ERROR(buffer); } + else { + fcs->loopCount = 0; + } + return FL2_error_no_error; +} -#ifndef NO_XXHASH - if (fcs->cctx->params.doXXH && !fcs->cctx->params.omitProp && fcs->hash_pos < XXHASH_SIZEOF) { - size_t const to_write = MIN(output->size - output->pos, XXHASH_SIZEOF - fcs->hash_pos); - XXH32_canonical_t canonical; +FL2LIB_API size_t FL2LIB_CALL FL2_compressStream(FL2_CStream* fcs, FL2_outBuffer *output, FL2_inBuffer* input) +{ + if (!fcs->lockParams) + return FL2_ERROR(init_missing); - if (output->pos >= output->size) - return 1; + size_t const prevIn = input->pos; + size_t const prevOut = (output != NULL) ? output->pos : 0; - XXH32_canonicalFromHash(&canonical, XXH32_digest(fcs->xxh)); - DEBUGLOG(4, "Writing XXH32 : %u bytes", (U32)to_write); - memcpy((BYTE*)output->dst + output->pos, canonical.digest + fcs->hash_pos, to_write); - output->pos += to_write; - fcs->hash_pos += to_write; - return fcs->hash_pos < XXHASH_SIZEOF; + if (output != NULL && fcs->outThread < fcs->threadCount) + FL2_copyCStreamOutput(fcs, output); + + CHECK_F(FL2_compressStream_input(fcs, input)); + + if(output != NULL && fcs->outThread < fcs->threadCount) + FL2_copyCStreamOutput(fcs, output); + + CHECK_F(FL2_loopCheck(fcs, prevIn == input->pos && (output == NULL || prevOut == output->pos))); + + return fcs->outThread < fcs->threadCount; +} + +FL2LIB_API size_t FL2LIB_CALL FL2_getDictionaryBuffer(FL2_CStream * fcs, FL2_dictBuffer * dict) +{ + if (!fcs->lockParams) + return FL2_ERROR(init_missing); + + CHECK_F(fcs->asyncRes); + + DICT_buffer *buf = &fcs->buf; + + if (!DICT_availSpace(buf) && DICT_hasUnprocessed(buf)) + CHECK_F(FL2_compressStream_internal(fcs, 0)); + + if (DICT_needShift(buf) && !DICT_async(buf)) + CHECK_F(FL2_waitCStream(fcs)); + + dict->size = (unsigned long)DICT_get(buf, &dict->dst); + + return FL2_error_no_error; +} + +FL2LIB_API size_t FL2LIB_CALL FL2_updateDictionary(FL2_CStream * fcs, size_t addedSize) +{ + if (DICT_update(&fcs->buf, addedSize)) + CHECK_F(FL2_compressStream_internal(fcs, 0)); + + return fcs->outThread < fcs->threadCount; +} + +FL2LIB_API size_t FL2LIB_CALL FL2_getNextCStreamBuffer(FL2_CStream* fcs, FL2_cBuffer* cbuf) +{ + cbuf->src = NULL; + cbuf->size = 0; + +#ifndef FL2_SINGLETHREAD + FL2POOL_waitAll(fcs->compressThread, 0); + CHECK_F(fcs->asyncRes); +#endif + + if (fcs->outThread < fcs->threadCount) { + cbuf->src = RMF_getTableAsOutputBuffer(fcs->matchTable, fcs->jobs[fcs->outThread].block.start) + fcs->outPos; + cbuf->size = fcs->jobs[fcs->outThread].cSize - fcs->outPos; + ++fcs->outThread; + fcs->outPos = 0; + } + return cbuf->size; +} + +FL2LIB_API unsigned long long FL2LIB_CALL FL2_getCStreamProgress(const FL2_CStream * fcs, unsigned long long *outputSize) +{ + if (outputSize != NULL) + *outputSize = fcs->streamCsize + fcs->progressOut; + + U64 const encodeSize = fcs->curBlock.end - fcs->curBlock.start; + + if (fcs->progressIn == 0 && fcs->curBlock.end != 0) + return fcs->streamTotal + ((fcs->matchTable->progress * encodeSize / fcs->curBlock.end * fcs->rmfWeight) >> 4); + + return fcs->streamTotal + ((fcs->rmfWeight * encodeSize) >> 4) + ((fcs->progressIn * fcs->encWeight) >> 4); +} + +FL2LIB_API size_t FL2LIB_CALL FL2_waitCStream(FL2_CStream * fcs) +{ +#ifndef FL2_SINGLETHREAD + if (FL2POOL_waitAll(fcs->compressThread, fcs->timeout) != 0) + return FL2_ERROR(timedOut); + CHECK_F(fcs->asyncRes); +#endif + return fcs->outThread < fcs->threadCount; +} + +FL2LIB_API void FL2LIB_CALL FL2_cancelCStream(FL2_CStream *fcs) +{ +#ifndef FL2_SINGLETHREAD + if (fcs->compressThread != NULL) { + fcs->canceled = 1; + + RMF_cancelBuild(fcs->matchTable); + FL2POOL_waitAll(fcs->compressThread, 0); + + fcs->canceled = 0; } #endif - return 0; + FL2_endFrame(fcs); } -FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cParameter param, unsigned value) +FL2LIB_API size_t FL2LIB_CALL FL2_remainingOutputSize(const FL2_CStream* fcs) { - if (fcs->inBuff.start < fcs->inBuff.end) - return FL2_ERROR(stage_wrong); - return FL2_CCtx_setParameter(fcs->cctx, param, value); + CHECK_F(fcs->asyncRes); + + size_t cSize = 0; + for (size_t u = fcs->outThread; u < fcs->threadCount; ++u) + cSize += fcs->jobs[u].cSize; + + return cSize; } - -size_t FL2_memoryUsage_internal(unsigned const dictionaryLog, unsigned const bufferLog, unsigned const searchDepth, - unsigned chainLog, FL2_strategy strategy, - unsigned nbThreads) +/* Write the properties byte (if required), the hash and the end marker + * into the output buffer. + */ +static void FL2_writeEnd(FL2_CStream* const fcs) { - size_t size = RMF_memoryUsage(dictionaryLog, bufferLog, searchDepth, nbThreads); - return size + FL2_lzma2MemoryUsage(chainLog, strategy, nbThreads); + size_t thread = fcs->threadCount - 1; + if (fcs->outThread == fcs->threadCount) { + fcs->outThread = 0; + fcs->threadCount = 1; + fcs->jobs[0].cSize = 0; + thread = 0; + } + BYTE *const dst = RMF_getTableAsOutputBuffer(fcs->matchTable, fcs->jobs[thread].block.start) + + fcs->jobs[thread].cSize; + + size_t pos = 0; + + if (!fcs->wroteProp && !fcs->params.omitProp) { + /* no compression occurred */ + dst[pos] = FL2_getProp(fcs, 0); + DEBUGLOG(4, "Writing property byte : 0x%X", dst[pos]); + ++pos; + fcs->wroteProp = 1; + } + + DEBUGLOG(4, "Writing end marker"); + dst[pos++] = LZMA2_END_MARKER; + +#ifndef NO_XXHASH + if (fcs->params.doXXH && !fcs->params.omitProp) { + XXH32_canonical_t canonical; + + XXH32_canonicalFromHash(&canonical, DICT_getDigest(&fcs->buf)); + DEBUGLOG(4, "Writing XXH32"); + memcpy(dst + pos, &canonical, XXHASH_SIZEOF); + + pos += XXHASH_SIZEOF; + } +#endif + fcs->jobs[thread].cSize += pos; + fcs->endMarked = 1; + + FL2_endFrame(fcs); +} + +static size_t FL2_flushStream_internal(FL2_CStream* fcs, int const ending) +{ + CHECK_F(fcs->asyncRes); + + DEBUGLOG(4, "FL2_flushStream_internal : %u to compress, %u to write", + (U32)(fcs->buf.end - fcs->buf.start), + (U32)FL2_remainingOutputSize(fcs)); + + CHECK_F(FL2_compressStream_internal(fcs, ending)); + + return fcs->outThread < fcs->threadCount; +} + +FL2LIB_API size_t FL2LIB_CALL FL2_flushStream(FL2_CStream* fcs, FL2_outBuffer *output) +{ + if (!fcs->lockParams) + return FL2_ERROR(init_missing); + + size_t const prevOut = (output != NULL) ? output->pos : 0; + + if (output != NULL && fcs->outThread < fcs->threadCount) + FL2_copyCStreamOutput(fcs, output); + + size_t res = FL2_flushStream_internal(fcs, 0); + CHECK_F(res); + + if (output != NULL && res != 0) { + FL2_copyCStreamOutput(fcs, output); + res = fcs->outThread < fcs->threadCount; + } + + CHECK_F(FL2_loopCheck(fcs, output != NULL && prevOut == output->pos)); + + return res; +} + +FL2LIB_API size_t FL2LIB_CALL FL2_endStream(FL2_CStream* fcs, FL2_outBuffer *output) +{ + if (!fcs->endMarked && !fcs->lockParams) + return FL2_ERROR(init_missing); + + size_t const prevOut = (output != NULL) ? output->pos : 0; + + if (output != NULL && fcs->outThread < fcs->threadCount) + FL2_copyCStreamOutput(fcs, output); + + CHECK_F(FL2_flushStream_internal(fcs, 1)); + + size_t res = FL2_waitCStream(fcs); + CHECK_F(res); + + if (!fcs->endMarked && !DICT_hasUnprocessed(&fcs->buf)) { + FL2_writeEnd(fcs); + res = 1; + } + + if (output != NULL && res != 0) { + FL2_copyCStreamOutput(fcs, output); + res = fcs->outThread < fcs->threadCount || DICT_hasUnprocessed(&fcs->buf); + } + + CHECK_F(FL2_loopCheck(fcs, output != NULL && prevOut == output->pos)); + + return res; +} + +FL2LIB_API size_t FL2LIB_CALL FL2_getLevelParameters(int compressionLevel, int high, FL2_compressionParameters * params) +{ + if (high) { + if (compressionLevel < 0 || compressionLevel > FL2_MAX_HIGH_CLEVEL) + return FL2_ERROR(parameter_outOfBound); + *params = FL2_highCParameters[compressionLevel]; + } + else { + if (compressionLevel < 0 || compressionLevel > FL2_MAX_CLEVEL) + return FL2_ERROR(parameter_outOfBound); + *params = FL2_defaultCParameters[compressionLevel]; + } + return FL2_error_no_error; +} + +static size_t FL2_memoryUsage_internal(size_t const dictionarySize, unsigned const bufferLog, + unsigned const chainLog, + FL2_strategy const strategy, + unsigned const nbThreads) +{ + return RMF_memoryUsage(dictionarySize, bufferLog, nbThreads) + + LZMA2_encMemoryUsage(chainLog, strategy, nbThreads); } FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize(int compressionLevel, unsigned nbThreads) { - return FL2_memoryUsage_internal(FL2_defaultCParameters[compressionLevel].dictionaryLog, - FL2_defaultCParameters[compressionLevel].bufferLog, - FL2_defaultCParameters[compressionLevel].searchDepth, - FL2_defaultCParameters[compressionLevel].chainLog, - FL2_defaultCParameters[compressionLevel].strategy, + if (compressionLevel == 0) + compressionLevel = FL2_CLEVEL_DEFAULT; + + CLAMPCHECK(compressionLevel, 1, FL2_MAX_CLEVEL); + + return FL2_estimateCCtxSize_byParams(FL2_defaultCParameters + compressionLevel, nbThreads); +} + +FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize_byParams(const FL2_compressionParameters * params, unsigned nbThreads) +{ + nbThreads = FL2_checkNbThreads(nbThreads); + return FL2_memoryUsage_internal(params->dictionarySize, + params->bufferLog, + params->chainLog, + params->strategy, nbThreads); } FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize_usingCCtx(const FL2_CCtx * cctx) { - return FL2_memoryUsage_internal(cctx->params.rParams.dictionary_log, + return FL2_memoryUsage_internal(cctx->params.rParams.dictionary_size, cctx->params.rParams.match_buffer_log, - cctx->params.rParams.depth, cctx->params.cParams.second_dict_bits, cctx->params.cParams.strategy, - cctx->jobCount); + cctx->jobCount) + DICT_memUsage(&cctx->buf); } -FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize(int compressionLevel, unsigned nbThreads) +FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize(int compressionLevel, unsigned nbThreads, int dualBuffer) { return FL2_estimateCCtxSize(compressionLevel, nbThreads) - + ((size_t)1 << FL2_defaultCParameters[compressionLevel].dictionaryLog); + + (FL2_defaultCParameters[compressionLevel].dictionarySize << (dualBuffer != 0)); } -FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_usingCCtx(const FL2_CStream* fcs) +FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_byParams(const FL2_compressionParameters * params, unsigned nbThreads, int dualBuffer) { - return FL2_estimateCCtxSize_usingCCtx(fcs->cctx) - + ((size_t)1 << fcs->cctx->params.rParams.dictionary_log); + return FL2_estimateCCtxSize_byParams(params, nbThreads) + + (params->dictionarySize << (dualBuffer != 0)); +} + +FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_usingCStream(const FL2_CStream* fcs) +{ + return FL2_estimateCCtxSize_usingCCtx(fcs); } diff --git a/C/fast-lzma2/fl2_compress_internal.h b/C/fast-lzma2/fl2_compress_internal.h index ae69bd8f..166457ea 100644 --- a/C/fast-lzma2/fl2_compress_internal.h +++ b/C/fast-lzma2/fl2_compress_internal.h @@ -20,8 +20,9 @@ #include "radix_internal.h" #include "lzma2_enc.h" #include "fast-lzma2.h" -#include "fl2threading.h" -#include "fl2pool.h" +#include "fl2_threading.h" +#include "fl2_pool.h" +#include "dict_buffer.h" #ifndef NO_XXHASH # include "xxhash.h" #endif @@ -30,19 +31,6 @@ extern "C" { #endif -typedef struct { - unsigned dictionaryLog; /* largest match distance : larger == more compression, more memory needed during decompression; >= 27 == more memory, slower */ - unsigned overlapFraction; /* overlap between consecutive blocks in 1/16 units: larger == more compression, slower */ - unsigned chainLog; /* fully searched segment : larger == more compression, slower, more memory; hybrid mode only (ultra) */ - unsigned searchLog; /* nb of searches : larger == more compression, slower; hybrid mode only (ultra) */ - unsigned searchDepth; /* maximum depth for resolving string matches : larger == more compression, slower; >= 64 == more memory, slower */ - unsigned fastLength; /* acceptable match size for parser, not less than searchDepth : larger == more compression, slower; fast bytes parameter from 7-zip */ - unsigned divideAndConquer; /* split long chains of 2-byte matches into shorter chains with a small overlap : faster, somewhat less compression; enabled by default */ - unsigned bufferLog; /* buffer size for processing match chains is (dictionaryLog - bufferLog) : when divideAndConquer enabled, affects compression; */ - /* when divideAndConquer disabled, affects speed in a hardware-dependent manner */ - FL2_strategy strategy; /* encoder strategy : fast, optimized or ultra (hybrid) */ -} FL2_compressionParameters; - /*-************************************* * Context memory management ***************************************/ @@ -60,38 +48,43 @@ typedef struct { typedef struct { FL2_CCtx* cctx; - FL2_lzmaEncoderCtx* enc; + LZMA2_ECtx* enc; FL2_dataBlock block; size_t cSize; } FL2_job; struct FL2_CCtx_s { + DICT_buffer buf; FL2_CCtx_params params; #ifndef FL2_SINGLETHREAD FL2POOL_ctx* factory; + FL2POOL_ctx* compressThread; #endif FL2_dataBlock curBlock; + size_t asyncRes; + size_t threadCount; + size_t outThread; + size_t outPos; size_t dictMax; - U64 block_total; + U64 streamTotal; + U64 streamCsize; FL2_matchTable* matchTable; +#ifndef FL2_SINGLETHREAD + U32 timeout; +#endif + U32 rmfWeight; + U32 encWeight; + FL2_atomic progressIn; + FL2_atomic progressOut; + int canceled; + BYTE wroteProp; + BYTE endMarked; + BYTE loopCount; + BYTE lockParams; unsigned jobCount; FL2_job jobs[1]; }; -struct FL2_CStream_s { - FL2_CCtx* cctx; - FL2_blockBuffer inBuff; -#ifndef NO_XXHASH - XXH32_state_t *xxh; -#endif - size_t thread_count; - size_t out_thread; - size_t out_pos; - size_t hash_pos; - BYTE end_marked; - BYTE wrote_prop; -}; - #if defined (__cplusplus) } #endif diff --git a/C/fast-lzma2/fl2_error_private.c b/C/fast-lzma2/fl2_error_private.c deleted file mode 100644 index 66289586..00000000 --- a/C/fast-lzma2/fl2_error_private.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. - * All rights reserved. - * Modified for FL2 by Conor McCarthy - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * You may select, at your option, one of the above-listed licenses. - */ - -/* The purpose of this file is to have a single list of error strings embedded in binary */ - -#include "fl2_error_private.h" - -const char* ERR_getFL2ErrorString(ERR_enum code) -{ - static const char* const notErrorCode = "Unspecified error code"; - switch( code ) - { - case PREFIX(no_error): return "No error detected"; - case PREFIX(GENERIC): return "Error (generic)"; - case PREFIX(corruption_detected): return "Corrupted block detected"; - case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; - case PREFIX(parameter_unsupported): return "Unsupported parameter"; - case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; - case PREFIX(init_missing): return "Context should be init first"; - case PREFIX(memory_allocation): return "Allocation error : not enough memory"; - case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; - case PREFIX(srcSize_wrong): return "Src size is incorrect"; - /* following error codes are not stable and may be removed or changed in a future version */ - case PREFIX(maxCode): - default: return notErrorCode; - } -} diff --git a/C/fast-lzma2/fl2_error_private.h b/C/fast-lzma2/fl2_error_private.h deleted file mode 100644 index 32532a9b..00000000 --- a/C/fast-lzma2/fl2_error_private.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. - * All rights reserved. - * Modified for FL2 by Conor McCarthy - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * You may select, at your option, one of the above-listed licenses. - */ - -/* Note : this module is expected to remain private, do not expose it */ - -#ifndef ERROR_H_MODULE -#define ERROR_H_MODULE - -#if defined (__cplusplus) -extern "C" { -#endif - - -/* **************************************** -* Dependencies -******************************************/ -#include /* size_t */ -#include "fl2_errors.h" /* enum list */ - - -/* **************************************** -* Compiler-specific -******************************************/ -#if defined(__GNUC__) -# define ERR_STATIC static __attribute__((unused)) -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define ERR_STATIC static inline -#elif defined(_MSC_VER) -# define ERR_STATIC static __inline -#else -# define ERR_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ -#endif - - -/*-**************************************** -* Customization (error_public.h) -******************************************/ -typedef FL2_ErrorCode ERR_enum; -#define PREFIX(name) FL2_error_##name - - -/*-**************************************** -* Error codes handling -******************************************/ -#define FL2_ERROR(name) ((size_t)-PREFIX(name)) - -ERR_STATIC unsigned ERR_isError(size_t code) { return (code > FL2_ERROR(maxCode)); } - -ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } - - -/*-**************************************** -* Error Strings -******************************************/ - -const char* ERR_getFL2ErrorString(ERR_enum code); /* error_private.c */ - -ERR_STATIC const char* ERR_getErrorName(size_t code) -{ - return ERR_getFL2ErrorString(ERR_getErrorCode(code)); -} - -#if defined (__cplusplus) -} -#endif - -#endif /* ERROR_H_MODULE */ diff --git a/C/fast-lzma2/fl2_errors.h b/C/fast-lzma2/fl2_errors.h index d669618f..1068f463 100644 --- a/C/fast-lzma2/fl2_errors.h +++ b/C/fast-lzma2/fl2_errors.h @@ -28,21 +28,23 @@ extern "C" { * only static linking is allowed ******************************************/ typedef enum { - FL2_error_no_error = 0, - FL2_error_GENERIC = 1, - FL2_error_internal = 2, - FL2_error_corruption_detected = 3, - FL2_error_checksum_wrong = 4, + FL2_error_no_error = 0, + FL2_error_GENERIC = 1, + FL2_error_internal = 2, + FL2_error_corruption_detected = 3, + FL2_error_checksum_wrong = 4, FL2_error_parameter_unsupported = 5, FL2_error_parameter_outOfBound = 6, - FL2_error_stage_wrong = 7, - FL2_error_init_missing = 8, - FL2_error_memory_allocation = 9, - FL2_error_dstSize_tooSmall = 10, - FL2_error_srcSize_wrong = 11, - FL2_error_write_failed = 12, - FL2_error_canceled = 13, - FL2_error_maxCode = 20 /* never EVER use this value directly, it can change in future versions! Use FL2_isError() instead */ + FL2_error_lclpMax_exceeded = 7, + FL2_error_stage_wrong = 8, + FL2_error_init_missing = 9, + FL2_error_memory_allocation = 10, + FL2_error_dstSize_tooSmall = 11, + FL2_error_srcSize_wrong = 12, + FL2_error_canceled = 13, + FL2_error_buffer = 14, + FL2_error_timedOut = 15, + FL2_error_maxCode = 20 /* never EVER use this value directly, it can change in future versions! Use FL2_isError() instead */ } FL2_ErrorCode; /*! FL2_getErrorCode() : diff --git a/C/fast-lzma2/fl2_internal.h b/C/fast-lzma2/fl2_internal.h index aedda654..9f666458 100644 --- a/C/fast-lzma2/fl2_internal.h +++ b/C/fast-lzma2/fl2_internal.h @@ -18,19 +18,30 @@ ***************************************/ #include "mem.h" #include "compiler.h" -#include "fl2_error_private.h" #if defined (__cplusplus) extern "C" { #endif + +/*-**************************************** +* Error codes handling +******************************************/ +#define PREFIX(name) FL2_error_##name +#define FL2_ERROR(name) ((size_t)-PREFIX(name)) + + +/*-************************************* +* Stream properties +***************************************/ #define FL2_PROP_HASH_BIT 7 #define FL2_LZMA_PROP_MASK 0x3FU #ifndef NO_XXHASH # define XXHASH_SIZEOF sizeof(XXH32_canonical_t) #endif + /*-************************************* * Debug ***************************************/ @@ -77,8 +88,8 @@ extern int g_debuglog_enable; #undef MAX #define MIN(a,b) ((a)<(b) ? (a) : (b)) #define MAX(a,b) ((a)>(b) ? (a) : (b)) -#define CHECK_F(f) { size_t const errcod = f; if (ERR_isError(errcod)) return errcod; } /* check and Forward error code */ -#define CHECK_E(f, e) { size_t const errcod = f; if (ERR_isError(errcod)) return FL2_ERROR(e); } /* check and send Error code */ +#define CHECK_F(f) do { size_t const errcod = f; if (FL2_isError(errcod)) return errcod; } while(0) /* check and Forward error code */ +#define CHECK_E(f, e) do { size_t const errcod = f; if (FL2_isError(errcod)) return FL2_ERROR(e); } while(0) /* check and send Error code */ MEM_STATIC U32 ZSTD_highbit32(U32 val) { diff --git a/C/fast-lzma2/fl2_pool.c b/C/fast-lzma2/fl2_pool.c new file mode 100644 index 00000000..8f90b44c --- /dev/null +++ b/C/fast-lzma2/fl2_pool.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * Modified for FL2 by Conor McCarthy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* ====== Dependencies ======= */ +#include /* size_t */ +#include /* malloc, calloc */ +#include "fl2_pool.h" +#include "fl2_internal.h" + + +#ifndef FL2_SINGLETHREAD + +#include "fl2_threading.h" /* pthread adaptation */ + +struct FL2POOL_ctx_s { + /* Keep track of the threads */ + size_t numThreads; + + /* All threads work on the same function and object during a job */ + FL2POOL_function function; + void *opaque; + + /* The number of threads working on jobs */ + size_t numThreadsBusy; + /* Indicates the number of threads requested and the values to pass */ + ptrdiff_t queueIndex; + ptrdiff_t queueEnd; + + /* The mutex protects the queue */ + FL2_pthread_mutex_t queueMutex; + /* Condition variable for pushers to wait on when the queue is full */ + FL2_pthread_cond_t busyCond; + /* Condition variable for poppers to wait on when the queue is empty */ + FL2_pthread_cond_t newJobsCond; + /* Indicates if the queue is shutting down */ + int shutdown; + + /* The threads. Extras to be calloc'd */ + FL2_pthread_t threads[1]; +}; + +/* FL2POOL_thread() : + Work thread for the thread pool. + Waits for jobs and executes them. + @returns : NULL on failure else non-null. +*/ +static void* FL2POOL_thread(void* opaque) +{ + FL2POOL_ctx* const ctx = (FL2POOL_ctx*)opaque; + if (!ctx) { return NULL; } + FL2_pthread_mutex_lock(&ctx->queueMutex); + for (;;) { + + /* While the mutex is locked, wait for a non-empty queue or until shutdown */ + while (ctx->queueIndex >= ctx->queueEnd && !ctx->shutdown) { + FL2_pthread_cond_wait(&ctx->newJobsCond, &ctx->queueMutex); + } + /* empty => shutting down: so stop */ + if (ctx->shutdown) { + FL2_pthread_mutex_unlock(&ctx->queueMutex); + return opaque; + } + /* Pop a job off the queue */ + size_t n = ctx->queueIndex; + ++ctx->queueIndex; + ++ctx->numThreadsBusy; + /* Unlock the mutex and run the job */ + FL2_pthread_mutex_unlock(&ctx->queueMutex); + + ctx->function(ctx->opaque, n); + + FL2_pthread_mutex_lock(&ctx->queueMutex); + --ctx->numThreadsBusy; + /* Signal the master thread waiting for jobs to complete */ + FL2_pthread_cond_signal(&ctx->busyCond); + } /* for (;;) */ + /* Unreachable */ +} + +FL2POOL_ctx* FL2POOL_create(size_t numThreads) +{ + FL2POOL_ctx* ctx; + /* Check the parameters */ + if (!numThreads) { return NULL; } + /* Allocate the context and zero initialize */ + ctx = calloc(1, sizeof(FL2POOL_ctx) + (numThreads - 1) * sizeof(FL2_pthread_t)); + if (!ctx) { return NULL; } + /* Initialize the busy count and jobs range */ + ctx->numThreadsBusy = 0; + ctx->queueIndex = 0; + ctx->queueEnd = 0; + (void)FL2_pthread_mutex_init(&ctx->queueMutex, NULL); + (void)FL2_pthread_cond_init(&ctx->busyCond, NULL); + (void)FL2_pthread_cond_init(&ctx->newJobsCond, NULL); + ctx->shutdown = 0; + ctx->numThreads = 0; + /* Initialize the threads */ + { size_t i; + for (i = 0; i < numThreads; ++i) { + if (FL2_pthread_create(&ctx->threads[i], NULL, &FL2POOL_thread, ctx)) { + ctx->numThreads = i; + FL2POOL_free(ctx); + return NULL; + } } + ctx->numThreads = numThreads; + } + return ctx; +} + +/*! FL2POOL_join() : + Shutdown the queue, wake any sleeping threads, and join all of the threads. +*/ +static void FL2POOL_join(FL2POOL_ctx* ctx) +{ + /* Shut down the queue */ + FL2_pthread_mutex_lock(&ctx->queueMutex); + ctx->shutdown = 1; + /* Wake up sleeping threads */ + FL2_pthread_cond_broadcast(&ctx->newJobsCond); + FL2_pthread_mutex_unlock(&ctx->queueMutex); + /* Join all of the threads */ + for (size_t i = 0; i < ctx->numThreads; ++i) + FL2_pthread_join(ctx->threads[i], NULL); +} + +void FL2POOL_free(FL2POOL_ctx *ctx) +{ + if (!ctx) { return; } + FL2POOL_join(ctx); + FL2_pthread_mutex_destroy(&ctx->queueMutex); + FL2_pthread_cond_destroy(&ctx->busyCond); + FL2_pthread_cond_destroy(&ctx->newJobsCond); + free(ctx); +} + +size_t FL2POOL_sizeof(FL2POOL_ctx *ctx) +{ + if (ctx==NULL) return 0; /* supports sizeof NULL */ + return sizeof(*ctx) + ctx->numThreads * sizeof(FL2_pthread_t); +} + +void FL2POOL_addRange(void* ctxVoid, FL2POOL_function function, void *opaque, ptrdiff_t first, ptrdiff_t end) +{ + FL2POOL_ctx* const ctx = (FL2POOL_ctx*)ctxVoid; + if (!ctx) + return; + + /* Callers always wait for jobs to complete before adding a new set */ + assert(!ctx->numThreadsBusy); + + FL2_pthread_mutex_lock(&ctx->queueMutex); + ctx->function = function; + ctx->opaque = opaque; + ctx->queueIndex = first; + ctx->queueEnd = end; + FL2_pthread_cond_broadcast(&ctx->newJobsCond); + FL2_pthread_mutex_unlock(&ctx->queueMutex); +} + +void FL2POOL_add(void* ctxVoid, FL2POOL_function function, void *opaque, ptrdiff_t n) +{ + FL2POOL_addRange(ctxVoid, function, opaque, n, n + 1); +} + +int FL2POOL_waitAll(void *ctxVoid, unsigned timeout) +{ + FL2POOL_ctx* const ctx = (FL2POOL_ctx*)ctxVoid; + if (!ctx || (!ctx->numThreadsBusy && ctx->queueIndex >= ctx->queueEnd) || ctx->shutdown) { return 0; } + + FL2_pthread_mutex_lock(&ctx->queueMutex); + /* Need to test for ctx->queueIndex < ctx->queueEnd in case not all jobs have started */ + if (timeout != 0) { + if ((ctx->numThreadsBusy || ctx->queueIndex < ctx->queueEnd) && !ctx->shutdown) + FL2_pthread_cond_timedwait(&ctx->busyCond, &ctx->queueMutex, timeout); + } + else { + while ((ctx->numThreadsBusy || ctx->queueIndex < ctx->queueEnd) && !ctx->shutdown) + FL2_pthread_cond_wait(&ctx->busyCond, &ctx->queueMutex); + } + FL2_pthread_mutex_unlock(&ctx->queueMutex); + return ctx->numThreadsBusy && !ctx->shutdown; +} + +size_t FL2POOL_threadsBusy(void * ctx) +{ + return ((FL2POOL_ctx*)ctx)->numThreadsBusy; +} + +#endif /* FL2_SINGLETHREAD */ diff --git a/C/fast-lzma2/fl2pool.h b/C/fast-lzma2/fl2_pool.h similarity index 76% rename from C/fast-lzma2/fl2pool.h rename to C/fast-lzma2/fl2_pool.h index 9c99f3c5..ccf1d003 100644 --- a/C/fast-lzma2/fl2pool.h +++ b/C/fast-lzma2/fl2_pool.h @@ -42,16 +42,20 @@ size_t FL2POOL_sizeof(FL2POOL_ctx *ctx); /*! FL2POOL_function : The function type that can be added to a thread pool. */ -typedef void(*FL2POOL_function)(void *, size_t); +typedef void(*FL2POOL_function)(void *, ptrdiff_t); /*! FL2POOL_add() : Add the job `function(opaque)` to the thread pool. +FL2POOL_addRange adds multiple jobs with size_t parameter from first to less than end. Possibly blocks until there is room in the queue. Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed. */ -void FL2POOL_add(void *ctx, FL2POOL_function function, void *opaque, size_t n); +void FL2POOL_add(void* ctxVoid, FL2POOL_function function, void *opaque, ptrdiff_t n); +void FL2POOL_addRange(void *ctx, FL2POOL_function function, void *opaque, ptrdiff_t first, ptrdiff_t end); -void FL2POOL_waitAll(void *ctx); +int FL2POOL_waitAll(void *ctx, unsigned timeout); + +size_t FL2POOL_threadsBusy(void *ctx); #if defined (__cplusplus) } diff --git a/C/fast-lzma2/fl2threading.c b/C/fast-lzma2/fl2_threading.c similarity index 73% rename from C/fast-lzma2/fl2threading.c rename to C/fast-lzma2/fl2_threading.c index 3372b109..d4ac2e91 100644 --- a/C/fast-lzma2/fl2threading.c +++ b/C/fast-lzma2/fl2_threading.c @@ -17,6 +17,10 @@ /* create fake symbol to avoid empty translation unit warning */ int g_ZSTD_threading_useles_symbol; +#include "fast-lzma2.h" +#include "fl2_threading.h" +#include "util.h" + #if !defined(FL2_SINGLETHREAD) && defined(_WIN32) /** @@ -28,19 +32,18 @@ int g_ZSTD_threading_useles_symbol; /* === Dependencies === */ #include #include -#include "fl2threading.h" /* === Implementation === */ static unsigned __stdcall worker(void *arg) { - ZSTD_pthread_t* const thread = (ZSTD_pthread_t*) arg; + FL2_pthread_t* const thread = (FL2_pthread_t*) arg; thread->arg = thread->start_routine(thread->arg); return 0; } -int FL2_pthread_create(ZSTD_pthread_t* thread, const void* unused, +int FL2_pthread_create(FL2_pthread_t* thread, const void* unused, void* (*start_routine) (void*), void* arg) { (void)unused; @@ -54,7 +57,7 @@ int FL2_pthread_create(ZSTD_pthread_t* thread, const void* unused, return 0; } -int FL2_pthread_join(ZSTD_pthread_t thread, void **value_ptr) +int FL2_pthread_join(FL2_pthread_t thread, void **value_ptr) { DWORD result; @@ -73,3 +76,20 @@ int FL2_pthread_join(ZSTD_pthread_t thread, void **value_ptr) } #endif /* FL2_SINGLETHREAD */ + +unsigned FL2_checkNbThreads(unsigned nbThreads) +{ +#ifndef FL2_SINGLETHREAD + if (nbThreads == 0) { + nbThreads = UTIL_countPhysicalCores(); + nbThreads += !nbThreads; + } + if (nbThreads > FL2_MAXTHREADS) { + nbThreads = FL2_MAXTHREADS; + } +#else + nbThreads = 1; +#endif + return nbThreads; +} + diff --git a/C/fast-lzma2/fl2_threading.h b/C/fast-lzma2/fl2_threading.h new file mode 100644 index 00000000..c9259b0c --- /dev/null +++ b/C/fast-lzma2/fl2_threading.h @@ -0,0 +1,178 @@ +/** + * Copyright (c) 2016 Tino Reichardt + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * + * You can contact the author at: + * - zstdmt source repository: https://github.com/mcmilk/zstdmt + */ + +#ifndef THREADING_H_938743 +#define THREADING_H_938743 + +#include "mem.h" + +#ifndef FL2_XZ_BUILD +# ifdef _WIN32 +# define MYTHREAD_VISTA +# else +# define MYTHREAD_POSIX /* posix assumed ; need a better detection method */ +# endif +#elif defined(HAVE_CONFIG_H) +# include +#endif + +#if defined (__cplusplus) +extern "C" { +#endif + +unsigned FL2_checkNbThreads(unsigned nbThreads); + + +#if !defined(FL2_SINGLETHREAD) && defined(MYTHREAD_VISTA) + +/** + * Windows minimalist Pthread Wrapper, based on : + * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html + */ +#ifdef WINVER +# undef WINVER +#endif +#define WINVER 0x0600 + +#ifdef _WIN32_WINNT +# undef _WIN32_WINNT +#endif +#define _WIN32_WINNT 0x0600 + +#ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +#endif + +#include +#include + + +/* mutex */ +#define FL2_pthread_mutex_t CRITICAL_SECTION +#define FL2_pthread_mutex_init(a, b) (InitializeCriticalSection((a)), 0) +#define FL2_pthread_mutex_destroy(a) DeleteCriticalSection((a)) +#define FL2_pthread_mutex_lock(a) EnterCriticalSection((a)) +#define FL2_pthread_mutex_unlock(a) LeaveCriticalSection((a)) + +/* condition variable */ +#define FL2_pthread_cond_t CONDITION_VARIABLE +#define FL2_pthread_cond_init(a, b) (InitializeConditionVariable((a)), 0) +#define FL2_pthread_cond_destroy(a) /* No delete */ +#define FL2_pthread_cond_wait(a, b) SleepConditionVariableCS((a), (b), INFINITE) +#define FL2_pthread_cond_timedwait(a, b, c) SleepConditionVariableCS((a), (b), (c)) +#define FL2_pthread_cond_signal(a) WakeConditionVariable((a)) +#define FL2_pthread_cond_broadcast(a) WakeAllConditionVariable((a)) + +/* FL2_pthread_create() and FL2_pthread_join() */ +typedef struct { + HANDLE handle; + void* (*start_routine)(void*); + void* arg; +} FL2_pthread_t; + +int FL2_pthread_create(FL2_pthread_t* thread, const void* unused, + void* (*start_routine) (void*), void* arg); + +int FL2_pthread_join(FL2_pthread_t thread, void** value_ptr); + +/** + * add here more wrappers as required + */ + + +#elif !defined(FL2_SINGLETHREAD) && defined(MYTHREAD_POSIX) +/* === POSIX Systems === */ +# include +# include + +#define FL2_pthread_mutex_t pthread_mutex_t +#define FL2_pthread_mutex_init(a, b) pthread_mutex_init((a), (b)) +#define FL2_pthread_mutex_destroy(a) pthread_mutex_destroy((a)) +#define FL2_pthread_mutex_lock(a) pthread_mutex_lock((a)) +#define FL2_pthread_mutex_unlock(a) pthread_mutex_unlock((a)) + +#define FL2_pthread_cond_t pthread_cond_t +#define FL2_pthread_cond_init(a, b) pthread_cond_init((a), (b)) +#define FL2_pthread_cond_destroy(a) pthread_cond_destroy((a)) +#define FL2_pthread_cond_wait(a, b) pthread_cond_wait((a), (b)) +#define FL2_pthread_cond_signal(a) pthread_cond_signal((a)) +#define FL2_pthread_cond_broadcast(a) pthread_cond_broadcast((a)) + +#define FL2_pthread_t pthread_t +#define FL2_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d)) +#define FL2_pthread_join(a, b) pthread_join((a),(b)) + +/* Timed wait functions from XZ by Lasse Collin +*/ + +/* Sets condtime to the absolute time that is timeout_ms milliseconds + * in the future. + */ +static inline void +mythread_condtime_set(struct timespec *condtime, U32 timeout_ms) +{ + condtime->tv_sec = timeout_ms / 1000; + condtime->tv_nsec = (timeout_ms % 1000) * 1000000; + + struct timeval now; + gettimeofday(&now, NULL); + + condtime->tv_sec += now.tv_sec; + condtime->tv_nsec += now.tv_usec * 1000L; + + /* tv_nsec must stay in the range [0, 999_999_999]. */ + if (condtime->tv_nsec >= 1000000000L) { + condtime->tv_nsec -= 1000000000L; + ++condtime->tv_sec; + } +} + +/* Waits on a condition or until a timeout expires. If the timeout expires, + * non-zero is returned, otherwise zero is returned. + */ +static inline void +FL2_pthread_cond_timedwait(FL2_pthread_cond_t *cond, FL2_pthread_mutex_t *mutex, + U32 timeout_ms) +{ + struct timespec condtime; + mythread_condtime_set(&condtime, timeout_ms); + pthread_cond_timedwait(cond, mutex, &condtime); +} + + +#elif defined(FL2_SINGLETHREAD) +/* No multithreading support */ + +typedef int FL2_pthread_mutex_t; +#define FL2_pthread_mutex_init(a, b) ((void)a, 0) +#define FL2_pthread_mutex_destroy(a) +#define FL2_pthread_mutex_lock(a) +#define FL2_pthread_mutex_unlock(a) + +typedef int FL2_pthread_cond_t; +#define FL2_pthread_cond_init(a, b) ((void)a, 0) +#define FL2_pthread_cond_destroy(a) +#define FL2_pthread_cond_wait(a, b) +#define FL2_pthread_cond_signal(a) +#define FL2_pthread_cond_broadcast(a) + +/* do not use FL2_pthread_t */ + +#else +# error FL2_SINGLETHREAD not defined but no threading support found +#endif /* FL2_SINGLETHREAD */ + +#if defined (__cplusplus) +} +#endif + +#endif /* THREADING_H_938743 */ diff --git a/C/fast-lzma2/fl2pool.c b/C/fast-lzma2/fl2pool.c deleted file mode 100644 index 24d4f9e6..00000000 --- a/C/fast-lzma2/fl2pool.c +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. - * All rights reserved. - * Modified for FL2 by Conor McCarthy - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * You may select, at your option, one of the above-listed licenses. - */ - - -/* ====== Dependencies ======= */ -#include /* size_t */ -#include /* malloc, calloc */ -#include "fl2pool.h" -#include "fl2_internal.h" - -/* ====== Compiler specifics ====== */ -#if defined(_MSC_VER) -# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ -#endif - - -#ifndef FL2_SINGLETHREAD - -#include "fl2threading.h" /* pthread adaptation */ - -/* A job is a function and an opaque argument */ -typedef struct FL2POOL_job_s { - FL2POOL_function function; - void *opaque; - size_t n; -} FL2POOL_job; - -struct FL2POOL_ctx_s { - /* Keep track of the threads */ - ZSTD_pthread_t *threads; - size_t numThreads; - - /* The queue is a single job */ - FL2POOL_job queue; - - /* The number of threads working on jobs */ - size_t numThreadsBusy; - /* Indicates if the queue is empty */ - int queueEmpty; - - /* The mutex protects the queue */ - ZSTD_pthread_mutex_t queueMutex; - /* Condition variable for pushers to wait on when the queue is full */ - ZSTD_pthread_cond_t queuePushCond; - /* Condition variables for poppers to wait on when the queue is empty */ - ZSTD_pthread_cond_t queuePopCond; - /* Indicates if the queue is shutting down */ - int shutdown; -}; - -/* FL2POOL_thread() : - Work thread for the thread pool. - Waits for jobs and executes them. - @returns : NULL on failure else non-null. -*/ -static void* FL2POOL_thread(void* opaque) { - FL2POOL_ctx* const ctx = (FL2POOL_ctx*)opaque; - if (!ctx) { return NULL; } - for (;;) { - /* Lock the mutex and wait for a non-empty queue or until shutdown */ - ZSTD_pthread_mutex_lock(&ctx->queueMutex); - - while (ctx->queueEmpty && !ctx->shutdown) { - ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex); - } - /* empty => shutting down: so stop */ - if (ctx->queueEmpty) { - ZSTD_pthread_mutex_unlock(&ctx->queueMutex); - return opaque; - } - /* Pop a job off the queue */ - { FL2POOL_job const job = ctx->queue; - ctx->queueEmpty = 1; - /* Unlock the mutex, signal a pusher, and run the job */ - ZSTD_pthread_mutex_unlock(&ctx->queueMutex); - ZSTD_pthread_cond_signal(&ctx->queuePushCond); - - job.function(job.opaque, job.n); - - ZSTD_pthread_mutex_lock(&ctx->queueMutex); - ctx->numThreadsBusy--; - ZSTD_pthread_mutex_unlock(&ctx->queueMutex); - ZSTD_pthread_cond_signal(&ctx->queuePushCond); - } - } /* for (;;) */ - /* Unreachable */ -} - -FL2POOL_ctx* FL2POOL_create(size_t numThreads) { - FL2POOL_ctx* ctx; - /* Check the parameters */ - if (!numThreads) { return NULL; } - /* Allocate the context and zero initialize */ - ctx = (FL2POOL_ctx*)calloc(1, sizeof(FL2POOL_ctx)); - if (!ctx) { return NULL; } - /* Initialize the job queue. - * It needs one extra space since one space is wasted to differentiate empty - * and full queues. - */ - ctx->numThreadsBusy = 0; - ctx->queueEmpty = 1; - (void)ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL); - (void)ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL); - (void)ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL); - ctx->shutdown = 0; - /* Allocate space for the thread handles */ - ctx->threads = (ZSTD_pthread_t*)malloc(numThreads * sizeof(ZSTD_pthread_t)); - ctx->numThreads = 0; - /* Check for errors */ - if (!ctx->threads) { FL2POOL_free(ctx); return NULL; } - /* Initialize the threads */ - { size_t i; - for (i = 0; i < numThreads; ++i) { - if (FL2_pthread_create(&ctx->threads[i], NULL, &FL2POOL_thread, ctx)) { - ctx->numThreads = i; - FL2POOL_free(ctx); - return NULL; - } } - ctx->numThreads = numThreads; - } - return ctx; -} - -/*! FL2POOL_join() : - Shutdown the queue, wake any sleeping threads, and join all of the threads. -*/ -static void FL2POOL_join(FL2POOL_ctx* ctx) { - /* Shut down the queue */ - ZSTD_pthread_mutex_lock(&ctx->queueMutex); - ctx->shutdown = 1; - ZSTD_pthread_mutex_unlock(&ctx->queueMutex); - /* Wake up sleeping threads */ - ZSTD_pthread_cond_broadcast(&ctx->queuePushCond); - ZSTD_pthread_cond_broadcast(&ctx->queuePopCond); - /* Join all of the threads */ - { size_t i; - for (i = 0; i < ctx->numThreads; ++i) { - FL2_pthread_join(ctx->threads[i], NULL); - } } -} - -void FL2POOL_free(FL2POOL_ctx *ctx) { - if (!ctx) { return; } - FL2POOL_join(ctx); - ZSTD_pthread_mutex_destroy(&ctx->queueMutex); - ZSTD_pthread_cond_destroy(&ctx->queuePushCond); - ZSTD_pthread_cond_destroy(&ctx->queuePopCond); - free(ctx->threads); - free(ctx); -} - -size_t FL2POOL_sizeof(FL2POOL_ctx *ctx) { - if (ctx==NULL) return 0; /* supports sizeof NULL */ - return sizeof(*ctx) - + ctx->numThreads * sizeof(ZSTD_pthread_t); -} - -void FL2POOL_add(void* ctxVoid, FL2POOL_function function, void *opaque, size_t n) { - FL2POOL_ctx* const ctx = (FL2POOL_ctx*)ctxVoid; - if (!ctx) - return; - - ZSTD_pthread_mutex_lock(&ctx->queueMutex); - { FL2POOL_job const job = {function, opaque, n}; - - /* Wait until there is space in the queue for the new job */ - while (!ctx->queueEmpty && !ctx->shutdown) { - ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex); - } - /* The queue is still going => there is space */ - if (!ctx->shutdown) { - ctx->numThreadsBusy++; - ctx->queueEmpty = 0; - ctx->queue = job; - } - } - ZSTD_pthread_mutex_unlock(&ctx->queueMutex); - ZSTD_pthread_cond_signal(&ctx->queuePopCond); -} - -void FL2POOL_waitAll(void *ctxVoid) -{ - FL2POOL_ctx* const ctx = (FL2POOL_ctx*)ctxVoid; - if (!ctx) { return; } - - ZSTD_pthread_mutex_lock(&ctx->queueMutex); - while (ctx->numThreadsBusy && !ctx->shutdown) { - ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex); - } - ZSTD_pthread_mutex_unlock(&ctx->queueMutex); -} - -#endif /* FL2_SINGLETHREAD */ diff --git a/C/fast-lzma2/fl2threading.h b/C/fast-lzma2/fl2threading.h deleted file mode 100644 index 9f6ff3b1..00000000 --- a/C/fast-lzma2/fl2threading.h +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Copyright (c) 2016 Tino Reichardt - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * - * You can contact the author at: - * - zstdmt source repository: https://github.com/mcmilk/zstdmt - */ - -#ifndef THREADING_H_938743 -#define THREADING_H_938743 - -#if defined (__cplusplus) -extern "C" { -#endif - -#if !defined(FL2_SINGLETHREAD) && defined(_WIN32) - -/** - * Windows minimalist Pthread Wrapper, based on : - * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html - */ -#ifdef WINVER -# undef WINVER -#endif -#define WINVER 0x0600 - -#ifdef _WIN32_WINNT -# undef _WIN32_WINNT -#endif -#define _WIN32_WINNT 0x0600 - -#ifndef WIN32_LEAN_AND_MEAN -# define WIN32_LEAN_AND_MEAN -#endif - -#include - - -/* mutex */ -#define ZSTD_pthread_mutex_t CRITICAL_SECTION -#define ZSTD_pthread_mutex_init(a, b) (InitializeCriticalSection((a)), 0) -#define ZSTD_pthread_mutex_destroy(a) DeleteCriticalSection((a)) -#define ZSTD_pthread_mutex_lock(a) EnterCriticalSection((a)) -#define ZSTD_pthread_mutex_unlock(a) LeaveCriticalSection((a)) - -/* condition variable */ -#define ZSTD_pthread_cond_t CONDITION_VARIABLE -#define ZSTD_pthread_cond_init(a, b) (InitializeConditionVariable((a)), 0) -#define ZSTD_pthread_cond_destroy(a) /* No delete */ -#define ZSTD_pthread_cond_wait(a, b) SleepConditionVariableCS((a), (b), INFINITE) -#define ZSTD_pthread_cond_signal(a) WakeConditionVariable((a)) -#define ZSTD_pthread_cond_broadcast(a) WakeAllConditionVariable((a)) - -/* FL2_pthread_create() and FL2_pthread_join() */ -typedef struct { - HANDLE handle; - void* (*start_routine)(void*); - void* arg; -} ZSTD_pthread_t; - -int FL2_pthread_create(ZSTD_pthread_t* thread, const void* unused, - void* (*start_routine) (void*), void* arg); - -int FL2_pthread_join(ZSTD_pthread_t thread, void** value_ptr); - -/** - * add here more wrappers as required - */ - - -#elif !defined(FL2_SINGLETHREAD) /* posix assumed ; need a better detection method */ -/* === POSIX Systems === */ -# include - -#define ZSTD_pthread_mutex_t pthread_mutex_t -#define ZSTD_pthread_mutex_init(a, b) pthread_mutex_init((a), (b)) -#define ZSTD_pthread_mutex_destroy(a) pthread_mutex_destroy((a)) -#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock((a)) -#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock((a)) - -#define ZSTD_pthread_cond_t pthread_cond_t -#define ZSTD_pthread_cond_init(a, b) pthread_cond_init((a), (b)) -#define ZSTD_pthread_cond_destroy(a) pthread_cond_destroy((a)) -#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait((a), (b)) -#define ZSTD_pthread_cond_signal(a) pthread_cond_signal((a)) -#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast((a)) - -#define ZSTD_pthread_t pthread_t -#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d)) -#define ZSTD_pthread_join(a, b) pthread_join((a),(b)) - -#else /* FL2_SINGLETHREAD defined */ -/* No multithreading support */ - -typedef int ZSTD_pthread_mutex_t; -#define ZSTD_pthread_mutex_init(a, b) ((void)a, 0) -#define ZSTD_pthread_mutex_destroy(a) -#define ZSTD_pthread_mutex_lock(a) -#define ZSTD_pthread_mutex_unlock(a) - -typedef int ZSTD_pthread_cond_t; -#define ZSTD_pthread_cond_init(a, b) ((void)a, 0) -#define ZSTD_pthread_cond_destroy(a) -#define ZSTD_pthread_cond_wait(a, b) -#define ZSTD_pthread_cond_signal(a) -#define ZSTD_pthread_cond_broadcast(a) - -/* do not use ZSTD_pthread_t */ - -#endif /* FL2_SINGLETHREAD */ - -#if defined (__cplusplus) -} -#endif - -#endif /* THREADING_H_938743 */ diff --git a/C/fast-lzma2/lzma2_enc.c b/C/fast-lzma2/lzma2_enc.c index 2201ef45..b68084ef 100644 --- a/C/fast-lzma2/lzma2_enc.c +++ b/C/fast-lzma2/lzma2_enc.c @@ -7,13 +7,24 @@ Public domain #include #include +#include "fl2_errors.h" #include "fl2_internal.h" -#include "mem.h" #include "lzma2_enc.h" #include "fl2_compress_internal.h" +#include "mem.h" +#include "count.h" #include "radix_mf.h" #include "range_enc.h" -#include "count.h" + +#ifdef FL2_XZ_BUILD +# include "tuklib_integer.h" +# define MEM_readLE32(a) unaligned_read32le(a) + +# ifdef TUKLIB_FAST_UNALIGNED_ACCESS +# define MEM_read16(a) (*(const U16*)(a)) +# endif + +#endif #define kNumReps 4U #define kNumStates 12U @@ -30,7 +41,8 @@ Public domain #define kNumAlignBits 4U #define kAlignTableSize (1U << kNumAlignBits) #define kAlignMask (kAlignTableSize - 1U) -#define kAlignRepriceFrequency kAlignTableSize +#define kMatchRepriceFrequency 64U +#define kRepLenRepriceFrequency 64U #define kStartPosModelIndex 4U #define kEndPosModelIndex 14U @@ -38,7 +50,6 @@ Public domain #define kNumFullDistancesBits (kEndPosModelIndex >> 1U) #define kNumFullDistances (1U << kNumFullDistancesBits) -#define kDistanceRepriceFrequency (1U << 7U) #define kNumPositionBitsMax 4U #define kNumPositionStatesMax (1U << kNumPositionBitsMax) @@ -49,23 +60,28 @@ Public domain #define kLenNumLowBits 3U #define kLenNumLowSymbols (1U << kLenNumLowBits) -#define kLenNumMidBits 3U -#define kLenNumMidSymbols (1U << kLenNumMidBits) #define kLenNumHighBits 8U #define kLenNumHighSymbols (1U << kLenNumHighBits) -#define kLenNumSymbolsTotal (kLenNumLowSymbols + kLenNumMidSymbols + kLenNumHighSymbols) +#define kLenNumSymbolsTotal (kLenNumLowSymbols * 2 + kLenNumHighSymbols) #define kMatchLenMin 2U #define kMatchLenMax (kMatchLenMin + kLenNumSymbolsTotal - 1U) -#define kOptimizerBufferSize (1U << 12U) +#define kMatchesMax 65U /* Doesn't need to be larger than FL2_HYBRIDCYCLES_MAX + 1 */ + +#define kOptimizerEndSize 32U +#define kOptimizerBufferSize (kMatchLenMax * 2U + kOptimizerEndSize) +#define kOptimizerSkipSize 16U #define kInfinityPrice (1UL << 30U) #define kNullDist (U32)-1 #define kChunkSize ((1UL << 16U) - 8192U) -#define kChunkBufferSize (1UL << 16U) +#define kSqrtChunkSize 239U +#define kTempMinOutput (LZMA_REQUIRED_INPUT_MAX * 4U) +#define kTempBufferSize (kTempMinOutput + kOptimizerBufferSize + kOptimizerBufferSize / 16U) #define kMaxChunkUncompressedSize ((1UL << 21U) - kMatchLenMax) +#define kMaxChunkCompressedSize (1UL << 16U) #define kChunkHeaderSize 5U #define kChunkResetShift 5U #define kChunkUncompressedDictReset 1U @@ -83,80 +99,93 @@ Public domain #define kMinTestChunkSize 0x4000U #define kRandomFilterMarginBits 8U +#define kState_LitAfterMatch 4 +#define kState_LitAfterRep 5 +#define kState_MatchAfterLit 7 +#define kState_RepAfterLit 8 + static const BYTE kLiteralNextStates[kNumStates] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 }; -#define LiteralNextState(s) kLiteralNextStates[s] +#define LIT_NEXT_STATE(s) kLiteralNextStates[s] static const BYTE kMatchNextStates[kNumStates] = { 7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10 }; -#define MatchNextState(s) kMatchNextStates[s] +#define MATCH_NEXT_STATE(s) kMatchNextStates[s] static const BYTE kRepNextStates[kNumStates] = { 8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11 }; -#define RepNextState(s) kRepNextStates[s] +#define REP_NEXT_STATE(s) kRepNextStates[s] static const BYTE kShortRepNextStates[kNumStates] = { 9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11 }; -#define ShortRepNextState(s) kShortRepNextStates[s] +#define SHORT_REP_NEXT_STATE(s) kShortRepNextStates[s] #include "fastpos_table.h" +#include "radix_get.h" +/* Probabilities and prices for encoding match lengths. + * Two objects of this type are needed, one for normal matches + * and another for rep matches. + */ typedef struct { size_t table_size; unsigned prices[kNumPositionStatesMax][kLenNumSymbolsTotal]; - unsigned counters[kNumPositionStatesMax]; - Probability choice; - Probability choice_2; - Probability low[kNumPositionStatesMax << kLenNumLowBits]; - Probability mid[kNumPositionStatesMax << kLenNumMidBits]; + Probability choice; /* low[0] is choice_2. Must be consecutive for speed */ + Probability low[kNumPositionStatesMax << (kLenNumLowBits + 1)]; Probability high[kLenNumHighSymbols]; } LengthStates; +/* All probabilities for the encoder. This is a separate from the encoder object + * so the state can be saved and restored in case a chunk is not compressible. + */ typedef struct { - U32 reps[kNumReps]; - size_t state; + /* Fields are ordered for speed */ + LengthStates rep_len_states; + Probability is_rep0_long[kNumStates][kNumPositionStatesMax]; + size_t state; + U32 reps[kNumReps]; + + Probability is_match[kNumStates][kNumPositionStatesMax]; Probability is_rep[kNumStates]; Probability is_rep_G0[kNumStates]; Probability is_rep_G1[kNumStates]; Probability is_rep_G2[kNumStates]; - Probability is_rep0_long[kNumStates][kNumPositionStatesMax]; - Probability is_match[kNumStates][kNumPositionStatesMax]; + + LengthStates len_states; Probability dist_slot_encoders[kNumLenToPosStates][1 << kNumPosSlotBits]; Probability dist_align_encoders[1 << kNumAlignBits]; Probability dist_encoders[kNumFullDistances - kEndPosModelIndex]; - LengthStates len_states; - LengthStates rep_len_states; - Probability literal_probs[(kNumLiterals * kNumLitTables) << kLcLpMax]; } EncoderStates; +/* + * Linked list item for optimal parsing + */ typedef struct { size_t state; - U32 reps[kNumReps]; U32 price; - unsigned prev_index; - U32 prev_dist; - unsigned prev_index_2; - U32 prev_dist_2; - BYTE is_combination; - BYTE prev_2; - + unsigned extra; /* 0 : normal + * 1 : LIT : MATCH + * > 1 : MATCH (extra-1) : LIT : REP0 (len) */ + unsigned len; + U32 dist; + U32 reps[kNumReps]; } OptimalNode; -#define MakeAsLiteral(node) (node).prev_dist = kNullDist; (node).is_combination = 0; -#define MakeAsShortRep(node) (node).prev_dist = 0; (node).is_combination = 0; +#define MARK_LITERAL(node) (node).dist = kNullDist; (node).extra = 0; +#define MARK_SHORT_REP(node) (node).dist = 0; (node).extra = 0; +/* + * Table and chain for 3-byte hash. Extra elements in hash_chain_3 are malloced. + */ typedef struct { S32 table_3[1 << kHash3Bits]; S32 hash_chain_3[1]; } HashChains; -typedef struct -{ - U32 length; - U32 dist; -} Match; - -struct FL2_lzmaEncoderCtx_s +/* + * LZMA2 encoder. + */ +struct LZMA2_ECtx_s { unsigned lc; unsigned lp; @@ -173,39 +202,35 @@ struct FL2_lzmaEncoderCtx_s EncoderStates states; unsigned match_price_count; - unsigned align_price_count; + unsigned rep_len_price_count; size_t dist_price_table_size; unsigned align_prices[kAlignTableSize]; unsigned dist_slot_prices[kNumLenToPosStates][kDistTableSizeMax]; unsigned distance_prices[kNumLenToPosStates][kNumFullDistances]; - Match matches[kMatchLenMax-kMatchLenMin]; + RMF_match base_match; /* Allows access to matches[-1] in LZMA_optimalParse */ + RMF_match matches[kMatchesMax]; size_t match_count; OptimalNode opt_buf[kOptimizerBufferSize]; - BYTE* out_buf; - HashChains* hash_buf; ptrdiff_t chain_mask_2; ptrdiff_t chain_mask_3; ptrdiff_t hash_dict_3; ptrdiff_t hash_prev_index; ptrdiff_t hash_alloc_3; + + BYTE out_buf[kTempBufferSize]; }; -FL2_lzmaEncoderCtx* FL2_lzma2Create() +LZMA2_ECtx* LZMA2_createECtx(void) { - FL2_lzmaEncoderCtx* enc = malloc(sizeof(FL2_lzmaEncoderCtx)); - DEBUGLOG(3, "FL2_lzma2Create"); + LZMA2_ECtx *const enc = malloc(sizeof(LZMA2_ECtx)); + DEBUGLOG(3, "LZMA2_createECtx"); if (enc == NULL) return NULL; - enc->out_buf = malloc(kChunkBufferSize); - if (enc->out_buf == NULL) { - free(enc); - return NULL; - } enc->lc = 3; enc->lp = 0; enc->pb = 2; @@ -215,8 +240,8 @@ FL2_lzmaEncoderCtx* FL2_lzma2Create() enc->pos_mask = (1 << enc->pb) - 1; enc->match_cycles = 1; enc->strategy = FL2_ultra; - enc->match_price_count = kDistanceRepriceFrequency; - enc->align_price_count = kAlignRepriceFrequency; + enc->match_price_count = 0; + enc->rep_len_price_count = 0; enc->dist_price_table_size = kDistTableSizeMax; enc->hash_buf = NULL; enc->hash_dict_3 = 0; @@ -225,399 +250,268 @@ FL2_lzmaEncoderCtx* FL2_lzma2Create() return enc; } -void FL2_lzma2Free(FL2_lzmaEncoderCtx* enc) +void LZMA2_freeECtx(LZMA2_ECtx *const enc) { if (enc == NULL) return; free(enc->hash_buf); - free(enc->out_buf); free(enc); } -#define GetLiteralProbs(enc, pos, prev_symbol) (enc->states.literal_probs + ((((pos) & enc->lit_pos_mask) << enc->lc) + ((prev_symbol) >> (8 - enc->lc))) * kNumLiterals * kNumLitTables) +#define LITERAL_PROBS(enc, pos, prev_symbol) (enc->states.literal_probs + ((((pos) & enc->lit_pos_mask) << enc->lc) + ((prev_symbol) >> (8 - enc->lc))) * kNumLiterals * kNumLitTables) -#define GetLenToDistState(len) (((len) < kNumLenToPosStates + 1) ? (len) - 2 : kNumLenToPosStates - 1) +#define LEN_TO_DIST_STATE(len) (((len) < kNumLenToPosStates + 1) ? (len) - 2 : kNumLenToPosStates - 1) -#define IsCharState(state) ((state) < 7) +#define IS_LIT_STATE(state) ((state) < 7) HINT_INLINE -unsigned GetRepLen1Price(FL2_lzmaEncoderCtx* enc, size_t state, size_t pos_state) +unsigned LZMA_getRepLen1Price(LZMA2_ECtx* const enc, size_t const state, size_t const pos_state) { - unsigned rep_G0_prob = enc->states.is_rep_G0[state]; - unsigned rep0_long_prob = enc->states.is_rep0_long[state][pos_state]; - return GET_PRICE_0(enc->rc, rep_G0_prob) + GET_PRICE_0(enc->rc, rep0_long_prob); + unsigned const rep_G0_prob = enc->states.is_rep_G0[state]; + unsigned const rep0_long_prob = enc->states.is_rep0_long[state][pos_state]; + return GET_PRICE_0(rep_G0_prob) + GET_PRICE_0(rep0_long_prob); } -static unsigned GetRepPrice(FL2_lzmaEncoderCtx* enc, size_t rep_index, size_t state, size_t pos_state) +static unsigned LZMA_getRepPrice(LZMA2_ECtx* const enc, size_t const rep_index, size_t const state, size_t const pos_state) { unsigned price; - unsigned rep_G0_prob = enc->states.is_rep_G0[state]; + unsigned const rep_G0_prob = enc->states.is_rep_G0[state]; if (rep_index == 0) { - unsigned rep0_long_prob = enc->states.is_rep0_long[state][pos_state]; - price = GET_PRICE_0(enc->rc, rep_G0_prob); - price += GET_PRICE_1(enc->rc, rep0_long_prob); + unsigned const rep0_long_prob = enc->states.is_rep0_long[state][pos_state]; + price = GET_PRICE_0(rep_G0_prob); + price += GET_PRICE_1(rep0_long_prob); } else { - unsigned rep_G1_prob = enc->states.is_rep_G1[state]; - price = GET_PRICE_1(enc->rc, rep_G0_prob); + unsigned const rep_G1_prob = enc->states.is_rep_G1[state]; + price = GET_PRICE_1(rep_G0_prob); if (rep_index == 1) { - price += GET_PRICE_0(enc->rc, rep_G1_prob); + price += GET_PRICE_0(rep_G1_prob); } else { - unsigned rep_G2_prob = enc->states.is_rep_G2[state]; - price += GET_PRICE_1(enc->rc, rep_G1_prob); - price += GET_PRICE(enc->enc->rc, rep_G2_prob, (U32)(rep_index) - 2); + unsigned const rep_G2_prob = enc->states.is_rep_G2[state]; + price += GET_PRICE_1(rep_G1_prob); + price += GET_PRICE(rep_G2_prob, rep_index - 2); } } return price; } -static unsigned GetRepMatch0Price(FL2_lzmaEncoderCtx* enc, size_t len, size_t state, size_t pos_state) +static unsigned LZMA_getRepMatch0Price(LZMA2_ECtx *const enc, size_t const len, size_t const state, size_t const pos_state) { - unsigned rep_G0_prob = enc->states.is_rep_G0[state]; - unsigned rep0_long_prob = enc->states.is_rep0_long[state][pos_state]; + unsigned const rep_G0_prob = enc->states.is_rep_G0[state]; + unsigned const rep0_long_prob = enc->states.is_rep0_long[state][pos_state]; return enc->states.rep_len_states.prices[pos_state][len - kMatchLenMin] - + GET_PRICE_0(enc->rc, rep_G0_prob) - + GET_PRICE_1(enc->rc, rep0_long_prob); + + GET_PRICE_0(rep_G0_prob) + + GET_PRICE_1(rep0_long_prob); } -static unsigned GetLiteralPriceMatched(RangeEncoder* rc, const Probability *prob_table, U32 symbol, unsigned match_byte) +static unsigned LZMA_getLiteralPriceMatched(const Probability *const prob_table, U32 symbol, unsigned match_byte) { unsigned price = 0; unsigned offs = 0x100; symbol |= 0x100; do { match_byte <<= 1; - price += GET_PRICE(enc->rc, prob_table[offs + (match_byte & offs) + (symbol >> 8)], (symbol >> 7) & 1); + price += GET_PRICE(prob_table[offs + (match_byte & offs) + (symbol >> 8)], (symbol >> 7) & 1); symbol <<= 1; offs &= ~(match_byte ^ symbol); } while (symbol < 0x10000); return price; } -static void EncodeLiteral(FL2_lzmaEncoderCtx* enc, size_t index, U32 symbol, unsigned prev_symbol) +HINT_INLINE +void LZMA_encodeLiteral(LZMA2_ECtx *const enc, size_t const index, U32 symbol, unsigned const prev_symbol) { - EncodeBit0(&enc->rc, &enc->states.is_match[enc->states.state][index & enc->pos_mask]); - enc->states.state = LiteralNextState(enc->states.state); + RC_encodeBit0(&enc->rc, &enc->states.is_match[enc->states.state][index & enc->pos_mask]); + enc->states.state = LIT_NEXT_STATE(enc->states.state); - { Probability* prob_table = GetLiteralProbs(enc, index, prev_symbol); - symbol |= 0x100; - do { - EncodeBit(&enc->rc, prob_table + (symbol >> 8), symbol & (1 << 7)); - symbol <<= 1; - } while (symbol < 0x10000); + Probability* const prob_table = LITERAL_PROBS(enc, index, prev_symbol); + symbol |= 0x100; + do { + RC_encodeBit(&enc->rc, prob_table + (symbol >> 8), symbol & (1 << 7)); + symbol <<= 1; + } while (symbol < 0x10000); +} + +HINT_INLINE +void LZMA_encodeLiteralMatched(LZMA2_ECtx *const enc, const BYTE* const data_block, size_t const index, U32 symbol) +{ + RC_encodeBit0(&enc->rc, &enc->states.is_match[enc->states.state][index & enc->pos_mask]); + enc->states.state = LIT_NEXT_STATE(enc->states.state); + + unsigned match_symbol = data_block[index - enc->states.reps[0] - 1]; + Probability* const prob_table = LITERAL_PROBS(enc, index, data_block[index - 1]); + unsigned offset = 0x100; + symbol |= 0x100; + do { + match_symbol <<= 1; + size_t prob_index = offset + (match_symbol & offset) + (symbol >> 8); + RC_encodeBit(&enc->rc, prob_table + prob_index, symbol & (1 << 7)); + symbol <<= 1; + offset &= ~(match_symbol ^ symbol); + } while (symbol < 0x10000); +} + +HINT_INLINE +void LZMA_encodeLiteralBuf(LZMA2_ECtx *const enc, const BYTE* const data_block, size_t const index) +{ + U32 const symbol = data_block[index]; + if (IS_LIT_STATE(enc->states.state)) { + unsigned const prev_symbol = data_block[index - 1]; + LZMA_encodeLiteral(enc, index, symbol, prev_symbol); + } + else { + LZMA_encodeLiteralMatched(enc, data_block, index, symbol); } } -static void EncodeLiteralMatched(FL2_lzmaEncoderCtx* enc, const BYTE* data_block, size_t index, U32 symbol) +static void LZMA_lengthStates_SetPrices(const Probability *probs, U32 start_price, unsigned *prices) { - EncodeBit0(&enc->rc, &enc->states.is_match[enc->states.state][index & enc->pos_mask]); - enc->states.state = LiteralNextState(enc->states.state); + for (size_t i = 0; i < 8; i += 2) { + U32 prob = probs[4 + (i >> 1)]; + U32 price = start_price + GET_PRICE(probs[1], (i >> 2)) + + GET_PRICE(probs[2 + (i >> 2)], (i >> 1) & 1); + prices[i] = price + GET_PRICE_0(prob); + prices[i + 1] = price + GET_PRICE_1(prob); + } +} - { unsigned match_symbol = data_block[index - enc->states.reps[0] - 1]; - Probability* prob_table = GetLiteralProbs(enc, index, data_block[index - 1]); - unsigned offset = 0x100; - symbol |= 0x100; +FORCE_NOINLINE +static void LZMA_lengthStates_updatePrices(LZMA2_ECtx *const enc, LengthStates* const ls) +{ + U32 b; + + { + unsigned const prob = ls->choice; + U32 a, c; + b = GET_PRICE_1(prob); + a = GET_PRICE_0(prob); + c = b + GET_PRICE_0(ls->low[0]); + for (size_t pos_state = 0; pos_state <= enc->pos_mask; pos_state++) { + unsigned *const prices = ls->prices[pos_state]; + const Probability *const probs = ls->low + (pos_state << (1 + kLenNumLowBits)); + LZMA_lengthStates_SetPrices(probs, a, prices); + LZMA_lengthStates_SetPrices(probs + kLenNumLowSymbols, c, prices + kLenNumLowSymbols); + } + } + + size_t i = ls->table_size; + + if (i > kLenNumLowSymbols * 2) { + const Probability *const probs = ls->high; + unsigned *const prices = ls->prices[0] + kLenNumLowSymbols * 2; + i = (i - (kLenNumLowSymbols * 2 - 1)) >> 1; + b += GET_PRICE_1(ls->low[0]); do { - match_symbol <<= 1; - size_t prob_index = offset + (match_symbol & offset) + (symbol >> 8); - EncodeBit(&enc->rc, prob_table + prob_index, symbol & (1 << 7)); - symbol <<= 1; - offset &= ~(match_symbol ^ symbol); - } while (symbol < 0x10000); + --i; + size_t sym = i + (1 << (kLenNumHighBits - 1)); + U32 price = b; + do { + size_t bit = sym & 1; + sym >>= 1; + price += GET_PRICE(probs[sym], bit); + } while (sym >= 2); + + unsigned const prob = probs[i + (1 << (kLenNumHighBits - 1))]; + prices[i * 2] = price + GET_PRICE_0(prob); + prices[i * 2 + 1] = price + GET_PRICE_1(prob); + } while (i); + + size_t const size = (ls->table_size - kLenNumLowSymbols * 2) * sizeof(ls->prices[0][0]); + for (size_t pos_state = 1; pos_state <= enc->pos_mask; pos_state++) + memcpy(ls->prices[pos_state] + kLenNumLowSymbols * 2, ls->prices[0] + kLenNumLowSymbols * 2, size); + } +} + +/* Rare enough that not inlining is faster overall */ +FORCE_NOINLINE +static void LZMA_encodeLength_MidHigh(LZMA2_ECtx *const enc, LengthStates* const len_prob_table, unsigned const len, size_t const pos_state) +{ + RC_encodeBit1(&enc->rc, &len_prob_table->choice); + if (len < kLenNumLowSymbols * 2) { + RC_encodeBit0(&enc->rc, &len_prob_table->low[0]); + RC_encodeBitTree(&enc->rc, len_prob_table->low + kLenNumLowSymbols + (pos_state << (1 + kLenNumLowBits)), kLenNumLowBits, len - kLenNumLowSymbols); + } + else { + RC_encodeBit1(&enc->rc, &len_prob_table->low[0]); + RC_encodeBitTree(&enc->rc, len_prob_table->high, kLenNumHighBits, len - kLenNumLowSymbols * 2); } } HINT_INLINE -void EncodeLiteralBuf(FL2_lzmaEncoderCtx* enc, const BYTE* data_block, size_t index) -{ - U32 symbol = data_block[index]; - if (IsCharState(enc->states.state)) { - unsigned prev_symbol = data_block[index - 1]; - EncodeLiteral(enc, index, symbol, prev_symbol); - } - else { - EncodeLiteralMatched(enc, data_block, index, symbol); - } -} - -static size_t RMF_bitpackExtendMatch(const BYTE* const data, - const U32* const table, - ptrdiff_t const start_index, - ptrdiff_t limit, - U32 const link, - size_t const length) -{ - ptrdiff_t end_index = start_index + length; - ptrdiff_t dist = start_index - link; - if (limit > start_index + (ptrdiff_t)kMatchLenMax) - limit = start_index + kMatchLenMax; - while (end_index < limit && end_index - (ptrdiff_t)(table[end_index] & RADIX_LINK_MASK) == dist) { - end_index += table[end_index] >> RADIX_LINK_BITS; - } - if (end_index >= limit) { - DEBUGLOG(7, "RMF_bitpackExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(limit - start_index)); - return limit - start_index; - } - while (end_index < limit && data[end_index - dist] == data[end_index]) { - ++end_index; - } - DEBUGLOG(7, "RMF_bitpackExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(end_index - start_index)); - return end_index - start_index; -} - -#define GetMatchLink(table, index) ((const RMF_unit*)(table))[(index) >> UNIT_BITS].links[(index) & UNIT_MASK] - -#define GetMatchLength(table, index) ((const RMF_unit*)(table))[(index) >> UNIT_BITS].lengths[(index) & UNIT_MASK] - -static size_t RMF_structuredExtendMatch(const BYTE* const data, - const U32* const table, - ptrdiff_t const start_index, - ptrdiff_t limit, - U32 const link, - size_t const length) -{ - ptrdiff_t end_index = start_index + length; - ptrdiff_t dist = start_index - link; - if (limit > start_index + (ptrdiff_t)kMatchLenMax) - limit = start_index + kMatchLenMax; - while (end_index < limit && end_index - (ptrdiff_t)GetMatchLink(table, end_index) == dist) { - end_index += GetMatchLength(table, end_index); - } - if (end_index >= limit) { - DEBUGLOG(7, "RMF_structuredExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(limit - start_index)); - return limit - start_index; - } - while (end_index < limit && data[end_index - dist] == data[end_index]) { - ++end_index; - } - DEBUGLOG(7, "RMF_structuredExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(end_index - start_index)); - return end_index - start_index; -} - -FORCE_INLINE_TEMPLATE -Match FL2_radixGetMatch(FL2_dataBlock block, - FL2_matchTable* tbl, - unsigned max_depth, - int structTbl, - size_t index) -{ - if (structTbl) - { - Match match; - U32 link = GetMatchLink(tbl->table, index); - size_t length; - size_t dist; - match.length = 0; - if (link == RADIX_NULL_LINK) - return match; - length = GetMatchLength(tbl->table, index); - dist = index - link - 1; - if (length > block.end - index) { - match.length = (U32)(block.end - index); - } - else if (length == max_depth - || length == STRUCTURED_MAX_LENGTH /* from HandleRepeat */) - { - match.length = (U32)RMF_structuredExtendMatch(block.data, tbl->table, index, block.end, link, length); - } - else { - match.length = (U32)length; - } - match.dist = (U32)dist; - return match; - } - else { - Match match; - U32 link = tbl->table[index]; - size_t length; - size_t dist; - match.length = 0; - if (link == RADIX_NULL_LINK) - return match; - length = link >> RADIX_LINK_BITS; - link &= RADIX_LINK_MASK; - dist = index - link - 1; - if (length > block.end - index) { - match.length = (U32)(block.end - index); - } - else if (length == max_depth - || length == BITPACK_MAX_LENGTH /* from HandleRepeat */) - { - match.length = (U32)RMF_bitpackExtendMatch(block.data, tbl->table, index, block.end, link, length); - } - else { - match.length = (U32)length; - } - match.dist = (U32)dist; - return match; - } -} - -FORCE_INLINE_TEMPLATE -Match FL2_radixGetNextMatch(FL2_dataBlock block, - FL2_matchTable* tbl, - unsigned max_depth, - int structTbl, - size_t index) -{ - if (structTbl) - { - Match match; - U32 link = GetMatchLink(tbl->table, index); - size_t length; - size_t dist; - match.length = 0; - if (link == RADIX_NULL_LINK) - return match; - length = GetMatchLength(tbl->table, index); - dist = index - link - 1; - if (link - 1 == GetMatchLink(tbl->table, index - 1)) { - /* same as the previous match, one byte shorter */ - return match; - } - if (length > block.end - index) { - match.length = (U32)(block.end - index); - } - else if (length == max_depth - || length == STRUCTURED_MAX_LENGTH /* from HandleRepeat */) - { - match.length = (U32)RMF_structuredExtendMatch(block.data, tbl->table, index, block.end, link, length); - } - else { - match.length = (U32)length; - } - match.dist = (U32)dist; - return match; - } - else { - Match match; - U32 link = tbl->table[index]; - size_t length; - size_t dist; - match.length = 0; - if (link == RADIX_NULL_LINK) - return match; - length = link >> RADIX_LINK_BITS; - link &= RADIX_LINK_MASK; - dist = index - link - 1; - if (link - 1 == (tbl->table[index - 1] & RADIX_LINK_MASK)) { - /* same distance, one byte shorter */ - return match; - } - if (length > block.end - index) { - match.length = (U32)(block.end - index); - } - else if (length == max_depth - || length == BITPACK_MAX_LENGTH /* from HandleRepeat */) - { - match.length = (U32)RMF_bitpackExtendMatch(block.data, tbl->table, index, block.end, link, length); - } - else { - match.length = (U32)length; - } - match.dist = (U32)dist; - return match; - } -} - -static void LengthStates_SetPrices(RangeEncoder* rc, LengthStates* ls, size_t pos_state) -{ - unsigned prob = ls->choice; - unsigned a0 = GET_PRICE_0(rc, prob); - unsigned a1 = GET_PRICE_1(rc, prob); - unsigned b0, b1; - size_t i = 0; - prob = ls->choice_2; - b0 = a1 + GET_PRICE_0(rc, prob); - b1 = a1 + GET_PRICE_1(rc, prob); - for (; i < kLenNumLowSymbols && i < ls->table_size; ++i) { - ls->prices[pos_state][i] = a0 + GetTreePrice(rc, ls->low + (pos_state << kLenNumLowBits), kLenNumLowBits, i); - } - for (; i < kLenNumLowSymbols + kLenNumMidSymbols && i < ls->table_size; ++i) { - ls->prices[pos_state][i] = b0 + GetTreePrice(rc, ls->mid + (pos_state << kLenNumMidBits), kLenNumMidBits, i - kLenNumLowSymbols); - } - for (; i < ls->table_size; ++i) { - ls->prices[pos_state][i] = b1 + GetTreePrice(rc, ls->high, kLenNumHighBits, i - kLenNumLowSymbols - kLenNumMidSymbols); - } - ls->counters[pos_state] = (unsigned)(ls->table_size); -} - -static void EncodeLength(FL2_lzmaEncoderCtx* enc, LengthStates* len_prob_table, unsigned len, size_t pos_state) +void LZMA_encodeLength(LZMA2_ECtx *const enc, LengthStates* const len_prob_table, unsigned len, size_t const pos_state) { len -= kMatchLenMin; if (len < kLenNumLowSymbols) { - EncodeBit0(&enc->rc, &len_prob_table->choice); - EncodeBitTree(&enc->rc, len_prob_table->low + (pos_state << kLenNumLowBits), kLenNumLowBits, len); + RC_encodeBit0(&enc->rc, &len_prob_table->choice); + RC_encodeBitTree(&enc->rc, len_prob_table->low + (pos_state << (1 + kLenNumLowBits)), kLenNumLowBits, len); } else { - EncodeBit1(&enc->rc, &len_prob_table->choice); - if (len < kLenNumLowSymbols + kLenNumMidSymbols) { - EncodeBit0(&enc->rc, &len_prob_table->choice_2); - EncodeBitTree(&enc->rc, len_prob_table->mid + (pos_state << kLenNumMidBits), kLenNumMidBits, len - kLenNumLowSymbols); - } - else { - EncodeBit1(&enc->rc, &len_prob_table->choice_2); - EncodeBitTree(&enc->rc, len_prob_table->high, kLenNumHighBits, len - kLenNumLowSymbols - kLenNumMidSymbols); - } - } - if (enc->strategy != FL2_fast && --len_prob_table->counters[pos_state] == 0) { - LengthStates_SetPrices(&enc->rc, len_prob_table, pos_state); + LZMA_encodeLength_MidHigh(enc, len_prob_table, len, pos_state); } } -static void EncodeRepMatch(FL2_lzmaEncoderCtx* enc, unsigned len, unsigned rep, size_t pos_state) +FORCE_NOINLINE +static void LZMA_encodeRepMatchShort(LZMA2_ECtx *const enc, size_t const pos_state) { - DEBUGLOG(7, "EncodeRepMatch : length %u, rep %u", len, rep); - EncodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]); - EncodeBit1(&enc->rc, &enc->states.is_rep[enc->states.state]); + DEBUGLOG(7, "LZMA_encodeRepMatchShort"); + RC_encodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]); + RC_encodeBit1(&enc->rc, &enc->states.is_rep[enc->states.state]); + RC_encodeBit0(&enc->rc, &enc->states.is_rep_G0[enc->states.state]); + RC_encodeBit0(&enc->rc, &enc->states.is_rep0_long[enc->states.state][pos_state]); + enc->states.state = SHORT_REP_NEXT_STATE(enc->states.state); +} + +FORCE_NOINLINE +static void LZMA_encodeRepMatchLong(LZMA2_ECtx *const enc, unsigned const len, unsigned const rep, size_t const pos_state) +{ + DEBUGLOG(7, "LZMA_encodeRepMatchLong : length %u, rep %u", len, rep); + RC_encodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]); + RC_encodeBit1(&enc->rc, &enc->states.is_rep[enc->states.state]); if (rep == 0) { - EncodeBit0(&enc->rc, &enc->states.is_rep_G0[enc->states.state]); - EncodeBit(&enc->rc, &enc->states.is_rep0_long[enc->states.state][pos_state], ((len == 1) ? 0 : 1)); + RC_encodeBit0(&enc->rc, &enc->states.is_rep_G0[enc->states.state]); + RC_encodeBit1(&enc->rc, &enc->states.is_rep0_long[enc->states.state][pos_state]); } else { - U32 distance = enc->states.reps[rep]; - EncodeBit1(&enc->rc, &enc->states.is_rep_G0[enc->states.state]); + U32 const distance = enc->states.reps[rep]; + RC_encodeBit1(&enc->rc, &enc->states.is_rep_G0[enc->states.state]); if (rep == 1) { - EncodeBit0(&enc->rc, &enc->states.is_rep_G1[enc->states.state]); + RC_encodeBit0(&enc->rc, &enc->states.is_rep_G1[enc->states.state]); } else { - EncodeBit1(&enc->rc, &enc->states.is_rep_G1[enc->states.state]); - EncodeBit(&enc->rc, &enc->states.is_rep_G2[enc->states.state], rep - 2); - if (rep == 3) { + RC_encodeBit1(&enc->rc, &enc->states.is_rep_G1[enc->states.state]); + RC_encodeBit(&enc->rc, &enc->states.is_rep_G2[enc->states.state], rep - 2); + if (rep == 3) enc->states.reps[3] = enc->states.reps[2]; - } enc->states.reps[2] = enc->states.reps[1]; } enc->states.reps[1] = enc->states.reps[0]; enc->states.reps[0] = distance; } - if (len == 1) { - enc->states.state = ShortRepNextState(enc->states.state); - } - else { - EncodeLength(enc, &enc->states.rep_len_states, len, pos_state); - enc->states.state = RepNextState(enc->states.state); - } + LZMA_encodeLength(enc, &enc->states.rep_len_states, len, pos_state); + enc->states.state = REP_NEXT_STATE(enc->states.state); + ++enc->rep_len_price_count; } -/* *****************************************/ -/* Distance slot functions based on fastpos.h from XZ*/ + +/* + * Distance slot functions based on fastpos.h from XZ + */ HINT_INLINE -unsigned FastDistShift(unsigned n) +unsigned LZMA_fastDistShift(unsigned const n) { return n * (kFastDistBits - 1); } HINT_INLINE -unsigned FastDistResult(U32 dist, unsigned n) +unsigned LZMA_fastDistResult(U32 const dist, unsigned const n) { - return distance_table[dist >> FastDistShift(n)] - + 2 * FastDistShift(n); + return distance_table[dist >> LZMA_fastDistShift(n)] + + 2 * LZMA_fastDistShift(n); } -static size_t GetDistSlot(U32 distance) +static size_t LZMA_getDistSlot(U32 const distance) { U32 limit = 1UL << kFastDistBits; /* If it is small enough, we can pick the result directly from */ @@ -625,193 +519,192 @@ static size_t GetDistSlot(U32 distance) if (distance < limit) { return distance_table[distance]; } - limit <<= FastDistShift(1); + limit <<= LZMA_fastDistShift(1); if (distance < limit) { - return FastDistResult(distance, 1); + return LZMA_fastDistResult(distance, 1); } - return FastDistResult(distance, 2); + return LZMA_fastDistResult(distance, 2); } -/* **************************************** */ +/* * */ -static void EncodeNormalMatch(FL2_lzmaEncoderCtx* enc, unsigned len, U32 dist, size_t pos_state) + +HINT_INLINE +void LZMA_encodeNormalMatch(LZMA2_ECtx *const enc, unsigned const len, U32 const dist, size_t const pos_state) { - DEBUGLOG(7, "EncodeNormalMatch : length %u, dist %u", len, dist); - EncodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]); - EncodeBit0(&enc->rc, &enc->states.is_rep[enc->states.state]); - enc->states.state = MatchNextState(enc->states.state); - EncodeLength(enc, &enc->states.len_states, len, pos_state); + DEBUGLOG(7, "LZMA_encodeNormalMatch : length %u, dist %u", len, dist); + RC_encodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]); + RC_encodeBit0(&enc->rc, &enc->states.is_rep[enc->states.state]); + enc->states.state = MATCH_NEXT_STATE(enc->states.state); - { size_t dist_slot = GetDistSlot(dist); - EncodeBitTree(&enc->rc, enc->states.dist_slot_encoders[GetLenToDistState(len)], kNumPosSlotBits, (unsigned)(dist_slot)); - if (dist_slot >= kStartPosModelIndex) { - unsigned footerBits = ((unsigned)(dist_slot >> 1) - 1); - size_t base = ((2 | (dist_slot & 1)) << footerBits); - unsigned posReduced = (unsigned)(dist - base); - if (dist_slot < kEndPosModelIndex) { - EncodeBitTreeReverse(&enc->rc, enc->states.dist_encoders + base - dist_slot - 1, footerBits, posReduced); - } - else { - EncodeDirect(&enc->rc, posReduced >> kNumAlignBits, footerBits - kNumAlignBits); - EncodeBitTreeReverse(&enc->rc, enc->states.dist_align_encoders, kNumAlignBits, posReduced & kAlignMask); - ++enc->align_price_count; - } + LZMA_encodeLength(enc, &enc->states.len_states, len, pos_state); + + size_t const dist_slot = LZMA_getDistSlot(dist); + RC_encodeBitTree(&enc->rc, enc->states.dist_slot_encoders[LEN_TO_DIST_STATE(len)], kNumPosSlotBits, (unsigned)dist_slot); + if (dist_slot >= kStartPosModelIndex) { + unsigned const footer_bits = ((unsigned)(dist_slot >> 1) - 1); + size_t const base = ((2 | (dist_slot & 1)) << footer_bits); + unsigned const dist_reduced = (unsigned)(dist - base); + if (dist_slot < kEndPosModelIndex) { + RC_encodeBitTreeReverse(&enc->rc, enc->states.dist_encoders + base - dist_slot - 1, footer_bits, dist_reduced); + } + else { + RC_encodeDirect(&enc->rc, dist_reduced >> kNumAlignBits, footer_bits - kNumAlignBits); + RC_encodeBitTreeReverse(&enc->rc, enc->states.dist_align_encoders, kNumAlignBits, dist_reduced & kAlignMask); } } enc->states.reps[3] = enc->states.reps[2]; enc->states.reps[2] = enc->states.reps[1]; enc->states.reps[1] = enc->states.reps[0]; enc->states.reps[0] = dist; + ++enc->match_price_count; } -#if defined(_MSC_VER) -# pragma warning(disable : 4701) /* disable: C4701: potentially uninitialized local variable */ -#endif - FORCE_INLINE_TEMPLATE -size_t EncodeChunkFast(FL2_lzmaEncoderCtx* enc, +size_t LZMA_encodeChunkFast(LZMA2_ECtx *const enc, FL2_dataBlock const block, - FL2_matchTable* tbl, - int structTbl, + FL2_matchTable* const tbl, + int const struct_tbl, size_t index, - size_t uncompressed_end) + size_t const uncompressed_end) { size_t const pos_mask = enc->pos_mask; size_t prev = index; - unsigned search_depth = tbl->params.depth; - while (index < uncompressed_end && enc->rc.out_index < enc->rc.chunk_size) - { + unsigned const search_depth = tbl->params.depth; + while (index < uncompressed_end && enc->rc.out_index < enc->rc.chunk_size) { size_t max_len; const BYTE* data; /* Table of distance restrictions for short matches */ static const U32 max_dist_table[] = { 0, 0, 0, 1 << 6, 1 << 14 }; /* Get a match from the table, extended to its full length */ - Match bestMatch = FL2_radixGetMatch(block, tbl, search_depth, structTbl, index); - if (bestMatch.length < kMatchLenMin) { + RMF_match best_match = RMF_getMatch(block, tbl, search_depth, struct_tbl, index); + if (best_match.length < kMatchLenMin) { ++index; continue; } /* Use if near enough */ - if (bestMatch.length >= 5 || bestMatch.dist < max_dist_table[bestMatch.length]) { - bestMatch.dist += kNumReps; - } - else { - bestMatch.length = 0; - } + if (best_match.length >= 5 || best_match.dist < max_dist_table[best_match.length]) + best_match.dist += kNumReps; + else + best_match.length = 0; + max_len = MIN(kMatchLenMax, block.end - index); data = block.data + index; - { Match bestRep; - Match repMatch; - bestRep.length = 0; - for (repMatch.dist = 0; repMatch.dist < kNumReps; ++repMatch.dist) { - const BYTE *data_2 = data - enc->states.reps[repMatch.dist] - 1; - if (MEM_read16(data) != MEM_read16(data_2)) { - continue; - } - repMatch.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2); - if (repMatch.length >= max_len) { - bestMatch = repMatch; - goto _encode; - } - if (repMatch.length > bestRep.length) { - bestRep = repMatch; - } - } - if (bestMatch.length >= max_len) + RMF_match best_rep = { 0, 0 }; + RMF_match rep_match; + /* Search all of the rep distances */ + for (rep_match.dist = 0; rep_match.dist < kNumReps; ++rep_match.dist) { + const BYTE *data_2 = data - enc->states.reps[rep_match.dist] - 1; + if (MEM_read16(data) != MEM_read16(data_2)) + continue; + + rep_match.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2); + if (rep_match.length >= max_len) { + best_match = rep_match; goto _encode; - if (bestRep.length >= 2) { - int const gain2 = (int)(bestRep.length * 3 - bestRep.dist); - int const gain1 = (int)(bestMatch.length * 3 - ZSTD_highbit32(bestMatch.dist + 1) + 1); + } + if (rep_match.length > best_rep.length) + best_rep = rep_match; + } + /* Encode if it is kMatchLenMax or completes the block */ + if (best_match.length >= max_len) + goto _encode; + + if (best_rep.length >= 2) { + if (best_rep.length > best_match.length) { + best_match = best_rep; + } + else { + /* Modified ZSTD scheme for estimating cost */ + int const gain2 = (int)(best_rep.length * 3 - best_rep.dist); + int const gain1 = (int)(best_match.length * 3 - ZSTD_highbit32(best_match.dist + 1) + 1); if (gain2 > gain1) { - DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", bestMatch.length, bestMatch.dist, bestRep.length, bestRep.dist); - bestMatch = bestRep; + DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", best_match.length, best_match.dist, best_rep.length, best_rep.dist); + best_match = best_rep; } } } - if (bestMatch.length < kMatchLenMin) { + if (best_match.length < kMatchLenMin) { ++index; continue; } - for (size_t next = index + 1; bestMatch.length < kMatchLenMax && next < uncompressed_end; ++next) { + for (size_t next = index + 1; best_match.length < kMatchLenMax && next < uncompressed_end; ++next) { /* lazy matching scheme from ZSTD */ - Match next_match = FL2_radixGetNextMatch(block, tbl, search_depth, structTbl, next); + RMF_match next_match = RMF_getNextMatch(block, tbl, search_depth, struct_tbl, next); if (next_match.length >= kMatchLenMin) { - Match bestRep; - Match repMatch; - bestRep.length = 0; + best_rep.length = 0; data = block.data + next; max_len = MIN(kMatchLenMax, block.end - next); - for (repMatch.dist = 0; repMatch.dist < kNumReps; ++repMatch.dist) { - const BYTE *data_2 = data - enc->states.reps[repMatch.dist] - 1; - if (MEM_read16(data) != MEM_read16(data_2)) { + for (rep_match.dist = 0; rep_match.dist < kNumReps; ++rep_match.dist) { + const BYTE *data_2 = data - enc->states.reps[rep_match.dist] - 1; + if (MEM_read16(data) != MEM_read16(data_2)) continue; - } - repMatch.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2); - if (repMatch.length > bestRep.length) { - bestRep = repMatch; - } + + rep_match.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2); + if (rep_match.length > best_rep.length) + best_rep = rep_match; } - if (bestRep.length >= 3) { - int const gain2 = (int)(bestRep.length * 3 - bestRep.dist); - int const gain1 = (int)(bestMatch.length * 3 - ZSTD_highbit32((U32)bestMatch.dist + 1) + 1); + if (best_rep.length >= 3) { + int const gain2 = (int)(best_rep.length * 3 - best_rep.dist); + int const gain1 = (int)(best_match.length * 3 - ZSTD_highbit32((U32)best_match.dist + 1) + 1); if (gain2 > gain1) { - DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", bestMatch.length, bestMatch.dist, bestRep.length, bestRep.dist); - bestMatch = bestRep; + DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", best_match.length, best_match.dist, best_rep.length, best_rep.dist); + best_match = best_rep; index = next; } } - if (next_match.length >= 3 && next_match.dist != bestMatch.dist) { + if (next_match.length >= 3 && next_match.dist != best_match.dist) { int const gain2 = (int)(next_match.length * 4 - ZSTD_highbit32((U32)next_match.dist + 1)); /* raw approx */ - int const gain1 = (int)(bestMatch.length * 4 - ZSTD_highbit32((U32)bestMatch.dist + 1) + 4); + int const gain1 = (int)(best_match.length * 4 - ZSTD_highbit32((U32)best_match.dist + 1) + 4); if (gain2 > gain1) { - DEBUGLOG(7, "Replace match (%u, %u) with match (%u, %u)", bestMatch.length, bestMatch.dist, next_match.length, next_match.dist + kNumReps); - bestMatch = next_match; - bestMatch.dist += kNumReps; + DEBUGLOG(7, "Replace match (%u, %u) with match (%u, %u)", best_match.length, best_match.dist, next_match.length, next_match.dist + kNumReps); + best_match = next_match; + best_match.dist += kNumReps; index = next; continue; } } } if (next < uncompressed_end - 4) { - Match bestRep; - Match repMatch; ++next; - next_match = FL2_radixGetNextMatch(block, tbl, search_depth, structTbl, next); + + next_match = RMF_getNextMatch(block, tbl, search_depth, struct_tbl, next); if (next_match.length < 4) break; + data = block.data + next; max_len = MIN(kMatchLenMax, block.end - next); - bestRep.length = 0; - for (repMatch.dist = 0; repMatch.dist < kNumReps; ++repMatch.dist) { - const BYTE *data_2 = data - enc->states.reps[repMatch.dist] - 1; - if (MEM_read16(data) != MEM_read16(data_2)) { + best_rep.length = 0; + + for (rep_match.dist = 0; rep_match.dist < kNumReps; ++rep_match.dist) { + const BYTE *data_2 = data - enc->states.reps[rep_match.dist] - 1; + if (MEM_read16(data) != MEM_read16(data_2)) continue; - } - repMatch.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2); - if (repMatch.length > bestRep.length) { - bestRep = repMatch; - } + + rep_match.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2); + if (rep_match.length > best_rep.length) + best_rep = rep_match; } - if (bestRep.length >= 4) { - int const gain2 = (int)(bestRep.length * 4 - (bestRep.dist >> 1)); - int const gain1 = (int)(bestMatch.length * 4 - ZSTD_highbit32((U32)bestMatch.dist + 1) + 1); + if (best_rep.length >= 4) { + int const gain2 = (int)(best_rep.length * 4 - (best_rep.dist >> 1)); + int const gain1 = (int)(best_match.length * 4 - ZSTD_highbit32((U32)best_match.dist + 1) + 1); if (gain2 > gain1) { - DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", bestMatch.length, bestMatch.dist, bestRep.length, bestRep.dist); - bestMatch = bestRep; + DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", best_match.length, best_match.dist, best_rep.length, best_rep.dist); + best_match = best_rep; index = next; } } - if (next_match.length >= 4 && next_match.dist != bestMatch.dist) { + if (next_match.length >= 4 && next_match.dist != best_match.dist) { int const gain2 = (int)(next_match.length * 4 - ZSTD_highbit32((U32)next_match.dist + 1)); - int const gain1 = (int)(bestMatch.length * 4 - ZSTD_highbit32((U32)bestMatch.dist + 1) + 7); + int const gain1 = (int)(best_match.length * 4 - ZSTD_highbit32((U32)best_match.dist + 1) + 7); if (gain2 > gain1) { - DEBUGLOG(7, "Replace match (%u, %u) with match (%u, %u)", bestMatch.length, bestMatch.dist, next_match.length, next_match.dist + kNumReps); - bestMatch = next_match; - bestMatch.dist += kNumReps; + DEBUGLOG(7, "Replace match (%u, %u) with match (%u, %u)", best_match.length, best_match.dist, next_match.length, next_match.dist + kNumReps); + best_match = next_match; + best_match.dist += kNumReps; index = next; continue; } @@ -821,134 +714,165 @@ size_t EncodeChunkFast(FL2_lzmaEncoderCtx* enc, break; } _encode: - assert(index + bestMatch.length <= block.end); - while (prev < index && enc->rc.out_index < enc->rc.chunk_size) { - if (block.data[prev] == block.data[prev - enc->states.reps[0] - 1]) { - EncodeRepMatch(enc, 1, 0, prev & pos_mask); + assert(index + best_match.length <= block.end); + + /* Chunk overflow size is kOptimizerBufferSize + extra. + * Unlikely for this limit to be hit. */ + size_t rc_end = enc->rc.chunk_size + kOptimizerBufferSize; + while (prev < index && enc->rc.out_index < rc_end) { + if (block.data[prev] != block.data[prev - enc->states.reps[0] - 1]) { + LZMA_encodeLiteralBuf(enc, block.data, prev); + ++prev; } else { - EncodeLiteralBuf(enc, block.data, prev); + LZMA_encodeRepMatchShort(enc, prev & pos_mask); + ++prev; } - ++prev; } - if (enc->rc.out_index >= enc->rc.chunk_size) { + if (prev < index) break; - } - if(bestMatch.length >= kMatchLenMin) { - if (bestMatch.dist < kNumReps) { - EncodeRepMatch(enc, bestMatch.length, bestMatch.dist, index & pos_mask); + + if(best_match.length >= kMatchLenMin) { + if (best_match.dist >= kNumReps) { + LZMA_encodeNormalMatch(enc, best_match.length, best_match.dist - kNumReps, index & pos_mask); + index += best_match.length; + prev = index; } else { - EncodeNormalMatch(enc, bestMatch.length, bestMatch.dist - kNumReps, index & pos_mask); + LZMA_encodeRepMatchLong(enc, best_match.length, best_match.dist, index & pos_mask); + index += best_match.length; + prev = index; } - index += bestMatch.length; - prev = index; } } while (prev < index && enc->rc.out_index < enc->rc.chunk_size) { - if (block.data[prev] == block.data[prev - enc->states.reps[0] - 1]) { - EncodeRepMatch(enc, 1, 0, prev & pos_mask); - } - else { - EncodeLiteralBuf(enc, block.data, prev); - } + if (block.data[prev] != block.data[prev - enc->states.reps[0] - 1]) + LZMA_encodeLiteralBuf(enc, block.data, prev); + else + LZMA_encodeRepMatchShort(enc, prev & pos_mask); ++prev; } - Flush(&enc->rc); return prev; } -/* Reverse the direction of the linked list generated by the optimal parser */ -static void ReverseOptimalChain(OptimalNode* opt_buf, size_t cur) +/* + * Reverse the direction of the linked list generated by the optimal parser + */ +FORCE_NOINLINE +static void LZMA_reverseOptimalChain(OptimalNode* const opt_buf, size_t cur) { - size_t next_index = opt_buf[cur].prev_index; - U32 next_dist = opt_buf[cur].prev_dist; - do - { - if (opt_buf[cur].is_combination) - { - MakeAsLiteral(opt_buf[next_index]); - opt_buf[next_index].prev_index = (unsigned)(next_index - 1); - if (opt_buf[cur].prev_2) - { - opt_buf[next_index - 1].is_combination = 0; - opt_buf[next_index - 1].prev_index = opt_buf[cur].prev_index_2; - opt_buf[next_index - 1].prev_dist = opt_buf[cur].prev_dist_2; + unsigned len = (unsigned)opt_buf[cur].len; + U32 dist = opt_buf[cur].dist; + + for(;;) { + unsigned const extra = (unsigned)opt_buf[cur].extra; + cur -= len; + + if (extra) { + opt_buf[cur].len = (U32)len; + len = extra; + if (extra == 1) { + opt_buf[cur].dist = dist; + dist = kNullDist; + --cur; + } + else { + opt_buf[cur].dist = 0; + --cur; + --len; + opt_buf[cur].dist = kNullDist; + opt_buf[cur].len = 1; + cur -= len; } } - { U32 temp = opt_buf[next_index].prev_dist; - opt_buf[next_index].prev_dist = next_dist; - next_dist = temp; - } + unsigned const next_len = opt_buf[cur].len; + U32 const next_dist = opt_buf[cur].dist; - { size_t prev_index = next_index; - next_index = opt_buf[prev_index].prev_index; - opt_buf[prev_index].prev_index = (unsigned)(cur); - cur = prev_index; - } - } while (cur != 0); + opt_buf[cur].dist = dist; + opt_buf[cur].len = (U32)len; + + if (cur == 0) + break; + + len = next_len; + dist = next_dist; + } } -static unsigned GetLiteralPrice(FL2_lzmaEncoderCtx* enc, size_t index, size_t state, unsigned prev_symbol, U32 symbol, unsigned match_byte) +static unsigned LZMA_getLiteralPrice(LZMA2_ECtx *const enc, size_t const index, size_t const state, unsigned const prev_symbol, U32 symbol, unsigned const match_byte) { - const Probability* prob_table = GetLiteralProbs(enc, index, prev_symbol); - if (IsCharState(state)) { + const Probability* const prob_table = LITERAL_PROBS(enc, index, prev_symbol); + if (IS_LIT_STATE(state)) { unsigned price = 0; symbol |= 0x100; do { - price += GET_PRICE(enc->rc, prob_table[symbol >> 8], (symbol >> 7) & 1); + price += GET_PRICE(prob_table[symbol >> 8], (symbol >> 7) & 1); symbol <<= 1; } while (symbol < 0x10000); return price; } - return GetLiteralPriceMatched(&enc->rc, prob_table, symbol, match_byte); + return LZMA_getLiteralPriceMatched(prob_table, symbol, match_byte); } -static void HashReset(FL2_lzmaEncoderCtx* enc, unsigned dictionary_bits_3) +/* + * Reset the hash object for encoding a new slice of a block + */ +static void LZMA_hashReset(LZMA2_ECtx *const enc, unsigned const dictionary_bits_3) { enc->hash_dict_3 = (ptrdiff_t)1 << dictionary_bits_3; enc->chain_mask_3 = enc->hash_dict_3 - 1; memset(enc->hash_buf->table_3, 0xFF, sizeof(enc->hash_buf->table_3)); } -static int HashCreate(FL2_lzmaEncoderCtx* enc, unsigned dictionary_bits_3) +/* + * Create hash table and chain with dict size dictionary_bits_3. Frees any existing object. + */ +static int LZMA_hashCreate(LZMA2_ECtx *const enc, unsigned const dictionary_bits_3) { DEBUGLOG(3, "Create hash chain : dict bits %u", dictionary_bits_3); - if (enc->hash_buf) { + + if (enc->hash_buf) free(enc->hash_buf); - } + enc->hash_alloc_3 = (ptrdiff_t)1 << dictionary_bits_3; enc->hash_buf = malloc(sizeof(HashChains) + (enc->hash_alloc_3 - 1) * sizeof(S32)); + if (enc->hash_buf == NULL) return 1; - HashReset(enc, dictionary_bits_3); + + LZMA_hashReset(enc, dictionary_bits_3); + return 0; } -/* Create a hash chain for hybrid mode */ -int FL2_lzma2HashAlloc(FL2_lzmaEncoderCtx* enc, const FL2_lzma2Parameters* options) +/* Create a hash chain for hybrid mode if options require one. + * Used for allocating before compression begins. Any existing table will be reused if + * it is at least as large as required. + */ +int LZMA2_hashAlloc(LZMA2_ECtx *const enc, const FL2_lzma2Parameters* const options) { - if (enc->strategy == FL2_ultra && enc->hash_alloc_3 < ((ptrdiff_t)1 << options->second_dict_bits)) { - return HashCreate(enc, options->second_dict_bits); - } + if (enc->strategy == FL2_ultra && enc->hash_alloc_3 < ((ptrdiff_t)1 << options->second_dict_bits)) + return LZMA_hashCreate(enc, options->second_dict_bits); + return 0; } #define GET_HASH_3(data) ((((MEM_readLE32(data)) << 8) * 506832829U) >> (32 - kHash3Bits)) +/* Find matches nearer than the match from the RMF. If none is at least as long as + * the RMF match (most likely), insert that match at the end of the list. + */ HINT_INLINE -size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, - ptrdiff_t index, - size_t length_limit, - Match match) +size_t LZMA_hashGetMatches(LZMA2_ECtx *const enc, FL2_dataBlock const block, + ptrdiff_t const index, + size_t const length_limit, + RMF_match const match) { ptrdiff_t const hash_dict_3 = enc->hash_dict_3; const BYTE* data = block.data; - HashChains* tbl = enc->hash_buf; + HashChains* const tbl = enc->hash_buf; ptrdiff_t const chain_mask_3 = enc->chain_mask_3; - size_t max_len; - ptrdiff_t first_3; enc->match_count = 0; enc->hash_prev_index = MAX(enc->hash_prev_index, index - hash_dict_3); @@ -959,15 +883,16 @@ size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, tbl->table_3[hash] = (S32)enc->hash_prev_index; } data += index; - max_len = 2; - { size_t hash = GET_HASH_3(data); - first_3 = tbl->table_3[hash]; - tbl->table_3[hash] = (S32)(index); - } + size_t const hash = GET_HASH_3(data); + ptrdiff_t const first_3 = tbl->table_3[hash]; + tbl->table_3[hash] = (S32)index; + + size_t max_len = 2; + if (first_3 >= 0) { int cycles = enc->match_cycles; - ptrdiff_t end_index = index - (((ptrdiff_t)match.dist < hash_dict_3) ? match.dist : hash_dict_3); + ptrdiff_t const end_index = index - (((ptrdiff_t)match.dist < hash_dict_3) ? match.dist : hash_dict_3); ptrdiff_t match_3 = first_3; if (match_3 >= end_index) { do { @@ -979,9 +904,8 @@ size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, enc->matches[enc->match_count].dist = (U32)(index - match_3 - 1); ++enc->match_count; max_len = len_test; - if (len_test >= length_limit) { + if (len_test >= length_limit) break; - } } if (cycles <= 0) break; @@ -990,7 +914,8 @@ size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, } } tbl->hash_chain_3[index & chain_mask_3] = (S32)first_3; - if ((unsigned)(max_len) < match.length) { + if ((unsigned)max_len < match.length) { + /* Insert the match from the RMF */ enc->matches[enc->match_count] = match; ++enc->match_count; return match.length; @@ -998,181 +923,167 @@ size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, return max_len; } -#if defined(_MSC_VER) -# pragma warning(disable : 4701) /* disable: C4701: potentially uninitialized local variable */ -#endif - -/* The speed of this function is critical and the sections have so many variables -* in common that breaking it up would be inefficient. +/* The speed of this function is critical. The sections have many variables +* in common, so breaking it up into shorter functions is not feasible. * For each position cur, starting at 1, check some or all possible * encoding choices - a literal, 1-byte rep 0 match, all rep match lengths, and * all match lengths at available distances. It also checks the combined -* sequences literal+rep0, rep+rep0 and match+rep0. +* sequences literal+rep0, rep+lit+rep0 and match+lit+rep0. * If is_hybrid != 0, this method works in hybrid mode, using the * hash chain to find shorter matches at near distances. */ FORCE_INLINE_TEMPLATE -size_t OptimalParse(FL2_lzmaEncoderCtx* const enc, const FL2_dataBlock block, - Match match, +size_t LZMA_optimalParse(LZMA2_ECtx* const enc, FL2_dataBlock const block, + RMF_match match, size_t const index, size_t const cur, size_t len_end, int const is_hybrid, U32* const reps) { - OptimalNode* cur_opt = &enc->opt_buf[cur]; - size_t prev_index = cur_opt->prev_index; - size_t state = enc->opt_buf[prev_index].state; + OptimalNode* const cur_opt = &enc->opt_buf[cur]; size_t const pos_mask = enc->pos_mask; - size_t pos_state = (index & pos_mask); - const BYTE* data = block.data + index; + size_t const pos_state = (index & pos_mask); + const BYTE* const data = block.data + index; size_t const fast_length = enc->fast_length; + size_t prev_index = cur - cur_opt->len; + size_t state; size_t bytes_avail; - size_t max_length; - size_t start_len; U32 match_price; U32 rep_match_price; - Probability is_rep_prob; - if (cur_opt->is_combination) { - --prev_index; - if (cur_opt->prev_2) { - state = enc->opt_buf[cur_opt->prev_index_2].state; - if (cur_opt->prev_dist_2 < kNumReps) { - state = RepNextState(state); - } - else { - state = MatchNextState(state); - } + /* Update the states according to how this location was reached */ + if (cur_opt->len == 1) { + /* Literal or 1-byte rep */ + const BYTE *next_state = (cur_opt->dist == 0) ? kShortRepNextStates : kLiteralNextStates; + state = next_state[enc->opt_buf[prev_index].state]; + } + else { + /* Match or rep match */ + size_t const dist = cur_opt->dist; + + if (cur_opt->extra) { + prev_index -= cur_opt->extra; + state = kState_RepAfterLit - ((dist >= kNumReps) & (cur_opt->extra == 1)); } else { state = enc->opt_buf[prev_index].state; + state = MATCH_NEXT_STATE(state) + (dist < kNumReps); } - state = LiteralNextState(state); - } - if (prev_index == cur - 1) { - if (cur_opt->prev_dist == 0) { - state = ShortRepNextState(state); - } - else { - state = LiteralNextState(state); - } - } - else { - size_t dist; - if (cur_opt->is_combination && cur_opt->prev_2) { - prev_index = cur_opt->prev_index_2; - dist = cur_opt->prev_dist_2; - state = RepNextState(state); - } - else { - dist = cur_opt->prev_dist; - if (dist < kNumReps) { - state = RepNextState(state); - } - else { - state = MatchNextState(state); - } - } - const OptimalNode* prev_opt = &enc->opt_buf[prev_index]; + const OptimalNode *const prev_opt = &enc->opt_buf[prev_index]; if (dist < kNumReps) { - size_t i = 1; + /* Move the chosen rep to the front. + * The table is hideous but faster than branching :D */ reps[0] = prev_opt->reps[dist]; - for (; i <= dist; ++i) { - reps[i] = prev_opt->reps[i - 1]; - } - for (; i < kNumReps; ++i) { - reps[i] = prev_opt->reps[i]; - } + size_t table = 1 | (2 << 2) | (3 << 4) + | (0 << 8) | (2 << 10) | (3 << 12) + | (0L << 16) | (1L << 18) | (3L << 20) + | (0L << 24) | (1L << 26) | (2L << 28); + table >>= (dist << 3); + reps[1] = prev_opt->reps[table & 3]; + table >>= 2; + reps[2] = prev_opt->reps[table & 3]; + table >>= 2; + reps[3] = prev_opt->reps[table & 3]; } else { reps[0] = (U32)(dist - kNumReps); - for (size_t i = 1; i < kNumReps; ++i) { - reps[i] = prev_opt->reps[i - 1]; - } + reps[1] = prev_opt->reps[0]; + reps[2] = prev_opt->reps[1]; + reps[3] = prev_opt->reps[2]; } } cur_opt->state = state; memcpy(cur_opt->reps, reps, sizeof(cur_opt->reps)); - is_rep_prob = enc->states.is_rep[state]; + Probability const is_rep_prob = enc->states.is_rep[state]; - { Probability is_match_prob = enc->states.is_match[state][pos_state]; - unsigned cur_byte = *data; - unsigned match_byte = *(data - reps[0] - 1); - U32 cur_price = cur_opt->price; - U32 cur_and_lit_price = cur_price + GET_PRICE_0(rc, is_match_prob) + - GetLiteralPrice(enc, index, state, data[-1], cur_byte, match_byte); - OptimalNode* next_opt = &enc->opt_buf[cur + 1]; - BYTE next_is_char = 0; - /* Try literal */ - if (cur_and_lit_price < next_opt->price) { - next_opt->price = cur_and_lit_price; - next_opt->prev_index = (unsigned)cur; - MakeAsLiteral(*next_opt); - next_is_char = 1; + { OptimalNode *const next_opt = &enc->opt_buf[cur + 1]; + U32 const cur_price = cur_opt->price; + U32 const next_price = next_opt->price; + Probability const is_match_prob = enc->states.is_match[state][pos_state]; + unsigned const cur_byte = *data; + unsigned const match_byte = *(data - reps[0] - 1); + + U32 cur_and_lit_price = cur_price + GET_PRICE_0(is_match_prob); + /* This is a compromise to try to filter out cases where literal + rep0 is unlikely to be cheaper */ + BYTE try_lit = cur_and_lit_price + kMinLitPrice / 2U <= next_price; + if (try_lit) { + /* cur_and_lit_price is used later for the literal + rep0 test */ + cur_and_lit_price += LZMA_getLiteralPrice(enc, index, state, data[-1], cur_byte, match_byte); + /* Try literal */ + if (cur_and_lit_price < next_price) { + next_opt->price = cur_and_lit_price; + next_opt->len = 1; + MARK_LITERAL(*next_opt); + if (is_hybrid) /* Evaluates as a constant expression due to inlining */ + try_lit = 0; + } } - match_price = cur_price + GET_PRICE_1(rc, is_match_prob); - rep_match_price = match_price + GET_PRICE_1(rc, is_rep_prob); + match_price = cur_price + GET_PRICE_1(is_match_prob); + rep_match_price = match_price + GET_PRICE_1(is_rep_prob); if (match_byte == cur_byte) { /* Try 1-byte rep0 */ - U32 short_rep_price = rep_match_price + GetRepLen1Price(enc, state, pos_state); + U32 short_rep_price = rep_match_price + LZMA_getRepLen1Price(enc, state, pos_state); if (short_rep_price <= next_opt->price) { next_opt->price = short_rep_price; - next_opt->prev_index = (unsigned)cur; - MakeAsShortRep(*next_opt); - next_is_char = 1; + next_opt->len = 1; + MARK_SHORT_REP(*next_opt); } } bytes_avail = MIN(block.end - index, kOptimizerBufferSize - 1 - cur); if (bytes_avail < 2) return len_end; - if (!next_is_char && match_byte != cur_byte) { + + /* If match_byte == cur_byte a rep0 begins at the current position */ + if (is_hybrid && try_lit && match_byte != cur_byte) { /* Try literal + rep0 */ - const BYTE *data_2 = data - reps[0]; + const BYTE *const data_2 = data - reps[0]; size_t limit = MIN(bytes_avail - 1, fast_length); size_t len_test_2 = ZSTD_count(data + 1, data_2, data + 1 + limit); if (len_test_2 >= 2) { - size_t state_2 = LiteralNextState(state); - size_t pos_state_next = (index + 1) & pos_mask; - U32 next_rep_match_price = cur_and_lit_price + - GET_PRICE_1(rc, enc->states.is_match[state_2][pos_state_next]) + - GET_PRICE_1(rc, enc->states.is_rep[state_2]); - size_t offset = cur + 1 + len_test_2; - U32 cur_and_len_price = next_rep_match_price + GetRepMatch0Price(enc, len_test_2, state_2, pos_state_next); + size_t const state_2 = LIT_NEXT_STATE(state); + size_t const pos_state_next = (index + 1) & pos_mask; + U32 const next_rep_match_price = cur_and_lit_price + + GET_PRICE_1(enc->states.is_match[state_2][pos_state_next]) + + GET_PRICE_1(enc->states.is_rep[state_2]); + U32 const cur_and_len_price = next_rep_match_price + LZMA_getRepMatch0Price(enc, len_test_2, state_2, pos_state_next); + size_t const offset = cur + 1 + len_test_2; if (cur_and_len_price < enc->opt_buf[offset].price) { len_end = MAX(len_end, offset); enc->opt_buf[offset].price = cur_and_len_price; - enc->opt_buf[offset].prev_index = (unsigned)(cur + 1); - enc->opt_buf[offset].prev_dist = 0; - enc->opt_buf[offset].is_combination = 1; - enc->opt_buf[offset].prev_2 = 0; + enc->opt_buf[offset].len = (unsigned)len_test_2; + enc->opt_buf[offset].dist = 0; + enc->opt_buf[offset].extra = 1; } } } } - max_length = MIN(bytes_avail, fast_length); - start_len = 2; + size_t const max_length = MIN(bytes_avail, fast_length); + size_t start_len = 2; + if (match.length > 0) { size_t len_test; size_t len; U32 cur_rep_price; for (size_t rep_index = 0; rep_index < kNumReps; ++rep_index) { - const BYTE *data_2 = data - reps[rep_index] - 1; + const BYTE *const data_2 = data - reps[rep_index] - 1; if (MEM_read16(data) != MEM_read16(data_2)) continue; + /* Test is limited to fast_length, but it is rare for the RMF to miss the longest match, + * therefore this function is rarely called when a rep len > fast_length exists */ len_test = ZSTD_count(data + 2, data_2 + 2, data + max_length) + 2; len_end = MAX(len_end, cur + len_test); - cur_rep_price = rep_match_price + GetRepPrice(enc, rep_index, state, pos_state); + cur_rep_price = rep_match_price + LZMA_getRepPrice(enc, rep_index, state, pos_state); len = 2; /* Try rep match */ do { - U32 cur_and_len_price = cur_rep_price + enc->states.rep_len_states.prices[pos_state][len - kMatchLenMin]; - OptimalNode* opt = &enc->opt_buf[cur + len]; + U32 const cur_and_len_price = cur_rep_price + enc->states.rep_len_states.prices[pos_state][len - kMatchLenMin]; + OptimalNode *const opt = &enc->opt_buf[cur + len]; if (cur_and_len_price < opt->price) { opt->price = cur_and_len_price; - opt->prev_index = (unsigned)cur; - opt->prev_dist = (U32)(rep_index); - opt->is_combination = 0; + opt->len = (unsigned)len; + opt->dist = (U32)rep_index; + opt->extra = 0; } } while (++len <= len_test); @@ -1180,69 +1091,64 @@ size_t OptimalParse(FL2_lzmaEncoderCtx* const enc, const FL2_dataBlock block, /* Save time by exluding normal matches not longer than the rep */ start_len = len_test + 1; } + /* rep + literal + rep0 is not common so this test is skipped for faster, non-hybrid encoding */ if (is_hybrid && len_test + 3 <= bytes_avail && MEM_read16(data + len_test + 1) == MEM_read16(data_2 + len_test + 1)) { - /* Try rep + literal + rep0 */ - size_t len_test_2 = ZSTD_count(data + len_test + 3, + /* Try rep + literal + rep0. + * The second rep may be > fast_length, but it is not worth the extra time to handle this case + * and the price table is not filled for it */ + size_t const len_test_2 = ZSTD_count(data + len_test + 3, data_2 + len_test + 3, data + MIN(len_test + 1 + fast_length, bytes_avail)) + 2; - size_t state_2 = RepNextState(state); + size_t state_2 = REP_NEXT_STATE(state); size_t pos_state_next = (index + len_test) & pos_mask; U32 rep_lit_rep_total_price = - cur_rep_price + enc->states.rep_len_states.prices[pos_state][len_test - kMatchLenMin] + - GET_PRICE_0(rc, enc->states.is_match[state_2][pos_state_next]) + - GetLiteralPriceMatched(&enc->rc, GetLiteralProbs(enc, index + len_test, data[len_test - 1]), + cur_rep_price + enc->states.rep_len_states.prices[pos_state][len_test - kMatchLenMin] + + GET_PRICE_0(enc->states.is_match[state_2][pos_state_next]) + + LZMA_getLiteralPriceMatched(LITERAL_PROBS(enc, index + len_test, data[len_test - 1]), data[len_test], data_2[len_test]); - size_t offset; - state_2 = LiteralNextState(state_2); + state_2 = kState_LitAfterRep; pos_state_next = (index + len_test + 1) & pos_mask; rep_lit_rep_total_price += - GET_PRICE_1(rc, enc->states.is_match[state_2][pos_state_next]) + - GET_PRICE_1(rc, enc->states.is_rep[state_2]); - offset = cur + len_test + 1 + len_test_2; - rep_lit_rep_total_price += GetRepMatch0Price(enc, len_test_2, state_2, pos_state_next); + GET_PRICE_1(enc->states.is_match[state_2][pos_state_next]) + + GET_PRICE_1(enc->states.is_rep[state_2]); + size_t const offset = cur + len_test + 1 + len_test_2; + rep_lit_rep_total_price += LZMA_getRepMatch0Price(enc, len_test_2, state_2, pos_state_next); if (rep_lit_rep_total_price < enc->opt_buf[offset].price) { len_end = MAX(len_end, offset); enc->opt_buf[offset].price = rep_lit_rep_total_price; - enc->opt_buf[offset].prev_index = (unsigned)(cur + len_test + 1); - enc->opt_buf[offset].prev_dist = 0; - enc->opt_buf[offset].is_combination = 1; - enc->opt_buf[offset].prev_2 = 1; - enc->opt_buf[offset].prev_index_2 = (unsigned)cur; - enc->opt_buf[offset].prev_dist_2 = (U32)(rep_index); + enc->opt_buf[offset].len = (unsigned)len_test_2; + enc->opt_buf[offset].dist = (U32)rep_index; + enc->opt_buf[offset].extra = (unsigned)(len_test + 1); } } } } if (match.length >= start_len && max_length >= start_len) { /* Try normal match */ - U32 normal_match_price = match_price + GET_PRICE_0(rc, is_rep_prob); + U32 const normal_match_price = match_price + GET_PRICE_0(is_rep_prob); if (!is_hybrid) { /* Normal mode - single match */ - size_t length = MIN(match.length, max_length); - size_t cur_dist = match.dist; - size_t dist_slot = GetDistSlot(match.dist); + size_t const length = MIN(match.length, max_length); + size_t const cur_dist = match.dist; + size_t const dist_slot = LZMA_getDistSlot(match.dist); size_t len_test = length; len_end = MAX(len_end, cur + length); - /* Pre-load rep0 data bytes */ -/* unsigned rep_0_bytes = MEM_read16(data - cur_dist + length); */ for (; len_test >= start_len; --len_test) { - OptimalNode *opt; U32 cur_and_len_price = normal_match_price + enc->states.len_states.prices[pos_state][len_test - kMatchLenMin]; - size_t len_to_dist_state = GetLenToDistState(len_test); + size_t const len_to_dist_state = LEN_TO_DIST_STATE(len_test); - if (cur_dist < kNumFullDistances) { + if (cur_dist < kNumFullDistances) cur_and_len_price += enc->distance_prices[len_to_dist_state][cur_dist]; - } - else { + else cur_and_len_price += enc->dist_slot_prices[len_to_dist_state][dist_slot] + enc->align_prices[cur_dist & kAlignMask]; - } - opt = &enc->opt_buf[cur + len_test]; + + OptimalNode *const opt = &enc->opt_buf[cur + len_test]; if (cur_and_len_price < opt->price) { opt->price = cur_and_len_price; - opt->prev_index = (unsigned)cur; - opt->prev_dist = (U32)(cur_dist + kNumReps); - opt->is_combination = 0; + opt->len = (unsigned)len_test; + opt->dist = (U32)(cur_dist + kNumReps); + opt->extra = 0; } else break; } @@ -1250,90 +1156,80 @@ size_t OptimalParse(FL2_lzmaEncoderCtx* const enc, const FL2_dataBlock block, else { /* Hybrid mode */ size_t main_len; - ptrdiff_t match_index; - ptrdiff_t start_match; match.length = MIN(match.length, (U32)max_length); - if (match.length < 3 || match.dist < 256) { + /* Need to test max_length < 4 because the hash fn reads a U32 */ + if (match.length < 3 || max_length < 4) { enc->matches[0] = match; enc->match_count = 1; main_len = match.length; } else { - main_len = HashGetMatches(enc, block, index, max_length, match); - } - match_index = enc->match_count - 1; - if (main_len == max_length - && match_index > 0 - && enc->matches[match_index - 1].length == main_len) - { - --match_index; + main_len = LZMA_hashGetMatches(enc, block, index, max_length, match); } + ptrdiff_t match_index = enc->match_count - 1; len_end = MAX(len_end, cur + main_len); - start_match = 0; - while (start_len > enc->matches[start_match].length) { + + /* Start with a match longer than the best rep if one exists */ + ptrdiff_t start_match = 0; + while (start_len > enc->matches[start_match].length) ++start_match; - } + + enc->matches[start_match - 1].length = (U32)start_len - 1; /* Avoids an if..else branch in the loop. [-1] is ok */ + for (; match_index >= start_match; --match_index) { size_t len_test = enc->matches[match_index].length; - size_t cur_dist = enc->matches[match_index].dist; - size_t dist_slot = GetDistSlot((U32)cur_dist); + size_t const cur_dist = enc->matches[match_index].dist; + const BYTE *const data_2 = data - cur_dist - 1; + size_t const rep_0_pos = len_test + 1; + size_t dist_slot = LZMA_getDistSlot((U32)cur_dist); U32 cur_and_len_price; - size_t base_len = (match_index > start_match) ? enc->matches[match_index - 1].length + 1 : start_len; - unsigned rep_0_bytes = MEM_read16(data - cur_dist + len_test); + /* Test from the full length down to 1 more than the next shorter match */ + size_t base_len = enc->matches[match_index - 1].length + 1; for (; len_test >= base_len; --len_test) { - size_t len_to_dist_state; - OptimalNode *opt; - cur_and_len_price = normal_match_price + enc->states.len_states.prices[pos_state][len_test - kMatchLenMin]; - len_to_dist_state = GetLenToDistState(len_test); - if (cur_dist < kNumFullDistances) { + size_t const len_to_dist_state = LEN_TO_DIST_STATE(len_test); + if (cur_dist < kNumFullDistances) cur_and_len_price += enc->distance_prices[len_to_dist_state][cur_dist]; - } - else { + else cur_and_len_price += enc->dist_slot_prices[len_to_dist_state][dist_slot] + enc->align_prices[cur_dist & kAlignMask]; - } - opt = &enc->opt_buf[cur + len_test]; + + BYTE const sub_len = len_test < enc->matches[match_index].length; + + OptimalNode *const opt = &enc->opt_buf[cur + len_test]; if (cur_and_len_price < opt->price) { opt->price = cur_and_len_price; - opt->prev_index = (unsigned)cur; - opt->prev_dist = (U32)(cur_dist + kNumReps); - opt->is_combination = 0; + opt->len = (unsigned)len_test; + opt->dist = (U32)(cur_dist + kNumReps); + opt->extra = 0; } - else if(len_test < main_len) - break; - if (len_test == enc->matches[match_index].length) { - size_t rep_0_pos = len_test + 1; - if (rep_0_pos + 2 <= bytes_avail && rep_0_bytes == MEM_read16(data + rep_0_pos)) { - /* Try match + literal + rep0 */ - const BYTE *data_2 = data - cur_dist - 1; - size_t limit = MIN(rep_0_pos + fast_length, bytes_avail); - size_t len_test_2 = ZSTD_count(data + rep_0_pos + 2, data_2 + rep_0_pos + 2, data + limit) + 2; - size_t state_2 = MatchNextState(state); - size_t pos_state_next = (index + len_test) & pos_mask; - U32 match_lit_rep_total_price = cur_and_len_price + - GET_PRICE_0(rc, enc->states.is_match[state_2][pos_state_next]) + - GetLiteralPriceMatched(&enc->rc, GetLiteralProbs(enc, index + len_test, data[len_test - 1]), - data[len_test], data_2[len_test]); - size_t offset; + else if(sub_len) + break; /* End the tests if prices for shorter lengths are not lower than those already recorded */ - state_2 = LiteralNextState(state_2); - pos_state_next = (pos_state_next + 1) & pos_mask; - match_lit_rep_total_price += - GET_PRICE_1(rc, enc->states.is_match[state_2][pos_state_next]) + - GET_PRICE_1(rc, enc->states.is_rep[state_2]); - offset = cur + rep_0_pos + len_test_2; - match_lit_rep_total_price += GetRepMatch0Price(enc, len_test_2, state_2, pos_state_next); - if (match_lit_rep_total_price < enc->opt_buf[offset].price) { - len_end = MAX(len_end, offset); - enc->opt_buf[offset].price = match_lit_rep_total_price; - enc->opt_buf[offset].prev_index = (unsigned)(cur + rep_0_pos); - enc->opt_buf[offset].prev_dist = 0; - enc->opt_buf[offset].is_combination = 1; - enc->opt_buf[offset].prev_2 = 1; - enc->opt_buf[offset].prev_index_2 = (unsigned)cur; - enc->opt_buf[offset].prev_dist_2 = (U32)(cur_dist + kNumReps); - } + if (!sub_len && rep_0_pos + 2 <= bytes_avail && MEM_read16(data + rep_0_pos) == MEM_read16(data_2 + rep_0_pos)) { + /* Try match + literal + rep0 */ + size_t const limit = MIN(rep_0_pos + fast_length, bytes_avail); + size_t const len_test_2 = ZSTD_count(data + rep_0_pos + 2, data_2 + rep_0_pos + 2, data + limit) + 2; + size_t state_2 = MATCH_NEXT_STATE(state); + size_t pos_state_next = (index + len_test) & pos_mask; + U32 match_lit_rep_total_price = cur_and_len_price + + GET_PRICE_0(enc->states.is_match[state_2][pos_state_next]) + + LZMA_getLiteralPriceMatched(LITERAL_PROBS(enc, index + len_test, data[len_test - 1]), + data[len_test], data_2[len_test]); + + state_2 = kState_LitAfterMatch; + pos_state_next = (pos_state_next + 1) & pos_mask; + match_lit_rep_total_price += + GET_PRICE_1(enc->states.is_match[state_2][pos_state_next]) + + GET_PRICE_1(enc->states.is_rep[state_2]); + size_t const offset = cur + rep_0_pos + len_test_2; + match_lit_rep_total_price += LZMA_getRepMatch0Price(enc, len_test_2, state_2, pos_state_next); + if (match_lit_rep_total_price < enc->opt_buf[offset].price) { + len_end = MAX(len_end, offset); + enc->opt_buf[offset].price = match_lit_rep_total_price; + enc->opt_buf[offset].len = (unsigned)len_test_2; + enc->opt_buf[offset].extra = (unsigned)rep_0_pos; + enc->opt_buf[offset].dist = (U32)(cur_dist + kNumReps); } } } @@ -1343,92 +1239,87 @@ size_t OptimalParse(FL2_lzmaEncoderCtx* const enc, const FL2_dataBlock block, return len_end; } -HINT_INLINE -void InitMatchesPos0(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, - Match match, - size_t pos_state, +FORCE_NOINLINE +static void LZMA_initMatchesPos0(LZMA2_ECtx *const enc, + RMF_match const match, + size_t const pos_state, size_t len, - unsigned normal_match_price) + unsigned const normal_match_price) { if ((unsigned)len <= match.length) { - size_t distance = match.dist; - size_t slot = GetDistSlot(match.dist); + size_t const distance = match.dist; + size_t const slot = LZMA_getDistSlot(match.dist); /* Test every available length of the match */ - do - { + do { unsigned cur_and_len_price = normal_match_price + enc->states.len_states.prices[pos_state][len - kMatchLenMin]; - size_t len_to_dist_state = GetLenToDistState(len); - if (distance < kNumFullDistances) { + size_t const len_to_dist_state = LEN_TO_DIST_STATE(len); + + if (distance < kNumFullDistances) cur_and_len_price += enc->distance_prices[len_to_dist_state][distance]; - } - else { + else cur_and_len_price += enc->align_prices[distance & kAlignMask] + enc->dist_slot_prices[len_to_dist_state][slot]; - } + if (cur_and_len_price < enc->opt_buf[len].price) { enc->opt_buf[len].price = cur_and_len_price; - enc->opt_buf[len].prev_index = 0; - enc->opt_buf[len].prev_dist = (U32)(distance + kNumReps); - enc->opt_buf[len].is_combination = 0; + enc->opt_buf[len].len = (unsigned)len; + enc->opt_buf[len].dist = (U32)(distance + kNumReps); + enc->opt_buf[len].extra = 0; } ++len; - } while ((unsigned)len <= match.length); + } while ((U32)len <= match.length); } } -static size_t InitMatchesPos0Best(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, - Match match, - size_t index, - size_t len, - unsigned normal_match_price) +FORCE_NOINLINE +static size_t LZMA_initMatchesPos0Best(LZMA2_ECtx *const enc, FL2_dataBlock const block, + RMF_match const match, + size_t const index, + size_t start_len, + unsigned const normal_match_price) { - if (len <= match.length) { + if (start_len <= match.length) { size_t main_len; - size_t match_index; - size_t pos_state; - size_t distance; - size_t slot; - - if (match.length < 3 || match.dist < 256) { + if (match.length < 3 || block.end - index < 4) { enc->matches[0] = match; enc->match_count = 1; main_len = match.length; } else { - main_len = HashGetMatches(enc, block, index, MIN(block.end - index, enc->fast_length), match); + main_len = LZMA_hashGetMatches(enc, block, index, MIN(block.end - index, enc->fast_length), match); } - match_index = 0; - while (len > enc->matches[match_index].length) { - ++match_index; - } - pos_state = index & enc->pos_mask; - distance = enc->matches[match_index].dist; - slot = GetDistSlot(enc->matches[match_index].dist); - /* Test every available match length at the shortest distance. The buffer is sorted */ - /* in order of increasing length, and therefore increasing distance too. */ - for (;; ++len) { - unsigned cur_and_len_price = normal_match_price - + enc->states.len_states.prices[pos_state][len - kMatchLenMin]; - size_t len_to_dist_state = GetLenToDistState(len); - if (distance < kNumFullDistances) { - cur_and_len_price += enc->distance_prices[len_to_dist_state][distance]; - } - else { - cur_and_len_price += enc->align_prices[distance & kAlignMask] + enc->dist_slot_prices[len_to_dist_state][slot]; - } - if (cur_and_len_price < enc->opt_buf[len].price) { - enc->opt_buf[len].price = cur_and_len_price; - enc->opt_buf[len].prev_index = 0; - enc->opt_buf[len].prev_dist = (U32)(distance + kNumReps); - enc->opt_buf[len].is_combination = 0; - } - if (len == enc->matches[match_index].length) { - /* Run out of length for this match. Get the next if any. */ - if (len == main_len) { - break; + + ptrdiff_t start_match = 0; + while (start_len > enc->matches[start_match].length) + ++start_match; + + enc->matches[start_match - 1].length = (U32)start_len - 1; /* Avoids an if..else branch in the loop. [-1] is ok */ + + size_t pos_state = index & enc->pos_mask; + + for (ptrdiff_t match_index = enc->match_count - 1; match_index >= start_match; --match_index) { + size_t len_test = enc->matches[match_index].length; + size_t const distance = enc->matches[match_index].dist; + size_t const slot = LZMA_getDistSlot((U32)distance); + size_t const base_len = enc->matches[match_index - 1].length + 1; + /* Test every available match length at the shortest distance. The buffer is sorted */ + /* in order of increasing length, and therefore increasing distance too. */ + for (; len_test >= base_len; --len_test) { + unsigned cur_and_len_price = normal_match_price + + enc->states.len_states.prices[pos_state][len_test - kMatchLenMin]; + size_t const len_to_dist_state = LEN_TO_DIST_STATE(len_test); + + if (distance < kNumFullDistances) + cur_and_len_price += enc->distance_prices[len_to_dist_state][distance]; + else + cur_and_len_price += enc->align_prices[distance & kAlignMask] + enc->dist_slot_prices[len_to_dist_state][slot]; + + if (cur_and_len_price < enc->opt_buf[len_test].price) { + enc->opt_buf[len_test].price = cur_and_len_price; + enc->opt_buf[len_test].len = (unsigned)len_test; + enc->opt_buf[len_test].dist = (U32)(distance + kNumReps); + enc->opt_buf[len_test].extra = 0; } - ++match_index; - distance = enc->matches[match_index].dist; - slot = GetDistSlot(enc->matches[match_index].dist); + else break; } } return main_len; @@ -1441,14 +1332,14 @@ static size_t InitMatchesPos0Best(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock b * This function must not be called at a position where no match is * available. */ FORCE_INLINE_TEMPLATE -size_t InitOptimizerPos0(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, - Match match, - size_t index, +size_t LZMA_initOptimizerPos0(LZMA2_ECtx *const enc, FL2_dataBlock const block, + RMF_match const match, + size_t const index, int const is_hybrid, - U32* reps) + U32* const reps) { - size_t max_length = MIN(block.end - index, kMatchLenMax); - const BYTE *data = block.data + index; + size_t const max_length = MIN(block.end - index, kMatchLenMax); + const BYTE *const data = block.data + index; const BYTE *data_2; size_t rep_max_index = 0; size_t rep_lens[kNumReps]; @@ -1462,288 +1353,352 @@ size_t InitOptimizerPos0(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, continue; } rep_lens[i] = ZSTD_count(data + 2, data_2 + 2, data + max_length) + 2; - if (rep_lens[i] > rep_lens[rep_max_index]) { + if (rep_lens[i] > rep_lens[rep_max_index]) rep_max_index = i; - } } if (rep_lens[rep_max_index] >= enc->fast_length) { - enc->opt_buf[0].prev_index = (unsigned)(rep_lens[rep_max_index]); - enc->opt_buf[0].prev_dist = (U32)(rep_max_index); + enc->opt_buf[0].len = (unsigned)(rep_lens[rep_max_index]); + enc->opt_buf[0].dist = (U32)rep_max_index; return 0; } if (match.length >= enc->fast_length) { - enc->opt_buf[0].prev_index = match.length; - enc->opt_buf[0].prev_dist = match.dist + kNumReps; + enc->opt_buf[0].len = match.length; + enc->opt_buf[0].dist = match.dist + kNumReps; return 0; } - { unsigned cur_byte = *data; - unsigned match_byte = *(data - reps[0] - 1); - unsigned match_price; - unsigned normal_match_price; - unsigned rep_match_price; - size_t len; - size_t state = enc->states.state; - size_t pos_state = index & enc->pos_mask; - Probability is_match_prob = enc->states.is_match[state][pos_state]; - Probability is_rep_prob = enc->states.is_rep[state]; + unsigned const cur_byte = *data; + unsigned const match_byte = *(data - reps[0] - 1); + size_t const state = enc->states.state; + size_t const pos_state = index & enc->pos_mask; + Probability const is_match_prob = enc->states.is_match[state][pos_state]; + Probability const is_rep_prob = enc->states.is_rep[state]; - enc->opt_buf[0].state = state; - /* Set the price for literal */ - enc->opt_buf[1].price = GET_PRICE_0(rc, is_match_prob) + - GetLiteralPrice(enc, index, state, data[-1], cur_byte, match_byte); - MakeAsLiteral(enc->opt_buf[1]); + enc->opt_buf[0].state = state; + /* Set the price for literal */ + enc->opt_buf[1].price = GET_PRICE_0(is_match_prob) + + LZMA_getLiteralPrice(enc, index, state, data[-1], cur_byte, match_byte); + MARK_LITERAL(enc->opt_buf[1]); - match_price = GET_PRICE_1(rc, is_match_prob); - rep_match_price = match_price + GET_PRICE_1(rc, is_rep_prob); - if (match_byte == cur_byte) { - /* Try 1-byte rep0 */ - unsigned short_rep_price = rep_match_price + GetRepLen1Price(enc, state, pos_state); - if (short_rep_price < enc->opt_buf[1].price) { - enc->opt_buf[1].price = short_rep_price; - MakeAsShortRep(enc->opt_buf[1]); + unsigned const match_price = GET_PRICE_1(is_match_prob); + unsigned const rep_match_price = match_price + GET_PRICE_1(is_rep_prob); + if (match_byte == cur_byte) { + /* Try 1-byte rep0 */ + unsigned const short_rep_price = rep_match_price + LZMA_getRepLen1Price(enc, state, pos_state); + if (short_rep_price < enc->opt_buf[1].price) { + enc->opt_buf[1].price = short_rep_price; + MARK_SHORT_REP(enc->opt_buf[1]); + } + } + memcpy(enc->opt_buf[0].reps, reps, sizeof(enc->opt_buf[0].reps)); + enc->opt_buf[1].len = 1; + /* Test the rep match prices */ + for (size_t i = 0; i < kNumReps; ++i) { + size_t rep_len = rep_lens[i]; + if (rep_len < 2) + continue; + + unsigned const price = rep_match_price + LZMA_getRepPrice(enc, i, state, pos_state); + /* Test every available length of the rep */ + do { + unsigned const cur_and_len_price = price + enc->states.rep_len_states.prices[pos_state][rep_len - kMatchLenMin]; + if (cur_and_len_price < enc->opt_buf[rep_len].price) { + enc->opt_buf[rep_len].price = cur_and_len_price; + enc->opt_buf[rep_len].len = (unsigned)rep_len; + enc->opt_buf[rep_len].dist = (U32)i; + enc->opt_buf[rep_len].extra = 0; } - } - memcpy(enc->opt_buf[0].reps, reps, sizeof(enc->opt_buf[0].reps)); - enc->opt_buf[1].prev_index = 0; - /* Test the rep match prices */ - for (size_t i = 0; i < kNumReps; ++i) { - unsigned price; - size_t rep_len = rep_lens[i]; - if (rep_len < 2) { - continue; - } - price = rep_match_price + GetRepPrice(enc, i, state, pos_state); - /* Test every available length of the rep */ - do { - unsigned cur_and_len_price = price + enc->states.rep_len_states.prices[pos_state][rep_len - kMatchLenMin]; - if (cur_and_len_price < enc->opt_buf[rep_len].price) { - enc->opt_buf[rep_len].price = cur_and_len_price; - enc->opt_buf[rep_len].prev_index = 0; - enc->opt_buf[rep_len].prev_dist = (U32)(i); - enc->opt_buf[rep_len].is_combination = 0; - } - } while (--rep_len >= kMatchLenMin); - } - normal_match_price = match_price + GET_PRICE_0(rc, is_rep_prob); - len = (rep_lens[0] >= 2) ? rep_lens[0] + 1 : 2; - /* Test the match prices */ - if (!is_hybrid) { - /* Normal mode */ - InitMatchesPos0(enc, block, match, pos_state, len, normal_match_price); - return MAX(match.length, rep_lens[rep_max_index]); - } - else { - /* Hybrid mode */ - size_t main_len = InitMatchesPos0Best(enc, block, match, index, len, normal_match_price); - return MAX(main_len, rep_lens[rep_max_index]); - } + } while (--rep_len >= kMatchLenMin); + } + unsigned const normal_match_price = match_price + GET_PRICE_0(is_rep_prob); + size_t const len = (rep_lens[0] >= 2) ? rep_lens[0] + 1 : 2; + /* Test the match prices */ + if (!is_hybrid) { + /* Normal mode */ + LZMA_initMatchesPos0(enc, match, pos_state, len, normal_match_price); + return MAX(match.length, rep_lens[rep_max_index]); + } + else { + /* Hybrid mode */ + size_t main_len = LZMA_initMatchesPos0Best(enc, block, match, index, len, normal_match_price); + return MAX(main_len, rep_lens[rep_max_index]); } } FORCE_INLINE_TEMPLATE -size_t EncodeOptimumSequence(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block, - FL2_matchTable* tbl, - int const structTbl, +size_t LZMA_encodeOptimumSequence(LZMA2_ECtx *const enc, FL2_dataBlock const block, + FL2_matchTable* const tbl, + int const struct_tbl, int const is_hybrid, size_t start_index, - size_t uncompressed_end, - Match match) + size_t const uncompressed_end, + RMF_match match) { size_t len_end = enc->len_end_max; - unsigned search_depth = tbl->params.depth; + unsigned const search_depth = tbl->params.depth; do { - U32 reps[kNumReps]; - size_t index; - size_t cur; - unsigned prev_index; - size_t i; size_t const pos_mask = enc->pos_mask; - for (; (len_end & 3) != 0; --len_end) { + + /* Reset all prices that were set last time */ + for (; (len_end & 3) != 0; --len_end) enc->opt_buf[len_end].price = kInfinityPrice; - } for (; len_end >= 4; len_end -= 4) { enc->opt_buf[len_end].price = kInfinityPrice; enc->opt_buf[len_end - 1].price = kInfinityPrice; enc->opt_buf[len_end - 2].price = kInfinityPrice; enc->opt_buf[len_end - 3].price = kInfinityPrice; } - index = start_index; + /* Set everything up at position 0 */ - len_end = InitOptimizerPos0(enc, block, match, index, is_hybrid, reps); + size_t index = start_index; + U32 reps[kNumReps]; + len_end = LZMA_initOptimizerPos0(enc, block, match, index, is_hybrid, reps); match.length = 0; - cur = 1; + size_t cur = 1; + /* len_end == 0 if a match of fast_length was found */ if (len_end > 0) { ++index; - /* Lazy termination of the optimal parser. In the second half of the buffer */ - /* a resolution within one byte is enough */ - for (; cur < (len_end - cur / (kOptimizerBufferSize / 2U)); ++cur, ++index) { - if (enc->opt_buf[cur + 1].price < enc->opt_buf[cur].price) - continue; - match = FL2_radixGetMatch(block, tbl, search_depth, structTbl, index); - if (match.length >= enc->fast_length) { + for (; cur < len_end; ++cur, ++index) { + /* Terminate if the farthest calculated price is too near the buffer end */ + if (len_end >= kOptimizerBufferSize - kOptimizerEndSize) { + U32 price = enc->opt_buf[cur].price; + /* This is a compromise to favor more distant end points + * even if the price is a bit higher */ + U32 const delta = price / (U32)cur / 2U; + for (size_t j = cur + 1; j <= len_end; j++) { + U32 const price2 = enc->opt_buf[j].price; + if (price >= price2) { + price = price2; + cur = j; + } + price += delta; + } break; } - len_end = OptimalParse(enc, block, match, index, cur, len_end, is_hybrid, reps); - } - if (cur < len_end && match.length < enc->fast_length) { - /* Adjust the end point base on scaling up the price. */ - cur += (enc->opt_buf[cur].price + enc->opt_buf[cur].price / cur) >= enc->opt_buf[cur + 1].price; - } - DEBUGLOG(6, "End optimal parse at %u", (U32)cur); - ReverseOptimalChain(enc->opt_buf, cur); - } - /* Encode the selections in the buffer */ - prev_index = 0; - i = 0; - do { - unsigned len = enc->opt_buf[i].prev_index - prev_index; - prev_index = enc->opt_buf[i].prev_index; - if (len == 1 && enc->opt_buf[i].prev_dist == kNullDist) - { - EncodeLiteralBuf(enc, block.data, start_index + i); - } - else { - size_t match_index = start_index + i; - U32 dist = enc->opt_buf[i].prev_dist; - /* The last match will be truncated to fit in the optimal buffer so get the full length */ - if (i + len >= kOptimizerBufferSize - 1 && dist >= kNumReps) { - Match lastmatch = FL2_radixGetMatch(block, tbl, search_depth, tbl->isStruct, match_index); - if (lastmatch.length > len) { - len = lastmatch.length; - dist = lastmatch.dist + kNumReps; + + /* Skip ahead if a lower or equal price is available at greater distance */ + size_t const end = MIN(cur + kOptimizerSkipSize, len_end); + U32 price = enc->opt_buf[cur].price; + for (size_t j = cur + 1; j <= end; j++) { + U32 const price2 = enc->opt_buf[j].price; + if (price >= price2) { + price = price2; + index += j - cur; + cur = j; + if (cur == len_end) + goto reverse; } } - if (dist < kNumReps) { - EncodeRepMatch(enc, len, dist, match_index & pos_mask); + + match = RMF_getMatch(block, tbl, search_depth, struct_tbl, index); + if (match.length >= enc->fast_length) + break; + + len_end = LZMA_optimalParse(enc, block, match, index, cur, len_end, is_hybrid, reps); + } +reverse: + DEBUGLOG(6, "End optimal parse at %u", (U32)cur); + LZMA_reverseOptimalChain(enc->opt_buf, cur); + } + /* Encode the selections in the buffer */ + size_t i = 0; + do { + unsigned const len = enc->opt_buf[i].len; + + if (len == 1 && enc->opt_buf[i].dist == kNullDist) { + LZMA_encodeLiteralBuf(enc, block.data, start_index + i); + ++i; + } + else { + size_t const pos_state = (start_index + i) & pos_mask; + U32 const dist = enc->opt_buf[i].dist; + /* Updating i separately for each case may allow a branch to be eliminated */ + if (dist >= kNumReps) { + LZMA_encodeNormalMatch(enc, len, dist - kNumReps, pos_state); + i += len; + } + else if(len == 1) { + LZMA_encodeRepMatchShort(enc, pos_state); + ++i; } else { - EncodeNormalMatch(enc, len, dist - kNumReps, match_index & pos_mask); + LZMA_encodeRepMatchLong(enc, len, dist, pos_state); + i += len; } } - i += len; } while (i < cur); start_index += i; - /* Do another round if there is a long match pending, because the reps must be checked */ - /* and the match encoded. */ + /* Do another round if there is a long match pending, + * because the reps must be checked and the match encoded. */ } while (match.length >= enc->fast_length && start_index < uncompressed_end && enc->rc.out_index < enc->rc.chunk_size); enc->len_end_max = len_end; return start_index; } -static void UpdateLengthPrices(FL2_lzmaEncoderCtx* enc, LengthStates* len_states) +static void FORCE_NOINLINE LZMA_fillAlignPrices(LZMA2_ECtx *const enc) { - for (size_t pos_state = 0; pos_state <= enc->pos_mask; ++pos_state) { - LengthStates_SetPrices(&enc->rc, len_states, pos_state); + unsigned i; + const Probability *const probs = enc->states.dist_align_encoders; + for (i = 0; i < kAlignTableSize / 2; i++) { + U32 price = 0; + unsigned sym = i; + unsigned m = 1; + unsigned bit; + bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[m], bit); m = (m << 1) + bit; + bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[m], bit); m = (m << 1) + bit; + bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[m], bit); m = (m << 1) + bit; + U32 const prob = probs[m]; + enc->align_prices[i] = price + GET_PRICE_0(prob); + enc->align_prices[i + 8] = price + GET_PRICE_1(prob); } } -static void FillAlignPrices(FL2_lzmaEncoderCtx* enc) +static void FORCE_NOINLINE LZMA_fillDistancesPrices(LZMA2_ECtx *const enc) { - for (size_t i = 0; i < kAlignTableSize; ++i) { - enc->align_prices[i] = GetReverseTreePrice(&enc->rc, enc->states.dist_align_encoders, kNumAlignBits, i); - } - enc->align_price_count = 0; -} + U32 * const temp_prices = enc->distance_prices[kNumLenToPosStates - 1]; -static void FillDistancesPrices(FL2_lzmaEncoderCtx* enc) -{ - static const size_t kLastLenToPosState = kNumLenToPosStates - 1; - for (size_t i = kStartPosModelIndex; i < kNumFullDistances; ++i) { - size_t dist_slot = distance_table[i]; - unsigned footerBits = (unsigned)((dist_slot >> 1) - 1); - size_t base = ((2 | (dist_slot & 1)) << footerBits); - enc->distance_prices[kLastLenToPosState][i] = GetReverseTreePrice(&enc->rc, enc->states.dist_encoders + base - dist_slot - 1, - footerBits, - i - base); - } - for (size_t lenToPosState = 0; lenToPosState < kNumLenToPosStates; ++lenToPosState) { - const Probability* encoder = enc->states.dist_slot_encoders[lenToPosState]; - for (size_t dist_slot = 0; dist_slot < enc->dist_price_table_size; ++dist_slot) { - enc->dist_slot_prices[lenToPosState][dist_slot] = GetTreePrice(&enc->rc, encoder, kNumPosSlotBits, dist_slot); - } - for (size_t dist_slot = kEndPosModelIndex; dist_slot < enc->dist_price_table_size; ++dist_slot) { - enc->dist_slot_prices[lenToPosState][dist_slot] += (((unsigned)(dist_slot >> 1) - 1) - kNumAlignBits) << kNumBitPriceShiftBits; - } - size_t i = 0; - for (; i < kStartPosModelIndex; ++i) { - enc->distance_prices[lenToPosState][i] = enc->dist_slot_prices[lenToPosState][i]; - } - for (; i < kNumFullDistances; ++i) { - enc->distance_prices[lenToPosState][i] = enc->dist_slot_prices[lenToPosState][distance_table[i]] - + enc->distance_prices[kLastLenToPosState][i]; - } - } enc->match_price_count = 0; + + for (size_t i = kStartPosModelIndex / 2; i < kNumFullDistances / 2; i++) { + unsigned const dist_slot = distance_table[i]; + unsigned footer_bits = (dist_slot >> 1) - 1; + size_t base = ((2 | (dist_slot & 1)) << footer_bits); + const Probability *probs = enc->states.dist_encoders + base * 2U; + base += i; + probs = probs - distance_table[base] - 1; + U32 price = 0; + unsigned m = 1; + unsigned sym = (unsigned)i; + unsigned const offset = (unsigned)1 << footer_bits; + + for (; footer_bits != 0; --footer_bits) { + unsigned bit = sym & 1; + sym >>= 1; + price += GET_PRICE(probs[m], bit); + m = (m << 1) + bit; + }; + + unsigned const prob = probs[m]; + temp_prices[base] = price + GET_PRICE_0(prob); + temp_prices[base + offset] = price + GET_PRICE_1(prob); + } + + for (unsigned lps = 0; lps < kNumLenToPosStates; lps++) { + size_t slot; + size_t const dist_table_size2 = (enc->dist_price_table_size + 1) >> 1; + U32 *const dist_slot_prices = enc->dist_slot_prices[lps]; + const Probability *const probs = enc->states.dist_slot_encoders[lps]; + + for (slot = 0; slot < dist_table_size2; slot++) { + /* dist_slot_prices[slot] = RcTree_GetPrice(encoder, kNumPosSlotBits, slot, p->ProbPrices); */ + U32 price; + unsigned bit; + unsigned sym = (unsigned)slot + (1 << (kNumPosSlotBits - 1)); + bit = sym & 1; sym >>= 1; price = GET_PRICE(probs[sym], bit); + bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[sym], bit); + bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[sym], bit); + bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[sym], bit); + bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[sym], bit); + unsigned const prob = probs[slot + (1 << (kNumPosSlotBits - 1))]; + dist_slot_prices[slot * 2] = price + GET_PRICE_0(prob); + dist_slot_prices[slot * 2 + 1] = price + GET_PRICE_1(prob); + } + + { + U32 delta = ((U32)((kEndPosModelIndex / 2 - 1) - kNumAlignBits) << kNumBitPriceShiftBits); + for (slot = kEndPosModelIndex / 2; slot < dist_table_size2; slot++) { + dist_slot_prices[slot * 2] += delta; + dist_slot_prices[slot * 2 + 1] += delta; + delta += ((U32)1 << kNumBitPriceShiftBits); + } + } + + { + U32 *const dp = enc->distance_prices[lps]; + + dp[0] = dist_slot_prices[0]; + dp[1] = dist_slot_prices[1]; + dp[2] = dist_slot_prices[2]; + dp[3] = dist_slot_prices[3]; + + for (size_t i = 4; i < kNumFullDistances; i += 2) { + U32 slot_price = dist_slot_prices[distance_table[i]]; + dp[i] = slot_price + temp_prices[i]; + dp[i + 1] = slot_price + temp_prices[i + 1]; + } + } + } } FORCE_INLINE_TEMPLATE -size_t EncodeChunkBest(FL2_lzmaEncoderCtx* enc, +size_t LZMA_encodeChunkBest(LZMA2_ECtx *const enc, FL2_dataBlock const block, - FL2_matchTable* tbl, - int const structTbl, + FL2_matchTable* const tbl, + int const struct_tbl, size_t index, - size_t uncompressed_end) + size_t const uncompressed_end) { - unsigned search_depth = tbl->params.depth; - FillDistancesPrices(enc); - FillAlignPrices(enc); - UpdateLengthPrices(enc, &enc->states.len_states); - UpdateLengthPrices(enc, &enc->states.rep_len_states); + unsigned const search_depth = tbl->params.depth; + LZMA_fillDistancesPrices(enc); + LZMA_fillAlignPrices(enc); + LZMA_lengthStates_updatePrices(enc, &enc->states.len_states); + LZMA_lengthStates_updatePrices(enc, &enc->states.rep_len_states); while (index < uncompressed_end && enc->rc.out_index < enc->rc.chunk_size) { - Match match = FL2_radixGetMatch(block, tbl, search_depth, structTbl, index); + RMF_match const match = RMF_getMatch(block, tbl, search_depth, struct_tbl, index); if (match.length > 1) { - if (enc->strategy != FL2_ultra) { - index = EncodeOptimumSequence(enc, block, tbl, structTbl, 0, index, uncompressed_end, match); + /* Template-like inline function */ + if (enc->strategy == FL2_ultra) { + index = LZMA_encodeOptimumSequence(enc, block, tbl, struct_tbl, 1, index, uncompressed_end, match); } else { - index = EncodeOptimumSequence(enc, block, tbl, structTbl, 1, index, uncompressed_end, match); + index = LZMA_encodeOptimumSequence(enc, block, tbl, struct_tbl, 0, index, uncompressed_end, match); } - if (enc->match_price_count >= kDistanceRepriceFrequency) { - FillDistancesPrices(enc); + if (enc->match_price_count >= kMatchRepriceFrequency) { + LZMA_fillAlignPrices(enc); + LZMA_fillDistancesPrices(enc); + LZMA_lengthStates_updatePrices(enc, &enc->states.len_states); } - if (enc->align_price_count >= kAlignRepriceFrequency) { - FillAlignPrices(enc); + if (enc->rep_len_price_count >= kRepLenRepriceFrequency) { + enc->rep_len_price_count = 0; + LZMA_lengthStates_updatePrices(enc, &enc->states.rep_len_states); } } else { - if (block.data[index] == block.data[index - enc->states.reps[0] - 1]) { - EncodeRepMatch(enc, 1, 0, index & enc->pos_mask); + if (block.data[index] != block.data[index - enc->states.reps[0] - 1]) { + LZMA_encodeLiteralBuf(enc, block.data, index); + ++index; } else { - EncodeLiteralBuf(enc, block.data, index); + LZMA_encodeRepMatchShort(enc, index & enc->pos_mask); + ++index; } - ++index; } } - Flush(&enc->rc); return index; } -static void LengthStates_Reset(LengthStates* ls, unsigned fast_length) +static void LZMA_lengthStates_Reset(LengthStates* const ls, unsigned const fast_length) { ls->choice = kProbInitValue; - ls->choice_2 = kProbInitValue; - for (size_t i = 0; i < (kNumPositionStatesMax << kLenNumLowBits); ++i) { + + for (size_t i = 0; i < (kNumPositionStatesMax << (kLenNumLowBits + 1)); ++i) ls->low[i] = kProbInitValue; - } - for (size_t i = 0; i < (kNumPositionStatesMax << kLenNumMidBits); ++i) { - ls->mid[i] = kProbInitValue; - } - for (size_t i = 0; i < kLenNumHighSymbols; ++i) { + + for (size_t i = 0; i < kLenNumHighSymbols; ++i) ls->high[i] = kProbInitValue; - } + ls->table_size = fast_length + 1 - kMatchLenMin; } -static void EncoderStates_Reset(EncoderStates* es, unsigned lc, unsigned lp, unsigned fast_length) +static void LZMA_encoderStates_Reset(EncoderStates* const es, unsigned const lc, unsigned const lp, unsigned fast_length) { es->state = 0; - for (size_t i = 0; i < kNumReps; ++i) { + + for (size_t i = 0; i < kNumReps; ++i) es->reps[i] = 0; - } + for (size_t i = 0; i < kNumStates; ++i) { for (size_t j = 0; j < kNumPositionStatesMax; ++j) { es->is_match[i][j] = kProbInitValue; @@ -1754,27 +1709,26 @@ static void EncoderStates_Reset(EncoderStates* es, unsigned lc, unsigned lp, uns es->is_rep_G1[i] = kProbInitValue; es->is_rep_G2[i] = kProbInitValue; } - size_t num = (size_t)(kNumLiterals * kNumLitTables) << (lp + lc); - for (size_t i = 0; i < num; ++i) { + size_t const num = (size_t)(kNumLiterals * kNumLitTables) << (lp + lc); + for (size_t i = 0; i < num; ++i) es->literal_probs[i] = kProbInitValue; - } + for (size_t i = 0; i < kNumLenToPosStates; ++i) { Probability *probs = es->dist_slot_encoders[i]; - for (size_t j = 0; j < (1 << kNumPosSlotBits); ++j) { + for (size_t j = 0; j < (1 << kNumPosSlotBits); ++j) probs[j] = kProbInitValue; - } } - for (size_t i = 0; i < kNumFullDistances - kEndPosModelIndex; ++i) { + for (size_t i = 0; i < kNumFullDistances - kEndPosModelIndex; ++i) es->dist_encoders[i] = kProbInitValue; - } - LengthStates_Reset(&es->len_states, fast_length); - LengthStates_Reset(&es->rep_len_states, fast_length); - for (size_t i = 0; i < (1 << kNumAlignBits); ++i) { + + LZMA_lengthStates_Reset(&es->len_states, fast_length); + LZMA_lengthStates_Reset(&es->rep_len_states, fast_length); + + for (size_t i = 0; i < (1 << kNumAlignBits); ++i) es->dist_align_encoders[i] = kProbInitValue; - } } -BYTE FL2_getDictSizeProp(size_t dictionary_size) +BYTE LZMA2_getDictSizeProp(size_t const dictionary_size) { BYTE dict_size_prop = 0; for (BYTE bit = 11; bit < 32; ++bit) { @@ -1790,34 +1744,54 @@ BYTE FL2_getDictSizeProp(size_t dictionary_size) return dict_size_prop; } -size_t FL2_lzma2MemoryUsage(unsigned chain_log, FL2_strategy strategy, unsigned thread_count) +size_t LZMA2_encMemoryUsage(unsigned const chain_log, FL2_strategy const strategy, unsigned const thread_count) { - size_t size = sizeof(FL2_lzmaEncoderCtx) + kChunkBufferSize; + size_t size = sizeof(LZMA2_ECtx); if(strategy == FL2_ultra) size += sizeof(HashChains) + (sizeof(U32) << chain_log) - sizeof(U32); return size * thread_count; } -static void Reset(FL2_lzmaEncoderCtx* enc, size_t max_distance) +static void LZMA2_reset(LZMA2_ECtx *const enc, size_t const max_distance) { DEBUGLOG(5, "LZMA encoder reset : max_distance %u", (unsigned)max_distance); - U32 i = 0; - RangeEncReset(&enc->rc); - EncoderStates_Reset(&enc->states, enc->lc, enc->lp, enc->fast_length); + RC_reset(&enc->rc); + LZMA_encoderStates_Reset(&enc->states, enc->lc, enc->lp, enc->fast_length); enc->pos_mask = (1 << enc->pb) - 1; enc->lit_pos_mask = (1 << enc->lp) - 1; + U32 i = 0; for (; max_distance > (size_t)1 << i; ++i) { } enc->dist_price_table_size = i * 2; + enc->rep_len_price_count = 0; + enc->match_price_count = 0; } -static BYTE GetLcLpPbCode(FL2_lzmaEncoderCtx* enc) +static BYTE LZMA_getLcLpPbCode(LZMA2_ECtx *const enc) { return (BYTE)((enc->pb * 5 + enc->lp) * 9 + enc->lc); } -BYTE IsChunkRandom(const FL2_matchTable* const tbl, - const FL2_dataBlock block, size_t const start, +/* Integer square root from https://stackoverflow.com/a/1101217 */ +static U32 LZMA2_isqrt(U32 op) +{ + U32 res = 0; + /* "one" starts at the highest power of four <= than the argument. */ + U32 one = (U32)1 << (ZSTD_highbit32(op) & ~1); + + while (one != 0) { + if (op >= res + one) { + op -= res + one; + res = res + 2U * one; + } + res >>= 1; + one >>= 2; + } + return res; +} + +static BYTE LZMA2_chunkNotCompressible(const FL2_matchTable* const tbl, + FL2_dataBlock const block, size_t const start, unsigned const strategy) { if (block.end - start >= kMinTestChunkSize) { @@ -1826,7 +1800,7 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl, { 0, 0, 1U << 6, 1U << 14, 1U << 22 }, /* opt */ { 0, 0, 1U << 6, 1U << 14, 1U << 22 } }; /* ultra */ static const size_t margin_divisor[3] = { 60U, 45U, 120U }; - static const double dev_table[3] = { 6.0, 6.0, 5.0 }; + static const U32 dev_table[3] = { 24, 24, 20}; size_t const end = MIN(start + kChunkSize, block.end); size_t const chunk_size = end - start; @@ -1834,7 +1808,7 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl, size_t const margin = chunk_size / margin_divisor[strategy]; size_t const terminator = start + margin; - if (tbl->isStruct) { + if (tbl->is_struct) { size_t prev_dist = 0; for (size_t index = start; index < end; ) { U32 const link = GetMatchLink(tbl->table, index); @@ -1844,12 +1818,16 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl, prev_dist = 0; } else { - size_t length = GetMatchLength(tbl->table, index); - size_t dist = index - GetMatchLink(tbl->table, index); - if (length > 4) - count += dist != prev_dist; - else - count += (dist < max_dist_table[strategy][length]) ? 1 : length; + size_t const length = GetMatchLength(tbl->table, index); + size_t const dist = index - GetMatchLink(tbl->table, index); + if (length > 4) { + /* Increase the cost if it's not the same match */ + count += dist != prev_dist; + } + else { + /* Increment the cost for a short match. The cost is the entire length if it's too far */ + count += (dist < max_dist_table[strategy][length]) ? 1 : length; + } index += length; prev_dist = dist; } @@ -1867,8 +1845,8 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl, prev_dist = 0; } else { - size_t length = link >> RADIX_LINK_BITS; - size_t dist = index - (link & RADIX_LINK_MASK); + size_t const length = link >> RADIX_LINK_BITS; + size_t const dist = index - (link & RADIX_LINK_MASK); if (length > 4) count += dist != prev_dist; else @@ -1881,166 +1859,187 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl, } } - { U32 char_count[256]; - double char_total = 0.0; - /* Expected normal character count */ - double const avg = (double)chunk_size / 256.0; + U32 char_count[256]; + U32 char_total = 0; + /* Expected normal character count * 4 */ + U32 const avg = (U32)(chunk_size / 64U); - memset(char_count, 0, sizeof(char_count)); - for (size_t index = start; index < end; ++index) - ++char_count[block.data[index]]; - /* Sum the deviations */ - for (size_t i = 0; i < 256; ++i) { - double delta = (double)char_count[i] - avg; - char_total += delta * delta; - } - return sqrt(char_total) / sqrt((double)chunk_size) <= dev_table[strategy]; - } + memset(char_count, 0, sizeof(char_count)); + for (size_t index = start; index < end; ++index) + char_count[block.data[index]] += 4; + /* Sum the deviations */ + for (size_t i = 0; i < 256; ++i) { + S32 delta = char_count[i] - avg; + char_total += delta * delta; + } + U32 sqrt_chunk = (chunk_size == kChunkSize) ? kSqrtChunkSize : LZMA2_isqrt((U32)chunk_size); + /* Result base on character count std dev */ + return LZMA2_isqrt(char_total) / sqrt_chunk <= dev_table[strategy]; } return 0; } -#ifdef __GNUC__ -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#else -__pragma(warning(disable:4701)) -#endif +static size_t LZMA2_encodeChunk(LZMA2_ECtx *const enc, + FL2_matchTable* const tbl, + FL2_dataBlock const block, + size_t const index, size_t const end) +{ + /* Template-like inline functions */ + if (enc->strategy == FL2_fast) { + if (tbl->is_struct) { + return LZMA_encodeChunkFast(enc, block, tbl, 1, + index, end); + } + else { + return LZMA_encodeChunkFast(enc, block, tbl, 0, + index, end); + } + } + else { + if (tbl->is_struct) { + return LZMA_encodeChunkBest(enc, block, tbl, 1, + index, end); + } + else { + return LZMA_encodeChunkBest(enc, block, tbl, 0, + index, end); + } + } +} -size_t FL2_lzma2Encode(FL2_lzmaEncoderCtx* enc, - FL2_matchTable* tbl, - const FL2_dataBlock block, - const FL2_lzma2Parameters* options, - FL2_progressFn progress, void* opaque, size_t base, U32 weight) +size_t LZMA2_encode(LZMA2_ECtx *const enc, + FL2_matchTable* const tbl, + FL2_dataBlock const block, + const FL2_lzma2Parameters* const options, + int stream_prop, + FL2_atomic *const progress_in, + FL2_atomic *const progress_out, + int *const canceled) { size_t const start = block.start; BYTE* out_dest = enc->out_buf; /* Each encoder writes a properties byte because the upstream encoder(s) could */ /* write only uncompressed chunks with no properties. */ BYTE encode_properties = 1; - BYTE next_is_random = 0; + BYTE not_compressible = 0; - if (block.end <= block.start) { + if (block.end <= block.start) return 0; - } + enc->lc = options->lc; - enc->lp = options->lp; - if (enc->lc + enc->lp > 4) { - enc->lc = 3; - enc->lp = 0; - } + enc->lp = MIN(options->lp, 4); + + if (enc->lc + enc->lp > 4) + enc->lc = 4 - enc->lp; + enc->pb = options->pb; enc->strategy = options->strategy; enc->fast_length = options->fast_length; - enc->match_cycles = options->match_cycles; - Reset(enc, block.end); + enc->match_cycles = MIN(options->match_cycles, kMatchesMax - 1); + + LZMA2_reset(enc, block.end); + if (enc->strategy == FL2_ultra) { /* Create a hash chain to put the encoder into hybrid mode */ if (enc->hash_alloc_3 < ((ptrdiff_t)1 << options->second_dict_bits)) { - if(HashCreate(enc, options->second_dict_bits) != 0) + if(LZMA_hashCreate(enc, options->second_dict_bits) != 0) return FL2_ERROR(memory_allocation); } else { - HashReset(enc, options->second_dict_bits); + LZMA_hashReset(enc, options->second_dict_bits); } - enc->hash_prev_index = (start >= (size_t)enc->hash_dict_3) ? start - enc->hash_dict_3 : -1; + enc->hash_prev_index = (start >= (size_t)enc->hash_dict_3) ? (ptrdiff_t)(start - enc->hash_dict_3) : (ptrdiff_t)-1; } enc->len_end_max = kOptimizerBufferSize - 1; RMF_limitLengths(tbl, block.end); for (size_t index = start; index < block.end;) { - unsigned header_size = encode_properties ? kChunkHeaderSize + 1 : kChunkHeaderSize; + size_t header_size = (stream_prop >= 0) + (encode_properties ? kChunkHeaderSize + 1 : kChunkHeaderSize); EncoderStates saved_states; size_t next_index; - size_t compressed_size; - size_t uncompressed_size; - RangeEncReset(&enc->rc); - SetOutputBuffer(&enc->rc, out_dest + header_size, kChunkSize); - if (!next_is_random) { + RC_reset(&enc->rc); + RC_setOutputBuffer(&enc->rc, out_dest + header_size, kChunkSize); + if (!not_compressible) { + size_t cur = index; + size_t const end = (enc->strategy == FL2_fast) ? MIN(block.end, index + kMaxChunkUncompressedSize) + : MIN(block.end, index + kMaxChunkUncompressedSize - kOptimizerBufferSize); saved_states = enc->states; if (index == 0) { - EncodeLiteral(enc, 0, block.data[0], 0); + /* First byte of the dictionary */ + LZMA_encodeLiteral(enc, 0, block.data[0], 0); + ++cur; } - if (enc->strategy == FL2_fast) { - if (tbl->isStruct) { - next_index = EncodeChunkFast(enc, block, tbl, 1, - index + (index == 0), - MIN(block.end, index + kMaxChunkUncompressedSize)); - } - else { - next_index = EncodeChunkFast(enc, block, tbl, 0, - index + (index == 0), - MIN(block.end, index + kMaxChunkUncompressedSize)); - } - } - else { - if (tbl->isStruct) { - next_index = EncodeChunkBest(enc, block, tbl, 1, - index + (index == 0), - MIN(block.end, index + kMaxChunkUncompressedSize - kOptimizerBufferSize)); - } - else { - next_index = EncodeChunkBest(enc, block, tbl, 0, - index + (index == 0), - MIN(block.end, index + kMaxChunkUncompressedSize - kOptimizerBufferSize)); - } + if (index == start) { + /* After four bytes we can write data to the match table because the */ + /* compressed data will never catch up with the table position being read. */ + enc->rc.chunk_size = kTempMinOutput; + cur = LZMA2_encodeChunk(enc, tbl, block, cur, end); + enc->rc.chunk_size = kChunkSize; + out_dest = RMF_getTableAsOutputBuffer(tbl, start); + memcpy(out_dest, enc->out_buf, header_size + enc->rc.out_index); + enc->rc.out_buffer = out_dest + header_size; } + next_index = LZMA2_encodeChunk(enc, tbl, block, cur, end); + RC_flush(&enc->rc); } else { next_index = MIN(index + kChunkSize, block.end); } - compressed_size = enc->rc.out_index; - uncompressed_size = next_index - index; - out_dest[1] = (BYTE)((uncompressed_size - 1) >> 8); - out_dest[2] = (BYTE)(uncompressed_size - 1); + size_t compressed_size = enc->rc.out_index; + size_t uncompressed_size = next_index - index; + + if (compressed_size > kMaxChunkCompressedSize) + return FL2_ERROR(internal); + + BYTE* header = out_dest; + + if (stream_prop >= 0) + *header++ = (BYTE)stream_prop; + stream_prop = -1; + + header[1] = (BYTE)((uncompressed_size - 1) >> 8); + header[2] = (BYTE)(uncompressed_size - 1); /* Output an uncompressed chunk if necessary */ - if (next_is_random || uncompressed_size + 3 <= compressed_size + header_size) { - DEBUGLOG(5, "Storing chunk : was %u => %u", (unsigned)uncompressed_size, (unsigned)compressed_size); - if (index == 0) { - out_dest[0] = kChunkUncompressedDictReset; - } - else { - out_dest[0] = kChunkUncompressed; - } - memcpy(out_dest + 3, block.data + index, uncompressed_size); + if (not_compressible || uncompressed_size + 3 <= compressed_size + header_size) { + DEBUGLOG(6, "Storing chunk : was %u => %u", (unsigned)uncompressed_size, (unsigned)compressed_size); + + header[0] = (index == 0) ? kChunkUncompressedDictReset : kChunkUncompressed; + + /* Copy uncompressed data into the output */ + memcpy(header + 3, block.data + index, uncompressed_size); + compressed_size = uncompressed_size; - header_size = 3; - if (!next_is_random) { + header_size = 3 + (header - out_dest); + if (!not_compressible) enc->states = saved_states; - } } else { - DEBUGLOG(5, "Compressed chunk : %u => %u", (unsigned)uncompressed_size, (unsigned)compressed_size); - if (index == 0) { - out_dest[0] = kChunkCompressedFlag | kChunkAllReset; - } - else if (encode_properties) { - out_dest[0] = kChunkCompressedFlag | kChunkStatePropertiesReset; - } - else { - out_dest[0] = kChunkCompressedFlag | kChunkNothingReset; - } - out_dest[0] |= (BYTE)((uncompressed_size - 1) >> 16); - out_dest[3] = (BYTE)((compressed_size - 1) >> 8); - out_dest[4] = (BYTE)(compressed_size - 1); + DEBUGLOG(6, "Compressed chunk : %u => %u", (unsigned)uncompressed_size, (unsigned)compressed_size); + + if (index == 0) + header[0] = kChunkCompressedFlag | kChunkAllReset; + else if (encode_properties) + header[0] = kChunkCompressedFlag | kChunkStatePropertiesReset; + else + header[0] = kChunkCompressedFlag | kChunkNothingReset; + + header[0] |= (BYTE)((uncompressed_size - 1) >> 16); + header[3] = (BYTE)((compressed_size - 1) >> 8); + header[4] = (BYTE)(compressed_size - 1); if (encode_properties) { - out_dest[5] = GetLcLpPbCode(enc); + header[5] = LZMA_getLcLpPbCode(enc); encode_properties = 0; } } - if (next_is_random || uncompressed_size + 3 <= compressed_size + (compressed_size >> kRandomFilterMarginBits) + header_size) - { + if (not_compressible || uncompressed_size + 3 <= compressed_size + (compressed_size >> kRandomFilterMarginBits) + header_size) { /* Test the next chunk for compressibility */ - next_is_random = IsChunkRandom(tbl, block, next_index, enc->strategy); - } - if (index == start) { - /* After the first chunk we can write data to the match table because the */ - /* compressed data will never catch up with the table position being read. */ - out_dest = RMF_getTableAsOutputBuffer(tbl, start); - memcpy(out_dest, enc->out_buf, compressed_size + header_size); + not_compressible = LZMA2_chunkNotCompressible(tbl, block, next_index, enc->strategy); } out_dest += compressed_size + header_size; + FL2_atomic_add(*progress_in, (long)(next_index - index)); + FL2_atomic_add(*progress_out, (long)(compressed_size + header_size)); index = next_index; - if (progress && progress(base + (((index - start) * weight) >> 4), opaque) != 0) + if (*canceled) return FL2_ERROR(canceled); } return out_dest - RMF_getTableAsOutputBuffer(tbl, start); diff --git a/C/fast-lzma2/lzma2_enc.h b/C/fast-lzma2/lzma2_enc.h index 9fbda523..aa821f78 100644 --- a/C/fast-lzma2/lzma2_enc.h +++ b/C/fast-lzma2/lzma2_enc.h @@ -10,6 +10,7 @@ Public domain #include "mem.h" #include "data_block.h" #include "radix_mf.h" +#include "atomic.h" #if defined (__cplusplus) extern "C" { @@ -19,14 +20,10 @@ extern "C" { #define LZMA2_END_MARKER '\0' #define LZMA_MIN_DICT_BITS 12 +#define ENC_MIN_BYTES_PER_THREAD 0x20000 -typedef struct FL2_lzmaEncoderCtx_s FL2_lzmaEncoderCtx; -typedef enum { - FL2_fast, - FL2_opt, - FL2_ultra -} FL2_strategy; +typedef struct LZMA2_ECtx_s LZMA2_ECtx; typedef struct { @@ -37,25 +34,28 @@ typedef struct unsigned match_cycles; FL2_strategy strategy; unsigned second_dict_bits; - unsigned random_filter; + unsigned reset_interval; } FL2_lzma2Parameters; -FL2_lzmaEncoderCtx* FL2_lzma2Create(); +LZMA2_ECtx* LZMA2_createECtx(void); -void FL2_lzma2Free(FL2_lzmaEncoderCtx* enc); +void LZMA2_freeECtx(LZMA2_ECtx *const enc); -int FL2_lzma2HashAlloc(FL2_lzmaEncoderCtx* enc, const FL2_lzma2Parameters* options); +int LZMA2_hashAlloc(LZMA2_ECtx *const enc, const FL2_lzma2Parameters* const options); -size_t FL2_lzma2Encode(FL2_lzmaEncoderCtx* enc, - FL2_matchTable* tbl, - const FL2_dataBlock block, - const FL2_lzma2Parameters* options, - FL2_progressFn progress, void* opaque, size_t base, U32 weight); +size_t LZMA2_encode(LZMA2_ECtx *const enc, + FL2_matchTable* const tbl, + FL2_dataBlock const block, + const FL2_lzma2Parameters* const options, + int stream_prop, + FL2_atomic *const progress_in, + FL2_atomic *const progress_out, + int *const canceled); -BYTE FL2_getDictSizeProp(size_t dictionary_size); +BYTE LZMA2_getDictSizeProp(size_t const dictionary_size); -size_t FL2_lzma2MemoryUsage(unsigned chain_log, FL2_strategy strategy, unsigned thread_count); +size_t LZMA2_encMemoryUsage(unsigned const chain_log, FL2_strategy const strategy, unsigned const thread_count); #if defined (__cplusplus) } diff --git a/C/fast-lzma2/mem.h b/C/fast-lzma2/mem.h index f54a45ce..5da24875 100644 --- a/C/fast-lzma2/mem.h +++ b/C/fast-lzma2/mem.h @@ -28,9 +28,6 @@ extern "C" { #if defined(_MSC_VER) /* Visual Studio */ # include /* _byteswap_ulong */ # include /* _byteswap_* */ -# pragma warning(disable : 4389) /* disable: C4389: '==' : signed/unsigned mismatch */ -#endif - #endif #if defined(__GNUC__) # define MEM_STATIC static __inline __attribute__((unused)) @@ -42,6 +39,10 @@ extern "C" { # define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ #endif +#ifndef __has_builtin +# define __has_builtin(x) 0 /* compat. with non-clang compilers */ +#endif + /* code only tested on 32 and 64 bits systems */ #define MEM_STATIC_ASSERT(c) { enum { MEM_static_assert = 1/(int)(!!(c)) }; } MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); } @@ -60,11 +61,23 @@ MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (size typedef uint64_t U64; typedef int64_t S64; #else +# include +#if CHAR_BIT != 8 +# error "this implementation requires char to be exactly 8-bit type" +#endif typedef unsigned char BYTE; +#if USHRT_MAX != 65535 +# error "this implementation requires short to be exactly 16-bit type" +#endif typedef unsigned short U16; typedef signed short S16; +#if UINT_MAX != 4294967295 +# error "this implementation requires int to be exactly 32-bit type" +#endif typedef unsigned int U32; typedef signed int S32; +/* note : there are no limits defined for long long type in C90. + * limits exist in C99, however, in such case, is preferred */ typedef unsigned long long U64; typedef signed long long S64; #endif @@ -189,7 +202,8 @@ MEM_STATIC U32 MEM_swap32(U32 in) { #if defined(_MSC_VER) /* Visual Studio */ return _byteswap_ulong(in); -#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403) +#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ + || (defined(__clang__) && __has_builtin(__builtin_bswap32)) return __builtin_bswap32(in); #else return ((in << 24) & 0xff000000 ) | @@ -203,7 +217,8 @@ MEM_STATIC U64 MEM_swap64(U64 in) { #if defined(_MSC_VER) /* Visual Studio */ return _byteswap_uint64(in); -#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403) +#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ + || (defined(__clang__) && __has_builtin(__builtin_bswap64)) return __builtin_bswap64(in); #else return ((in << 56) & 0xff00000000000000ULL) | diff --git a/C/fast-lzma2/platform.h b/C/fast-lzma2/platform.h index a4d7850f..155ebcd1 100644 --- a/C/fast-lzma2/platform.h +++ b/C/fast-lzma2/platform.h @@ -21,10 +21,10 @@ extern "C" { * Compiler Options ****************************************/ #if defined(_MSC_VER) -# define _CRT_SECURE_NO_WARNINGS /* Disable Visual Studio warning messages for fopen, strncpy, strerror */ -# define _CRT_SECURE_NO_DEPRECATE /* VS2005 - must be declared before and */ -# if (_MSC_VER <= 1800) /* (1800 = Visual Studio 2013) */ -# define snprintf sprintf_s /* snprintf unsupported by Visual <= 2013 */ +# define _CRT_SECURE_NO_WARNINGS /* Disable Visual Studio warning messages for fopen, strncpy, strerror */ +# if (_MSC_VER <= 1800) /* 1800 == Visual Studio 2013 */ +# define _CRT_SECURE_NO_DEPRECATE /* VS2005 - must be declared before and */ +# define snprintf sprintf_s /* snprintf unsupported by Visual <= 2013 */ # endif #endif @@ -50,53 +50,70 @@ extern "C" { /* ********************************************************* * Turn on Large Files support (>4GB) for 32-bit Linux/Unix ***********************************************************/ -#if !defined(__64BIT__) || defined(__MINGW32__) /* No point defining Large file for 64 bit but MinGW-w64 requires it */ +#if !defined(__64BIT__) || defined(__MINGW32__) /* No point defining Large file for 64 bit but MinGW-w64 requires it */ # if !defined(_FILE_OFFSET_BITS) -# define _FILE_OFFSET_BITS 64 /* turn off_t into a 64-bit type for ftello, fseeko */ +# define _FILE_OFFSET_BITS 64 /* turn off_t into a 64-bit type for ftello, fseeko */ # endif -# if !defined(_LARGEFILE_SOURCE) /* obsolete macro, replaced with _FILE_OFFSET_BITS */ -# define _LARGEFILE_SOURCE 1 /* Large File Support extension (LFS) - fseeko, ftello */ +# if !defined(_LARGEFILE_SOURCE) /* obsolete macro, replaced with _FILE_OFFSET_BITS */ +# define _LARGEFILE_SOURCE 1 /* Large File Support extension (LFS) - fseeko, ftello */ # endif # if defined(_AIX) || defined(__hpux) -# define _LARGE_FILES /* Large file support on 32-bits AIX and HP-UX */ +# define _LARGE_FILES /* Large file support on 32-bits AIX and HP-UX */ # endif #endif /* ************************************************************ * Detect POSIX version -* PLATFORM_POSIX_VERSION = -1 for non-Unix e.g. Windows -* PLATFORM_POSIX_VERSION = 0 for Unix-like non-POSIX -* PLATFORM_POSIX_VERSION >= 1 is equal to found _POSIX_VERSION +* PLATFORM_POSIX_VERSION = 0 for non-Unix e.g. Windows +* PLATFORM_POSIX_VERSION = 1 for Unix-like but non-POSIX +* PLATFORM_POSIX_VERSION > 1 is equal to found _POSIX_VERSION +* Value of PLATFORM_POSIX_VERSION can be forced on command line ***************************************************************/ -#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__)) /* UNIX-like OS */ \ - || defined(__midipix__) || defined(__VMS)) -# if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1–2001 (SUSv3) conformant */ \ +#ifndef PLATFORM_POSIX_VERSION + +# if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \ || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) /* BSD distros */ + /* exception rule : force posix version to 200112L, + * note: it's better to use unistd.h's _POSIX_VERSION whenever possible */ # define PLATFORM_POSIX_VERSION 200112L -# else + +/* try to determine posix version through official unistd.h's _POSIX_VERSION (http://pubs.opengroup.org/onlinepubs/7908799/xsh/unistd.h.html). + * note : there is no simple way to know in advance if is present or not on target system, + * Posix specification mandates its presence and its content, but target system must respect this spec. + * It's necessary to _not_ #include whenever target OS is not unix-like + * otherwise it will block preprocessing stage. + * The following list of build macros tries to "guess" if target OS is likely unix-like, and therefore can #include + */ +# elif !defined(_WIN32) \ + && (defined(__unix__) || defined(__unix) \ + || defined(__midipix__) || defined(__VMS) || defined(__HAIKU__)) + # if defined(__linux__) || defined(__linux) # ifndef _POSIX_C_SOURCE -# define _POSIX_C_SOURCE 200112L /* use feature test macro */ +# define _POSIX_C_SOURCE 200112L /* feature test macro : https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html */ # endif # endif # include /* declares _POSIX_VERSION */ # if defined(_POSIX_VERSION) /* POSIX compliant */ # define PLATFORM_POSIX_VERSION _POSIX_VERSION # else -# define PLATFORM_POSIX_VERSION 0 +# define PLATFORM_POSIX_VERSION 1 # endif -# endif -#endif -#if !defined(PLATFORM_POSIX_VERSION) -# define PLATFORM_POSIX_VERSION -1 -#endif +# else /* non-unix target platform (like Windows) */ +# define PLATFORM_POSIX_VERSION 0 +# endif + +#endif /* PLATFORM_POSIX_VERSION */ /*-********************************************* * Detect if isatty() and fileno() are available ************************************************/ -#if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 1)) || (PLATFORM_POSIX_VERSION >= 200112L) || defined(__DJGPP__) +#if (defined(__linux__) && (PLATFORM_POSIX_VERSION > 1)) \ + || (PLATFORM_POSIX_VERSION >= 200112L) \ + || defined(__DJGPP__) \ + || defined(__MSYS__) # include /* isatty */ # define IS_CONSOLE(stdStream) isatty(fileno(stdStream)) #elif defined(MSDOS) || defined(OS2) || defined(__CYGWIN__) @@ -106,8 +123,7 @@ extern "C" { # include /* _isatty */ # include /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */ # include /* FILE */ -static __inline int IS_CONSOLE(FILE* stdStream) -{ +static __inline int IS_CONSOLE(FILE* stdStream) { DWORD dummy; return _isatty(_fileno(stdStream)) && GetConsoleMode((HANDLE)_get_osfhandle(_fileno(stdStream)), &dummy); } @@ -117,7 +133,7 @@ static __inline int IS_CONSOLE(FILE* stdStream) /****************************** -* OS-specific Includes +* OS-specific IO behaviors ******************************/ #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) # include /* _O_BINARY */ @@ -125,7 +141,7 @@ static __inline int IS_CONSOLE(FILE* stdStream) # if !defined(__DJGPP__) # include /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */ # include /* FSCTL_SET_SPARSE */ -# define SET_BINARY_MODE(file) { int unused=_setmode(_fileno(file), _O_BINARY); (void)unused; } +# define SET_BINARY_MODE(file) { int const unused=_setmode(_fileno(file), _O_BINARY); (void)unused; } # define SET_SPARSE_FILE_MODE(file) { DWORD dw; DeviceIoControl((HANDLE) _get_osfhandle(_fileno(file)), FSCTL_SET_SPARSE, 0, 0, 0, 0, &dw, 0); } # else # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) @@ -146,6 +162,34 @@ static __inline int IS_CONSOLE(FILE* stdStream) #endif +#ifndef ZSTD_START_SYMBOLLIST_FRAME +# ifdef __linux__ +# define ZSTD_START_SYMBOLLIST_FRAME 2 +# elif defined __APPLE__ +# define ZSTD_START_SYMBOLLIST_FRAME 4 +# else +# define ZSTD_START_SYMBOLLIST_FRAME 0 +# endif +#endif + + +#ifndef ZSTD_SETPRIORITY_SUPPORT + /* mandates presence of and support for setpriority() : http://man7.org/linux/man-pages/man2/setpriority.2.html */ +# define ZSTD_SETPRIORITY_SUPPORT (PLATFORM_POSIX_VERSION >= 200112L) +#endif + + +#ifndef ZSTD_NANOSLEEP_SUPPORT + /* mandates support of nanosleep() within : http://man7.org/linux/man-pages/man2/nanosleep.2.html */ +# if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 199309L)) \ + || (PLATFORM_POSIX_VERSION >= 200112L) +# define ZSTD_NANOSLEEP_SUPPORT 1 +# else +# define ZSTD_NANOSLEEP_SUPPORT 0 +# endif +#endif + + #if defined (__cplusplus) } #endif diff --git a/C/fast-lzma2/radix_bitpack.c b/C/fast-lzma2/radix_bitpack.c index a20b0d60..c7e5484c 100644 --- a/C/fast-lzma2/radix_bitpack.c +++ b/C/fast-lzma2/radix_bitpack.c @@ -9,7 +9,7 @@ */ #include "mem.h" /* U32, U64 */ -#include "fl2threading.h" +#include "fl2_threading.h" #include "fl2_internal.h" #include "radix_internal.h" @@ -52,9 +52,8 @@ void RMF_bitpackLimitLengths(FL2_matchTable* const tbl, size_t const index) SetNull(index - 1); for (U32 length = 2; length < RADIX_MAX_LENGTH && length <= index; ++length) { U32 const link = tbl->table[index - length]; - if (link != RADIX_NULL_LINK) { + if (link != RADIX_NULL_LINK) tbl->table[index - length] = (MIN(length, link >> RADIX_LINK_BITS) << RADIX_LINK_BITS) | (link & RADIX_LINK_MASK); - } } } diff --git a/C/fast-lzma2/radix_engine.h b/C/fast-lzma2/radix_engine.h index 1697f942..0886d87d 100644 --- a/C/fast-lzma2/radix_engine.h +++ b/C/fast-lzma2/radix_engine.h @@ -9,80 +9,82 @@ */ #include -#include "count.h" #define MAX_READ_BEYOND_DEPTH 2 /* If a repeating byte is found, fill that section of the table with matches of distance 1 */ -static size_t HandleRepeat(FL2_matchTable* const tbl, const BYTE* const data_block, size_t const start, ptrdiff_t const block_size, ptrdiff_t i, size_t const radix_16) +static size_t RMF_handleRepeat(RMF_builder* const tbl, const BYTE* const data_block, size_t const start, ptrdiff_t i, U32 depth) { - ptrdiff_t const rpt_index = i - (MAX_REPEAT / 2 - 2); - ptrdiff_t rpt_end; - /* Set the head to the first byte of the repeat and adjust the count */ - tbl->list_heads[radix_16].head = (U32)(rpt_index - 1); - tbl->list_heads[radix_16].count -= MAX_REPEAT / 2 - 2; - /* Find the end */ - i += ZSTD_count(data_block + i + 2, data_block + i + 1, data_block + block_size); - rpt_end = i; + /* Normally the last 2 bytes, but may be 4 if depth == 4 */ + ptrdiff_t const last_2 = i + MAX_REPEAT / 2 - 1; + + /* Find the start */ + i += (4 - (i & 3)) & 3; + U32 u = *(U32*)(data_block + i); + while (i != 0 && *(U32*)(data_block + i - 4) == u) + i -= 4; + while (i != 0 && data_block[i - 1] == (BYTE)u) + --i; + + ptrdiff_t const rpt_index = i; /* No point if it's in the overlap region */ - if (i >= (ptrdiff_t)start) { - U32 len = 2; + if (last_2 >= (ptrdiff_t)start) { + U32 len = depth; /* Set matches at distance 1 and available length */ - for (; i >= rpt_index && len <= RADIX_MAX_LENGTH; --i) { + for (i = last_2; i > rpt_index && len <= RADIX_MAX_LENGTH; --i) { SetMatchLinkAndLength(i, (U32)(i - 1), len); ++len; } /* Set matches at distance 1 and max length */ - for (; i >= rpt_index; --i) { + for (; i > rpt_index; --i) SetMatchLinkAndLength(i, (U32)(i - 1), RADIX_MAX_LENGTH); - } } - return rpt_end; + return rpt_index; } /* If a 2-byte repeat is found, fill that section of the table with matches of distance 2 */ -static size_t HandleRepeat2(FL2_matchTable* const tbl, const BYTE* const data_block, size_t const start, ptrdiff_t const block_size, ptrdiff_t i, size_t const radix_16) +static size_t RMF_handleRepeat2(RMF_builder* const tbl, const BYTE* const data_block, size_t const start, ptrdiff_t i, U32 depth) { - size_t radix_16_rev; - ptrdiff_t const rpt_index = i - (MAX_REPEAT - 3); - ptrdiff_t rpt_end; + /* Normally the last 2 bytes, but may be 4 if depth == 4 */ + ptrdiff_t const last_2 = i + MAX_REPEAT * 2U - 4; - /* Set the head to the first byte of the repeat and adjust the count */ - tbl->list_heads[radix_16].head = (U32)(rpt_index - 1); - tbl->list_heads[radix_16].count -= MAX_REPEAT / 2 - 2; - radix_16_rev = ((radix_16 >> 8) | (radix_16 << 8)) & 0xFFFF; - tbl->list_heads[radix_16_rev].head = (U32)(rpt_index - 2); - tbl->list_heads[radix_16_rev].count -= MAX_REPEAT / 2 - 1; - /* Find the end */ - i += ZSTD_count(data_block + i + 2, data_block + i, data_block + block_size); - rpt_end = i; + /* Find the start */ + ptrdiff_t realign = i & 1; + i += (4 - (i & 3)) & 3; + U32 u = *(U32*)(data_block + i); + while (i != 0 && *(U32*)(data_block + i - 4) == u) + i -= 4; + while (i != 0 && data_block[i - 1] == data_block[i + 1]) + --i; + i += (i & 1) ^ realign; + + ptrdiff_t const rpt_index = i; /* No point if it's in the overlap region */ if (i >= (ptrdiff_t)start) { - U32 len = 2; + U32 len = depth + (data_block[last_2 + depth] == data_block[last_2]); /* Set matches at distance 2 and available length */ - for (; i >= rpt_index && len <= RADIX_MAX_LENGTH; --i) { + for (i = last_2; i > rpt_index && len <= RADIX_MAX_LENGTH; i -= 2) { SetMatchLinkAndLength(i, (U32)(i - 2), len); - ++len; + len += 2; } /* Set matches at distance 2 and max length */ - for (; i >= rpt_index; --i) { + for (; i > rpt_index; i -= 2) SetMatchLinkAndLength(i, (U32)(i - 2), RADIX_MAX_LENGTH); - } } - return rpt_end; + return rpt_index; } /* Initialization for the reference algortithm */ #ifdef RMF_REFERENCE -static void RadixInitReference(FL2_matchTable* const tbl, const void* const data, size_t const start, size_t const end) +static void RMF_initReference(FL2_matchTable* const tbl, const void* const data, size_t const end) { const BYTE* const data_block = (const BYTE*)data; ptrdiff_t const block_size = end - 1; size_t st_index = 0; for (ptrdiff_t i = 0; i < block_size; ++i) { - size_t radix_16 = ((size_t)data_block[i] << 8) | data_block[i + 1]; - U32 prev = tbl->list_heads[radix_16].head; + size_t const radix_16 = ((size_t)data_block[i] << 8) | data_block[i + 1]; + U32 const prev = tbl->list_heads[radix_16].head; if (prev != RADIX_NULL_LINK) { SetMatchLinkAndLength(i, prev, 2U); tbl->list_heads[radix_16].head = (U32)i; @@ -98,7 +100,6 @@ static void RadixInitReference(FL2_matchTable* const tbl, const void* const data SetNull(end - 1); tbl->end_index = (U32)st_index; tbl->st_index = ATOMIC_INITIAL_VALUE; - (void)start; } #endif @@ -108,82 +109,50 @@ RMF_bitpackInit #else RMF_structuredInit #endif -(FL2_matchTable* const tbl, const void* const data, size_t const start, size_t const end) +(FL2_matchTable* const tbl, const void* const data, size_t const end) { - const BYTE* const data_block = (const BYTE*)data; - size_t st_index = 0; - size_t radix_16; - ptrdiff_t const block_size = end - 2; - ptrdiff_t rpt_total = 0; - U32 count = 0; - if (end <= 2) { - for (size_t i = 0; i < end; ++i) { + for (size_t i = 0; i < end; ++i) SetNull(i); - } + tbl->end_index = 0; return 0; } #ifdef RMF_REFERENCE if (tbl->params.use_ref_mf) { - RadixInitReference(tbl, data, start, end); + RMF_initReference(tbl, data, end); return 0; } #endif + SetNull(0); + + const BYTE* const data_block = (const BYTE*)data; + size_t st_index = 0; /* Initial 2-byte radix value */ - radix_16 = ((size_t)data_block[0] << 8) | data_block[1]; + size_t radix_16 = ((size_t)data_block[0] << 8) | data_block[1]; tbl->stack[st_index++] = (U32)radix_16; tbl->list_heads[radix_16].head = 0; tbl->list_heads[radix_16].count = 1; radix_16 = ((size_t)((BYTE)radix_16) << 8) | data_block[2]; - ptrdiff_t i = 1; + ptrdiff_t rpt_total = 0; + ptrdiff_t i = 1; + ptrdiff_t const block_size = end - 2; for (; i < block_size; ++i) { - /* Pre-load the next value for speed increase */ + /* Pre-load the next value for speed increase on some hardware. Execution can continue while memory read is pending */ size_t const next_radix = ((size_t)((BYTE)radix_16) << 8) | data_block[i + 2]; U32 const prev = tbl->list_heads[radix_16].head; if (prev != RADIX_NULL_LINK) { - S32 dist = (S32)i - prev; - /* Check for repeat */ - if (dist > 2) { - count = 0; - /* Link this position to the previous occurance */ - InitMatchLink(i, prev); - /* Set the previous to this position */ - tbl->list_heads[radix_16].head = (U32)i; - ++tbl->list_heads[radix_16].count; - radix_16 = next_radix; - } - else { - count += 3 - dist; - /* Do the usual if the repeat is too short */ - if (count < MAX_REPEAT - 2) { - InitMatchLink(i, prev); - tbl->list_heads[radix_16].head = (U32)i; - ++tbl->list_heads[radix_16].count; - radix_16 = next_radix; - } - else { - ptrdiff_t const prev_i = i; - /* Eliminate the repeat from the linked list to save time */ - if (dist == 1) { - i = HandleRepeat(tbl, data_block, start, end, i, radix_16); - rpt_total += i - prev_i + MAX_REPEAT / 2U - 1; - } - else { - i = HandleRepeat2(tbl, data_block, start, end, i, radix_16); - rpt_total += i - prev_i + MAX_REPEAT - 2; - } - if (i < block_size) - radix_16 = ((size_t)data_block[i + 1] << 8) | data_block[i + 2]; - count = 0; - } - } + /* Link this position to the previous occurrence */ + InitMatchLink(i, prev); + /* Set the previous to this position */ + tbl->list_heads[radix_16].head = (U32)i; + ++tbl->list_heads[radix_16].count; + radix_16 = next_radix; } else { - count = 0; SetNull(i); tbl->list_heads[radix_16].head = (U32)i; tbl->list_heads[radix_16].count = 1; @@ -192,65 +161,100 @@ RMF_structuredInit } } /* Handle the last value */ - if (i <= block_size && tbl->list_heads[radix_16].head != RADIX_NULL_LINK) { + if (tbl->list_heads[radix_16].head != RADIX_NULL_LINK) SetMatchLinkAndLength(block_size, tbl->list_heads[radix_16].head, 2); - } - else { + else SetNull(block_size); - } + /* Never a match at the last byte */ SetNull(end - 1); tbl->end_index = (U32)st_index; - tbl->st_index = ATOMIC_INITIAL_VALUE; return rpt_total; } -#if defined(_MSC_VER) -# pragma warning(disable : 4701) /* disable: C4701: potentially uninitialized local variable */ -#endif - - /* Copy the list into a buffer and recurse it there. This decreases cache misses and allows */ /* data characters to be loaded every fourth pass and stored for use in the next 4 passes */ -static void RecurseListsBuffered(RMF_builder* const tbl, +static void RMF_recurseListsBuffered(RMF_builder* const tbl, const BYTE* const data_block, size_t const block_start, size_t link, - BYTE depth, - BYTE const max_depth, + U32 depth, + U32 const max_depth, U32 orig_list_count, size_t const stack_base) { + if (orig_list_count < 2 || tbl->match_buffer_limit < 2) + return; + /* Create an offset data buffer pointer for reading the next bytes */ const BYTE* data_src = data_block + depth; size_t start = 0; - if (orig_list_count < 2 || tbl->match_buffer_limit < 2) - return; do { - size_t count = start; U32 list_count = (U32)(start + orig_list_count); - U32 overlap; - if (list_count > tbl->match_buffer_limit) { + if (list_count > tbl->match_buffer_limit) list_count = (U32)tbl->match_buffer_limit; - } + + size_t count = start; + size_t prev_link = (size_t)-1; + size_t rpt = 0; + size_t rpt_tail = link; for (; count < list_count; ++count) { /* Pre-load next link */ size_t const next_link = GetMatchLink(link); - /* Get 4 data characters for later. This doesn't block on a cache miss. */ - tbl->match_buffer[count].src.u32 = MEM_read32(data_src + link); - /* Record the actual location of this suffix */ - tbl->match_buffer[count].from = (U32)link; - /* Initialize the next link */ - tbl->match_buffer[count].next = (U32)(count + 1) | ((U32)depth << 24); - link = next_link; + size_t dist = prev_link - link; + if (dist > 2) { + /* Get 4 data characters for later. This doesn't block on a cache miss. */ + tbl->match_buffer[count].src.u32 = MEM_read32(data_src + link); + /* Record the actual location of this suffix */ + tbl->match_buffer[count].from = (U32)link; + /* Initialize the next link */ + tbl->match_buffer[count].next = (U32)(count + 1) | (depth << 24); + rpt = 0; + prev_link = link; + rpt_tail = link; + link = next_link; + } + else { + rpt += 3 - dist; + /* Do the usual if the repeat is too short */ + if (rpt < MAX_REPEAT - 2) { + /* Get 4 data characters for later. This doesn't block on a cache miss. */ + tbl->match_buffer[count].src.u32 = MEM_read32(data_src + link); + /* Record the actual location of this suffix */ + tbl->match_buffer[count].from = (U32)link; + /* Initialize the next link */ + tbl->match_buffer[count].next = (U32)(count + 1) | (depth << 24); + prev_link = link; + link = next_link; + } + else { + /* Eliminate the repeat from the linked list to save time */ + if (dist == 1) { + link = RMF_handleRepeat(tbl, data_block, block_start, link, depth); + count -= MAX_REPEAT / 2; + orig_list_count -= (U32)(rpt_tail - link); + } + else { + link = RMF_handleRepeat2(tbl, data_block, block_start, link, depth); + count -= MAX_REPEAT - 1; + orig_list_count -= (U32)(rpt_tail - link) >> 1; + } + rpt = 0; + list_count = (U32)(start + orig_list_count); + + if (list_count > tbl->match_buffer_limit) + list_count = (U32)tbl->match_buffer_limit; + } + } } + count = list_count; /* Make the last element circular so pre-loading doesn't read past the end. */ - tbl->match_buffer[count - 1].next = (U32)(count - 1) | ((U32)depth << 24); - overlap = 0; + tbl->match_buffer[count - 1].next = (U32)(count - 1) | (depth << 24); + U32 overlap = 0; if (list_count < (U32)(start + orig_list_count)) { overlap = list_count >> MATCH_BUFFER_OVERLAP; overlap += !overlap; @@ -259,15 +263,25 @@ static void RecurseListsBuffered(RMF_builder* const tbl, orig_list_count -= (U32)(list_count - start); /* Copy everything back, except the last link which never changes, and any extra overlap */ count -= overlap + (overlap == 0); - for (size_t index = 0; index < count; ++index) { +#ifdef RMF_BITPACK + if (max_depth > RADIX_MAX_LENGTH) for (size_t index = 0; index < count; ++index) { size_t const from = tbl->match_buffer[index].from; if (from < block_start) return; - - { U32 length = tbl->match_buffer[index].next >> 24; - size_t next = tbl->match_buffer[index].next & BUFFER_LINK_MASK; - SetMatchLinkAndLength(from, tbl->match_buffer[next].from, length); - } + U32 length = tbl->match_buffer[index].next >> 24; + length = (length > RADIX_MAX_LENGTH) ? RADIX_MAX_LENGTH : length; + size_t const next = tbl->match_buffer[index].next & BUFFER_LINK_MASK; + SetMatchLinkAndLength(from, tbl->match_buffer[next].from, length); + } + else +#endif + for (size_t index = 0; index < count; ++index) { + size_t const from = tbl->match_buffer[index].from; + if (from < block_start) + return; + U32 const length = tbl->match_buffer[index].next >> 24; + size_t const next = tbl->match_buffer[index].next & BUFFER_LINK_MASK; + SetMatchLinkAndLength(from, tbl->match_buffer[next].from, length); } start = 0; if (overlap) { @@ -275,7 +289,7 @@ static void RecurseListsBuffered(RMF_builder* const tbl, for (size_t src = list_count - overlap; src < list_count; ++src) { tbl->match_buffer[dest].from = tbl->match_buffer[src].from; tbl->match_buffer[dest].src.u32 = MEM_read32(data_src + tbl->match_buffer[src].from); - tbl->match_buffer[dest].next = (U32)(dest + 1) | ((U32)depth << 24); + tbl->match_buffer[dest].next = (U32)(dest + 1) | (depth << 24); ++dest; } start = dest; @@ -283,30 +297,23 @@ static void RecurseListsBuffered(RMF_builder* const tbl, } while (orig_list_count != 0); } -/* Parse the list with bounds checks on data reads. Stop at the point where bound checks are not required. */ +/* Parse the list with an upper bound check on data reads. Stop at the point where bound checks are not required. */ /* Buffering is used so that parsing can continue below the bound to find a few matches without altering the main table. */ -static void RecurseListsBound(RMF_builder* const tbl, +static void RMF_recurseListsBound(RMF_builder* const tbl, const BYTE* const data_block, ptrdiff_t const block_size, RMF_tableHead* const list_head, - U32 const max_depth) + U32 max_depth) { U32 list_count = list_head->count; + if (list_count < 2) + return; + ptrdiff_t link = list_head->head; ptrdiff_t const bounded_size = max_depth + MAX_READ_BEYOND_DEPTH; ptrdiff_t const bounded_start = block_size - MIN(block_size, bounded_size); - /* Create an offset data buffer pointer for reading the next bytes */ size_t count = 0; size_t extra_count = (max_depth >> 4) + 4; - ptrdiff_t limit; - const BYTE* data_src; - U32 depth; - size_t index; - size_t st_index; - RMF_listTail* tails_8; - - if (list_count < 2) - return; list_count = MIN((U32)bounded_size, list_count); list_count = MIN(list_count, (U32)tbl->match_buffer_size); @@ -314,9 +321,8 @@ static void RecurseListsBound(RMF_builder* const tbl, ptrdiff_t next_link = GetMatchLink(link); if (link >= bounded_start) { --list_head->count; - if (next_link < bounded_start) { + if (next_link < bounded_start) list_head->head = (U32)next_link; - } } else { --extra_count; @@ -328,18 +334,20 @@ static void RecurseListsBound(RMF_builder* const tbl, link = next_link; } list_count = (U32)count; - limit = block_size - 2; - data_src = data_block + 2; - depth = 3; - index = 0; - st_index = 0; - tails_8 = tbl->tails_8; + ptrdiff_t limit = block_size - 2; + /* Create an offset data buffer pointer for reading the next bytes */ + const BYTE* data_src = data_block + 2; + U32 depth = 3; + size_t index = 0; + size_t st_index = 0; + RMF_listTail* const tails_8 = tbl->tails_8; do { link = tbl->match_buffer[index].from; if (link < limit) { size_t const radix_8 = data_src[link]; /* Seen this char before? */ - const U32 prev = tails_8[radix_8].prev_index; + U32 const prev = tails_8[radix_8].prev_index; + tails_8[radix_8].prev_index = (U32)index; if (prev != RADIX_NULL_LINK) { ++tails_8[radix_8].list_count; /* Link the previous occurrence to this one and record the new length */ @@ -353,7 +361,6 @@ static void RecurseListsBound(RMF_builder* const tbl, tbl->stack[st_index].count = (U32)radix_8; ++st_index; } - tails_8[radix_8].prev_index = (U32)index; } ++index; } while (index < list_count); @@ -368,10 +375,9 @@ static void RecurseListsBound(RMF_builder* const tbl, /* Pop an item off the stack */ --st_index; list_count = tbl->stack[st_index].count; - if (list_count < 2) { - /* Nothing to match with */ + if (list_count < 2) /* Nothing to match with */ continue; - } + index = tbl->stack[st_index].head; depth = (tbl->match_buffer[index].next >> 24); if (depth >= max_depth) @@ -390,9 +396,10 @@ static void RecurseListsBound(RMF_builder* const tbl, if (link < limit) { size_t const radix_8 = data_src[link]; U32 const prev = tails_8[radix_8].prev_index; + tails_8[radix_8].prev_index = (U32)index; if (prev != RADIX_NULL_LINK) { ++tails_8[radix_8].list_count; - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } else { tails_8[radix_8].list_count = 1; @@ -400,7 +407,6 @@ static void RecurseListsBound(RMF_builder* const tbl, tbl->stack[st_index].count = (U32)radix_8; ++st_index; } - tails_8[radix_8].prev_index = (U32)index; } index = tbl->match_buffer[index].next & BUFFER_LINK_MASK; } while (--list_count != 0); @@ -413,20 +419,20 @@ static void RecurseListsBound(RMF_builder* const tbl, --count; for (index = 0; index < count; ++index) { ptrdiff_t const from = tbl->match_buffer[index].from; - size_t next; - U32 length; - if (from < bounded_start) break; - length = tbl->match_buffer[index].next >> 24; + + U32 length = tbl->match_buffer[index].next >> 24; length = MIN(length, (U32)(block_size - from)); - next = tbl->match_buffer[index].next & BUFFER_LINK_MASK; + length = MIN(length, RADIX_MAX_LENGTH); + + size_t const next = tbl->match_buffer[index].next & BUFFER_LINK_MASK; SetMatchLinkAndLength(from, tbl->match_buffer[next].from, length); } } /* Compare each string with all others to find the best match */ -static void BruteForce(RMF_builder* const tbl, +static void RMF_bruteForce(RMF_builder* const tbl, const BYTE* const data_block, size_t const block_start, size_t link, @@ -445,6 +451,7 @@ static void BruteForce(RMF_builder* const tbl, link = GetMatchLink(link); buffer[i] = link; } while (++i < list_count); + i = 0; do { size_t longest = 0; @@ -454,34 +461,37 @@ static void BruteForce(RMF_builder* const tbl, do { const BYTE* data_2 = data_src + buffer[j]; size_t len_test = 0; - while (data[len_test] == data_2[len_test] && len_test < limit) { + while (data[len_test] == data_2[len_test] && len_test < limit) ++len_test; - } + if (len_test > longest) { longest_index = j; longest = len_test; - if (len_test >= limit) { + if (len_test >= limit) break; - } } } while (++j < list_count); - if (longest > 0) { - SetMatchLinkAndLength(buffer[i], - (U32)buffer[longest_index], - depth + (U32)longest); - } + + if (longest > 0) + SetMatchLinkAndLength(buffer[i], (U32)buffer[longest_index], depth + (U32)longest); + ++i; + /* Test with block_start to avoid wasting time matching strings in the overlap region with each other */ } while (i < list_count - 1 && buffer[i] >= block_start); } -static void RecurseLists16(RMF_builder* const tbl, +/* RMF_recurseLists16() : + * Match strings at depth 2 using a 16-bit radix to lengthen to depth 4 + */ +static void RMF_recurseLists16(RMF_builder* const tbl, const BYTE* const data_block, size_t const block_start, size_t link, U32 count, U32 const max_depth) { - /* Offset data pointer. This method is only called at depth 2 */ + U32 const table_max_depth = MIN(max_depth, RADIX_MAX_LENGTH); + /* Offset data pointer. This function is only called at depth 2 */ const BYTE* const data_src = data_block + 2; /* Load radix values from the data chars */ size_t next_radix_8 = data_src[link]; @@ -489,7 +499,6 @@ static void RecurseLists16(RMF_builder* const tbl, size_t reset_list[RADIX8_TABLE_SIZE]; size_t reset_count = 0; size_t st_index = 0; - U32 prev; /* Last one is done separately */ --count; do @@ -504,7 +513,8 @@ static void RecurseLists16(RMF_builder* const tbl, next_radix_8 = data_src[next_link]; next_radix_16 = next_radix_8 + ((size_t)(data_src[next_link + 1]) << 8); - prev = tbl->tails_8[radix_8].prev_index; + U32 prev = tbl->tails_8[radix_8].prev_index; + tbl->tails_8[radix_8].prev_index = (U32)link; if (prev != RADIX_NULL_LINK) { /* Link the previous occurrence to this one at length 3. */ /* This will be overwritten if a 4 is found. */ @@ -513,9 +523,9 @@ static void RecurseLists16(RMF_builder* const tbl, else { reset_list[reset_count++] = radix_8; } - tbl->tails_8[radix_8].prev_index = (U32)link; prev = tbl->tails_16[radix_16].prev_index; + tbl->tails_16[radix_16].prev_index = (U32)link; if (prev != RADIX_NULL_LINK) { ++tbl->tails_16[radix_16].list_count; /* Link at length 4, overwriting the 3 */ @@ -524,35 +534,35 @@ static void RecurseLists16(RMF_builder* const tbl, else { tbl->tails_16[radix_16].list_count = 1; tbl->stack[st_index].head = (U32)link; + /* Store a reference to this table location to retrieve the count at the end */ tbl->stack[st_index].count = (U32)radix_16; ++st_index; } - tbl->tails_16[radix_16].prev_index = (U32)link; link = next_link; } while (--count > 0); + /* Do the last location */ - prev = tbl->tails_8[next_radix_8].prev_index; - if (prev != RADIX_NULL_LINK) { + U32 prev = tbl->tails_8[next_radix_8].prev_index; + if (prev != RADIX_NULL_LINK) SetMatchLinkAndLength(prev, (U32)link, 3); - } + prev = tbl->tails_16[next_radix_16].prev_index; if (prev != RADIX_NULL_LINK) { ++tbl->tails_16[next_radix_16].list_count; SetMatchLinkAndLength(prev, (U32)link, 4); } - for (size_t i = 0; i < reset_count; ++i) { + + for (size_t i = 0; i < reset_count; ++i) tbl->tails_8[reset_list[i]].prev_index = RADIX_NULL_LINK; - } + for (size_t i = 0; i < st_index; ++i) { tbl->tails_16[tbl->stack[i].count].prev_index = RADIX_NULL_LINK; tbl->stack[i].count = tbl->tails_16[tbl->stack[i].count].list_count; } - while (st_index > 0) { - U32 list_count; - U32 depth; + while (st_index > 0) { --st_index; - list_count = tbl->stack[st_index].count; + U32 const list_count = tbl->stack[st_index].count; if (list_count < 2) { /* Nothing to do */ continue; @@ -567,19 +577,19 @@ static void RecurseLists16(RMF_builder* const tbl, continue; } /* The current depth */ - depth = GetMatchLength(link); + U32 const depth = GetMatchLength(link); if (list_count <= MAX_BRUTE_FORCE_LIST_SIZE) { /* Quicker to use brute force, each string compared with all previous strings */ - BruteForce(tbl, data_block, + RMF_bruteForce(tbl, data_block, block_start, link, list_count, depth, - max_depth); + table_max_depth); continue; } /* Send to the buffer at depth 4 */ - RecurseListsBuffered(tbl, + RMF_recurseListsBuffered(tbl, data_block, block_start, link, @@ -591,7 +601,10 @@ static void RecurseLists16(RMF_builder* const tbl, } #if 0 -static void RecurseListsUnbuf16(RMF_builder* const tbl, +/* Unbuffered complete processing to max_depth. + * This may be faster on CPUs without a large memory cache. + */ +static void RMF_recurseListsUnbuf16(RMF_builder* const tbl, const BYTE* const data_block, size_t const block_start, size_t link, @@ -607,7 +620,6 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl, size_t reset_list[RADIX8_TABLE_SIZE]; size_t reset_count = 0; size_t st_index = 0; - U32 prev; /* Last one is done separately */ --count; do @@ -620,7 +632,7 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl, size_t radix_16 = next_radix_16; next_radix_8 = data_src[next_link]; next_radix_16 = next_radix_8 + ((size_t)(data_src[next_link + 1]) << 8); - prev = tails_8[radix_8].prev_index; + U32 prev = tails_8[radix_8].prev_index; if (prev != RADIX_NULL_LINK) { /* Link the previous occurrence to this one at length 3. */ /* This will be overwritten if a 4 is found. */ @@ -646,7 +658,7 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl, link = next_link; } while (--count > 0); /* Do the last location */ - prev = tails_8[next_radix_8].prev_index; + U32 prev = tails_8[next_radix_8].prev_index; if (prev != RADIX_NULL_LINK) { SetMatchLinkAndLength(prev, (U32)link, 3); } @@ -683,7 +695,7 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl, U32 depth = GetMatchLength(link); if (list_count <= MAX_BRUTE_FORCE_LIST_SIZE) { /* Quicker to use brute force, each string compared with all previous strings */ - BruteForce(tbl, data_block, + RMF_bruteForce(tbl, data_block, block_start, link, list_count, @@ -800,7 +812,7 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl, #ifdef RMF_REFERENCE /* Simple, slow, complete parsing for reference */ -static void RecurseListsReference(RMF_builder* const tbl, +static void RMF_recurseListsReference(RMF_builder* const tbl, const BYTE* const data_block, size_t const block_size, size_t link, @@ -836,12 +848,8 @@ static void RecurseListsReference(RMF_builder* const tbl, } memset(tbl->tails_8, 0xFF, sizeof(tbl->tails_8)); while (st_index > 0) { - U32 list_count; - U32 depth; - size_t prev_st_index; - --st_index; - list_count = tbl->stack[st_index].count; + U32 list_count = tbl->stack[st_index].count; if (list_count < 2) { /* Nothing to do */ continue; @@ -854,14 +862,14 @@ static void RecurseListsReference(RMF_builder* const tbl, } link = tbl->stack[st_index].head; /* The current depth */ - depth = GetMatchLength(link); + U32 depth = GetMatchLength(link); if (depth >= max_depth) continue; data_src = data_block + depth; limit = block_size - depth; /* Next depth for 1 extra char */ ++depth; - prev_st_index = st_index; + size_t prev_st_index = st_index; do { if (link < limit) { size_t const radix_8 = data_src[link]; @@ -890,21 +898,29 @@ static void RecurseListsReference(RMF_builder* const tbl, #endif /* RMF_REFERENCE */ /* Atomically take a list from the head table */ -static ptrdiff_t RMF_getNextList(FL2_matchTable* const tbl, unsigned const multi_thread) +static ptrdiff_t RMF_getNextList_mt(FL2_matchTable* const tbl) { if (tbl->st_index < tbl->end_index) { - long index = multi_thread ? FL2_atomic_increment(tbl->st_index) : FL2_nonAtomic_increment(tbl->st_index); - if (index < tbl->end_index) { + long index = FL2_atomic_increment(tbl->st_index); + if (index < tbl->end_index) return index; - } } return -1; } -#define UPDATE_INTERVAL 0x40000U +/* Non-atomically take a list from the head table */ +static ptrdiff_t RMF_getNextList_st(FL2_matchTable* const tbl) +{ + if (tbl->st_index < tbl->end_index) { + long index = FL2_nonAtomic_increment(tbl->st_index); + if (index < tbl->end_index) + return index; + } + return -1; +} /* Iterate the head table concurrently with other threads, and recurse each list until max_depth is reached */ -int +void #ifdef RMF_BITPACK RMF_bitpackBuildTable #else @@ -913,69 +929,58 @@ RMF_structuredBuildTable (FL2_matchTable* const tbl, size_t const job, unsigned const multi_thread, - FL2_dataBlock const block, - FL2_progressFn progress, void* opaque, U32 weight, size_t init_done) + FL2_dataBlock const block) { - if (!block.end) - return 0; - U64 const enc_size = block.end - block.start; + if (block.end == 0) + return; + unsigned const best = !tbl->params.divide_and_conquer; - unsigned const max_depth = MIN(tbl->params.depth, RADIX_MAX_LENGTH) & ~1; - size_t const bounded_start = block.end - max_depth - MAX_READ_BEYOND_DEPTH; - ptrdiff_t next_progress = 0; - size_t update = UPDATE_INTERVAL; - size_t total = init_done; + unsigned const max_depth = MIN(tbl->params.depth, STRUCTURED_MAX_LENGTH) & ~1; + size_t bounded_start = max_depth + MAX_READ_BEYOND_DEPTH; + bounded_start = block.end - MIN(block.end, bounded_start); + ptrdiff_t next_progress = (job == 0) ? 0 : RADIX16_TABLE_SIZE; + ptrdiff_t(*getNextList)(FL2_matchTable* const tbl) + = multi_thread ? RMF_getNextList_mt : RMF_getNextList_st; for (;;) { /* Get the next to process */ - ptrdiff_t index = RMF_getNextList(tbl, multi_thread); - RMF_tableHead list_head; + ptrdiff_t index = getNextList(tbl); - if (index < 0) { + if (index < 0) break; - } - if (progress) { - while (next_progress < index) { - total += tbl->list_heads[tbl->stack[next_progress]].count; - ++next_progress; - } - if (total >= update) { - if (progress((size_t)((total * enc_size / block.end * weight) >> 4), opaque)) { - FL2_atomic_add(tbl->st_index, RADIX16_TABLE_SIZE); - return 1; - } - update = total + UPDATE_INTERVAL; - } + + while (next_progress < index) { + /* initial value of next_progress ensures only thread 0 executes this */ + tbl->progress += tbl->list_heads[tbl->stack[next_progress]].count; + ++next_progress; } index = tbl->stack[index]; - list_head = tbl->list_heads[index]; + RMF_tableHead list_head = tbl->list_heads[index]; tbl->list_heads[index].head = RADIX_NULL_LINK; - if (list_head.count < 2 || list_head.head < block.start) { + if (list_head.count < 2 || list_head.head < block.start) continue; - } + #ifdef RMF_REFERENCE if (tbl->params.use_ref_mf) { - RecurseListsReference(tbl->builders[job], block.data, block.end, list_head.head, list_head.count, max_depth); + RMF_recurseListsReference(tbl->builders[job], block.data, block.end, list_head.head, list_head.count, max_depth); continue; } #endif if (list_head.head >= bounded_start) { - RecurseListsBound(tbl->builders[job], block.data, block.end, &list_head, (BYTE)max_depth); - if (list_head.count < 2 || list_head.head < block.start) { + RMF_recurseListsBound(tbl->builders[job], block.data, block.end, &list_head, max_depth); + if (list_head.count < 2 || list_head.head < block.start) continue; - } } if (best && list_head.count > tbl->builders[job]->match_buffer_limit) { /* Not worth buffering or too long */ - RecurseLists16(tbl->builders[job], block.data, block.start, list_head.head, list_head.count, max_depth); + RMF_recurseLists16(tbl->builders[job], block.data, block.start, list_head.head, list_head.count, max_depth); } else { - RecurseListsBuffered(tbl->builders[job], block.data, block.start, list_head.head, 2, (BYTE)max_depth, list_head.count, 0); + RMF_recurseListsBuffered(tbl->builders[job], block.data, block.start, list_head.head, 2, (BYTE)max_depth, list_head.count, 0); } } - return 0; } int @@ -984,28 +989,24 @@ RMF_bitpackIntegrityCheck #else RMF_structuredIntegrityCheck #endif -(const FL2_matchTable* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned const max_depth) +(const FL2_matchTable* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned max_depth) { + max_depth &= ~1; int err = 0; for (index += !index; index < end; ++index) { - U32 link; - U32 length; - U32 len_test; - U32 limit; - if (IsNull(index)) continue; - link = GetMatchLink(index); + U32 const link = GetMatchLink(index); if (link >= index) { printf("Forward link at %X to %u\r\n", (U32)index, link); err = 1; continue; } - length = GetMatchLength(index); + U32 const length = GetMatchLength(index); if (index && length < RADIX_MAX_LENGTH && link - 1 == GetMatchLink(index - 1) && length + 1 == GetMatchLength(index - 1)) continue; - len_test = 0; - limit = MIN((U32)(end - index), RADIX_MAX_LENGTH); + U32 len_test = 0; + U32 const limit = MIN((U32)(end - index), RADIX_MAX_LENGTH); for (; len_test < limit && data[link + len_test] == data[index + len_test]; ++len_test) { } if (len_test < length) { @@ -1013,63 +1014,8 @@ RMF_structuredIntegrityCheck err = 1; } if (length < max_depth && len_test > length) + /* These occur occasionally due to splitting of chains in the buffer when long repeats are present */ printf("Shortened match at %X: %u of %u\r\n", (U32)index, length, len_test); } return err; } - - -static size_t ExtendMatch(const FL2_matchTable* const tbl, - const BYTE* const data, - ptrdiff_t const start_index, - ptrdiff_t const limit, - U32 const link, - size_t const length) -{ - ptrdiff_t end_index = start_index + length; - ptrdiff_t const dist = start_index - link; - while (end_index < limit && end_index - (ptrdiff_t)GetMatchLink(end_index) == dist) { - end_index += GetMatchLength(end_index); - } - if (end_index >= limit) { - return limit - start_index; - } - while (end_index < limit && data[end_index - dist] == data[end_index]) { - ++end_index; - } - return end_index - start_index; -} - -size_t -#ifdef RMF_BITPACK -RMF_bitpackGetMatch -#else -RMF_structuredGetMatch -#endif -(const FL2_matchTable* const tbl, - const BYTE* const data, - size_t const index, - size_t const limit, - unsigned const max_depth, - size_t* const offset_ptr) -{ - size_t length; - size_t dist; - U32 link; - if (IsNull(index)) - return 0; - link = GetMatchLink(index); - length = GetMatchLength(index); - if (length < 2) - return 0; - dist = index - link; - *offset_ptr = dist; - if (length > limit - index) - return limit - index; - if (length == max_depth - || length == RADIX_MAX_LENGTH /* from HandleRepeat */) - { - length = ExtendMatch(tbl, data, index, limit, link, length); - } - return length; -} diff --git a/C/fast-lzma2/radix_get.h b/C/fast-lzma2/radix_get.h new file mode 100644 index 00000000..8696fdae --- /dev/null +++ b/C/fast-lzma2/radix_get.h @@ -0,0 +1,210 @@ +/* +* Copyright (c) 2018, Conor McCarthy +* All rights reserved. +* +* This source code is licensed under both the BSD-style license (found in the +* LICENSE file in the root directory of this source tree) and the GPLv2 (found +* in the COPYING file in the root directory of this source tree). +* You may select, at your option, one of the above-listed licenses. +*/ + +#ifndef FL2_RADIX_GET_H_ +#define FL2_RADIX_GET_H_ + +#if defined (__cplusplus) +extern "C" { +#endif + +typedef struct +{ + U32 length; + U32 dist; +} RMF_match; + +static size_t RMF_bitpackExtendMatch(const BYTE* const data, + const U32* const table, + ptrdiff_t const start_index, + ptrdiff_t limit, + U32 const link, + size_t const length) +{ + ptrdiff_t end_index = start_index + length; + ptrdiff_t const dist = start_index - link; + + if (limit > start_index + (ptrdiff_t)kMatchLenMax) + limit = start_index + kMatchLenMax; + + while (end_index < limit && end_index - (ptrdiff_t)(table[end_index] & RADIX_LINK_MASK) == dist) + end_index += table[end_index] >> RADIX_LINK_BITS; + + if (end_index >= limit) { + DEBUGLOG(7, "RMF_bitpackExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(limit - start_index)); + return limit - start_index; + } + + while (end_index < limit && data[end_index - dist] == data[end_index]) + ++end_index; + + DEBUGLOG(7, "RMF_bitpackExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(end_index - start_index)); + return end_index - start_index; +} + +#define GetMatchLink(table, index) ((const RMF_unit*)(table))[(index) >> UNIT_BITS].links[(index) & UNIT_MASK] + +#define GetMatchLength(table, index) ((const RMF_unit*)(table))[(index) >> UNIT_BITS].lengths[(index) & UNIT_MASK] + +static size_t RMF_structuredExtendMatch(const BYTE* const data, + const U32* const table, + ptrdiff_t const start_index, + ptrdiff_t limit, + U32 const link, + size_t const length) +{ + ptrdiff_t end_index = start_index + length; + ptrdiff_t const dist = start_index - link; + + if (limit > start_index + (ptrdiff_t)kMatchLenMax) + limit = start_index + kMatchLenMax; + + while (end_index < limit && end_index - (ptrdiff_t)GetMatchLink(table, end_index) == dist) + end_index += GetMatchLength(table, end_index); + + if (end_index >= limit) { + DEBUGLOG(7, "RMF_structuredExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(limit - start_index)); + return limit - start_index; + } + + while (end_index < limit && data[end_index - dist] == data[end_index]) + ++end_index; + + DEBUGLOG(7, "RMF_structuredExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(end_index - start_index)); + return end_index - start_index; +} + +FORCE_INLINE_TEMPLATE +RMF_match RMF_getMatch(FL2_dataBlock block, + FL2_matchTable* tbl, + unsigned max_depth, + int structTbl, + size_t index) +{ + if (structTbl) + { + U32 const link = GetMatchLink(tbl->table, index); + + RMF_match match; + match.length = 0; + + if (link == RADIX_NULL_LINK) + return match; + + size_t const length = GetMatchLength(tbl->table, index); + size_t const dist = index - link - 1; + + if (length > block.end - index) + match.length = (U32)(block.end - index); + else if (length == max_depth || length == STRUCTURED_MAX_LENGTH /* from HandleRepeat */) + match.length = (U32)RMF_structuredExtendMatch(block.data, tbl->table, index, block.end, link, length); + else + match.length = (U32)length; + + match.dist = (U32)dist; + + return match; + } + else { + U32 link = tbl->table[index]; + + RMF_match match; + match.length = 0; + + if (link == RADIX_NULL_LINK) + return match; + + size_t const length = link >> RADIX_LINK_BITS; + link &= RADIX_LINK_MASK; + size_t const dist = index - link - 1; + + if (length > block.end - index) + match.length = (U32)(block.end - index); + else if (length == max_depth || length == BITPACK_MAX_LENGTH /* from HandleRepeat */) + match.length = (U32)RMF_bitpackExtendMatch(block.data, tbl->table, index, block.end, link, length); + else + match.length = (U32)length; + + match.dist = (U32)dist; + + return match; + } +} + +FORCE_INLINE_TEMPLATE +RMF_match RMF_getNextMatch(FL2_dataBlock block, + FL2_matchTable* tbl, + unsigned max_depth, + int structTbl, + size_t index) +{ + if (structTbl) + { + U32 const link = GetMatchLink(tbl->table, index); + + RMF_match match; + match.length = 0; + + if (link == RADIX_NULL_LINK) + return match; + + size_t const length = GetMatchLength(tbl->table, index); + size_t const dist = index - link - 1; + + /* same distance, one byte shorter */ + if (link - 1 == GetMatchLink(tbl->table, index - 1)) + return match; + + if (length > block.end - index) + match.length = (U32)(block.end - index); + else if (length == max_depth || length == STRUCTURED_MAX_LENGTH /* from HandleRepeat */) + match.length = (U32)RMF_structuredExtendMatch(block.data, tbl->table, index, block.end, link, length); + else + match.length = (U32)length; + + match.dist = (U32)dist; + + return match; + } + else { + U32 link = tbl->table[index]; + + RMF_match match; + match.length = 0; + + if (link == RADIX_NULL_LINK) + return match; + + size_t const length = link >> RADIX_LINK_BITS; + link &= RADIX_LINK_MASK; + size_t const dist = index - link - 1; + + /* same distance, one byte shorter */ + if (link - 1 == (tbl->table[index - 1] & RADIX_LINK_MASK)) + return match; + + if (length > block.end - index) + match.length = (U32)(block.end - index); + else if (length == max_depth || length == BITPACK_MAX_LENGTH /* from HandleRepeat */) + match.length = (U32)RMF_bitpackExtendMatch(block.data, tbl->table, index, block.end, link, length); + else + match.length = (U32)length; + + match.dist = (U32)dist; + + return match; + } +} + +#if defined (__cplusplus) +} +#endif + +#endif /* FL2_RADIX_GET_H_ */ \ No newline at end of file diff --git a/C/fast-lzma2/radix_internal.h b/C/fast-lzma2/radix_internal.h index 4a9ba359..36431939 100644 --- a/C/fast-lzma2/radix_internal.h +++ b/C/fast-lzma2/radix_internal.h @@ -14,6 +14,10 @@ #include "atomic.h" #include "radix_mf.h" +#if defined(FL2_XZ_BUILD) && defined(TUKLIB_FAST_UNALIGNED_ACCESS) +# define MEM_read32(a) (*(const U32*)(a)) +#endif + #if defined (__cplusplus) extern "C" { #endif @@ -21,26 +25,27 @@ extern "C" { #define DICTIONARY_LOG_MIN 12U #define DICTIONARY_LOG_MAX_64 30U #define DICTIONARY_LOG_MAX_32 27U -#define DEFAULT_BUFFER_LOG 8U -#define DEFAULT_BLOCK_OVERLAP 2U -#define DEFAULT_SEARCH_DEPTH 32U -#define DEFAULT_DIVIDEANDCONQUER 1 -#define MAX_REPEAT 32 -#define RADIX16_TABLE_SIZE (1UL << 16) -#define RADIX8_TABLE_SIZE (1UL << 8) +#define DICTIONARY_SIZE_MIN ((size_t)1 << DICTIONARY_LOG_MIN) +#define DICTIONARY_SIZE_MAX_64 ((size_t)1 << DICTIONARY_LOG_MAX_64) +#define DICTIONARY_SIZE_MAX_32 ((size_t)1 << DICTIONARY_LOG_MAX_32) +#define MAX_REPEAT 24 +#define RADIX16_TABLE_SIZE ((size_t)1 << 16) +#define RADIX8_TABLE_SIZE ((size_t)1 << 8) #define STACK_SIZE (RADIX16_TABLE_SIZE * 3) #define MAX_BRUTE_FORCE_LIST_SIZE 5 #define BUFFER_LINK_MASK 0xFFFFFFU #define MATCH_BUFFER_OVERLAP 6 -#define BITPACK_MAX_LENGTH 63UL -#define STRUCTURED_MAX_LENGTH 255UL +#define BITPACK_MAX_LENGTH 63U +#define STRUCTURED_MAX_LENGTH 255U #define RADIX_LINK_BITS 26 -#define RADIX_LINK_MASK ((1UL << RADIX_LINK_BITS) - 1) -#define RADIX_NULL_LINK 0xFFFFFFFFUL +#define RADIX_LINK_MASK ((1U << RADIX_LINK_BITS) - 1) +#define RADIX_NULL_LINK 0xFFFFFFFFU #define UNIT_BITS 2 -#define UNIT_MASK ((1UL << UNIT_BITS) - 1) +#define UNIT_MASK ((1U << UNIT_BITS) - 1) + +#define RADIX_CANCEL_INDEX (long)(RADIX16_TABLE_SIZE + FL2_MAXTHREADS + 2) typedef struct { @@ -88,9 +93,10 @@ struct FL2_matchTable_s { FL2_atomic st_index; long end_index; - int isStruct; - int allocStruct; + int is_struct; + int alloc_struct; unsigned thread_count; + size_t progress; RMF_parameters params; RMF_builder** builders; U32 stack[RADIX16_TABLE_SIZE]; @@ -98,27 +104,25 @@ struct FL2_matchTable_s U32 table[1]; }; -size_t RMF_bitpackInit(struct FL2_matchTable_s* const tbl, const void* data, size_t const start, size_t const end); -size_t RMF_structuredInit(struct FL2_matchTable_s* const tbl, const void* data, size_t const start, size_t const end); -int RMF_bitpackBuildTable(struct FL2_matchTable_s* const tbl, +size_t RMF_bitpackInit(struct FL2_matchTable_s* const tbl, const void* data, size_t const end); +size_t RMF_structuredInit(struct FL2_matchTable_s* const tbl, const void* data, size_t const end); +void RMF_bitpackBuildTable(struct FL2_matchTable_s* const tbl, size_t const job, unsigned const multi_thread, - FL2_dataBlock const block, - FL2_progressFn progress, void* opaque, U32 weight, size_t init_done); -int RMF_structuredBuildTable(struct FL2_matchTable_s* const tbl, + FL2_dataBlock const block); +void RMF_structuredBuildTable(struct FL2_matchTable_s* const tbl, size_t const job, unsigned const multi_thread, - FL2_dataBlock const block, - FL2_progressFn progress, void* opaque, U32 weight, size_t init_done); + FL2_dataBlock const block); void RMF_recurseListChunk(RMF_builder* const tbl, const BYTE* const data_block, size_t const block_start, - BYTE const depth, - BYTE const max_depth, + U32 const depth, + U32 const max_depth, U32 const list_count, size_t const stack_base); -int RMF_bitpackIntegrityCheck(const struct FL2_matchTable_s* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned const max_depth); -int RMF_structuredIntegrityCheck(const struct FL2_matchTable_s* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned const max_depth); +int RMF_bitpackIntegrityCheck(const struct FL2_matchTable_s* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned max_depth); +int RMF_structuredIntegrityCheck(const struct FL2_matchTable_s* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned max_depth); void RMF_bitpackLimitLengths(struct FL2_matchTable_s* const tbl, size_t const index); void RMF_structuredLimitLengths(struct FL2_matchTable_s* const tbl, size_t const index); BYTE* RMF_bitpackAsOutputBuffer(struct FL2_matchTable_s* const tbl, size_t const index); diff --git a/C/fast-lzma2/radix_mf.c b/C/fast-lzma2/radix_mf.c index 55187638..ebfa4d33 100644 --- a/C/fast-lzma2/radix_mf.c +++ b/C/fast-lzma2/radix_mf.c @@ -11,21 +11,20 @@ #include /* size_t, ptrdiff_t */ #include /* malloc, free */ #include "fast-lzma2.h" +#include "fl2_errors.h" #include "mem.h" /* U32, U64, MEM_64bits */ #include "fl2_internal.h" #include "radix_internal.h" #ifdef __GNUC__ -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" /* warning: 'rpt_head_next' may be used uninitialized in this function */ +# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" /* warning: 'rpt_head_next' may be used uninitialized in this function */ #elif defined(_MSC_VER) -# pragma warning(disable : 4701) /* disable: C4701: potentially uninitialized local variable */ +# pragma warning(disable : 4701) /* warning: 'rpt_head_next' may be used uninitialized in this function */ #endif #define MIN_MATCH_BUFFER_SIZE 256U /* min buffer size at least FL2_SEARCH_DEPTH_MAX + 2 for bounded build */ #define MAX_MATCH_BUFFER_SIZE (1UL << 24) /* max buffer size constrained by 24-bit link values */ -#define REPEAT_CHECK_TABLE ((1 << 1) | (1 << 2) | (1 << 4) | (1 << 8) | (1 << 16) | (1ULL << 32)) - static void RMF_initTailTable(RMF_builder* const tbl) { for (size_t i = 0; i < RADIX8_TABLE_SIZE; i += 2) { @@ -43,146 +42,175 @@ static RMF_builder* RMF_createBuilder(size_t match_buffer_size) match_buffer_size = MIN(match_buffer_size, MAX_MATCH_BUFFER_SIZE); match_buffer_size = MAX(match_buffer_size, MIN_MATCH_BUFFER_SIZE); - { RMF_builder* const builder = (RMF_builder*)malloc( - sizeof(RMF_builder) + (match_buffer_size - 1) * sizeof(RMF_buildMatch)); - builder->match_buffer_size = match_buffer_size; - builder->match_buffer_limit = match_buffer_size; - RMF_initTailTable(builder); - return builder; - } + RMF_builder* const builder = malloc( + sizeof(RMF_builder) + (match_buffer_size - 1) * sizeof(RMF_buildMatch)); + + if (builder == NULL) + return NULL; + + builder->match_buffer_size = match_buffer_size; + builder->match_buffer_limit = match_buffer_size; + + RMF_initTailTable(builder); + + return builder; } static void RMF_freeBuilderTable(RMF_builder** const builders, unsigned const size) { if (builders == NULL) return; - for (unsigned i = 0; i < size; ++i) { + + for (unsigned i = 0; i < size; ++i) free(builders[i]); - } + free(builders); } -static RMF_builder** RMF_createBuilderTable(U32* const matchTable, size_t const match_buffer_size, unsigned const max_len, unsigned const size) +/* RMF_createBuilderTable() : + * Create one match table builder object per thread. + * max_len : maximum match length supported by the table structure + * size : number of threads + */ +static RMF_builder** RMF_createBuilderTable(U32* const match_table, size_t const match_buffer_size, unsigned const max_len, unsigned const size) { - RMF_builder** builders = (RMF_builder**)malloc(size * sizeof(RMF_builder*)); DEBUGLOG(3, "RMF_createBuilderTable : match_buffer_size %u, builders %u", (U32)match_buffer_size, size); + + RMF_builder** const builders = malloc(size * sizeof(RMF_builder*)); + if (builders == NULL) return NULL; + for (unsigned i = 0; i < size; ++i) builders[i] = NULL; + for (unsigned i = 0; i < size; ++i) { builders[i] = RMF_createBuilder(match_buffer_size); if (builders[i] == NULL) { RMF_freeBuilderTable(builders, i); return NULL; } - builders[i]->table = matchTable; + builders[i]->table = match_table; builders[i]->max_len = max_len; } return builders; } -static int RMF_isStruct(unsigned dictionary_log, unsigned depth) +static int RMF_isStruct(size_t const dictionary_size) { - return dictionary_log > RADIX_LINK_BITS || depth > BITPACK_MAX_LENGTH; + return dictionary_size > ((size_t)1 << RADIX_LINK_BITS); } -static int RMF_isStructParam(const RMF_parameters* const params) -{ - return RMF_isStruct(params->dictionary_log, params->depth); -} - -/** RMF_clampCParams() : -* make CParam values within valid range. -* @return : valid CParams */ +/* RMF_clampParams() : +* Make param values within valid range. +* Return : valid RMF_parameters */ static RMF_parameters RMF_clampParams(RMF_parameters params) { # define CLAMP(val,min,max) { \ if (val<(min)) val=(min); \ else if (val>(max)) val=(max); \ } - CLAMP(params.dictionary_log, DICTIONARY_LOG_MIN, MEM_64bits() ? DICTIONARY_LOG_MAX_64 : DICTIONARY_LOG_MAX_32); - CLAMP(params.match_buffer_log, FL2_BUFFER_SIZE_LOG_MIN, FL2_BUFFER_SIZE_LOG_MAX); - CLAMP(params.overlap_fraction, FL2_BLOCK_OVERLAP_MIN, FL2_BLOCK_OVERLAP_MAX); + CLAMP(params.dictionary_size, DICTIONARY_SIZE_MIN, MEM_64bits() ? DICTIONARY_SIZE_MAX_64 : DICTIONARY_SIZE_MAX_32); + CLAMP(params.match_buffer_log, RMF_BUFFER_LOG_MIN, RMF_BUFFER_LOG_MAX); + if (params.overlap_fraction > FL2_BLOCK_OVERLAP_MAX) + params.overlap_fraction = FL2_BLOCK_OVERLAP_MAX; CLAMP(params.depth, FL2_SEARCH_DEPTH_MIN, FL2_SEARCH_DEPTH_MAX); return params; +# undef CLAMP } +/* RMF_applyParameters_internal() : + * Set parameters to those specified. + * Create a builder table if none exists. Free an existing one if incompatible. + * Set match_buffer_limit and max supported match length. + * Returns an error if dictionary won't fit. + */ static size_t RMF_applyParameters_internal(FL2_matchTable* const tbl, const RMF_parameters* const params) { - int const isStruct = RMF_isStructParam(params); - unsigned const dictionary_log = tbl->params.dictionary_log; + int const is_struct = RMF_isStruct(params->dictionary_size); + size_t const dictionary_size = tbl->params.dictionary_size; /* dictionary is allocated with the struct and is immutable */ - if (params->dictionary_log > tbl->params.dictionary_log - || (params->dictionary_log == tbl->params.dictionary_log && isStruct > tbl->allocStruct)) + if (params->dictionary_size > tbl->params.dictionary_size + || (params->dictionary_size == tbl->params.dictionary_size && is_struct > tbl->alloc_struct)) return FL2_ERROR(parameter_unsupported); - { size_t const match_buffer_size = (size_t)1 << (params->dictionary_log - params->match_buffer_log); - tbl->params = *params; - tbl->params.dictionary_log = dictionary_log; - tbl->isStruct = isStruct; - if (tbl->builders == NULL - || match_buffer_size > tbl->builders[0]->match_buffer_size) - { - RMF_freeBuilderTable(tbl->builders, tbl->thread_count); - tbl->builders = RMF_createBuilderTable(tbl->table, match_buffer_size, tbl->isStruct ? STRUCTURED_MAX_LENGTH : BITPACK_MAX_LENGTH, tbl->thread_count); - if (tbl->builders == NULL) { - return FL2_ERROR(memory_allocation); - } + size_t const match_buffer_size = params->dictionary_size >> params->match_buffer_log; + tbl->params = *params; + tbl->params.dictionary_size = dictionary_size; + tbl->is_struct = is_struct; + if (tbl->builders == NULL + || match_buffer_size > tbl->builders[0]->match_buffer_size) + { + RMF_freeBuilderTable(tbl->builders, tbl->thread_count); + tbl->builders = RMF_createBuilderTable(tbl->table, match_buffer_size, tbl->is_struct ? STRUCTURED_MAX_LENGTH : BITPACK_MAX_LENGTH, tbl->thread_count); + if (tbl->builders == NULL) { + return FL2_ERROR(memory_allocation); } - else { - for (unsigned i = 0; i < tbl->thread_count; ++i) { - tbl->builders[i]->match_buffer_limit = match_buffer_size; - tbl->builders[i]->max_len = tbl->isStruct ? STRUCTURED_MAX_LENGTH : BITPACK_MAX_LENGTH; - } + } + else { + for (unsigned i = 0; i < tbl->thread_count; ++i) { + tbl->builders[i]->match_buffer_limit = match_buffer_size; + tbl->builders[i]->max_len = tbl->is_struct ? STRUCTURED_MAX_LENGTH : BITPACK_MAX_LENGTH; } } return 0; } +/* RMF_reduceDict() : + * Reduce dictionary and match buffer size if the total input size is known and < dictionary_size. + */ static void RMF_reduceDict(RMF_parameters* const params, size_t const dict_reduce) { - if (dict_reduce) - while (params->dictionary_log > DICTIONARY_LOG_MIN && (size_t)1 << (params->dictionary_log - 1) >= dict_reduce) { - --params->dictionary_log; - params->match_buffer_log = MAX(params->match_buffer_log - 1, FL2_BUFFER_SIZE_LOG_MIN); + if (dict_reduce) { + for (size_t dict_size = params->dictionary_size; dict_size > DICTIONARY_SIZE_MIN && (dict_size >> 1) >= dict_reduce; dict_size >>= 1) { + /* Use unchanged match buffer size for reduced dict */ + params->match_buffer_log = MAX(params->match_buffer_log - 1, RMF_BUFFER_LOG_MIN); } + params->dictionary_size = MIN(params->dictionary_size, MAX(dict_reduce, DICTIONARY_SIZE_MIN)); + } } -FL2_matchTable* RMF_createMatchTable(const RMF_parameters* const p, size_t const dict_reduce, unsigned const thread_count) +static void RMF_initListHeads(FL2_matchTable* const tbl) { - int isStruct; - size_t dictionary_size; - size_t table_bytes; - FL2_matchTable* tbl; - RMF_parameters params = RMF_clampParams(*p); - - RMF_reduceDict(¶ms, dict_reduce); - isStruct = RMF_isStructParam(¶ms); - dictionary_size = (size_t)1 << params.dictionary_log; - - DEBUGLOG(3, "RMF_createMatchTable : isStruct %d, dict %u", isStruct, (U32)dictionary_size); - - table_bytes = isStruct ? ((dictionary_size + 3U) / 4U) * sizeof(RMF_unit) - : dictionary_size * sizeof(U32); - tbl = (FL2_matchTable*)malloc( - sizeof(FL2_matchTable) + table_bytes - sizeof(U32)); - if (!tbl) return NULL; - - tbl->isStruct = isStruct; - tbl->allocStruct = isStruct; - tbl->thread_count = thread_count + !thread_count; - tbl->params = params; - tbl->builders = NULL; - - RMF_applyParameters_internal(tbl, ¶ms); - for (size_t i = 0; i < RADIX16_TABLE_SIZE; i += 2) { tbl->list_heads[i].head = RADIX_NULL_LINK; tbl->list_heads[i].count = 0; tbl->list_heads[i + 1].head = RADIX_NULL_LINK; tbl->list_heads[i + 1].count = 0; } +} + +/* RMF_createMatchTable() : + * Create a match table. Reduce the dict size to input size if possible. + * A thread_count of 0 will be raised to 1. + */ +FL2_matchTable* RMF_createMatchTable(const RMF_parameters* const p, size_t const dict_reduce, unsigned const thread_count) +{ + RMF_parameters params = RMF_clampParams(*p); + RMF_reduceDict(¶ms, dict_reduce); + + int const is_struct = RMF_isStruct(params.dictionary_size); + size_t dictionary_size = params.dictionary_size; + + DEBUGLOG(3, "RMF_createMatchTable : is_struct %d, dict %u", is_struct, (U32)dictionary_size); + + size_t const table_bytes = is_struct ? ((dictionary_size + 3U) / 4U) * sizeof(RMF_unit) + : dictionary_size * sizeof(U32); + FL2_matchTable* const tbl = malloc(sizeof(FL2_matchTable) + table_bytes - sizeof(U32)); + if (!tbl) return NULL; + + tbl->is_struct = is_struct; + tbl->alloc_struct = is_struct; + tbl->thread_count = thread_count + !thread_count; + tbl->params = params; + tbl->builders = NULL; + + RMF_applyParameters_internal(tbl, ¶ms); + + RMF_initListHeads(tbl); + + RMF_initProgress(tbl); + return tbl; } @@ -190,7 +218,9 @@ void RMF_freeMatchTable(FL2_matchTable* const tbl) { if (tbl == NULL) return; + DEBUGLOG(3, "RMF_freeMatchTable"); + RMF_freeBuilderTable(tbl->builders, tbl->thread_count); free(tbl); } @@ -199,8 +229,8 @@ BYTE RMF_compatibleParameters(const FL2_matchTable* const tbl, const RMF_paramet { RMF_parameters params = RMF_clampParams(*p); RMF_reduceDict(¶ms, dict_reduce); - return tbl->params.dictionary_log > params.dictionary_log - || (tbl->params.dictionary_log == params.dictionary_log && tbl->allocStruct >= RMF_isStructParam(¶ms)); + return tbl->params.dictionary_size > params.dictionary_size + || (tbl->params.dictionary_size == params.dictionary_size && tbl->alloc_struct >= RMF_isStruct(params.dictionary_size)); } size_t RMF_applyParameters(FL2_matchTable* const tbl, const RMF_parameters* const p, size_t const dict_reduce) @@ -215,18 +245,25 @@ size_t RMF_threadCount(const FL2_matchTable* const tbl) return tbl->thread_count; } -size_t RMF_initTable(FL2_matchTable* const tbl, const void* const data, size_t const start, size_t const end) +void RMF_initProgress(FL2_matchTable * const tbl) { - DEBUGLOG(5, "RMF_initTable : start %u, size %u", (U32)start, (U32)end); - if (tbl->isStruct) { - return RMF_structuredInit(tbl, data, start, end); - } - else { - return RMF_bitpackInit(tbl, data, start, end); - } + if (tbl != NULL) + tbl->progress = 0; } -static void HandleRepeat(RMF_buildMatch* const match_buffer, +size_t RMF_initTable(FL2_matchTable* const tbl, const void* const data, size_t const end) +{ + DEBUGLOG(5, "RMF_initTable : size %u", (U32)end); + + tbl->st_index = ATOMIC_INITIAL_VALUE; + + if (tbl->is_struct) + return RMF_structuredInit(tbl, data, end); + else + return RMF_bitpackInit(tbl, data, end); +} + +static void RMF_handleRepeat(RMF_buildMatch* const match_buffer, const BYTE* const data_block, size_t const next, U32 count, @@ -235,20 +272,22 @@ static void HandleRepeat(RMF_buildMatch* const match_buffer, U32 const max_len) { size_t index = next; - size_t next_i; U32 length = depth + rpt_len; + const BYTE* const data = data_block + match_buffer[index].from; const BYTE* const data_2 = data - rpt_len; + while (data[length] == data_2[length] && length < max_len) ++length; + for (; length <= max_len && count; --count) { - next_i = match_buffer[index].next & 0xFFFFFF; + size_t next_i = match_buffer[index].next & 0xFFFFFF; match_buffer[index].next = (U32)next_i | (length << 24); length += rpt_len; index = next_i; } for (; count; --count) { - next_i = match_buffer[index].next & 0xFFFFFF; + size_t next_i = match_buffer[index].next & 0xFFFFFF; match_buffer[index].next = (U32)next_i | (max_len << 24); index = next_i; } @@ -261,27 +300,29 @@ typedef struct union src_data_u src; } BruteForceMatch; -static void BruteForceBuffered(RMF_builder* const tbl, +static void RMF_bruteForceBuffered(RMF_builder* const tbl, const BYTE* const data_block, size_t const block_start, size_t index, - size_t list_count, + size_t const list_count, size_t const slot, size_t const depth, size_t const max_depth) { BruteForceMatch buffer[MAX_BRUTE_FORCE_LIST_SIZE + 1]; - const BYTE* data_src = data_block + depth; - size_t limit = max_depth - depth; - const BYTE* start = data_src + block_start; + const BYTE* const data_src = data_block + depth; + size_t const limit = max_depth - depth; + const BYTE* const start = data_src + block_start; size_t i = 0; for (;;) { + /* Load all locations from the match buffer */ buffer[i].index = index; buffer[i].data_src = data_src + tbl->match_buffer[index].from; buffer[i].src.u32 = tbl->match_buffer[index].src.u32; - if (++i >= list_count) { + + if (++i >= list_count) break; - } + index = tbl->match_buffer[index].next & 0xFFFFFF; } i = 0; @@ -289,28 +330,29 @@ static void BruteForceBuffered(RMF_builder* const tbl, size_t longest = 0; size_t j = i + 1; size_t longest_index = j; - const BYTE* data = buffer[i].data_src; + const BYTE* const data = buffer[i].data_src; do { + /* Begin with the remaining chars pulled from the match buffer */ size_t len_test = slot; - while (len_test < 4 && buffer[i].src.chars[len_test] == buffer[j].src.chars[len_test] && len_test - slot < limit) { + while (len_test < 4 && buffer[i].src.chars[len_test] == buffer[j].src.chars[len_test] && len_test - slot < limit) ++len_test; - } + len_test -= slot; if (len_test) { + /* Complete the match length count in the raw input buffer */ const BYTE* data_2 = buffer[j].data_src; - while (data[len_test] == data_2[len_test] && len_test < limit) { + while (data[len_test] == data_2[len_test] && len_test < limit) ++len_test; - } } if (len_test > longest) { longest_index = j; longest = len_test; - if (len_test >= limit) { + if (len_test >= limit) break; - } } } while (++j < list_count); if (longest > 0) { + /* If the existing match was extended, store the new link and length info in the match buffer */ index = buffer[i].index; tbl->match_buffer[index].next = (U32)(buffer[longest_index].index | ((depth + longest) << 24)); } @@ -318,17 +360,19 @@ static void BruteForceBuffered(RMF_builder* const tbl, } while (i < list_count - 1 && buffer[i].data_src >= start); } +/* Lengthen and divide buffered chains into smaller chains, save them on a stack and process in turn. + * The match finder spends most of its time here. + */ FORCE_INLINE_TEMPLATE void RMF_recurseListChunk_generic(RMF_builder* const tbl, const BYTE* const data_block, size_t const block_start, - BYTE depth, - BYTE const max_depth, + U32 depth, + U32 const max_depth, U32 list_count, size_t const stack_base) { - /* Create an offset data buffer pointer for reading the next bytes */ - const BYTE base_depth = depth; + U32 const base_depth = depth; size_t st_index = stack_base; size_t index = 0; ++depth; @@ -338,10 +382,11 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, size_t const radix_8 = tbl->match_buffer[index].src.chars[0]; /* Seen this char before? */ U32 const prev = tbl->tails_8[radix_8].prev_index; + tbl->tails_8[radix_8].prev_index = (U32)index; if (prev != RADIX_NULL_LINK) { ++tbl->tails_8[radix_8].list_count; /* Link the previous occurrence to this one and record the new length */ - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } else { tbl->tails_8[radix_8].list_count = 1; @@ -351,7 +396,6 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, tbl->stack[st_index].count = (U32)radix_8; ++st_index; } - tbl->tails_8[radix_8].prev_index = (U32)index; ++index; } while (index < list_count); @@ -361,7 +405,7 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, U32 const prev = tbl->tails_8[radix_8].prev_index; if (prev != RADIX_NULL_LINK) { ++tbl->tails_8[radix_8].list_count; - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } } /* Convert radix values on the stack to counts and reset any used tail slots */ @@ -370,11 +414,6 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, tbl->stack[j].count = (U32)tbl->tails_8[tbl->stack[j].count].list_count; } while (st_index > stack_base) { - const BYTE* data_src; - size_t link; - size_t slot; - U32 test; - /* Pop an item off the stack */ --st_index; list_count = tbl->stack[st_index].count; @@ -383,7 +422,7 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, continue; } index = tbl->stack[st_index].head; - link = tbl->match_buffer[index].from; + size_t link = tbl->match_buffer[index].from; if (link < block_start) { /* Chain starts in the overlap region which is already encoded */ continue; @@ -396,10 +435,11 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, continue; } depth = tbl->match_buffer[index].next >> 24; - slot = (depth - base_depth) & 3; + /* Index into the 4-byte pre-loaded input char cache */ + size_t slot = (depth - base_depth) & 3; if (list_count <= MAX_BRUTE_FORCE_LIST_SIZE) { /* Quicker to use brute force, each string compared with all previous strings */ - BruteForceBuffered(tbl, + RMF_bruteForceBuffered(tbl, data_block, block_start, index, @@ -409,35 +449,41 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, max_depth); continue; } - /* check for repeats at depth 4,8,16,32 etc */ - test = max_depth != 6 && ((depth & 3) == 0) && ((REPEAT_CHECK_TABLE >> ((depth >> 2) & 31)) & 1) && (max_depth >= depth + (depth >> 1)); + /* check for repeats at depth 4,8,16,32 etc unless depth is near max_depth */ + U32 const test = max_depth != 6 && ((depth & 3) == 0) + && (depth & (depth - 1)) == 0 + && (max_depth >= depth + (depth >> 1)); ++depth; - /* Update the offset data buffer pointer */ - data_src = data_block + depth; + /* Create an offset data buffer pointer for reading the next bytes */ + const BYTE* const data_src = data_block + depth; /* Last pass is done separately */ if (!test && depth < max_depth) { size_t const prev_st_index = st_index; /* Last element done separately */ --list_count; - /* slot is the char cache index. If 3 then chars need to be loaded. */ + /* If slot is 3 then chars need to be loaded. */ if (slot == 3 && max_depth != 6) do { size_t const radix_8 = tbl->match_buffer[index].src.chars[3]; size_t const next_index = tbl->match_buffer[index].next & BUFFER_LINK_MASK; - /* Pre-load the next link and data bytes to avoid waiting for RAM access */ + /* Pre-load the next link and data bytes. On some hardware execution can continue + * ahead while the data is retrieved if no operations except move are done on the data. */ tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link); size_t const next_link = tbl->match_buffer[next_index].from; U32 const prev = tbl->tails_8[radix_8].prev_index; - if (prev!=RADIX_NULL_LINK) { + tbl->tails_8[radix_8].prev_index = (U32)index; + if (prev != RADIX_NULL_LINK) { + /* This char has occurred before in the chain. Link the previous (> index) occurance with this */ ++tbl->tails_8[radix_8].list_count; - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } else { + /* First occurrence in the chain */ tbl->tails_8[radix_8].list_count = 1; tbl->stack[st_index].head = (U32)index; + /* Save the char as a reference to load the count at the end */ tbl->stack[st_index].count = (U32)radix_8; ++st_index; } - tbl->tails_8[radix_8].prev_index = (U32)index; index = next_index; link = next_link; } while (--list_count != 0); @@ -447,9 +493,10 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, /* Pre-load the next link to avoid waiting for RAM access */ size_t const next_link = tbl->match_buffer[next_index].from; U32 const prev = tbl->tails_8[radix_8].prev_index; + tbl->tails_8[radix_8].prev_index = (U32)index; if (prev != RADIX_NULL_LINK) { ++tbl->tails_8[radix_8].list_count; - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } else { tbl->tails_8[radix_8].list_count = 1; @@ -457,20 +504,18 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, tbl->stack[st_index].count = (U32)radix_8; ++st_index; } - tbl->tails_8[radix_8].prev_index = (U32)index; index = next_index; link = next_link; } while (--list_count != 0); - { size_t const radix_8 = tbl->match_buffer[index].src.chars[slot]; - U32 const prev = tbl->tails_8[radix_8].prev_index; - if (prev != RADIX_NULL_LINK) { - if (slot == 3) { - tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link); - } - ++tbl->tails_8[radix_8].list_count; - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); - } + size_t const radix_8 = tbl->match_buffer[index].src.chars[slot]; + U32 const prev = tbl->tails_8[radix_8].prev_index; + if (prev != RADIX_NULL_LINK) { + if (slot == 3) + tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link); + + ++tbl->tails_8[radix_8].list_count; + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } for (size_t j = prev_st_index; j < st_index; ++j) { tbl->tails_8[tbl->stack[j].count].prev_index = RADIX_NULL_LINK; @@ -490,14 +535,15 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, size_t const next_index = tbl->match_buffer[index].next & BUFFER_LINK_MASK; size_t const next_link = tbl->match_buffer[next_index].from; if ((link - next_link) > rpt_depth) { - if (rpt > 0) { - HandleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len); - } + if (rpt > 0) + RMF_handleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len); + rpt = -1; U32 const prev = tbl->tails_8[radix_8].prev_index; + tbl->tails_8[radix_8].prev_index = (U32)index; if (prev != RADIX_NULL_LINK) { ++tbl->tails_8[radix_8].list_count; - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } else { tbl->tails_8[radix_8].list_count = 1; @@ -505,23 +551,23 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, tbl->stack[st_index].count = (U32)radix_8; ++st_index; } - tbl->tails_8[radix_8].prev_index = (U32)index; index = next_index; link = next_link; } else { U32 const dist = (U32)(link - next_link); if (rpt < 0 || dist != rpt_dist) { - if (rpt > 0) { - HandleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len); - } + if (rpt > 0) + RMF_handleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len); + rpt = 0; rpt_head_next = next_index; rpt_dist = dist; U32 const prev = tbl->tails_8[radix_8].prev_index; + tbl->tails_8[radix_8].prev_index = (U32)index; if (prev != RADIX_NULL_LINK) { ++tbl->tails_8[radix_8].list_count; - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } else { tbl->tails_8[radix_8].list_count = 1; @@ -529,7 +575,6 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, tbl->stack[st_index].count = (U32)radix_8; ++st_index; } - tbl->tails_8[radix_8].prev_index = (U32)index; } else { ++rpt; @@ -538,19 +583,18 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, link = next_link; } } while (--list_count != 0); - if (rpt > 0) { - HandleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len); - } - { size_t const radix_8 = tbl->match_buffer[index].src.chars[slot]; - U32 const prev = tbl->tails_8[radix_8].prev_index; - if (prev != RADIX_NULL_LINK) { - if (slot == 3) { - tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link); - } - ++tbl->tails_8[radix_8].list_count; - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); + if (rpt > 0) + RMF_handleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len); + + size_t const radix_8 = tbl->match_buffer[index].src.chars[slot]; + U32 const prev = tbl->tails_8[radix_8].prev_index; + if (prev != RADIX_NULL_LINK) { + if (slot == 3) { + tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link); } + ++tbl->tails_8[radix_8].list_count; + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } for (size_t j = prev_st_index; j < st_index; ++j) { tbl->tails_8[tbl->stack[j].count].prev_index = RADIX_NULL_LINK; @@ -558,7 +602,7 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, } } else { - size_t prev_st_index = st_index; + size_t const prev_st_index = st_index; /* The last pass at max_depth */ do { size_t const radix_8 = tbl->match_buffer[index].src.chars[slot]; @@ -567,14 +611,14 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, /* The last element in tbl->match_buffer is circular so this is never an access violation. */ size_t const next_link = tbl->match_buffer[next_index].from; U32 const prev = tbl->tails_8[radix_8].prev_index; + tbl->tails_8[radix_8].prev_index = (U32)index; if (prev != RADIX_NULL_LINK) { - tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24); + tbl->match_buffer[prev].next = (U32)index | (depth << 24); } else { tbl->stack[st_index].count = (U32)radix_8; ++st_index; } - tbl->tails_8[radix_8].prev_index = (U32)index; index = next_index; link = next_link; } while (--list_count != 0); @@ -589,84 +633,81 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl, void RMF_recurseListChunk(RMF_builder* const tbl, const BYTE* const data_block, size_t const block_start, - BYTE const depth, - BYTE const max_depth, + U32 const depth, + U32 const max_depth, U32 const list_count, size_t const stack_base) { - if (max_depth > 6) { + if (list_count < 2) + return; + /* Template-like inline functions */ + if (list_count <= MAX_BRUTE_FORCE_LIST_SIZE) + RMF_bruteForceBuffered(tbl, data_block, block_start, 0, list_count, 0, depth, max_depth); + else if (max_depth > 6) RMF_recurseListChunk_generic(tbl, data_block, block_start, depth, max_depth, list_count, stack_base); - } - else { + else RMF_recurseListChunk_generic(tbl, data_block, block_start, depth, 6, list_count, stack_base); - } } /* Iterate the head table concurrently with other threads, and recurse each list until max_depth is reached */ int RMF_buildTable(FL2_matchTable* const tbl, - size_t const job, + size_t const job, unsigned const multi_thread, - FL2_dataBlock const block, - FL2_progressFn progress, void* opaque, U32 weight, size_t init_done) + FL2_dataBlock const block) { DEBUGLOG(5, "RMF_buildTable : thread %u", (U32)job); - if (tbl->isStruct) { - return RMF_structuredBuildTable(tbl, job, multi_thread, block, progress, opaque, weight, init_done); - } - else { - return RMF_bitpackBuildTable(tbl, job, multi_thread, block, progress, opaque, weight, init_done); + + if (tbl->is_struct) + RMF_structuredBuildTable(tbl, job, multi_thread, block); + else + RMF_bitpackBuildTable(tbl, job, multi_thread, block); + + if (job == 0 && tbl->st_index >= RADIX_CANCEL_INDEX) { + RMF_initListHeads(tbl); + return 1; } + return 0; +} + +void RMF_cancelBuild(FL2_matchTable * const tbl) +{ + if(tbl != NULL) + FL2_atomic_add(tbl->st_index, RADIX_CANCEL_INDEX - ATOMIC_INITIAL_VALUE); +} + +void RMF_resetIncompleteBuild(FL2_matchTable * const tbl) +{ + RMF_initListHeads(tbl); } int RMF_integrityCheck(const FL2_matchTable* const tbl, const BYTE* const data, size_t const index, size_t const end, unsigned const max_depth) { - if (tbl->isStruct) { + if (tbl->is_struct) return RMF_structuredIntegrityCheck(tbl, data, index, end, max_depth); - } - else { + else return RMF_bitpackIntegrityCheck(tbl, data, index, end, max_depth); - } -} - -size_t RMF_getMatch(FL2_matchTable* const tbl, - const BYTE* const data, - size_t const index, - size_t const limit, - unsigned max_depth, - size_t* const offset_ptr) -{ - if (tbl->isStruct) { - return RMF_structuredGetMatch(tbl, data, index, limit, max_depth, offset_ptr); - } - else { - return RMF_bitpackGetMatch(tbl, data, index, limit, max_depth, offset_ptr); - } } void RMF_limitLengths(FL2_matchTable* const tbl, size_t const index) { - if (tbl->isStruct) { + if (tbl->is_struct) RMF_structuredLimitLengths(tbl, index); - } - else { + else RMF_bitpackLimitLengths(tbl, index); - } } BYTE* RMF_getTableAsOutputBuffer(FL2_matchTable* const tbl, size_t const index) { - if (tbl->isStruct) { + if (tbl->is_struct) return RMF_structuredAsOutputBuffer(tbl, index); - } - else { + else return RMF_bitpackAsOutputBuffer(tbl, index); - } } -size_t RMF_memoryUsage(unsigned const dict_log, unsigned const buffer_log, unsigned const depth, unsigned thread_count) +size_t RMF_memoryUsage(size_t const dict_size, unsigned const buffer_log, unsigned const thread_count) { - size_t size = (size_t)(4U + RMF_isStruct(dict_log, depth)) << dict_log; - U32 buf_size = (U32)1 << (dict_log - buffer_log); + size_t size = (size_t)(4U + RMF_isStruct(dict_size)) * dict_size; + size_t const buf_size = dict_size >> buffer_log; size += ((buf_size - 1) * sizeof(RMF_buildMatch) + sizeof(RMF_builder)) * thread_count; return size; } diff --git a/C/fast-lzma2/radix_mf.h b/C/fast-lzma2/radix_mf.h index c5bf943d..e6b7711b 100644 --- a/C/fast-lzma2/radix_mf.h +++ b/C/fast-lzma2/radix_mf.h @@ -20,16 +20,19 @@ extern "C" { typedef struct FL2_matchTable_s FL2_matchTable; -#define OVERLAP_FROM_DICT_LOG(d, o) (((size_t)1 << ((d) - 4)) * (o)) +#define OVERLAP_FROM_DICT_SIZE(d, o) (((d) >> 4) * (o)) #define RMF_MIN_BYTES_PER_THREAD 1024 +#define RMF_BUFFER_LOG_BASE 12 +#define RMF_BUFFER_LOG_MIN 6 +#define RMF_BUFFER_LOG_MAX 12 + typedef struct { - unsigned dictionary_log; + size_t dictionary_size; unsigned match_buffer_log; unsigned overlap_fraction; - unsigned block_size_log; unsigned divide_and_conquer; unsigned depth; #ifdef RMF_REFERENCE @@ -42,16 +45,18 @@ void RMF_freeMatchTable(FL2_matchTable* const tbl); BYTE RMF_compatibleParameters(const FL2_matchTable* const tbl, const RMF_parameters* const params, size_t const dict_reduce); size_t RMF_applyParameters(FL2_matchTable* const tbl, const RMF_parameters* const params, size_t const dict_reduce); size_t RMF_threadCount(const FL2_matchTable * const tbl); -size_t RMF_initTable(FL2_matchTable* const tbl, const void* const data, size_t const start, size_t const end); +void RMF_initProgress(FL2_matchTable * const tbl); +size_t RMF_initTable(FL2_matchTable* const tbl, const void* const data, size_t const end); int RMF_buildTable(FL2_matchTable* const tbl, size_t const job, unsigned const multi_thread, - FL2_dataBlock const block, - FL2_progressFn progress, void* opaque, U32 weight, size_t init_done); + FL2_dataBlock const block); +void RMF_cancelBuild(FL2_matchTable* const tbl); +void RMF_resetIncompleteBuild(FL2_matchTable* const tbl); int RMF_integrityCheck(const FL2_matchTable* const tbl, const BYTE* const data, size_t const index, size_t const end, unsigned const max_depth); void RMF_limitLengths(FL2_matchTable* const tbl, size_t const index); BYTE* RMF_getTableAsOutputBuffer(FL2_matchTable* const tbl, size_t const index); -size_t RMF_memoryUsage(unsigned const dict_log, unsigned const buffer_log, unsigned const depth, unsigned thread_count); +size_t RMF_memoryUsage(size_t const dict_size, unsigned const buffer_log, unsigned const thread_count); #if defined (__cplusplus) } diff --git a/C/fast-lzma2/radix_struct.c b/C/fast-lzma2/radix_struct.c index 2aac9093..ce8b6ee1 100644 --- a/C/fast-lzma2/radix_struct.c +++ b/C/fast-lzma2/radix_struct.c @@ -9,7 +9,7 @@ */ #include "mem.h" /* U32, U64 */ -#include "fl2threading.h" +#include "fl2_threading.h" #include "fl2_internal.h" #include "radix_internal.h" @@ -34,7 +34,7 @@ typedef struct FL2_matchTable_s FL2_matchTable; #define SetMatchLength(index, link, length) ((RMF_unit*)tbl->table)[(index) >> UNIT_BITS].lengths[(index) & UNIT_MASK] = (BYTE)(length) -#define SetMatchLinkAndLength(index, link, length) { size_t i_ = (index) >> UNIT_BITS, u_ = (index) & UNIT_MASK; ((RMF_unit*)tbl->table)[i_].links[u_] = (U32)(link); ((RMF_unit*)tbl->table)[i_].lengths[u_] = (BYTE)(length); } +#define SetMatchLinkAndLength(index, link, length) do { size_t i_ = (index) >> UNIT_BITS, u_ = (index) & UNIT_MASK; ((RMF_unit*)tbl->table)[i_].links[u_] = (U32)(link); ((RMF_unit*)tbl->table)[i_].lengths[u_] = (BYTE)(length); } while(0) #define SetNull(index) ((RMF_unit*)tbl->table)[(index) >> UNIT_BITS].links[(index) & UNIT_MASK] = RADIX_NULL_LINK diff --git a/C/fast-lzma2/range_enc.c b/C/fast-lzma2/range_enc.c index aff9ab80..1da8a1c5 100644 --- a/C/fast-lzma2/range_enc.c +++ b/C/fast-lzma2/range_enc.c @@ -7,84 +7,194 @@ #include "fl2_internal.h" #include "mem.h" +#include "platform.h" #include "range_enc.h" -const unsigned price_table[kBitModelTotal >> kNumMoveReducingBits] = { - 128, 103, 91, 84, 78, 73, 69, 66, - 63, 61, 58, 56, 54, 52, 51, 49, - 48, 46, 45, 44, 43, 42, 41, 40, - 39, 38, 37, 36, 35, 34, 34, 33, - 32, 31, 31, 30, 29, 29, 28, 28, - 27, 26, 26, 25, 25, 24, 24, 23, - 23, 22, 22, 22, 21, 21, 20, 20, - 19, 19, 19, 18, 18, 17, 17, 17, - 16, 16, 16, 15, 15, 15, 14, 14, - 14, 13, 13, 13, 12, 12, 12, 11, - 11, 11, 11, 10, 10, 10, 10, 9, - 9, 9, 9, 8, 8, 8, 8, 7, - 7, 7, 7, 6, 6, 6, 6, 5, - 5, 5, 5, 5, 4, 4, 4, 4, - 3, 3, 3, 3, 3, 2, 2, 2, - 2, 2, 2, 1, 1, 1, 1, 1 -}; +/* The first and last elements of these tables are never used */ +BYTE price_table[2][kPriceTableSize] = { { + 0, 193, 182, 166, 154, 145, 137, 131, + 125, 120, 115, 111, 107, 103, 100, 97, + 94, 91, 89, 86, 84, 82, 80, 78, + 76, 74, 72, 71, 69, 67, 66, 64, + 63, 61, 60, 59, 57, 56, 55, 54, + 53, 52, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 42, 41, 40, 39, 38, + 37, 36, 36, 35, 34, 33, 33, 32, + 31, 30, 30, 29, 28, 28, 27, 26, + 26, 25, 25, 24, 23, 23, 22, 21, + 21, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 11, 11, 10, 10, 9, + 9, 8, 8, 8, 7, 7, 6, 6, + 5, 5, 5, 4, 4, 3, 3, 3, + 2, 2, 2, 1, 1, 0, 0, 0 +}, { + 0, 0, 0, 1, 1, 2, 2, 2, + 3, 3, 3, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8, 8, 9, + 9, 10, 10, 11, 11, 12, 12, 13, + 13, 13, 14, 14, 15, 15, 16, 17, + 17, 18, 18, 19, 19, 20, 20, 21, + 21, 22, 23, 23, 24, 24, 25, 26, + 26, 27, 28, 28, 29, 30, 30, 31, + 32, 33, 33, 34, 35, 36, 36, 37, + 38, 39, 40, 41, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 53, + 54, 55, 56, 57, 59, 60, 61, 63, + 64, 66, 67, 69, 70, 72, 74, 76, + 78, 80, 82, 84, 86, 89, 91, 94, + 97, 100, 103, 107, 111, 115, 119, 125, + 130, 137, 145, 154, 165, 181, 192, 0 +} }; -void SetOutputBuffer(RangeEncoder* const rc, BYTE *const out_buffer, size_t chunk_size) +#if 0 + +#include + +/* Generates price_table */ +void RC_printPriceTable() +{ + static const unsigned test_size = 0x4000; + const unsigned test_div = test_size >> 8; + BYTE buf[0x3062]; + unsigned table0[kPriceTableSize]; + unsigned table1[kPriceTableSize]; + unsigned count[kPriceTableSize]; + memset(table0, 0, sizeof(table0)); + memset(table1, 0, sizeof(table1)); + memset(count, 0, sizeof(count)); + for (Probability i = 31; i <= kBitModelTotal - 31; ++i) { + RangeEncoder rc; + RC_reset(&rc); + RC_setOutputBuffer(&rc, buf, sizeof(buf)); + for (unsigned j = 0; j < test_size; ++j) { + Probability prob = i; + RC_encodeBit0(&rc, &prob); + } + RC_flush(&rc); + table0[i >> kNumMoveReducingBits] += (unsigned)rc.out_index - 5; + RC_reset(&rc); + RC_setOutputBuffer(&rc, buf, sizeof(buf)); + for (unsigned j = 0; j < test_size; ++j) { + Probability prob = i; + RC_encodeBit1(&rc, &prob); + } + RC_flush(&rc); + table1[i >> kNumMoveReducingBits] += (unsigned)rc.out_index - 5; + ++count[i >> kNumMoveReducingBits]; + } + for (int i = 0; i < kPriceTableSize; ++i) if (count[i]) { + table0[i] = (table0[i] / count[i]) / test_div; + table1[i] = (table1[i] / count[i]) / test_div; + } + fputs("const BYTE price_table[2][kPriceTableSize] = {\r\n", stdout); + for (int i = 0; i < kPriceTableSize;) { + for (int j = 0; j < 8; ++j, ++i) + printf("%4d,", table0[i]); + fputs("\r\n", stdout); + } + fputs("}, {\r\n", stdout); + for (int i = 0; i < kPriceTableSize;) { + for (int j = 0; j < 8; ++j, ++i) + printf("%4d,", table1[i]); + fputs("\r\n", stdout); + } + fputs("} };\r\n", stdout); +} + +#endif + +void RC_setOutputBuffer(RangeEncoder* const rc, BYTE *const out_buffer, size_t chunk_size) { rc->out_buffer = out_buffer; rc->chunk_size = chunk_size; rc->out_index = 0; } -void RangeEncReset(RangeEncoder* const rc) +void RC_reset(RangeEncoder* const rc) { rc->low = 0; rc->range = (U32)-1; - rc->cache_size = 1; + rc->cache_size = 0; rc->cache = 0; } -void ShiftLow(RangeEncoder* const rc) +#ifdef __64BIT__ + +void FORCE_NOINLINE RC_shiftLow(RangeEncoder* const rc) { - if (rc->low < 0xFF000000 || rc->low > 0xFFFFFFFF) - { - BYTE temp = rc->cache; - do { - assert (rc->out_index < rc->chunk_size - 4096); - rc->out_buffer[rc->out_index++] = temp + (BYTE)(rc->low >> 32); - temp = 0xFF; - } while (--rc->cache_size != 0); - rc->cache = (BYTE)(rc->low >> 24); - } - ++rc->cache_size; - rc->low = (rc->low << 8) & 0xFFFFFFFF; + U64 low = rc->low; + rc->low = (U32)(low << 8); + if (low < 0xFF000000 || low > 0xFFFFFFFF) { + BYTE high = (BYTE)(low >> 32); + rc->out_buffer[rc->out_index++] = rc->cache + high; + rc->cache = (BYTE)(low >> 24); + if (rc->cache_size != 0) { + high += 0xFF; + do { + rc->out_buffer[rc->out_index++] = high; + } while (--rc->cache_size != 0); + } + } + else { + rc->cache_size++; + } } -void EncodeBitTree(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol) +#else + +void FORCE_NOINLINE RC_shiftLow(RangeEncoder* const rc) { - size_t tree_index = 1; - assert(bit_count > 0); + U32 low = (U32)rc->low; + unsigned high = (unsigned)(rc->low >> 32); + rc->low = low << 8; + if (low < (U32)0xFF000000 || high != 0) { + rc->out_buffer[rc->out_index++] = rc->cache + (BYTE)high; + rc->cache = (BYTE)(low >> 24); + if (rc->cache_size != 0) { + high += 0xFF; + do { + rc->out_buffer[rc->out_index++] = (BYTE)high; + } while (--rc->cache_size != 0); + } + } + else { + rc->cache_size++; + } +} + +#endif + +void RC_encodeBitTree(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol) +{ + assert(bit_count > 1); + --bit_count; + unsigned bit = symbol >> bit_count; + RC_encodeBit(rc, &probs[1], bit); + size_t tree_index = 1; do { - unsigned bit; - --bit_count; - bit = (symbol >> bit_count) & 1; - EncodeBit(rc, &probs[tree_index], bit); - tree_index = (tree_index << 1) | bit; - } while (bit_count != 0); + --bit_count; + tree_index = (tree_index << 1) | bit; + bit = (symbol >> bit_count) & 1; + RC_encodeBit(rc, &probs[tree_index], bit); + } while (bit_count != 0); } -void EncodeBitTreeReverse(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol) +void RC_encodeBitTreeReverse(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol) { - unsigned tree_index = 1; assert(bit_count != 0); - do { - unsigned bit = symbol & 1; - EncodeBit(rc, &probs[tree_index], bit); - tree_index = (tree_index << 1) + bit; - symbol >>= 1; - } while (--bit_count != 0); + unsigned bit = symbol & 1; + RC_encodeBit(rc, &probs[1], bit); + unsigned tree_index = 1; + while (--bit_count != 0) { + tree_index = (tree_index << 1) + bit; + symbol >>= 1; + bit = symbol & 1; + RC_encodeBit(rc, &probs[tree_index], bit); + } } -void EncodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count) +void FORCE_NOINLINE RC_encodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count) { assert(bit_count > 0); do { @@ -93,7 +203,7 @@ void EncodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count) rc->low += rc->range & -((int)(value >> bit_count) & 1); if (rc->range < kTopValue) { rc->range <<= 8; - ShiftLow(rc); + RC_shiftLow(rc); } } while (bit_count != 0); } diff --git a/C/fast-lzma2/range_enc.h b/C/fast-lzma2/range_enc.h index 54672f4e..159449ad 100644 --- a/C/fast-lzma2/range_enc.h +++ b/C/fast-lzma2/range_enc.h @@ -28,9 +28,13 @@ typedef U16 Probability; #define kNumMoveBits 5U #define kProbInitValue (kBitModelTotal >> 1U) #define kNumMoveReducingBits 4U -#define kNumBitPriceShiftBits 4U +#define kNumBitPriceShiftBits 5U +#define kPriceTableSize (kBitModelTotal >> kNumMoveReducingBits) -extern const unsigned price_table[kBitModelTotal >> kNumMoveReducingBits]; +extern BYTE price_table[2][kPriceTableSize]; +#if 0 +void RC_printPriceTable(); +#endif typedef struct { @@ -43,22 +47,20 @@ typedef struct BYTE cache; } RangeEncoder; -void RangeEncReset(RangeEncoder* const rc); +void RC_reset(RangeEncoder* const rc); -void SetOutputBuffer(RangeEncoder* const rc, BYTE *const out_buffer, size_t chunk_size); +void RC_setOutputBuffer(RangeEncoder* const rc, BYTE *const out_buffer, size_t chunk_size); -void RangeEncReset(RangeEncoder* const rc); +void FORCE_NOINLINE RC_shiftLow(RangeEncoder* const rc); -void ShiftLow(RangeEncoder* const rc); +void RC_encodeBitTree(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol); -void EncodeBitTree(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol); +void RC_encodeBitTreeReverse(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol); -void EncodeBitTreeReverse(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol); - -void EncodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count); +void FORCE_NOINLINE RC_encodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count); HINT_INLINE -void EncodeBit0(RangeEncoder* const rc, Probability *const rprob) +void RC_encodeBit0(RangeEncoder* const rc, Probability *const rprob) { unsigned prob = *rprob; rc->range = (rc->range >> kNumBitModelTotalBits) * prob; @@ -66,12 +68,12 @@ void EncodeBit0(RangeEncoder* const rc, Probability *const rprob) *rprob = (Probability)prob; if (rc->range < kTopValue) { rc->range <<= 8; - ShiftLow(rc); + RC_shiftLow(rc); } } HINT_INLINE -void EncodeBit1(RangeEncoder* const rc, Probability *const rprob) +void RC_encodeBit1(RangeEncoder* const rc, Probability *const rprob) { unsigned prob = *rprob; U32 new_bound = (rc->range >> kNumBitModelTotalBits) * prob; @@ -81,16 +83,16 @@ void EncodeBit1(RangeEncoder* const rc, Probability *const rprob) *rprob = (Probability)prob; if (rc->range < kTopValue) { rc->range <<= 8; - ShiftLow(rc); + RC_shiftLow(rc); } } HINT_INLINE -void EncodeBit(RangeEncoder* const rc, Probability *const rprob, unsigned const bit) +void RC_encodeBit(RangeEncoder* const rc, Probability *const rprob, unsigned const bit) { unsigned prob = *rprob; if (bit != 0) { - U32 new_bound = (rc->range >> kNumBitModelTotalBits) * prob; + U32 const new_bound = (rc->range >> kNumBitModelTotalBits) * prob; rc->low += new_bound; rc->range -= new_bound; prob -= prob >> kNumMoveBits; @@ -102,52 +104,56 @@ void EncodeBit(RangeEncoder* const rc, Probability *const rprob, unsigned const *rprob = (Probability)prob; if (rc->range < kTopValue) { rc->range <<= 8; - ShiftLow(rc); + RC_shiftLow(rc); } } -#define GET_PRICE(rc, prob, symbol) \ - price_table[((prob) ^ ((-(int)(symbol)) & (kBitModelTotal - 1))) >> kNumMoveReducingBits]; +#define GET_PRICE(prob, symbol) \ + price_table[symbol][(prob) >> kNumMoveReducingBits] -#define GET_PRICE_0(rc, prob) price_table[(prob) >> kNumMoveReducingBits] +#define GET_PRICE_0(prob) price_table[0][(prob) >> kNumMoveReducingBits] -#define GET_PRICE_1(rc, prob) price_table[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits] +#define GET_PRICE_1(prob) price_table[1][(prob) >> kNumMoveReducingBits] + +#define kMinLitPrice 8U HINT_INLINE -unsigned GetTreePrice(RangeEncoder* const rc, const Probability* const prob_table, unsigned const bit_count, size_t symbol) +unsigned RC_getTreePrice(const Probability* const prob_table, unsigned bit_count, size_t symbol) { unsigned price = 0; - symbol |= ((size_t)1 << bit_count); - while (symbol != 1) { - size_t next_symbol = symbol >> 1; + symbol |= ((size_t)1 << bit_count); + do { + size_t const next_symbol = symbol >> 1; unsigned prob = prob_table[next_symbol]; - unsigned bit = (unsigned)symbol & 1; - price += GET_PRICE(rc, prob, bit); + size_t bit = symbol & 1; + price += GET_PRICE(prob, bit); symbol = next_symbol; - } + } while (symbol != 1); return price; } HINT_INLINE -unsigned GetReverseTreePrice(RangeEncoder* const rc, const Probability* const prob_table, unsigned const bit_count, size_t symbol) +unsigned RC_getReverseTreePrice(const Probability* const prob_table, unsigned bit_count, size_t symbol) { - unsigned price = 0; - size_t m = 1; - for (unsigned i = bit_count; i != 0; --i) { - unsigned prob = prob_table[m]; - unsigned bit = symbol & 1; - symbol >>= 1; - price += GET_PRICE(rc, prob, bit); - m = (m << 1) | bit; - } - return price; + unsigned prob = prob_table[1]; + size_t bit = symbol & 1; + unsigned price = GET_PRICE(prob, bit); + size_t m = 1; + while (--bit_count != 0) { + m = (m << 1) | bit; + symbol >>= 1; + prob = prob_table[m]; + bit = symbol & 1; + price += GET_PRICE(prob, bit); + } + return price; } HINT_INLINE -void Flush(RangeEncoder* const rc) +void RC_flush(RangeEncoder* const rc) { for (int i = 0; i < 5; ++i) - ShiftLow(rc); + RC_shiftLow(rc); } #if defined (__cplusplus) diff --git a/C/fast-lzma2/util.c b/C/fast-lzma2/util.c new file mode 100644 index 00000000..d6466063 --- /dev/null +++ b/C/fast-lzma2/util.c @@ -0,0 +1,707 @@ +/* + * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#if defined (__cplusplus) +extern "C" { +#endif + + +/*-**************************************** +* Dependencies +******************************************/ +#include "util.h" /* note : ensure that platform.h is included first ! */ +#include +#include + + +int UTIL_fileExist(const char* filename) +{ + stat_t statbuf; +#if defined(_MSC_VER) + int const stat_error = _stat64(filename, &statbuf); +#else + int const stat_error = stat(filename, &statbuf); +#endif + return !stat_error; +} + +int UTIL_isRegularFile(const char* infilename) +{ + stat_t statbuf; + return UTIL_getFileStat(infilename, &statbuf); /* Only need to know whether it is a regular file */ +} + +int UTIL_getFileStat(const char* infilename, stat_t *statbuf) +{ + int r; +#if defined(_MSC_VER) + r = _stat64(infilename, statbuf); + if (r || !(statbuf->st_mode & S_IFREG)) return 0; /* No good... */ +#else + r = stat(infilename, statbuf); + if (r || !S_ISREG(statbuf->st_mode)) return 0; /* No good... */ +#endif + return 1; +} + +int UTIL_setFileStat(const char *filename, stat_t *statbuf) +{ + int res = 0; + struct utimbuf timebuf; + + if (!UTIL_isRegularFile(filename)) + return -1; + + timebuf.actime = time(NULL); + timebuf.modtime = statbuf->st_mtime; + res += utime(filename, &timebuf); /* set access and modification times */ + +#if !defined(_WIN32) + res += chown(filename, statbuf->st_uid, statbuf->st_gid); /* Copy ownership */ +#endif + + res += chmod(filename, statbuf->st_mode & 07777); /* Copy file permissions */ + + errno = 0; + return -res; /* number of errors is returned */ +} + +U32 UTIL_isDirectory(const char* infilename) +{ + int r; + stat_t statbuf; +#if defined(_MSC_VER) + r = _stat64(infilename, &statbuf); + if (!r && (statbuf.st_mode & _S_IFDIR)) return 1; +#else + r = stat(infilename, &statbuf); + if (!r && S_ISDIR(statbuf.st_mode)) return 1; +#endif + return 0; +} + +U32 UTIL_isLink(const char* infilename) +{ +/* macro guards, as defined in : https://linux.die.net/man/2/lstat */ +#ifndef __STRICT_ANSI__ +#if defined(_BSD_SOURCE) \ + || (defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE >= 500)) \ + || (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) \ + || (defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) \ + || (defined(__APPLE__) && defined(__MACH__)) \ + || defined(__OpenBSD__) \ + || defined(__FreeBSD__) + int r; + stat_t statbuf; + r = lstat(infilename, &statbuf); + if (!r && S_ISLNK(statbuf.st_mode)) return 1; +#endif +#endif + (void)infilename; + return 0; +} + +U64 UTIL_getFileSize(const char* infilename) +{ + if (!UTIL_isRegularFile(infilename)) return UTIL_FILESIZE_UNKNOWN; + { int r; +#if defined(_MSC_VER) + struct __stat64 statbuf; + r = _stat64(infilename, &statbuf); + if (r || !(statbuf.st_mode & S_IFREG)) return UTIL_FILESIZE_UNKNOWN; +#elif defined(__MINGW32__) && defined (__MSVCRT__) + struct _stati64 statbuf; + r = _stati64(infilename, &statbuf); + if (r || !(statbuf.st_mode & S_IFREG)) return UTIL_FILESIZE_UNKNOWN; +#else + struct stat statbuf; + r = stat(infilename, &statbuf); + if (r || !S_ISREG(statbuf.st_mode)) return UTIL_FILESIZE_UNKNOWN; +#endif + return (U64)statbuf.st_size; + } +} + + +U64 UTIL_getTotalFileSize(const char* const * const fileNamesTable, unsigned nbFiles) +{ + U64 total = 0; + int error = 0; + unsigned n; + for (n=0; n= *bufEnd) { + ptrdiff_t const newListSize = (*bufEnd - *bufStart) + LIST_SIZE_INCREASE; + *bufStart = (char*)UTIL_realloc(*bufStart, newListSize); + if (*bufStart == NULL) { free(path); FindClose(hFile); return 0; } + *bufEnd = *bufStart + newListSize; + } + if (*bufStart + *pos + pathLength < *bufEnd) { + memcpy(*bufStart + *pos, path, pathLength+1 /* include final \0 */); + *pos += pathLength + 1; + nbFiles++; + } + } + free(path); + } while (FindNextFileA(hFile, &cFile)); + + FindClose(hFile); + return nbFiles; +} + +#elif defined(__linux__) || (PLATFORM_POSIX_VERSION >= 200112L) /* opendir, readdir require POSIX.1-2001 */ + +int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks) +{ + DIR *dir; + struct dirent *entry; + char* path; + int dirLength, fnameLength, pathLength, nbFiles = 0; + + if (!(dir = opendir(dirName))) { + UTIL_DISPLAYLEVEL(1, "Cannot open directory '%s': %s\n", dirName, strerror(errno)); + return 0; + } + + dirLength = (int)strlen(dirName); + errno = 0; + while ((entry = readdir(dir)) != NULL) { + if (strcmp (entry->d_name, "..") == 0 || + strcmp (entry->d_name, ".") == 0) continue; + fnameLength = (int)strlen(entry->d_name); + path = (char*) malloc(dirLength + fnameLength + 2); + if (!path) { closedir(dir); return 0; } + memcpy(path, dirName, dirLength); + + path[dirLength] = '/'; + memcpy(path+dirLength+1, entry->d_name, fnameLength); + pathLength = dirLength+1+fnameLength; + path[pathLength] = 0; + + if (!followLinks && UTIL_isLink(path)) { + UTIL_DISPLAYLEVEL(2, "Warning : %s is a symbolic link, ignoring\n", path); + continue; + } + + if (UTIL_isDirectory(path)) { + nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks); /* Recursively call "UTIL_prepareFileList" with the new path. */ + if (*bufStart == NULL) { free(path); closedir(dir); return 0; } + } else { + if (*bufStart + *pos + pathLength >= *bufEnd) { + ptrdiff_t newListSize = (*bufEnd - *bufStart) + LIST_SIZE_INCREASE; + *bufStart = (char*)UTIL_realloc(*bufStart, newListSize); + *bufEnd = *bufStart + newListSize; + if (*bufStart == NULL) { free(path); closedir(dir); return 0; } + } + if (*bufStart + *pos + pathLength < *bufEnd) { + memcpy(*bufStart + *pos, path, pathLength + 1); /* with final \0 */ + *pos += pathLength + 1; + nbFiles++; + } + } + free(path); + errno = 0; /* clear errno after UTIL_isDirectory, UTIL_prepareFileList */ + } + + if (errno != 0) { + UTIL_DISPLAYLEVEL(1, "readdir(%s) error: %s\n", dirName, strerror(errno)); + free(*bufStart); + *bufStart = NULL; + } + closedir(dir); + return nbFiles; +} + +#else + +int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks) +{ + (void)bufStart; (void)bufEnd; (void)pos; (void)followLinks; + UTIL_DISPLAYLEVEL(1, "Directory %s ignored (compiled without _WIN32 or _POSIX_C_SOURCE)\n", dirName); + return 0; +} + +#endif /* #ifdef _WIN32 */ + +/* + * UTIL_createFileList - takes a list of files and directories (params: inputNames, inputNamesNb), scans directories, + * and returns a new list of files (params: return value, allocatedBuffer, allocatedNamesNb). + * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer) + * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called. + */ +const char** +UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, + char** allocatedBuffer, unsigned* allocatedNamesNb, + int followLinks) +{ + size_t pos; + unsigned i, nbFiles; + char* buf = (char*)malloc(LIST_SIZE_INCREASE); + char* bufend = buf + LIST_SIZE_INCREASE; + const char** fileTable; + + if (!buf) return NULL; + + for (i=0, pos=0, nbFiles=0; i= bufend) { + ptrdiff_t newListSize = (bufend - buf) + LIST_SIZE_INCREASE; + buf = (char*)UTIL_realloc(buf, newListSize); + bufend = buf + newListSize; + if (!buf) return NULL; + } + if (buf + pos + len < bufend) { + memcpy(buf+pos, inputNames[i], len+1); /* with final \0 */ + pos += len + 1; + nbFiles++; + } + } else { + nbFiles += UTIL_prepareFileList(inputNames[i], &buf, &pos, &bufend, followLinks); + if (buf == NULL) return NULL; + } } + + if (nbFiles == 0) { free(buf); return NULL; } + + fileTable = (const char**)malloc((nbFiles+1) * sizeof(const char*)); + if (!fileTable) { free(buf); return NULL; } + + for (i=0, pos=0; i bufend) { free(buf); free((void*)fileTable); return NULL; } + + *allocatedBuffer = buf; + *allocatedNamesNb = nbFiles; + + return fileTable; +} + +/*-**************************************** +* Console log +******************************************/ +int g_utilDisplayLevel; + + +/*-**************************************** +* Time functions +******************************************/ +#if defined(_WIN32) /* Windows */ + +UTIL_time_t UTIL_getTime(void) { UTIL_time_t x; QueryPerformanceCounter(&x); return x; } + +U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd) +{ + static LARGE_INTEGER ticksPerSecond; + static int init = 0; + if (!init) { + if (!QueryPerformanceFrequency(&ticksPerSecond)) + UTIL_DISPLAYLEVEL(1, "ERROR: QueryPerformanceFrequency() failure\n"); + init = 1; + } + return 1000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart; +} + +U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd) +{ + static LARGE_INTEGER ticksPerSecond; + static int init = 0; + if (!init) { + if (!QueryPerformanceFrequency(&ticksPerSecond)) + UTIL_DISPLAYLEVEL(1, "ERROR: QueryPerformanceFrequency() failure\n"); + init = 1; + } + return 1000000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart; +} + +#elif defined(__APPLE__) && defined(__MACH__) + +UTIL_time_t UTIL_getTime(void) { return mach_absolute_time(); } + +U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd) +{ + static mach_timebase_info_data_t rate; + static int init = 0; + if (!init) { + mach_timebase_info(&rate); + init = 1; + } + return (((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom))/1000ULL; +} + +U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd) +{ + static mach_timebase_info_data_t rate; + static int init = 0; + if (!init) { + mach_timebase_info(&rate); + init = 1; + } + return ((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom); +} + +#elif (PLATFORM_POSIX_VERSION >= 200112L) \ + && (defined(__UCLIBC__) \ + || (defined(__GLIBC__) \ + && ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) \ + || (__GLIBC__ > 2)))) + +UTIL_time_t UTIL_getTime(void) +{ + UTIL_time_t time; + if (clock_gettime(CLOCK_MONOTONIC, &time)) + UTIL_DISPLAYLEVEL(1, "ERROR: Failed to get time\n"); /* we could also exit() */ + return time; +} + +UTIL_time_t UTIL_getSpanTime(UTIL_time_t begin, UTIL_time_t end) +{ + UTIL_time_t diff; + if (end.tv_nsec < begin.tv_nsec) { + diff.tv_sec = (end.tv_sec - 1) - begin.tv_sec; + diff.tv_nsec = (end.tv_nsec + 1000000000ULL) - begin.tv_nsec; + } else { + diff.tv_sec = end.tv_sec - begin.tv_sec; + diff.tv_nsec = end.tv_nsec - begin.tv_nsec; + } + return diff; +} + +U64 UTIL_getSpanTimeMicro(UTIL_time_t begin, UTIL_time_t end) +{ + UTIL_time_t const diff = UTIL_getSpanTime(begin, end); + U64 micro = 0; + micro += 1000000ULL * diff.tv_sec; + micro += diff.tv_nsec / 1000ULL; + return micro; +} + +U64 UTIL_getSpanTimeNano(UTIL_time_t begin, UTIL_time_t end) +{ + UTIL_time_t const diff = UTIL_getSpanTime(begin, end); + U64 nano = 0; + nano += 1000000000ULL * diff.tv_sec; + nano += diff.tv_nsec; + return nano; +} + +#else /* relies on standard C (note : clock_t measurements can be wrong when using multi-threading) */ + +UTIL_time_t UTIL_getTime(void) { return clock(); } +U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd) { return 1000000ULL * (clockEnd - clockStart) / CLOCKS_PER_SEC; } +U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd) { return 1000000000ULL * (clockEnd - clockStart) / CLOCKS_PER_SEC; } + +#endif + +/* returns time span in microseconds */ +U64 UTIL_clockSpanMicro(UTIL_time_t clockStart ) +{ + UTIL_time_t const clockEnd = UTIL_getTime(); + return UTIL_getSpanTimeMicro(clockStart, clockEnd); +} + +/* returns time span in microseconds */ +U64 UTIL_clockSpanNano(UTIL_time_t clockStart ) +{ + UTIL_time_t const clockEnd = UTIL_getTime(); + return UTIL_getSpanTimeNano(clockStart, clockEnd); +} + +void UTIL_waitForNextTick(void) +{ + UTIL_time_t const clockStart = UTIL_getTime(); + UTIL_time_t clockEnd; + do { + clockEnd = UTIL_getTime(); + } while (UTIL_getSpanTimeNano(clockStart, clockEnd) == 0); +} + +/* count the number of physical cores */ +#if defined(_WIN32) || defined(WIN32) + +#include + +typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + +int UTIL_countPhysicalCores(void) +{ + static int numPhysicalCores = 0; + if (numPhysicalCores != 0) return numPhysicalCores; + + { LPFN_GLPI glpi; + BOOL done = FALSE; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL; + DWORD returnLength = 0; + size_t byteOffset = 0; + + glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), + "GetLogicalProcessorInformation"); + + if (glpi == NULL) { + goto failed; + } + + while(!done) { + DWORD rc = glpi(buffer, &returnLength); + if (FALSE == rc) { + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { + if (buffer) + free(buffer); + buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(returnLength); + + if (buffer == NULL) { + perror("zstd"); + exit(1); + } + } else { + /* some other error */ + goto failed; + } + } else { + done = TRUE; + } + } + + ptr = buffer; + + while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) { + + if (ptr->Relationship == RelationProcessorCore) { + numPhysicalCores++; + } + + ptr++; + byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + } + + free(buffer); + + return numPhysicalCores; + } + +failed: + /* try to fall back on GetSystemInfo */ + { SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + numPhysicalCores = sysinfo.dwNumberOfProcessors; + if (numPhysicalCores == 0) numPhysicalCores = 1; /* just in case */ + } + return numPhysicalCores; +} + +#elif defined(__APPLE__) + +#include + +/* Use apple-provided syscall + * see: man 3 sysctl */ +int UTIL_countPhysicalCores(void) +{ + static S32 numPhysicalCores = 0; /* apple specifies int32_t */ + if (numPhysicalCores != 0) return numPhysicalCores; + + { size_t size = sizeof(S32); + int const ret = sysctlbyname("hw.physicalcpu", &numPhysicalCores, &size, NULL, 0); + if (ret != 0) { + if (errno == ENOENT) { + /* entry not present, fall back on 1 */ + numPhysicalCores = 1; + } else { + perror("zstd: can't get number of physical cpus"); + exit(1); + } + } + + return numPhysicalCores; + } +} + +#elif defined(__linux__) + +/* parse /proc/cpuinfo + * siblings / cpu cores should give hyperthreading ratio + * otherwise fall back on sysconf */ +int UTIL_countPhysicalCores(void) +{ + static int numPhysicalCores = 0; + + if (numPhysicalCores != 0) return numPhysicalCores; + + numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN); + if (numPhysicalCores == -1) { + /* value not queryable, fall back on 1 */ + return numPhysicalCores = 1; + } + + /* try to determine if there's hyperthreading */ + { FILE* const cpuinfo = fopen("/proc/cpuinfo", "r"); +#define BUF_SIZE 80 + char buff[BUF_SIZE]; + + int siblings = 0; + int cpu_cores = 0; + int ratio = 1; + + if (cpuinfo == NULL) { + /* fall back on the sysconf value */ + return numPhysicalCores; + } + + /* assume the cpu cores/siblings values will be constant across all + * present processors */ + while (!feof(cpuinfo)) { + if (fgets(buff, BUF_SIZE, cpuinfo) != NULL) { + if (strncmp(buff, "siblings", 8) == 0) { + const char* const sep = strchr(buff, ':'); + if (*sep == '\0') { + /* formatting was broken? */ + goto failed; + } + + siblings = atoi(sep + 1); + } + if (strncmp(buff, "cpu cores", 9) == 0) { + const char* const sep = strchr(buff, ':'); + if (*sep == '\0') { + /* formatting was broken? */ + goto failed; + } + + cpu_cores = atoi(sep + 1); + } + } else if (ferror(cpuinfo)) { + /* fall back on the sysconf value */ + goto failed; + } + } + if (siblings && cpu_cores) { + ratio = siblings / cpu_cores; + } +failed: + fclose(cpuinfo); + return numPhysicalCores = numPhysicalCores / ratio; + } +} + +#elif defined(__FreeBSD__) + +#include +#include + +/* Use physical core sysctl when available + * see: man 4 smp, man 3 sysctl */ +int UTIL_countPhysicalCores(void) +{ + static int numPhysicalCores = 0; /* freebsd sysctl is native int sized */ + if (numPhysicalCores != 0) return numPhysicalCores; + +#if __FreeBSD_version >= 1300008 + { size_t size = sizeof(numPhysicalCores); + int ret = sysctlbyname("kern.smp.cores", &numPhysicalCores, &size, NULL, 0); + if (ret == 0) return numPhysicalCores; + if (errno != ENOENT) { + perror("zstd: can't get number of physical cpus"); + exit(1); + } + /* sysctl not present, fall through to older sysconf method */ + } +#endif + + numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN); + if (numPhysicalCores == -1) { + /* value not queryable, fall back on 1 */ + numPhysicalCores = 1; + } + return numPhysicalCores; +} + +#elif defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) + +/* Use POSIX sysconf + * see: man 3 sysconf */ +int UTIL_countPhysicalCores(void) +{ + static int numPhysicalCores = 0; + + if (numPhysicalCores != 0) return numPhysicalCores; + + numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN); + if (numPhysicalCores == -1) { + /* value not queryable, fall back on 1 */ + return numPhysicalCores = 1; + } + return numPhysicalCores; +} + +#else + +int UTIL_countPhysicalCores(void) +{ + /* assume 1 */ + return 1; +} + +#endif + +#if defined (__cplusplus) +} +#endif diff --git a/C/fast-lzma2/util.h b/C/fast-lzma2/util.h index d5688203..f78bcbe1 100644 --- a/C/fast-lzma2/util.h +++ b/C/fast-lzma2/util.h @@ -16,17 +16,15 @@ extern "C" { #endif - /*-**************************************** * Dependencies ******************************************/ -#include "platform.h" /* PLATFORM_POSIX_VERSION */ -#include /* malloc */ +#include "platform.h" /* PLATFORM_POSIX_VERSION, ZSTD_NANOSLEEP_SUPPORT, ZSTD_SETPRIORITY_SUPPORT */ +#include /* malloc, realloc, free */ #include /* size_t, ptrdiff_t */ #include /* fprintf */ -#include /* strncmp */ #include /* stat, utime */ -#include /* stat */ +#include /* stat, chmod */ #if defined(_MSC_VER) # include /* utime */ # include /* _chmod */ @@ -34,13 +32,12 @@ extern "C" { # include /* chown, stat */ # include /* utime */ #endif -#include /* time */ -#include +#include /* clock_t, clock, CLOCKS_PER_SEC, nanosleep */ #include "mem.h" /* U32, U64 */ -/* ************************************************************ -* Avoid fseek()'s 2GiB barrier with MSVC, MacOS, *BSD, MinGW +/*-************************************************************ +* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW ***************************************************************/ #if defined(_MSC_VER) && (_MSC_VER >= 1400) # define UTIL_fseek _fseeki64 @@ -53,37 +50,38 @@ extern "C" { #endif -/*-**************************************** -* Sleep functions: Windows - Posix - others -******************************************/ +/*-************************************************* +* Sleep & priority functions: Windows - Posix - others +***************************************************/ #if defined(_WIN32) # include # define SET_REALTIME_PRIORITY SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS) # define UTIL_sleep(s) Sleep(1000*s) # define UTIL_sleepMilli(milli) Sleep(milli) -#elif PLATFORM_POSIX_VERSION >= 0 /* Unix-like operating system */ -# include -# include /* setpriority */ -# include /* clock_t, nanosleep, clock, CLOCKS_PER_SEC */ -# if defined(PRIO_PROCESS) -# define SET_REALTIME_PRIORITY setpriority(PRIO_PROCESS, 0, -20) -# else -# define SET_REALTIME_PRIORITY /* disabled */ -# endif + +#elif PLATFORM_POSIX_VERSION > 0 /* Unix-like operating system */ +# include /* sleep */ # define UTIL_sleep(s) sleep(s) -# if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 199309L)) || (PLATFORM_POSIX_VERSION >= 200112L) /* nanosleep requires POSIX.1-2001 */ +# if ZSTD_NANOSLEEP_SUPPORT /* necessarily defined in platform.h */ # define UTIL_sleepMilli(milli) { struct timespec t; t.tv_sec=0; t.tv_nsec=milli*1000000ULL; nanosleep(&t, NULL); } # else # define UTIL_sleepMilli(milli) /* disabled */ # endif -#else -# define SET_REALTIME_PRIORITY /* disabled */ +# if ZSTD_SETPRIORITY_SUPPORT +# include /* setpriority */ +# define SET_REALTIME_PRIORITY setpriority(PRIO_PROCESS, 0, -20) +# else +# define SET_REALTIME_PRIORITY /* disabled */ +# endif + +#else /* unknown non-unix operating systen */ # define UTIL_sleep(s) /* disabled */ # define UTIL_sleepMilli(milli) /* disabled */ +# define SET_REALTIME_PRIORITY /* disabled */ #endif -/* ************************************* +/*-************************************* * Constants ***************************************/ #define LIST_SIZE_INCREASE (8*1024) @@ -101,8 +99,6 @@ extern "C" { # define UTIL_STATIC static inline #elif defined(_MSC_VER) # define UTIL_STATIC static __inline -# pragma warning(disable : 4996) /* disable: C4996: 'strncpy': This function or variable may be unsafe. */ -# pragma warning(disable : 4389) /* disable: C4389: '==' : signed/unsigned mismatch */ #else # define UTIL_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ #endif @@ -111,7 +107,7 @@ extern "C" { /*-**************************************** * Console log ******************************************/ -static int g_utilDisplayLevel; +extern int g_utilDisplayLevel; #define UTIL_DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define UTIL_DISPLAYLEVEL(l, ...) { if (g_utilDisplayLevel>=l) { UTIL_DISPLAY(__VA_ARGS__); } } @@ -120,119 +116,47 @@ static int g_utilDisplayLevel; * Time functions ******************************************/ #if defined(_WIN32) /* Windows */ + + #define UTIL_TIME_INITIALIZER { { 0, 0 } } typedef LARGE_INTEGER UTIL_time_t; - UTIL_STATIC UTIL_time_t UTIL_getTime(void) { UTIL_time_t x; QueryPerformanceCounter(&x); return x; } - UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd) - { - static LARGE_INTEGER ticksPerSecond; - static int init = 0; - if (!init) { - if (!QueryPerformanceFrequency(&ticksPerSecond)) - UTIL_DISPLAYLEVEL(1, "ERROR: QueryPerformanceFrequency() failure\n"); - init = 1; - } - return 1000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart; - } - UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd) - { - static LARGE_INTEGER ticksPerSecond; - static int init = 0; - if (!init) { - if (!QueryPerformanceFrequency(&ticksPerSecond)) - UTIL_DISPLAYLEVEL(1, "ERROR: QueryPerformanceFrequency() failure\n"); - init = 1; - } - return 1000000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart; - } + #elif defined(__APPLE__) && defined(__MACH__) + #include + #define UTIL_TIME_INITIALIZER 0 typedef U64 UTIL_time_t; - UTIL_STATIC UTIL_time_t UTIL_getTime(void) { return mach_absolute_time(); } - UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd) - { - static mach_timebase_info_data_t rate; - static int init = 0; - if (!init) { - mach_timebase_info(&rate); - init = 1; - } - return (((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom))/1000ULL; - } - UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd) - { - static mach_timebase_info_data_t rate; - static int init = 0; - if (!init) { - mach_timebase_info(&rate); - init = 1; - } - return ((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom); - } -#elif (PLATFORM_POSIX_VERSION >= 200112L) - #include + +#elif (PLATFORM_POSIX_VERSION >= 200112L) \ + && (defined(__UCLIBC__) \ + || (defined(__GLIBC__) \ + && ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) \ + || (__GLIBC__ > 2)))) + + #define UTIL_TIME_INITIALIZER { 0, 0 } typedef struct timespec UTIL_freq_t; typedef struct timespec UTIL_time_t; - UTIL_STATIC UTIL_time_t UTIL_getTime(void) - { - UTIL_time_t time; - if (clock_gettime(CLOCK_MONOTONIC, &time)) - UTIL_DISPLAYLEVEL(1, "ERROR: Failed to get time\n"); /* we could also exit() */ - return time; - } - UTIL_STATIC UTIL_time_t UTIL_getSpanTime(UTIL_time_t begin, UTIL_time_t end) - { - UTIL_time_t diff; - if (end.tv_nsec < begin.tv_nsec) { - diff.tv_sec = (end.tv_sec - 1) - begin.tv_sec; - diff.tv_nsec = (end.tv_nsec + 1000000000ULL) - begin.tv_nsec; - } else { - diff.tv_sec = end.tv_sec - begin.tv_sec; - diff.tv_nsec = end.tv_nsec - begin.tv_nsec; - } - return diff; - } - UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t begin, UTIL_time_t end) - { - UTIL_time_t const diff = UTIL_getSpanTime(begin, end); - U64 micro = 0; - micro += 1000000ULL * diff.tv_sec; - micro += diff.tv_nsec / 1000ULL; - return micro; - } - UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t begin, UTIL_time_t end) - { - UTIL_time_t const diff = UTIL_getSpanTime(begin, end); - U64 nano = 0; - nano += 1000000000ULL * diff.tv_sec; - nano += diff.tv_nsec; - return nano; - } + + UTIL_time_t UTIL_getSpanTime(UTIL_time_t begin, UTIL_time_t end); + #else /* relies on standard C (note : clock_t measurements can be wrong when using multi-threading) */ + typedef clock_t UTIL_time_t; - UTIL_STATIC UTIL_time_t UTIL_getTime(void) { return clock(); } - UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd) { return 1000000ULL * (clockEnd - clockStart) / CLOCKS_PER_SEC; } - UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd) { return 1000000000ULL * (clockEnd - clockStart) / CLOCKS_PER_SEC; } + #define UTIL_TIME_INITIALIZER 0 + #endif +UTIL_time_t UTIL_getTime(void); +U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd); +U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd); + +#define SEC_TO_MICRO 1000000 /* returns time span in microseconds */ -UTIL_STATIC U64 UTIL_clockSpanMicro( UTIL_time_t clockStart ) -{ - UTIL_time_t const clockEnd = UTIL_getTime(); - return UTIL_getSpanTimeMicro(clockStart, clockEnd); -} - - -UTIL_STATIC void UTIL_waitForNextTick(void) -{ - UTIL_time_t const clockStart = UTIL_getTime(); - UTIL_time_t clockEnd; - do { - clockEnd = UTIL_getTime(); - } while (UTIL_getSpanTimeNano(clockStart, clockEnd) == 0); -} - +U64 UTIL_clockSpanMicro(UTIL_time_t clockStart); +/* returns time span in microseconds */ +U64 UTIL_clockSpanNano(UTIL_time_t clockStart); +void UTIL_waitForNextTick(void); /*-**************************************** * File functions @@ -245,118 +169,23 @@ UTIL_STATIC void UTIL_waitForNextTick(void) #endif -UTIL_STATIC int UTIL_setFileStat(const char *filename, stat_t *statbuf) -{ - int res = 0; - struct utimbuf timebuf; - - timebuf.actime = time(NULL); - timebuf.modtime = statbuf->st_mtime; - res += utime(filename, &timebuf); /* set access and modification times */ - -#if !defined(_WIN32) - res += chown(filename, statbuf->st_uid, statbuf->st_gid); /* Copy ownership */ -#endif - - res += chmod(filename, statbuf->st_mode & 07777); /* Copy file permissions */ - - errno = 0; - return -res; /* number of errors is returned */ -} - - -UTIL_STATIC int UTIL_getFileStat(const char* infilename, stat_t *statbuf) -{ - int r; -#if defined(_MSC_VER) - r = _stat64(infilename, statbuf); - if (r || !(statbuf->st_mode & S_IFREG)) return 0; /* No good... */ -#else - r = stat(infilename, statbuf); - if (r || !S_ISREG(statbuf->st_mode)) return 0; /* No good... */ -#endif - return 1; -} - - -UTIL_STATIC int UTIL_isRegularFile(const char* infilename) -{ - stat_t statbuf; - return UTIL_getFileStat(infilename, &statbuf); /* Only need to know whether it is a regular file */ -} - - -UTIL_STATIC U32 UTIL_isDirectory(const char* infilename) -{ - int r; - stat_t statbuf; -#if defined(_MSC_VER) - r = _stat64(infilename, &statbuf); - if (!r && (statbuf.st_mode & _S_IFDIR)) return 1; -#else - r = stat(infilename, &statbuf); - if (!r && S_ISDIR(statbuf.st_mode)) return 1; -#endif - return 0; -} - -UTIL_STATIC U32 UTIL_isLink(const char* infilename) -{ -#if defined(_WIN32) - /* no symlinks on windows */ - (void)infilename; -#else - int r; - stat_t statbuf; - r = lstat(infilename, &statbuf); - if (!r && S_ISLNK(statbuf.st_mode)) return 1; -#endif - return 0; -} - +int UTIL_fileExist(const char* filename); +int UTIL_isRegularFile(const char* infilename); +int UTIL_setFileStat(const char* filename, stat_t* statbuf); +U32 UTIL_isDirectory(const char* infilename); +int UTIL_getFileStat(const char* infilename, stat_t* statbuf); +U32 UTIL_isLink(const char* infilename); #define UTIL_FILESIZE_UNKNOWN ((U64)(-1)) -UTIL_STATIC U64 UTIL_getFileSize(const char* infilename) -{ - if (!UTIL_isRegularFile(infilename)) return UTIL_FILESIZE_UNKNOWN; - { int r; -#if defined(_MSC_VER) - struct __stat64 statbuf; - r = _stat64(infilename, &statbuf); - if (r || !(statbuf.st_mode & S_IFREG)) return UTIL_FILESIZE_UNKNOWN; -#elif defined(__MINGW32__) && defined (__MSVCRT__) - struct _stati64 statbuf; - r = _stati64(infilename, &statbuf); - if (r || !(statbuf.st_mode & S_IFREG)) return UTIL_FILESIZE_UNKNOWN; -#else - struct stat statbuf; - r = stat(infilename, &statbuf); - if (r || !S_ISREG(statbuf.st_mode)) return UTIL_FILESIZE_UNKNOWN; -#endif - return (U64)statbuf.st_size; - } -} - - -UTIL_STATIC U64 UTIL_getTotalFileSize(const char* const * const fileNamesTable, unsigned nbFiles) -{ - U64 total = 0; - int error = 0; - unsigned n; - for (n=0; n= *bufEnd) { - ptrdiff_t newListSize = (*bufEnd - *bufStart) + LIST_SIZE_INCREASE; - *bufStart = (char*)UTIL_realloc(*bufStart, newListSize); - *bufEnd = *bufStart + newListSize; - if (*bufStart == NULL) { free(path); FindClose(hFile); return 0; } - } - if (*bufStart + *pos + pathLength < *bufEnd) { - strncpy(*bufStart + *pos, path, *bufEnd - (*bufStart + *pos)); - *pos += pathLength + 1; - nbFiles++; - } - } - free(path); - } while (FindNextFileA(hFile, &cFile)); - - FindClose(hFile); - return nbFiles; -} - #elif defined(__linux__) || (PLATFORM_POSIX_VERSION >= 200112L) /* opendir, readdir require POSIX.1-2001 */ # define UTIL_HAS_CREATEFILELIST # include /* opendir, readdir */ # include /* strerror, memcpy */ - -UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks) -{ - DIR *dir; - struct dirent *entry; - char* path; - int dirLength, fnameLength, pathLength, nbFiles = 0; - - if (!(dir = opendir(dirName))) { - UTIL_DISPLAYLEVEL(1, "Cannot open directory '%s': %s\n", dirName, strerror(errno)); - return 0; - } - - dirLength = (int)strlen(dirName); - errno = 0; - while ((entry = readdir(dir)) != NULL) { - if (strcmp (entry->d_name, "..") == 0 || - strcmp (entry->d_name, ".") == 0) continue; - fnameLength = (int)strlen(entry->d_name); - path = (char*) malloc(dirLength + fnameLength + 2); - if (!path) { closedir(dir); return 0; } - memcpy(path, dirName, dirLength); - - path[dirLength] = '/'; - memcpy(path+dirLength+1, entry->d_name, fnameLength); - pathLength = dirLength+1+fnameLength; - path[pathLength] = 0; - - if (!followLinks && UTIL_isLink(path)) { - UTIL_DISPLAYLEVEL(2, "Warning : %s is a symbolic link, ignoring\n", path); - continue; - } - - if (UTIL_isDirectory(path)) { - nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks); /* Recursively call "UTIL_prepareFileList" with the new path. */ - if (*bufStart == NULL) { free(path); closedir(dir); return 0; } - } else { - if (*bufStart + *pos + pathLength >= *bufEnd) { - ptrdiff_t newListSize = (*bufEnd - *bufStart) + LIST_SIZE_INCREASE; - *bufStart = (char*)UTIL_realloc(*bufStart, newListSize); - *bufEnd = *bufStart + newListSize; - if (*bufStart == NULL) { free(path); closedir(dir); return 0; } - } - if (*bufStart + *pos + pathLength < *bufEnd) { - strncpy(*bufStart + *pos, path, *bufEnd - (*bufStart + *pos)); - *pos += pathLength + 1; - nbFiles++; - } - } - free(path); - errno = 0; /* clear errno after UTIL_isDirectory, UTIL_prepareFileList */ - } - - if (errno != 0) { - UTIL_DISPLAYLEVEL(1, "readdir(%s) error: %s\n", dirName, strerror(errno)); - free(*bufStart); - *bufStart = NULL; - } - closedir(dir); - return nbFiles; -} - #else - -UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks) -{ - (void)bufStart; (void)bufEnd; (void)pos; - UTIL_DISPLAYLEVEL(1, "Directory %s ignored (compiled without _WIN32 or _POSIX_C_SOURCE)\n", dirName); - return 0; -} - #endif /* #ifdef _WIN32 */ /* @@ -509,53 +209,10 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_ * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer) * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called. */ -UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb, int followLinks) -{ - size_t pos; - unsigned i, nbFiles; - char* buf = (char*)malloc(LIST_SIZE_INCREASE); - char* bufend = buf + LIST_SIZE_INCREASE; - const char** fileTable; - - if (!buf) return NULL; - - for (i=0, pos=0, nbFiles=0; i= bufend) { - ptrdiff_t newListSize = (bufend - buf) + LIST_SIZE_INCREASE; - buf = (char*)UTIL_realloc(buf, newListSize); - bufend = buf + newListSize; - if (!buf) return NULL; - } - if (buf + pos + len < bufend) { - strncpy(buf + pos, inputNames[i], bufend - (buf + pos)); - pos += len + 1; - nbFiles++; - } - } else { - nbFiles += UTIL_prepareFileList(inputNames[i], &buf, &pos, &bufend, followLinks); - if (buf == NULL) return NULL; - } } - - if (nbFiles == 0) { free(buf); return NULL; } - - fileTable = (const char**)malloc((nbFiles+1) * sizeof(const char*)); - if (!fileTable) { free(buf); return NULL; } - - for (i=0, pos=0; i bufend) { free(buf); free((void*)fileTable); return NULL; } - - *allocatedBuffer = buf; - *allocatedNamesNb = nbFiles; - - return fileTable; -} - +const char** +UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, + char** allocatedBuffer, unsigned* allocatedNamesNb, + int followLinks); UTIL_STATIC void UTIL_freeFileList(const char** filenameTable, char* allocatedBuffer) { @@ -563,201 +220,7 @@ UTIL_STATIC void UTIL_freeFileList(const char** filenameTable, char* allocatedBu if (filenameTable) free((void*)filenameTable); } -/* count the number of physical cores */ -#if defined(_WIN32) || defined(WIN32) - -#include - -typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); - -UTIL_STATIC int UTIL_countPhysicalCores(void) -{ - static int numPhysicalCores = 0; - if (numPhysicalCores != 0) return numPhysicalCores; - - { LPFN_GLPI glpi; - BOOL done = FALSE; - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL; - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL; - DWORD returnLength = 0; - size_t byteOffset = 0; - - glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), - "GetLogicalProcessorInformation"); - - if (glpi == NULL) { - goto failed; - } - - while(!done) { - DWORD rc = glpi(buffer, &returnLength); - if (FALSE == rc) { - if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { - if (buffer) - free(buffer); - buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(returnLength); - - if (buffer == NULL) { - perror("zstd"); - exit(1); - } - } else { - /* some other error */ - goto failed; - } - } else { - done = TRUE; - } - } - - ptr = buffer; - - while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) { - - if (ptr->Relationship == RelationProcessorCore) { - numPhysicalCores++; - } - - ptr++; - byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); - } - - free(buffer); - - return numPhysicalCores; - } - -failed: - /* try to fall back on GetSystemInfo */ - { SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); - numPhysicalCores = sysinfo.dwNumberOfProcessors; - if (numPhysicalCores == 0) numPhysicalCores = 1; /* just in case */ - } - return numPhysicalCores; -} - -#elif defined(__APPLE__) - -#include - -/* Use apple-provided syscall - * see: man 3 sysctl */ -UTIL_STATIC int UTIL_countPhysicalCores(void) -{ - static S32 numPhysicalCores = 0; /* apple specifies int32_t */ - if (numPhysicalCores != 0) return numPhysicalCores; - - { size_t size = sizeof(S32); - int const ret = sysctlbyname("hw.physicalcpu", &numPhysicalCores, &size, NULL, 0); - if (ret != 0) { - if (errno == ENOENT) { - /* entry not present, fall back on 1 */ - numPhysicalCores = 1; - } else { - perror("zstd: can't get number of physical cpus"); - exit(1); - } - } - - return numPhysicalCores; - } -} - -#elif defined(__linux__) - -/* parse /proc/cpuinfo - * siblings / cpu cores should give hyperthreading ratio - * otherwise fall back on sysconf */ -UTIL_STATIC int UTIL_countPhysicalCores(void) -{ - static int numPhysicalCores = 0; - - if (numPhysicalCores != 0) return numPhysicalCores; - - numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN); - if (numPhysicalCores == -1) { - /* value not queryable, fall back on 1 */ - return numPhysicalCores = 1; - } - - /* try to determine if there's hyperthreading */ - { FILE* const cpuinfo = fopen("/proc/cpuinfo", "r"); -#define BUF_SIZE 80 - char buff[BUF_SIZE]; - - int siblings = 0; - int cpu_cores = 0; - int ratio = 1; - - if (cpuinfo == NULL) { - /* fall back on the sysconf value */ - return numPhysicalCores; - } - - /* assume the cpu cores/siblings values will be constant across all - * present processors */ - while (!feof(cpuinfo)) { - if (fgets(buff, BUF_SIZE, cpuinfo) != NULL) { - if (strncmp(buff, "siblings", 8) == 0) { - const char* const sep = strchr(buff, ':'); - if (*sep == '\0') { - /* formatting was broken? */ - goto failed; - } - - siblings = atoi(sep + 1); - } - if (strncmp(buff, "cpu cores", 9) == 0) { - const char* const sep = strchr(buff, ':'); - if (*sep == '\0') { - /* formatting was broken? */ - goto failed; - } - - cpu_cores = atoi(sep + 1); - } - } else if (ferror(cpuinfo)) { - /* fall back on the sysconf value */ - goto failed; - } - } - if (siblings && cpu_cores) { - ratio = siblings / cpu_cores; - } -failed: - fclose(cpuinfo); - return numPhysicalCores = numPhysicalCores / ratio; - } -} - -#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) - -/* Use apple-provided syscall - * see: man 3 sysctl */ -UTIL_STATIC int UTIL_countPhysicalCores(void) -{ - static int numPhysicalCores = 0; - - if (numPhysicalCores != 0) return numPhysicalCores; - - numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN); - if (numPhysicalCores == -1) { - /* value not queryable, fall back on 1 */ - return numPhysicalCores = 1; - } - return numPhysicalCores; -} - -#else - -UTIL_STATIC int UTIL_countPhysicalCores(void) -{ - /* assume 1 */ - return 1; -} - -#endif +int UTIL_countPhysicalCores(void); #if defined (__cplusplus) } diff --git a/CPP/7zip/7zip.mak b/CPP/7zip/7zip.mak index e759328c..582dfd85 100644 --- a/CPP/7zip/7zip.mak +++ b/CPP/7zip/7zip.mak @@ -212,7 +212,7 @@ $(ZSTDMT_OBJS): ../../../../C/zstdmt/$(*B).c !IFDEF FASTLZMA2_OBJS $(FASTLZMA2_OBJS): ../../../../C/fast-lzma2/$(*B).c - $(COMPL_O2) -DNO_XXHASH + $(COMPL_O2) -DNO_XXHASH -DFL2_7ZIP_BUILD !ENDIF @@ -298,7 +298,7 @@ $(FASTLZMA2_OBJS): ../../../../C/fast-lzma2/$(*B).c -I ../../../../C/lz5 \ -I ../../../../C/zstd {../../../../C/fast-lzma2}.c{$O}.obj:: - $(COMPLB_O2) -DNO_XXHASH + $(COMPLB_O2) -DNO_XXHASH -DFL2_7ZIP_BUILD !ENDIF diff --git a/CPP/7zip/Bundles/Alone/makefile b/CPP/7zip/Bundles/Alone/makefile index 43524082..f68a4296 100644 --- a/CPP/7zip/Bundles/Alone/makefile +++ b/CPP/7zip/Bundles/Alone/makefile @@ -322,16 +322,17 @@ ZSTDMT_OBJS = \ $O\zstd-mt_threading.obj \ FASTLZMA2_OBJS = \ - $O\fl2_error_private.obj \ - $O\fl2pool.obj \ - $O\fl2threading.obj \ + $O\dict_buffer.obj \ $O\fl2_common.obj \ $O\fl2_compress.obj \ + $O\fl2_pool.obj \ + $O\fl2_threading.obj \ $O\lzma2_enc.obj \ $O\radix_bitpack.obj \ $O\radix_mf.obj \ $O\radix_struct.obj \ $O\range_enc.obj \ + $O\util.obj \ !include "../../UI/Console/Console.mak" diff --git a/CPP/7zip/Bundles/Codec_flzma2/makefile b/CPP/7zip/Bundles/Codec_flzma2/makefile index 30b42cb2..ef34c5d4 100644 --- a/CPP/7zip/Bundles/Codec_flzma2/makefile +++ b/CPP/7zip/Bundles/Codec_flzma2/makefile @@ -36,15 +36,16 @@ COMPRESS_OBJS = $(COMPRESS_OBJS) \ $O\FastLzma2Register.obj \ FASTLZMA2_OBJS = \ - $O\fl2_error_private.obj \ - $O\fl2pool.obj \ - $O\fl2threading.obj \ + $O\dict_buffer.obj \ $O\fl2_common.obj \ $O\fl2_compress.obj \ + $O\fl2_pool.obj \ + $O\fl2_threading.obj \ $O\lzma2_enc.obj \ $O\radix_bitpack.obj \ $O\radix_mf.obj \ $O\radix_struct.obj \ $O\range_enc.obj \ + $O\util.obj \ !include "../../7zip.mak" diff --git a/CPP/7zip/Bundles/Format7z/makefile b/CPP/7zip/Bundles/Format7z/makefile index dafcf4ae..1db7800b 100644 --- a/CPP/7zip/Bundles/Format7z/makefile +++ b/CPP/7zip/Bundles/Format7z/makefile @@ -244,16 +244,17 @@ ZSTDMT_OBJS = \ $O\zstd-mt_threading.obj \ FASTLZMA2_OBJS = \ - $O\fl2_error_private.obj \ - $O\fl2pool.obj \ - $O\fl2threading.obj \ + $O\dict_buffer.obj \ $O\fl2_common.obj \ $O\fl2_compress.obj \ + $O\fl2_pool.obj \ + $O\fl2_threading.obj \ $O\lzma2_enc.obj \ $O\radix_bitpack.obj \ $O\radix_mf.obj \ $O\radix_struct.obj \ $O\range_enc.obj \ + $O\util.obj \ diff --git a/CPP/7zip/Bundles/Format7zF/makefile b/CPP/7zip/Bundles/Format7zF/makefile index ecf51975..f23b4250 100644 --- a/CPP/7zip/Bundles/Format7zF/makefile +++ b/CPP/7zip/Bundles/Format7zF/makefile @@ -119,15 +119,16 @@ ZSTDMT_OBJS = \ $O\zstd-mt_threading.obj \ FASTLZMA2_OBJS = \ - $O\fl2_error_private.obj \ - $O\fl2pool.obj \ - $O\fl2threading.obj \ + $O\dict_buffer.obj \ $O\fl2_common.obj \ $O\fl2_compress.obj \ + $O\fl2_pool.obj \ + $O\fl2_threading.obj \ $O\lzma2_enc.obj \ $O\radix_bitpack.obj \ $O\radix_mf.obj \ $O\radix_struct.obj \ $O\range_enc.obj \ + $O\util.obj \ !include "../../7zip.mak" diff --git a/CPP/7zip/Bundles/Format7zFO/makefile b/CPP/7zip/Bundles/Format7zFO/makefile index 5cc2b44d..799356c1 100644 --- a/CPP/7zip/Bundles/Format7zFO/makefile +++ b/CPP/7zip/Bundles/Format7zFO/makefile @@ -119,15 +119,16 @@ ZSTDMT_OBJS = \ $O\zstd-mt_threading.obj \ FASTLZMA2_OBJS = \ - $O\fl2_error_private.obj \ - $O\fl2pool.obj \ - $O\fl2threading.obj \ + $O\dict_buffer.obj \ $O\fl2_common.obj \ $O\fl2_compress.obj \ + $O\fl2_pool.obj \ + $O\fl2_threading.obj \ $O\lzma2_enc.obj \ $O\radix_bitpack.obj \ $O\radix_mf.obj \ $O\radix_struct.obj \ $O\range_enc.obj \ + $O\util.obj \ !include "../../7zip.mak" diff --git a/CPP/7zip/Bundles/Format7zUSB/makefile b/CPP/7zip/Bundles/Format7zUSB/makefile index 58c45033..68301d24 100644 --- a/CPP/7zip/Bundles/Format7zUSB/makefile +++ b/CPP/7zip/Bundles/Format7zUSB/makefile @@ -236,15 +236,16 @@ ZSTDMT_OBJS = \ $O\zstd-mt_threading.obj \ FASTLZMA2_OBJS = \ - $O\fl2_error_private.obj \ - $O\fl2pool.obj \ - $O\fl2threading.obj \ + $O\dict_buffer.obj \ $O\fl2_common.obj \ $O\fl2_compress.obj \ + $O\fl2_pool.obj \ + $O\fl2_threading.obj \ $O\lzma2_enc.obj \ $O\radix_bitpack.obj \ $O\radix_mf.obj \ $O\radix_struct.obj \ $O\range_enc.obj \ + $O\util.obj \ !include "../../7zip.mak" diff --git a/CPP/7zip/Compress/Lzma2Encoder.cpp b/CPP/7zip/Compress/Lzma2Encoder.cpp index 5eb88d50..d8c5e0f9 100644 --- a/CPP/7zip/Compress/Lzma2Encoder.cpp +++ b/CPP/7zip/Compress/Lzma2Encoder.cpp @@ -121,23 +121,39 @@ STDMETHODIMP CEncoder::Code(ISequentialInStream *inStream, ISequentialOutStream return SResToHRESULT(res); } -CFastEncoder::CFastEncoder() +static HRESULT TranslateError(size_t res) { - _encoder = NULL; - reduceSize = 0; + if (FL2_getErrorCode(res) == FL2_error_memory_allocation) + return E_OUTOFMEMORY; + return S_FALSE; } -CFastEncoder::~CFastEncoder() +#define CHECK_S(f_) do { \ + size_t r_ = f_; \ + if (FL2_isError(r_)) \ + return TranslateError(r_); \ +} while (false) + +#define CHECK_H(f_) do { \ + HRESULT r_ = f_; \ + if (r_ != S_OK) \ + return r_; \ +} while (false) + +#define CHECK_P(f) if (FL2_isError(f)) return E_INVALIDARG; /* check and convert error code */ + +CFastEncoder::FastLzma2::FastLzma2() + : fcs(NULL), + dict_pos(0) { - if (_encoder) - FL2_freeCCtx(_encoder); } +CFastEncoder::FastLzma2::~FastLzma2() +{ + FL2_freeCCtx(fcs); +} -#define CHECK_F(f) if (FL2_isError(f)) return E_INVALIDARG; /* check and convert error code */ - -STDMETHODIMP CFastEncoder::SetCoderProperties(const PROPID *propIDs, - const PROPVARIANT *coderProps, UInt32 numProps) +HRESULT CFastEncoder::FastLzma2::SetCoderProperties(const PROPID *propIDs, const PROPVARIANT *coderProps, UInt32 numProps) { CLzma2EncProps lzma2Props; Lzma2EncProps_Init(&lzma2Props); @@ -146,56 +162,165 @@ STDMETHODIMP CFastEncoder::SetCoderProperties(const PROPID *propIDs, { RINOK(SetLzma2Prop(propIDs[i], coderProps[i], lzma2Props)); } - if (_encoder == NULL) { - _encoder = FL2_createCCtxMt(lzma2Props.numTotalThreads); - if (_encoder == NULL) + if (fcs == NULL) { + fcs = FL2_createCStreamMt(lzma2Props.numTotalThreads, 1); + if (fcs == NULL) return E_OUTOFMEMORY; } if (lzma2Props.lzmaProps.algo > 2) { if (lzma2Props.lzmaProps.algo > 3) return E_INVALIDARG; lzma2Props.lzmaProps.algo = 2; - FL2_CCtx_setParameter(_encoder, FL2_p_highCompression, 1); - FL2_CCtx_setParameter(_encoder, FL2_p_compressionLevel, lzma2Props.lzmaProps.level); + FL2_CCtx_setParameter(fcs, FL2_p_highCompression, 1); + FL2_CCtx_setParameter(fcs, FL2_p_compressionLevel, lzma2Props.lzmaProps.level); } else { - FL2_CCtx_setParameter(_encoder, FL2_p_7zLevel, lzma2Props.lzmaProps.level); + FL2_CCtx_setParameter(fcs, FL2_p_compressionLevel, lzma2Props.lzmaProps.level); } - dictSize = lzma2Props.lzmaProps.dictSize; + size_t dictSize = lzma2Props.lzmaProps.dictSize; if (!dictSize) { - dictSize = (UInt32)1 << FL2_CCtx_setParameter(_encoder, FL2_p_dictionaryLog, 0); + dictSize = (UInt32)FL2_CCtx_getParameter(fcs, FL2_p_dictionarySize); } - reduceSize = lzma2Props.lzmaProps.reduceSize; + size_t reduceSize = lzma2Props.lzmaProps.reduceSize; reduceSize += (reduceSize < (UInt64)-1); /* prevent extra buffer shift after read */ dictSize = (UInt32)min(dictSize, reduceSize); - unsigned dictLog = FL2_DICTLOG_MIN; - while (((UInt32)1 << dictLog) < dictSize) - ++dictLog; - CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_dictionaryLog, dictLog)); + dictSize = max(dictSize, FL2_DICTSIZE_MIN); + CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_dictionarySize, dictSize)); if (lzma2Props.lzmaProps.algo >= 0) { - CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_strategy, (unsigned)lzma2Props.lzmaProps.algo)); + CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_strategy, (unsigned)lzma2Props.lzmaProps.algo)); } if (lzma2Props.lzmaProps.fb > 0) - CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_fastLength, lzma2Props.lzmaProps.fb)); - if (lzma2Props.lzmaProps.mc) { - unsigned ml = 0; - while (((UInt32)1 << ml) < lzma2Props.lzmaProps.mc) - ++ml; - CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_searchLog, ml)); - } + CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_fastLength, lzma2Props.lzmaProps.fb)); + if (lzma2Props.lzmaProps.mc > 0) + CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_hybridCycles, lzma2Props.lzmaProps.mc)); if (lzma2Props.lzmaProps.lc >= 0) - CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_literalCtxBits, lzma2Props.lzmaProps.lc)); + CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_literalCtxBits, lzma2Props.lzmaProps.lc)); if (lzma2Props.lzmaProps.lp >= 0) - CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_literalPosBits, lzma2Props.lzmaProps.lp)); + CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_literalPosBits, lzma2Props.lzmaProps.lp)); if (lzma2Props.lzmaProps.pb >= 0) - CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_posBits, lzma2Props.lzmaProps.pb)); - FL2_CCtx_setParameter(_encoder, FL2_p_omitProperties, 1); -#ifndef NO_XXHASH - FL2_CCtx_setParameter(_encoder, FL2_p_doXXHash, 0); -#endif + CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_posBits, lzma2Props.lzmaProps.pb)); + FL2_CCtx_setParameter(fcs, FL2_p_omitProperties, 1); + FL2_setCStreamTimeout(fcs, 500); return S_OK; } +size_t CFastEncoder::FastLzma2::GetDictSize() const +{ + return FL2_CCtx_getParameter(fcs, FL2_p_dictionarySize); +} + +HRESULT CFastEncoder::FastLzma2::Begin() +{ + CHECK_S(FL2_initCStream(fcs, 0)); + CHECK_S(FL2_getDictionaryBuffer(fcs, &dict)); + dict_pos = 0; + return S_OK; +} + +BYTE* CFastEncoder::FastLzma2::GetAvailableBuffer(unsigned long& size) +{ + size = static_cast(dict.size - dict_pos); + return reinterpret_cast(dict.dst) + dict_pos; +} + +HRESULT CFastEncoder::FastLzma2::WaitAndReport(size_t& res, ICompressProgressInfo *progress) +{ + while (FL2_isTimedOut(res)) { + if (!UpdateProgress(progress)) + return S_FALSE; + res = FL2_waitCStream(fcs); + } + CHECK_S(res); + return S_OK; +} + +HRESULT CFastEncoder::FastLzma2::AddByteCount(size_t count, ISequentialOutStream *outStream, ICompressProgressInfo *progress) +{ + dict_pos += count; + if (dict_pos == dict.size) { + size_t res = FL2_updateDictionary(fcs, dict_pos); + CHECK_H(WaitAndReport(res, progress)); + if (res != 0) + CHECK_H(WriteBuffers(outStream)); + do { + res = FL2_getDictionaryBuffer(fcs, &dict); + } while (FL2_isTimedOut(res)); + CHECK_S(res); + dict_pos = 0; + } + if (!UpdateProgress(progress)) + return S_FALSE; + return S_OK; +} + +bool CFastEncoder::FastLzma2::UpdateProgress(ICompressProgressInfo *progress) +{ + if (progress) { + UInt64 outProcessed; + UInt64 inProcessed = FL2_getCStreamProgress(fcs, &outProcessed); + HRESULT err = progress->SetRatioInfo(&inProcessed, &outProcessed); + if (err != S_OK) { + FL2_cancelCStream(fcs); + return false; + } + } + return true; +} + +HRESULT CFastEncoder::FastLzma2::WriteBuffers(ISequentialOutStream *outStream) +{ + size_t csize; + for (;;) { + FL2_cBuffer cbuf; + // Waits if compression in progress + csize = FL2_getNextCStreamBuffer(fcs, &cbuf); + CHECK_S(csize); + if (csize == 0) + break; + HRESULT err = WriteStream(outStream, cbuf.src, cbuf.size); + if (err != S_OK) + return err; + } + return S_OK; +} + +HRESULT CFastEncoder::FastLzma2::End(ISequentialOutStream *outStream, ICompressProgressInfo *progress) +{ + if (dict_pos) { + size_t res = FL2_updateDictionary(fcs, dict_pos); + CHECK_H(WaitAndReport(res, progress)); + } + + size_t res = FL2_endStream(fcs, nullptr); + CHECK_H(WaitAndReport(res, progress)); + while (res) { + WriteBuffers(outStream); + res = FL2_endStream(fcs, nullptr); + CHECK_H(WaitAndReport(res, progress)); + } + return S_OK; +} + +void CFastEncoder::FastLzma2::Cancel() +{ + FL2_cancelCStream(fcs); +} + +CFastEncoder::CFastEncoder() +{ +} + +CFastEncoder::~CFastEncoder() +{ +} + + +STDMETHODIMP CFastEncoder::SetCoderProperties(const PROPID *propIDs, + const PROPVARIANT *coderProps, UInt32 numProps) +{ + return _encoder.SetCoderProperties(propIDs, coderProps, numProps); +} + #define LZMA2_DIC_SIZE_FROM_PROP(p) (((UInt32)2 | ((p) & 1)) << ((p) / 2 + 11)) @@ -203,6 +328,7 @@ STDMETHODIMP CFastEncoder::WriteCoderProperties(ISequentialOutStream *outStream) { Byte prop; unsigned i; + size_t dictSize = _encoder.GetDictSize(); for (i = 0; i < 40; i++) if (dictSize <= LZMA2_DIC_SIZE_FROM_PROP(i)) break; @@ -211,79 +337,29 @@ STDMETHODIMP CFastEncoder::WriteCoderProperties(ISequentialOutStream *outStream) } -typedef struct -{ - ISequentialOutStream* outStream; - ICompressProgressInfo* progress; - UInt64 in_processed; - UInt64 out_processed; - HRESULT res; -} EncodingObjects; - -static int FL2LIB_CALL Progress(size_t done, void* opaque) -{ - EncodingObjects* p = (EncodingObjects*)opaque; - if (p && p->progress) { - UInt64 in_processed = p->in_processed + done; - p->res = p->progress->SetRatioInfo(&in_processed, &p->out_processed); - return p->res != S_OK; - } - return 0; -} - -static int FL2LIB_CALL Write(const void* src, size_t srcSize, void* opaque) -{ - EncodingObjects* p = (EncodingObjects*)opaque; - p->res = WriteStream(p->outStream, src, srcSize); - return p->res != S_OK; -} - STDMETHODIMP CFastEncoder::Code(ISequentialInStream *inStream, ISequentialOutStream *outStream, const UInt64 * /* inSize */, const UInt64 * /* outSize */, ICompressProgressInfo *progress) { - HRESULT err = S_OK; - inBuffer.AllocAtLeast(dictSize); - EncodingObjects objs = { outStream, progress, 0, 0, S_OK }; - FL2_blockBuffer block = { inBuffer, 0, 0, dictSize }; + CHECK_H(_encoder.Begin()); + size_t inSize; + unsigned long dSize; do { - FL2_shiftBlock(_encoder, &block); - size_t inSize = dictSize - block.start; - err = ReadStream(inStream, inBuffer + block.start, &inSize); - if (err != S_OK) - break; - block.end += inSize; - if (inSize) { - size_t cSize = FL2_compressCCtxBlock_toFn(_encoder, Write, &objs, &block, Progress); - if (FL2_isError(cSize)) { - if (FL2_getErrorCode(cSize) == FL2_error_memory_allocation) - return E_OUTOFMEMORY; - return objs.res != S_OK ? objs.res : S_FALSE; - } - if (objs.res != S_OK) - return objs.res; - objs.out_processed += cSize; - objs.in_processed += inSize; - if (progress) { - err = progress->SetRatioInfo(&objs.in_processed, &objs.out_processed); - if (err != S_OK) - break; - } - if (block.end < dictSize) - break; + BYTE* dict = _encoder.GetAvailableBuffer(dSize); + + inSize = dSize; + HRESULT err = ReadStream(inStream, dict, &inSize); + if (err != S_OK) { + _encoder.Cancel(); + return err; } - else break; + CHECK_H(_encoder.AddByteCount(inSize, outStream, progress)); - } while (err == S_OK); + } while (inSize == dSize); - if (err == S_OK) { - size_t cSize = FL2_endFrame_toFn(_encoder, Write, &objs); - if (FL2_isError(cSize)) - return S_FALSE; - objs.out_processed += cSize; - err = objs.res; - } - return err; + CHECK_H(_encoder.End(outStream, progress)); + + return S_OK; } }} diff --git a/CPP/7zip/Compress/Lzma2Encoder.h b/CPP/7zip/Compress/Lzma2Encoder.h index 4279825f..734c697b 100644 --- a/CPP/7zip/Compress/Lzma2Encoder.h +++ b/CPP/7zip/Compress/Lzma2Encoder.h @@ -24,13 +24,13 @@ class CEncoder: CLzma2EncHandle _encoder; public: MY_UNKNOWN_IMP4( - ICompressCoder, - ICompressSetCoderProperties, - ICompressWriteCoderProperties, - ICompressSetCoderPropertiesOpt) + ICompressCoder, + ICompressSetCoderProperties, + ICompressWriteCoderProperties, + ICompressSetCoderPropertiesOpt) STDMETHOD(Code)(ISequentialInStream *inStream, ISequentialOutStream *outStream, - const UInt64 *inSize, const UInt64 *outSize, ICompressProgressInfo *progress); + const UInt64 *inSize, const UInt64 *outSize, ICompressProgressInfo *progress); STDMETHOD(SetCoderProperties)(const PROPID *propIDs, const PROPVARIANT *props, UInt32 numProps); STDMETHOD(WriteCoderProperties)(ISequentialOutStream *outStream); STDMETHOD(SetCoderPropertiesOpt)(const PROPID *propIDs, const PROPVARIANT *props, UInt32 numProps); @@ -45,10 +45,33 @@ class CFastEncoder : public ICompressWriteCoderProperties, public CMyUnknownImp { - FL2_CCtx* _encoder; - CByteBuffer inBuffer; - UInt64 reduceSize; - UInt32 dictSize; + class FastLzma2 + { + public: + FastLzma2(); + ~FastLzma2(); + HRESULT SetCoderProperties(const PROPID *propIDs, const PROPVARIANT *props, UInt32 numProps); + size_t GetDictSize() const; + HRESULT Begin(); + BYTE* GetAvailableBuffer(unsigned long& size); + HRESULT AddByteCount(size_t count, ISequentialOutStream *outStream, ICompressProgressInfo *progress); + HRESULT End(ISequentialOutStream *outStream, ICompressProgressInfo *progress); + void Cancel(); + + private: + bool UpdateProgress(ICompressProgressInfo *progress); + HRESULT WaitAndReport(size_t& res, ICompressProgressInfo *progress); + HRESULT WriteBuffers(ISequentialOutStream *outStream); + + FL2_CStream* fcs; + FL2_dictBuffer dict; + size_t dict_pos; + + FastLzma2(const FastLzma2&) = delete; + FastLzma2& operator=(const FastLzma2&) = delete; + }; + + FastLzma2 _encoder; public: MY_UNKNOWN_IMP3( @@ -57,7 +80,7 @@ public: ICompressWriteCoderProperties) STDMETHOD(Code)(ISequentialInStream *inStream, ISequentialOutStream *outStream, - const UInt64 *inSize, const UInt64 *outSize, ICompressProgressInfo *progress); + const UInt64 *inSize, const UInt64 *outSize, ICompressProgressInfo *progress); STDMETHOD(SetCoderProperties)(const PROPID *propIDs, const PROPVARIANT *props, UInt32 numProps); STDMETHOD(WriteCoderProperties)(ISequentialOutStream *outStream); diff --git a/CPP/7zip/UI/GUI/CompressDialog.cpp b/CPP/7zip/UI/GUI/CompressDialog.cpp index c47228fb..5e0fda2f 100644 --- a/CPP/7zip/UI/GUI/CompressDialog.cpp +++ b/CPP/7zip/UI/GUI/CompressDialog.cpp @@ -1410,7 +1410,7 @@ typedef enum { } FL2_strategy; typedef struct { - unsigned dictionaryLog; /* largest match distance : larger == more compression, more memory needed during decompression; >= 27 == more memory, slower */ + UInt32 dictionarySize; /* largest match distance : larger == more compression, more memory needed during decompression; >= 27 == more memory per byte, slower */ unsigned overlapFraction; /* overlap between consecutive blocks in 1/16 units: larger == more compression, slower */ unsigned chainLog; /* fully searched segment : larger == more compression, slower, more memory; hybrid mode only (ultra) */ unsigned searchLog; /* nb of searches : larger == more compression, slower; hybrid mode only (ultra) */ @@ -1424,19 +1424,23 @@ typedef struct { #define FL2_MAX_7Z_CLEVEL 9 +#define MB *(1U<<20) + static const FL2_compressionParameters FL2_7zCParameters[FL2_MAX_7Z_CLEVEL + 1] = { - { 0,0,0,0,0,0,0 }, - { 20, 1, 7, 0, 6, 32, 1, 8, FL2_fast }, /* 1 */ - { 20, 2, 7, 0, 12, 32, 1, 8, FL2_fast }, /* 2 */ - { 21, 2, 7, 0, 16, 32, 1, 8, FL2_fast }, /* 3 */ - { 20, 2, 7, 0, 16, 32, 1, 8, FL2_opt }, /* 4 */ - { 24, 2, 9, 0, 40, 48, 1, 8, FL2_ultra }, /* 5 */ - { 25, 2, 10, 0, 48, 64, 1, 8, FL2_ultra }, /* 6 */ - { 26, 2, 11, 1, 60, 96, 1, 9, FL2_ultra }, /* 7 */ - { 27, 2, 12, 2, 128, 128, 1, 10, FL2_ultra }, /* 8 */ - { 27, 3, 14, 3, 252, 160, 0, 10, FL2_ultra } /* 9 */ + { 0,0,0,0,0,0,0,0,FL2_fast }, + { 1 MB, 1, 7, 0, 6, 32, 1, 4, FL2_fast }, /* 1 */ + { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_fast }, /* 2 */ + { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_opt }, /* 3 */ + { 4 MB, 2, 7, 0, 14, 32, 1, 4, FL2_opt }, /* 4 */ + { 16 MB, 2, 9, 0, 42, 48, 1, 4, FL2_ultra }, /* 5 */ + { 32 MB, 2, 10, 0, 50, 64, 1, 4, FL2_ultra }, /* 6 */ + { 64 MB, 2, 11, 1, 62, 96, 1, 3, FL2_ultra }, /* 7 */ + { 64 MB, 4, 12, 2, 90, 273, 1, 3, FL2_ultra }, /* 8 */ + { 128 MB, 2, 14, 3, 254, 273, 0, 2, FL2_ultra } /* 9 */ }; +#undef MB + #define RMF_BUILDER_SIZE (8 * 0x40100U) void CCompressDialog::SetDictionary() @@ -1512,7 +1516,7 @@ void CCompressDialog::SetDictionary() if (level > FL2_MAX_7Z_CLEVEL) level = FL2_MAX_7Z_CLEVEL; if (defaultDict == (UInt32)(Int32)-1) - defaultDict = (UInt32)1 << FL2_7zCParameters[level].dictionaryLog; + defaultDict = FL2_7zCParameters[level].dictionarySize; m_Dictionary.SetCurSel(0); @@ -2020,11 +2024,11 @@ UInt64 CCompressDialog::GetMemoryUsage(UInt32 dict, UInt64 &decompressMemory) { if (level > FL2_MAX_7Z_CLEVEL) level = FL2_MAX_7Z_CLEVEL; - size += dict * 5 + (1UL << 18) * numThreads; - unsigned depth = FL2_7zCParameters[level].searchDepth; - UInt32 bufSize = UInt32(1) << (FL2_7zCParameters[level].dictionaryLog - FL2_7zCParameters[level].bufferLog); + /* dual buffer is enabled in Lzma2Encoder.cpp so size is dict * 6 */ + size += dict * 6 + (1UL << 18) * numThreads; + UInt32 bufSize = dict >> (12 - FL2_7zCParameters[level].bufferLog); size += (bufSize * 12 + RMF_BUILDER_SIZE) * numThreads; - if (dict > (UInt32(1) << 26) || depth > 63) + if (dict > (UInt32(1) << 26)) size += dict; if (FL2_7zCParameters[level].strategy == FL2_ultra) size += (UInt32(4) << 14) + (UInt32(4) << FL2_7zCParameters[level].chainLog);