From d85962e654a3573129d67b1896e86b6aa1b44918 Mon Sep 17 00:00:00 2001
From: conor42 <ronoc74@yahoo.com>
Date: Mon, 18 Mar 2019 00:05:50 +1000
Subject: [PATCH] Update to Fast LZMA2 1.0.0

---
 C/fast-lzma2/compiler.h                       |   83 +-
 C/fast-lzma2/count.h                          |   12 +-
 C/fast-lzma2/dict_buffer.c                    |  230 ++
 C/fast-lzma2/dict_buffer.h                    |   81 +
 C/fast-lzma2/fast-lzma2.h                     |  558 ++--
 C/fast-lzma2/fl2_common.c                     |   64 +-
 C/fast-lzma2/fl2_compress.c                   | 1563 ++++++-----
 C/fast-lzma2/fl2_compress_internal.h          |   55 +-
 C/fast-lzma2/fl2_error_private.c              |   35 -
 C/fast-lzma2/fl2_error_private.h              |   75 -
 C/fast-lzma2/fl2_errors.h                     |   28 +-
 C/fast-lzma2/fl2_internal.h                   |   17 +-
 C/fast-lzma2/fl2_pool.c                       |  198 ++
 C/fast-lzma2/{fl2pool.h => fl2_pool.h}        |   10 +-
 .../{fl2threading.c => fl2_threading.c}       |   28 +-
 C/fast-lzma2/fl2_threading.h                  |  178 ++
 C/fast-lzma2/fl2pool.c                        |  201 --
 C/fast-lzma2/fl2threading.h                   |  120 -
 C/fast-lzma2/lzma2_enc.c                      | 2305 ++++++++---------
 C/fast-lzma2/lzma2_enc.h                      |   34 +-
 C/fast-lzma2/mem.h                            |   25 +-
 C/fast-lzma2/platform.h                       |  100 +-
 C/fast-lzma2/radix_bitpack.c                  |    5 +-
 C/fast-lzma2/radix_engine.h                   |  562 ++--
 C/fast-lzma2/radix_get.h                      |  210 ++
 C/fast-lzma2/radix_internal.h                 |   56 +-
 C/fast-lzma2/radix_mf.c                       |  465 ++--
 C/fast-lzma2/radix_mf.h                       |   19 +-
 C/fast-lzma2/radix_struct.c                   |    4 +-
 C/fast-lzma2/range_enc.c                      |  216 +-
 C/fast-lzma2/range_enc.h                      |   88 +-
 C/fast-lzma2/util.c                           |  707 +++++
 C/fast-lzma2/util.h                           |  671 +----
 CPP/7zip/7zip.mak                             |    4 +-
 CPP/7zip/Bundles/Alone/makefile               |    7 +-
 CPP/7zip/Bundles/Codec_flzma2/makefile        |    7 +-
 CPP/7zip/Bundles/Format7z/makefile            |    7 +-
 CPP/7zip/Bundles/Format7zF/makefile           |    7 +-
 CPP/7zip/Bundles/Format7zFO/makefile          |    7 +-
 CPP/7zip/Bundles/Format7zUSB/makefile         |    7 +-
 CPP/7zip/Compress/Lzma2Encoder.cpp            |  282 +-
 CPP/7zip/Compress/Lzma2Encoder.h              |   43 +-
 CPP/7zip/UI/GUI/CompressDialog.cpp            |   36 +-
 43 files changed, 5467 insertions(+), 3943 deletions(-)
 create mode 100644 C/fast-lzma2/dict_buffer.c
 create mode 100644 C/fast-lzma2/dict_buffer.h
 delete mode 100644 C/fast-lzma2/fl2_error_private.c
 delete mode 100644 C/fast-lzma2/fl2_error_private.h
 create mode 100644 C/fast-lzma2/fl2_pool.c
 rename C/fast-lzma2/{fl2pool.h => fl2_pool.h} (76%)
 rename C/fast-lzma2/{fl2threading.c => fl2_threading.c} (73%)
 create mode 100644 C/fast-lzma2/fl2_threading.h
 delete mode 100644 C/fast-lzma2/fl2pool.c
 delete mode 100644 C/fast-lzma2/fl2threading.h
 create mode 100644 C/fast-lzma2/radix_get.h
 create mode 100644 C/fast-lzma2/util.c

diff --git a/C/fast-lzma2/compiler.h b/C/fast-lzma2/compiler.h
index dc3bfff3..b33d18b7 100644
--- a/C/fast-lzma2/compiler.h
+++ b/C/fast-lzma2/compiler.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
  * All rights reserved.
+ * Modified for FL2 by Conor McCarthy
  *
  * This source code is licensed under both the BSD-style license (found in the
  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
@@ -8,13 +9,15 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
-#ifndef ZSTD_COMPILER_H
-#define ZSTD_COMPILER_H
+#ifndef FL2_COMPILER_H
+#define FL2_COMPILER_H
 
 /*-*******************************************************
 *  Compiler specifics
 *********************************************************/
 /* force inlining */
+
+#if !defined(FL2_NO_INLINE)
 #if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
 #  define INLINE_KEYWORD inline
 #else
@@ -29,6 +32,13 @@
 #  define FORCE_INLINE_ATTR
 #endif
 
+#else
+
+#define INLINE_KEYWORD
+#define FORCE_INLINE_ATTR
+
+#endif
+
 /**
  * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
  * parameters. They must be inlined for the compiler to eliminate the constant
@@ -54,24 +64,69 @@
 
 /* force no inlining */
 #ifdef _MSC_VER
-#  define FORCE_NOINLINE static __declspec(noinline)
+#  define FORCE_NOINLINE __declspec(noinline)
 #else
 #  ifdef __GNUC__
-#    define FORCE_NOINLINE static __attribute__((__noinline__))
+#    define FORCE_NOINLINE __attribute__((__noinline__))
 #  else
-#    define FORCE_NOINLINE static
+#    define FORCE_NOINLINE
 #  endif
 #endif
 
-/* prefetch */
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
-#  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#  define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
-#elif defined(__GNUC__)
-#  define PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
-#else
-#  define PREFETCH(ptr)   /* disabled */
+/* target attribute */
+#ifndef __has_attribute
+  #define __has_attribute(x) 0  /* Compatibility with non-clang compilers. */
 #endif
+#if defined(__GNUC__)
+#  define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+#  define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+ */
+#ifndef DYNAMIC_BMI2
+  #if ((defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
+      && (defined(__x86_64__) || defined(_M_X86)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
+#endif
+
+/* prefetch
+ * can be disabled, by declaring NO_PREFETCH build macro */
+#if defined(NO_PREFETCH)
+#  define PREFETCH_L1(ptr)  (void)(ptr)  /* disabled */
+#  define PREFETCH_L2(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#    define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#    define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#  else
+#    define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+#    define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* NO_PREFETCH */
+
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)  {            \
+    const char* const _ptr = (const char*)(p);  \
+    size_t const _size = (size_t)(s);     \
+    size_t _pos;                          \
+    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+        PREFETCH_L2(_ptr + _pos);         \
+    }                                     \
+}
 
 /* disable warnings */
 #ifdef _MSC_VER    /* Visual Studio */
@@ -83,4 +138,4 @@
 #  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
 #endif
 
-#endif /* ZSTD_COMPILER_H */
+#endif /* FL2_COMPILER_H */
diff --git a/C/fast-lzma2/count.h b/C/fast-lzma2/count.h
index 77f796a3..11bf1ef3 100644
--- a/C/fast-lzma2/count.h
+++ b/C/fast-lzma2/count.h
@@ -1,3 +1,13 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
 #ifndef ZSTD_COUNT_H_
 #define ZSTD_COUNT_H_
 
@@ -86,7 +96,7 @@ static unsigned ZSTD_NbCommonBytes(register size_t val)
 }
 
 
-MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+static size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
 {
     const BYTE* const pStart = pIn;
     const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t) - 1);
diff --git a/C/fast-lzma2/dict_buffer.c b/C/fast-lzma2/dict_buffer.c
new file mode 100644
index 00000000..06d9a4f0
--- /dev/null
+++ b/C/fast-lzma2/dict_buffer.c
@@ -0,0 +1,230 @@
+/*
+* Copyright (c) 2019, Conor McCarthy
+* All rights reserved.
+*
+* This source code is licensed under both the BSD-style license (found in the
+* LICENSE file in the root directory of this source tree) and the GPLv2 (found
+* in the COPYING file in the root directory of this source tree).
+* You may select, at your option, one of the above-listed licenses.
+*/
+
+#include <stdlib.h>
+#include "dict_buffer.h"
+#include "fl2_internal.h"
+
+#define ALIGNMENT_SIZE 16U
+#define ALIGNMENT_MASK (~(size_t)(ALIGNMENT_SIZE-1))
+
+/* DICT_buffer functions */
+
+int DICT_construct(DICT_buffer * const buf, int const async)
+{
+    buf->data[0] = NULL;
+    buf->data[1] = NULL;
+    buf->size = 0;
+
+    buf->async = (async != 0);
+
+#ifndef NO_XXHASH
+    buf->xxh = NULL;
+#endif
+
+    return 0;
+}
+
+int DICT_init(DICT_buffer * const buf, size_t const dict_size, size_t const overlap, unsigned const reset_multiplier, int const do_hash)
+{
+    /* Allocate if not yet allocated or existing dict too small */
+    if (buf->data[0] == NULL || dict_size > buf->size) {
+        /* Free any existing buffers */
+        DICT_destruct(buf);
+
+        buf->data[0] = malloc(dict_size);
+
+        buf->data[1] = NULL;
+        if (buf->async)
+            buf->data[1] = malloc(dict_size);
+
+        if (buf->data[0] == NULL || (buf->async && buf->data[1] == NULL)) {
+            DICT_destruct(buf);
+            return 1;
+        }
+    }
+    buf->index = 0;
+    buf->overlap = overlap;
+    buf->start = 0;
+    buf->end = 0;
+    buf->size = dict_size;
+    buf->total = 0;
+    buf->reset_interval = (reset_multiplier != 0) ? dict_size * reset_multiplier : ((size_t)1 << 31);
+
+#ifndef NO_XXHASH
+    if (do_hash) {
+        if (buf->xxh == NULL) {
+            buf->xxh = XXH32_createState();
+            if (buf->xxh == NULL) {
+                DICT_destruct(buf);
+                return 1;
+            }
+        }
+        XXH32_reset(buf->xxh, 0);
+    }
+    else {
+        XXH32_freeState(buf->xxh);
+        buf->xxh = NULL;
+    }
+#else
+    (void)do_hash;
+#endif
+
+    return 0;
+}
+
+void DICT_destruct(DICT_buffer * const buf)
+{
+    free(buf->data[0]);
+    free(buf->data[1]);
+    buf->data[0] = NULL;
+    buf->data[1] = NULL;
+    buf->size = 0;
+#ifndef NO_XXHASH
+    XXH32_freeState(buf->xxh);
+    buf->xxh = NULL;
+#endif
+}
+
+size_t DICT_size(const DICT_buffer * const buf)
+{
+    return buf->size;
+}
+
+/* Get the dictionary buffer for adding input */
+size_t DICT_get(DICT_buffer * const buf, void **const dict)
+{
+    DICT_shift(buf);
+
+    DEBUGLOG(5, "Getting dict buffer %u, pos %u, avail %u", (unsigned)buf->index, (unsigned)buf->end, (unsigned)(buf->size - buf->end));
+    *dict = buf->data[buf->index] + buf->end;
+    return buf->size - buf->end;
+}
+
+/* Update with the amount added */
+int DICT_update(DICT_buffer * const buf, size_t const added_size)
+{
+    DEBUGLOG(5, "Added %u bytes to dict buffer %u", (unsigned)added_size, (unsigned)buf->index);
+    buf->end += added_size;
+    assert(buf->end <= buf->size);
+    return !DICT_availSpace(buf);
+}
+
+/* Read from input and write to the dict */
+void DICT_put(DICT_buffer * const buf, FL2_inBuffer * const input)
+{
+    size_t const to_read = MIN(buf->size - buf->end, input->size - input->pos);
+
+    DEBUGLOG(5, "CStream : reading %u bytes", (U32)to_read);
+
+    memcpy(buf->data[buf->index] + buf->end, (BYTE*)input->src + input->pos, to_read);
+
+    input->pos += to_read;
+    buf->end += to_read;
+}
+
+size_t DICT_availSpace(const DICT_buffer * const buf)
+{
+    return buf->size - buf->end;
+}
+
+/* Get the size of uncompressed data. start is set to end after compression */
+int DICT_hasUnprocessed(const DICT_buffer * const buf)
+{
+    return buf->start < buf->end;
+}
+
+/* Get the buffer, overlap and end for compression */
+void DICT_getBlock(DICT_buffer * const buf, FL2_dataBlock * const block)
+{
+    block->data = buf->data[buf->index];
+    block->start = buf->start;
+    block->end = buf->end;
+
+#ifndef NO_XXHASH
+    if (buf->xxh != NULL)
+        XXH32_update(buf->xxh, buf->data[buf->index] + buf->start, buf->end - buf->start);
+#endif
+
+    buf->total += buf->end - buf->start;
+    buf->start = buf->end;
+}
+
+/* Shift occurs when all is processed and end is beyond the overlap size */
+int DICT_needShift(DICT_buffer * const buf)
+{
+    if (buf->start < buf->end)
+        return 0;
+    /* Reset the dict if the next compression cycle would exceed the reset interval */
+    size_t overlap = (buf->total + buf->size - buf->overlap > buf->reset_interval) ? 0 : buf->overlap;
+    return buf->start == buf->end && (overlap == 0 || buf->end >= overlap + ALIGNMENT_SIZE);
+}
+
+int DICT_async(const DICT_buffer * const buf)
+{
+    return (int)buf->async;
+}
+
+/* Shift the overlap amount to the start of either the only dict buffer or the alternate one
+ * if it exists */
+void DICT_shift(DICT_buffer * const buf)
+{
+    if (buf->start < buf->end)
+        return;
+
+    size_t overlap = buf->overlap;
+    /* Reset the dict if the next compression cycle would exceed the reset interval */
+    if (buf->total + buf->size - buf->overlap > buf->reset_interval) {
+        DEBUGLOG(4, "Resetting dictionary after %u bytes", (unsigned)buf->total);
+        overlap = 0;
+    }
+
+    if (overlap == 0) {
+        /* No overlap means a simple buffer switch */
+        buf->start = 0;
+        buf->end = 0;
+        buf->index ^= buf->async;
+        buf->total = 0;
+    }
+    else if (buf->end >= overlap + ALIGNMENT_SIZE) {
+        size_t const from = (buf->end - overlap) & ALIGNMENT_MASK;
+        const BYTE *const src = buf->data[buf->index];
+        /* Copy to the alternate if one exists */
+        BYTE *const dst = buf->data[buf->index ^ buf->async];
+
+        overlap = buf->end - from;
+
+        if (overlap <= from || dst != src) {
+            DEBUGLOG(5, "Copy overlap data : %u bytes from %u", (unsigned)overlap, (unsigned)from);
+            memcpy(dst, src + from, overlap);
+        }
+        else if (from != 0) {
+            DEBUGLOG(5, "Move overlap data : %u bytes from %u", (unsigned)overlap, (unsigned)from);
+            memmove(dst, src + from, overlap);
+        }
+        /* New data will be written after the overlap */
+        buf->start = overlap;
+        buf->end = overlap;
+        /* Switch buffers */
+        buf->index ^= buf->async;
+    }
+}
+
+#ifndef NO_XXHASH
+XXH32_hash_t DICT_getDigest(const DICT_buffer * const buf)
+{
+    return XXH32_digest(buf->xxh);
+}
+#endif
+
+size_t DICT_memUsage(const DICT_buffer * const buf)
+{
+    return (1 + buf->async) * buf->size;
+}
diff --git a/C/fast-lzma2/dict_buffer.h b/C/fast-lzma2/dict_buffer.h
new file mode 100644
index 00000000..436472fb
--- /dev/null
+++ b/C/fast-lzma2/dict_buffer.h
@@ -0,0 +1,81 @@
+/*
+* Copyright (c) 2018, Conor McCarthy
+* All rights reserved.
+*
+* This source code is licensed under both the BSD-style license (found in the
+* LICENSE file in the root directory of this source tree) and the GPLv2 (found
+* in the COPYING file in the root directory of this source tree).
+* You may select, at your option, one of the above-listed licenses.
+*/
+
+#include "fast-lzma2.h"
+#include "mem.h"
+#include "data_block.h"
+#ifndef NO_XXHASH
+#  include "xxhash.h"
+#endif
+
+#ifndef FL2_DICT_BUFFER_H_
+#define FL2_DICT_BUFFER_H_
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* DICT_buffer structure.
+ * Maintains one or two dictionary buffers. In a dual dict configuration (asyc==1), when the
+ * current buffer is full, the overlap region will be copied to the other buffer and it
+ * becomes the destination for input while the first is compressed. This is useful when I/O
+ * is much slower than compression. */
+typedef struct {
+    BYTE* data[2];
+    size_t index;
+    size_t async;
+    size_t overlap;
+    size_t start;  /* start = 0 (first block) or overlap */
+    size_t end;    /* never < overlap */
+    size_t size;   /* allocation size */
+    size_t total;  /* total size compressed after last dict reset */
+    size_t reset_interval;
+#ifndef NO_XXHASH
+    XXH32_state_t *xxh;
+#endif
+} DICT_buffer;
+
+int DICT_construct(DICT_buffer *const buf, int const async);
+
+int DICT_init(DICT_buffer *const buf, size_t const dict_size, size_t const overlap, unsigned const reset_multiplier, int const do_hash);
+
+void DICT_destruct(DICT_buffer *const buf);
+
+size_t DICT_size(const DICT_buffer *const buf);
+
+size_t DICT_get(DICT_buffer *const buf, void **const dict);
+
+int DICT_update(DICT_buffer *const buf, size_t const added_size);
+
+void DICT_put(DICT_buffer *const buf, FL2_inBuffer* const input);
+
+size_t DICT_availSpace(const DICT_buffer *const buf);
+
+int DICT_hasUnprocessed(const DICT_buffer *const buf);
+
+void DICT_getBlock(DICT_buffer *const buf, FL2_dataBlock *const block);
+
+int DICT_needShift(DICT_buffer *const buf);
+
+int DICT_async(const DICT_buffer *const buf);
+
+void DICT_shift(DICT_buffer *const buf);
+
+#ifndef NO_XXHASH
+XXH32_hash_t DICT_getDigest(const DICT_buffer *const buf);
+#endif
+
+size_t DICT_memUsage(const DICT_buffer *const buf);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* FL2_DICT_BUFFER_H_ */
\ No newline at end of file
diff --git a/C/fast-lzma2/fast-lzma2.h b/C/fast-lzma2/fast-lzma2.h
index a1d479c9..7f90de45 100644
--- a/C/fast-lzma2/fast-lzma2.h
+++ b/C/fast-lzma2/fast-lzma2.h
@@ -53,9 +53,9 @@ Introduction
 *********************************************************************************************************/
 
 /*------   Version   ------*/
-#define FL2_VERSION_MAJOR    0
-#define FL2_VERSION_MINOR    9
-#define FL2_VERSION_RELEASE  2
+#define FL2_VERSION_MAJOR    1
+#define FL2_VERSION_MINOR    0
+#define FL2_VERSION_RELEASE  0
 
 #define FL2_VERSION_NUMBER  (FL2_VERSION_MAJOR *100*100 + FL2_VERSION_MINOR *100 + FL2_VERSION_RELEASE)
 FL2LIB_API unsigned FL2LIB_CALL FL2_versionNumber(void);   /**< useful to check dll version */
@@ -67,12 +67,13 @@ FL2LIB_API unsigned FL2LIB_CALL FL2_versionNumber(void);   /**< useful to check
 FL2LIB_API const char* FL2LIB_CALL FL2_versionString(void);
 
 
+#define FL2_MAXTHREADS 200
+
+
 /***************************************
 *  Simple API
 ***************************************/
 
-#define FL2_MAXTHREADS 200
-
 /*! FL2_compress() :
  *  Compresses `src` content as a single LZMA2 compressed stream into already allocated `dst`.
  *  Call FL2_compressMt() to use > 1 thread. Specify nbThreads = 0 to use all cores.
@@ -88,20 +89,30 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compressMt(void* dst, size_t dstCapacity,
     unsigned nbThreads);
 
 /*! FL2_decompress() :
- *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
- *  `dstCapacity` is an upper bound of originalSize to regenerate.
- *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  Decompresses a single LZMA2 compressed stream from `src` into already allocated `dst`.
+ *  `compressedSize` : must be at least the size of the LZMA2 stream.
+ *  `dstCapacity` is the original, uncompressed size to regenerate, returned by calling
+ *  FL2_findDecompressedSize().
+ *  Call FL2_decompressMt() to use > 1 thread. Specify nbThreads = 0 to use all cores. The stream
+ *  must contain dictionary resets to use multiple threads. These are inserted during compression by
+ *  default. The frequency can be changed/disabled with the FL2_p_resetInterval parameter setting.
  *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
  *            or an errorCode if it fails (which can be tested using FL2_isError()). */
 FL2LIB_API size_t FL2LIB_CALL FL2_decompress(void* dst, size_t dstCapacity,
     const void* src, size_t compressedSize);
 
+FL2LIB_API size_t FL2LIB_CALL FL2_decompressMt(void* dst, size_t dstCapacity,
+    const void* src, size_t compressedSize,
+    unsigned nbThreads);
+
 /*! FL2_findDecompressedSize()
  *  `src` should point to the start of a LZMA2 encoded stream.
  *  `srcSize` must be at least as large as the LZMA2 stream including end marker.
+ *  A property byte is assumed to exist at position 0 in `src`. If the stream was created without one,
+ *  subtract 1 byte from `src` when passing it to the function.
  *  @return : - decompressed size of the stream in `src`, if known
  *            - FL2_CONTENTSIZE_ERROR if an error occurred (e.g. corruption, srcSize too small)
- *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 1 : a 0 return value means the stream is valid but "empty".
  *   note 2 : decompressed size can be very large (64-bits value),
  *            potentially larger than what local system can handle as a single memory segment.
  *            In which case, it's necessary to use streaming mode to decompress data.
@@ -109,122 +120,80 @@ FL2LIB_API size_t FL2LIB_CALL FL2_decompress(void* dst, size_t dstCapacity,
  *            Always ensure return value fits within application's authorized limits.
  *            Each application can set its own limits. */
 #define FL2_CONTENTSIZE_ERROR (size_t)-1
-FL2LIB_API size_t FL2LIB_CALL FL2_findDecompressedSize(const void *src, size_t srcSize);
+FL2LIB_API unsigned long long FL2LIB_CALL FL2_findDecompressedSize(const void *src, size_t srcSize);
 
 
 /*======  Helper functions  ======*/
-#define FL2_COMPRESSBOUND(srcSize)   ((srcSize) + (((srcSize) + 0xFFF) / 0x1000) * 3 + 6)  /* this formula calculates the maximum size of data stored in uncompressed chunks */
+#define FL2_COMPRESSBOUND(srcSize)   ((srcSize) + (((srcSize) + 0xFFF) / 0x1000) * 3 + 6)  /*!< calculates the maximum size of data stored in a sequence of uncompressed chunks */
 FL2LIB_API size_t      FL2LIB_CALL FL2_compressBound(size_t srcSize); /*!< maximum compressed size in worst case scenario */
 FL2LIB_API unsigned    FL2LIB_CALL FL2_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+FL2LIB_API unsigned    FL2LIB_CALL FL2_isTimedOut(size_t code);       /*!< tells if a `size_t` function result is the timeout code */
 FL2LIB_API const char* FL2LIB_CALL FL2_getErrorName(size_t code);     /*!< provides readable string from an error code */
 FL2LIB_API int         FL2LIB_CALL FL2_maxCLevel(void);               /*!< maximum compression level available */
 FL2LIB_API int         FL2LIB_CALL FL2_maxHighCLevel(void);           /*!< maximum compression level available in high mode */
 
+
 /***************************************
 *  Explicit memory management
 ***************************************/
+
 /*= Compression context
- *  When compressing many times,
- *  it is recommended to allocate a context just once, and re-use it for each successive compression operation.
- *  This will make workload friendlier for system's memory.
- *  The context may not use the number of threads requested if the library is compiled for single-threaded
- *  compression or nbThreads > FL2_MAXTHREADS. Call FL2_CCtx_nbThreads to obtain the actual number. */
+ *  When compressing many times, it is recommended to allocate a context just once,
+ *  and re-use it for each successive compression operation. This will make workload
+ *  friendlier for system's memory. The context may not use the number of threads requested
+ *  if the library is compiled for single-threaded compression or nbThreads > FL2_MAXTHREADS.
+ *  Call FL2_getCCtxThreadCount to obtain the actual number allocated. */
 typedef struct FL2_CCtx_s FL2_CCtx;
 FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtx(void);
 FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtxMt(unsigned nbThreads);
 FL2LIB_API void      FL2LIB_CALL FL2_freeCCtx(FL2_CCtx* cctx);
 
-FL2LIB_API unsigned FL2LIB_CALL FL2_CCtx_nbThreads(const FL2_CCtx* ctx);
+FL2LIB_API unsigned FL2LIB_CALL FL2_getCCtxThreadCount(const FL2_CCtx* cctx);
 
 /*! FL2_compressCCtx() :
- *  Same as FL2_compress(), requires an allocated FL2_CCtx (see FL2_createCCtx()). */
-FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* ctx,
+ *  Same as FL2_compress(), but requires an allocated FL2_CCtx (see FL2_createCCtx()). */
+FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* cctx,
     void* dst, size_t dstCapacity,
     const void* src, size_t srcSize,
     int compressionLevel);
 
-/************************************************
-*  Caller-managed data buffer and overlap section
-************************************************/
-
-typedef struct {
-    unsigned char *data;
-    size_t start;   /* start = 0 (first block) or overlap */
-    size_t end;     /* never < overlap */
-    size_t bufSize; /* allocation size */
-} FL2_blockBuffer;
-
-typedef int (FL2LIB_CALL *FL2_progressFn)(size_t done, void* opaque);
-
-/* Get the size of the overlap section. */
-FL2LIB_API size_t FL2LIB_CALL FL2_blockOverlap(const FL2_CCtx* ctx);
-
-/* Copy the overlap section to the start to prepare for more data */
-FL2LIB_API void FL2LIB_CALL FL2_shiftBlock(FL2_CCtx* ctx, FL2_blockBuffer *block);
-/* Copy the overlap to a different buffer. This allows a dual-buffer configuration where
- * data is read into one block while the other is compressed. */
-FL2LIB_API void FL2LIB_CALL FL2_shiftBlock_switch(FL2_CCtx* ctx, FL2_blockBuffer *block, unsigned char *dst);
-
-FL2LIB_API void FL2LIB_CALL FL2_beginFrame(FL2_CCtx* const cctx);
-
-/*! FL2_compressCCtxBlock() :
- *  Same as FL2_compressCCtx except the caller is responsible for supplying an overlap section.
- *  The FL2_p_overlapFraction parameter will not be used.
- *  srcStart + srcSize should equal the dictionary size except on the last call.
- *  Can be called multiple times. FL2_endFrame() must be called when finished.
- *  For compatibility with this library the caller must write a property byte at
- *  the beginning of the output. Obtain it by calling FL2_dictSizeProp() before
- *  compressing the first block or after the last. No hash will be written, but
- *  the caller can calculate it using the interface in xxhash.h, write it at the end,
- *  and set bit 7 in the property byte. */
-FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtxBlock(FL2_CCtx* ctx,
-    void* dst, size_t dstCapacity,
-    const FL2_blockBuffer *block,
-    FL2_progressFn progress, void* opaque);
-
-/*! FL2_endFrame() :
- *  Write the end marker to terminate the LZMA2 stream.
- *  Must be called after compressing with FL2_compressCCtxBlock() */
-FL2LIB_API size_t FL2LIB_CALL FL2_endFrame(FL2_CCtx* ctx,
-    void* dst, size_t dstCapacity);
-
-typedef int (FL2LIB_CALL *FL2_writerFn)(const void* src, size_t srcSize, void* opaque);
-
-/*! FL2_compressCCtxBlock_toFn() :
- *  Same as FL2_compressCCtx except the caller is responsible for supplying an
- *  overlap section, and compressed data is written to a callback function.
- *  The FL2_p_overlapFraction parameter will not be used.
- *  Can be called multiple times. FL2_endFrame_toFn() must be called when finished. */
-FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtxBlock_toFn(FL2_CCtx* ctx,
-    FL2_writerFn writeFn, void* opaque,
-    const FL2_blockBuffer *block,
-    FL2_progressFn progress);
-
-/*! FL2_endFrame() :
- *  Write the end marker to a callback function to terminate the LZMA2 stream.
- *  Must be called after compressing with FL2_compressCCtxBlock_toFn() */
-FL2LIB_API size_t FL2LIB_CALL FL2_endFrame_toFn(FL2_CCtx* ctx,
-    FL2_writerFn writeFn, void* opaque);
-
-/*! FL2_dictSizeProp() :
+/*! FL2_getCCtxDictProp() :
  *  Get the dictionary size property.
  *  Intended for use with the FL2_p_omitProperties parameter for creating a
- *  7-zip compatible LZMA2 stream. */
-FL2LIB_API unsigned char FL2LIB_CALL FL2_dictSizeProp(FL2_CCtx* ctx);
+ *  7-zip or XZ compatible LZMA2 stream. */
+FL2LIB_API unsigned char FL2LIB_CALL FL2_getCCtxDictProp(FL2_CCtx* cctx);
+
+
+/****************************
+*  Decompression
+****************************/
 
 /*= Decompression context
- *  When decompressing many times,
- *  it is recommended to allocate a context only once,
- *  and re-use it for each successive compression operation.
- *  This will make the workload friendlier for the system's memory.
- *  Use one context per thread for parallel execution. */
-typedef struct CLzma2Dec_s FL2_DCtx;
+ *  When decompressing many times, it is recommended to allocate a context only once,
+ *  and re-use it for each successive decompression operation. This will make the workload
+ *  friendlier for the system's memory.
+ *  The context may not allocate the number of threads requested if the library is
+ *  compiled for single-threaded compression or nbThreads > FL2_MAXTHREADS.
+ *  Call FL2_getDCtxThreadCount to obtain the actual number allocated.
+ *  At least nbThreads dictionary resets must exist in the stream to use all of the
+ *  threads. Dictionary resets are inserted into the stream according to the
+ *  FL2_p_resetInterval parameter used in the compression context. */
+typedef struct FL2_DCtx_s FL2_DCtx;
 FL2LIB_API FL2_DCtx* FL2LIB_CALL FL2_createDCtx(void);
+FL2LIB_API FL2_DCtx* FL2LIB_CALL FL2_createDCtxMt(unsigned nbThreads);
 FL2LIB_API size_t    FL2LIB_CALL FL2_freeDCtx(FL2_DCtx* dctx);
 
+FL2LIB_API unsigned FL2LIB_CALL FL2_getDCtxThreadCount(const FL2_DCtx* dctx);
+
+
+/*! FL2_initDCtx() :
+ *  Use only when a property byte is not present at input byte 0. No init is necessary otherwise.
+ *  The caller must store the result from FL2_getCCtxDictProp() and pass it to this function. */
+FL2LIB_API size_t FL2LIB_CALL FL2_initDCtx(FL2_DCtx* dctx, unsigned char prop);
+
 /*! FL2_decompressDCtx() :
  *  Same as FL2_decompress(), requires an allocated FL2_DCtx (see FL2_createDCtx()) */
-FL2LIB_API size_t FL2LIB_CALL FL2_decompressDCtx(FL2_DCtx* ctx,
+FL2LIB_API size_t FL2LIB_CALL FL2_decompressDCtx(FL2_DCtx* cctx,
     void* dst, size_t dstCapacity,
     const void* src, size_t srcSize);
 
@@ -232,90 +201,180 @@ FL2LIB_API size_t FL2LIB_CALL FL2_decompressDCtx(FL2_DCtx* ctx,
 *  Streaming
 ****************************/
 
-typedef struct FL2_inBuffer_s {
+typedef struct {
     const void* src;    /**< start of input buffer */
     size_t size;        /**< size of input buffer */
     size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
 } FL2_inBuffer;
 
-typedef struct FL2_outBuffer_s {
+typedef struct {
     void*  dst;         /**< start of output buffer */
     size_t size;        /**< size of output buffer */
     size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
 } FL2_outBuffer;
 
+/*** Push/pull structs ***/
 
+typedef struct {
+    void*  dst;         /**< start of available dict buffer */
+    unsigned long size; /**< size of dict remaining */
+} FL2_dictBuffer;
+
+typedef struct {
+    const void* src;    /**< start of compressed data */
+    size_t size;        /**< size of compressed data */
+} FL2_cBuffer;
 
 /*-***********************************************************************
- *  Streaming compression - HowTo
+ *  Streaming compression
  *
  *  A FL2_CStream object is required to track streaming operation.
  *  Use FL2_createCStream() and FL2_freeCStream() to create/release resources.
  *  FL2_CStream objects can be reused multiple times on consecutive compression operations.
- *  It is recommended to re-use FL2_CStream in situations where many streaming operations will be achieved consecutively,
- *  since it will play nicer with system's memory, by re-using already allocated memory.
+ *  It is recommended to re-use FL2_CStream in situations where many streaming operations will be done
+ *  consecutively, since it will reduce allocation and initialization time.
  *
- *  Start a new compression by initializing FL2_CStream.
- *  Use FL2_initCStream() to start a new compression operation.
+ *  Call FL2_createCStreamMt() with a nonzero dualBuffer parameter to use two input dictionary buffers.
+ *  The stream will not block on FL2_compressStream() and continues to accept data while compression is
+ *  underway, until both buffers are full. Useful when I/O is slow.
+ *  To compress with a single thread with dual buffering, call FL2_createCStreamMt with nbThreads=1.
+ *
+ *  Use FL2_initCStream() on the FL2_CStream object to start a new compression operation.
  *
  *  Use FL2_compressStream() repetitively to consume input stream.
- *  The function will automatically update both `pos` fields.
- *  It will always consume the entire input unless an error occurs,
+ *  The function will automatically update the `pos` field.
+ *  It will always consume the entire input unless an error occurs or the dictionary buffer is filled,
  *  unlike the decompression function.
- *  @return : a size hint - remaining capacity to fill before compression occurs,
- *            or an error code, which can be tested using FL2_isError().
- *            Note : it's just a hint, any other value will work fine.
  *
- *  At any moment, it's possible, but not recommended, to flush whatever data remains
- *  within internal buffer using FL2_flushStream().
- *  `output->pos` will be updated.
- *  Note 1 : this will reduce compression ratio because the algorithm is block-based.
- *  Note 2 : some content might still be left within internal buffers if `output->size` is too small.
- *  @return : nb of bytes still present within internal buffers (0 if they're empty)
- *            or an error code, which can be tested using FL2_isError().
+ *  The radix match finder allows compressed data to be stored in its match table during encoding.
+ *  Applications may call streaming compression functions with output == NULL. In this case,
+ *  when the function returns 1, the compressed data must be read from the internal buffers.
+ *  Call FL2_getNextCStreamBuffer() repeatedly until it returns 0.
+ *  Each call returns buffer information in the FL2_inBuffer parameter. Applications typically will 
+ *  passed this to an I/O write function or downstream filter.
+ *  Alternately, applications may pass an FL2_outBuffer object pointer to receive the output. In this
+ *  case the return value is 1 if the buffer is full and more compressed data remains.
  *
- *  FL2_endStream() instructs to finish a frame.
- *  It will perform a flush and write the LZMA2 termination byte (required).
- *  FL2_endStream() may not be able to flush full data if `output->size` is too small.
- *  In which case, call again FL2_endStream() to complete the flush.
- *  @return : 0 if stream fully completed and flushed,
- *  or >0 to indicate the nb of bytes still present within the internal buffers,
- *  or an error code, which can be tested using FL2_isError().
+ *  FL2_endStream() instructs to finish a stream. It will perform a flush and write the LZMA2
+ *  termination byte (required). Call FL2_endStream() repeatedly until it returns 0.
+ *
+ *  Most functions may return a size_t error code, which can be tested using FL2_isError().
  *
  * *******************************************************************/
 
-typedef struct FL2_CStream_s FL2_CStream;
+typedef struct FL2_CCtx_s FL2_CStream;
 
 /*===== FL2_CStream management functions =====*/
 FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStream(void);
-FL2LIB_API size_t FL2LIB_CALL FL2_freeCStream(FL2_CStream* fcs);
+FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStreamMt(unsigned nbThreads, int dualBuffer);
+FL2LIB_API void FL2LIB_CALL FL2_freeCStream(FL2_CStream * fcs);
 
 /*===== Streaming compression functions =====*/
-FL2LIB_API size_t FL2LIB_CALL FL2_initCStream(FL2_CStream* fcs, int compressionLevel);
-FL2LIB_API size_t FL2LIB_CALL FL2_compressStream(FL2_CStream* fcs, FL2_outBuffer* output, FL2_inBuffer* input);
-FL2LIB_API size_t FL2LIB_CALL FL2_flushStream(FL2_CStream* fcs, FL2_outBuffer* output);
-FL2LIB_API size_t FL2LIB_CALL FL2_endStream(FL2_CStream* fcs, FL2_outBuffer* output);
 
+/*! FL2_initCStream() :
+ *  Call this function before beginning a new compressed data stream. To keep the stream object's
+ *  current parameters, specify zero for the compression level. The object is set to the default
+ *  level upon creation. */
+FL2LIB_API size_t FL2LIB_CALL FL2_initCStream(FL2_CStream* fcs, int compressionLevel);
+
+/*! FL2_setCStreamTimeout() :
+ *  Sets a timeout in milliseconds. Zero disables the timeout (default). If a nonzero timout is set, functions
+ *  FL2_compressStream(), FL2_updateDictionary(), FL2_getNextCStreamBuffer(), FL2_flushStream(), and
+ *  FL2_endStream() may return a timeout code before compression of the current dictionary of data
+ *  completes. FL2_isError() returns true for the timeout code, so check the code with FL2_isTimedOut() before
+ *  testing for errors. With the exception of FL2_updateDictionary(), the above functions may be called again
+ *  to wait for completion. A typical application for timeouts is to update the user on compression progress. */
+FL2LIB_API size_t FL2LIB_CALL FL2_setCStreamTimeout(FL2_CStream * fcs, unsigned timeout);
+
+/*! FL2_compressStream() :
+ *  Reads data from input into the dictionary buffer. Compression will begin if the buffer fills up.
+ *  A dual buffering stream will fill the second buffer while compression proceeds on the first.
+ *  A call to FL2_compressStream() will wait for ongoing compression to complete if all dictionary space
+ *  is filled. FL2_compressStream() must not be called with output == NULL unless the caller has read all
+ *  compressed data from the CStream object.
+ *  Returns 1 to indicate compressed data must be read (or output is full), or 0 otherwise. */
+FL2LIB_API size_t FL2LIB_CALL FL2_compressStream(FL2_CStream* fcs, FL2_outBuffer *output, FL2_inBuffer* input);
+
+/*** Push/pull functions ***/
+
+/*! FL2_getDictionaryBuffer() :
+ *  Returns a buffer in the FL2_outBuffer object, which the caller can directly read data into.
+ *  Applications will normally pass this buffer to an I/O read function or upstream filter.
+ *  Returns 0, or an error or timeout code. */
+FL2LIB_API size_t FL2LIB_CALL FL2_getDictionaryBuffer(FL2_CStream* fcs, FL2_dictBuffer* dict);
+
+/*! FL2_updateDictionary() :
+ *  Informs the CStream how much data was added to the buffer. Compression begins if the dictionary
+ *  was filled. Returns 1 to indicate compressed data must be read, 0 if not, or an error code. */
+FL2LIB_API size_t FL2LIB_CALL FL2_updateDictionary(FL2_CStream* fcs, size_t addedSize);
+
+/*! FL2_getNextCStreamBuffer() :
+ *  Returns a buffer containing a slice of the compressed data. Call this function and process the data
+ *  until the function returns zero. In most cases it will return a buffer for each compression thread
+ *  used. It is sometimes less but never more than nbThreads. If asynchronous compression is in progress,
+ *  this function will wait for completion before returning, or it will return the timeout code. */
+FL2LIB_API size_t FL2LIB_CALL FL2_getNextCStreamBuffer(FL2_CStream* fcs, FL2_cBuffer* cbuf);
+
+/******/
+
+/*! FL2_getCStreamProgress() :
+ *  Returns the number of bytes processed since the stream was initialized. This is a synthetic
+ *  estimate because the match finder does not proceed sequentially through the data. If
+ *  outputSize is not NULL, returns the number of bytes of compressed data generated. */
+FL2LIB_API unsigned long long FL2LIB_CALL FL2_getCStreamProgress(const FL2_CStream * fcs, unsigned long long *outputSize);
+
+/*! FL2_waitCStream() :
+ *  Waits for compression to end. This function returns after the timeout set using
+ *  FL2_setCStreamTimeout has elapsed. Unnecessary when no timeout is set.
+ *  Returns 1 if compressed output is available, 0 if not, or the timeout code. */
+FL2LIB_API size_t FL2LIB_CALL FL2_waitCStream(FL2_CStream * fcs);
+
+/*! FL2_cancelCStream() :
+ *  Cancels any compression operation underway. Useful only when dual buffering and/or timeouts
+ *  are enabled. The stream will be returned to an uninitialized state. */
+FL2LIB_API void FL2LIB_CALL FL2_cancelCStream(FL2_CStream *fcs);
+
+/*! FL2_remainingOutputSize() :
+ *  The amount of compressed data remaining to be read from the CStream object. */
+FL2LIB_API size_t FL2LIB_CALL FL2_remainingOutputSize(const FL2_CStream* fcs);
+
+/*! FL2_flushStream() :
+ *  Compress all data remaining in the dictionary buffer(s). It may be necessary to call
+ *  FL2_flushStream() more than once. If output == NULL the compressed data must be read from the
+ *  CStream object after each call.
+ *  Flushing is not normally useful and produces larger output.
+ *  Returns 1 if input or output still exists in the CStream object, 0 if complete, or an error code. */
+FL2LIB_API size_t FL2LIB_CALL FL2_flushStream(FL2_CStream* fcs, FL2_outBuffer *output);
+
+/*! FL2_endStream() :
+ *  Compress all data remaining in the dictionary buffer(s) and write the stream end marker. It may
+ *  be necessary to call FL2_endStream() more than once. If output == NULL the compressed data must
+ *  be read from the CStream object after each call.
+ *  Returns 0 when compression is complete and all output has been flushed, 1 if not complete, or
+ *  an error code. */
+FL2LIB_API size_t FL2LIB_CALL FL2_endStream(FL2_CStream* fcs, FL2_outBuffer *output);
 
 /*-***************************************************************************
- *  Streaming decompression - HowTo
+ *  Streaming decompression
  *
  *  A FL2_DStream object is required to track streaming operations.
  *  Use FL2_createDStream() and FL2_freeDStream() to create/release resources.
  *  FL2_DStream objects can be re-used multiple times.
  *
  *  Use FL2_initDStream() to start a new decompression operation.
- *   @return : recommended first input size
+ *  @return : zero or an error code
  *
  *  Use FL2_decompressStream() repetitively to consume your input.
  *  The function will update both `pos` fields.
  *  If `input.pos < input.size`, some input has not been consumed.
- *  It's up to the caller to present again remaining data.
- *  More data must be loaded if `input.pos + LZMA_REQUIRED_INPUT_MAX >= input.size`
+ *  It's up to the caller to present again the remaining data.
+ *  More data must be loaded if `input.pos + LZMA_REQUIRED_INPUT_MAX >= input.size`. In this case,
+ *  move the remaining input (<= LZMA_REQUIRED_INPUT_MAX bytes) to the start of the buffer and
+ *  load new data after it.
  *  If `output.pos < output.size`, decoder has flushed everything it could.
- *  @return : 0 when a frame is completely decoded and fully flushed,
- *            an error code, which can be tested using FL2_isError(),
- *            1, which means there is still some decoding to do to complete current frame.
+ *  @return : 0 when a stream is completely decoded and fully flushed,
+ *            1, which means there is still some decoding to do to complete the stream,
+ *            or an error code, which can be tested using FL2_isError().
  * *******************************************************************************/
 
 #define LZMA_REQUIRED_INPUT_MAX 20
@@ -324,101 +383,187 @@ typedef struct FL2_DStream_s FL2_DStream;
 
 /*===== FL2_DStream management functions =====*/
 FL2LIB_API FL2_DStream* FL2LIB_CALL FL2_createDStream(void);
+FL2LIB_API FL2_DStream* FL2LIB_CALL FL2_createDStreamMt(unsigned nbThreads);
 FL2LIB_API size_t FL2LIB_CALL FL2_freeDStream(FL2_DStream* fds);
 
+/*! FL2_setDStreamMemoryLimitMt() :
+ *  Set a total size limit for multithreaded decoder input and output buffers. MT decoder memory
+ *  usage is unknown until the input is parsed. If the limit is exceeded, the decoder switches to
+ *  using a single thread.
+ *  MT decoding memory usage is typically dictionary_size * 4 * nbThreads for the output
+ *  buffers plus the size of the compressed input for that amount of output. */
+FL2LIB_API void FL2LIB_CALL FL2_setDStreamMemoryLimitMt(FL2_DStream* fds, size_t limit);
+
+/*! FL2_setDStreamTimeout() :
+ *  Sets a timeout in milliseconds. Zero disables the timeout. If a nonzero timout is set,
+ *  FL2_decompressStream() may return a timeout code before decompression of the available data
+ *  completes. FL2_isError() returns true for the timeout code, so check the code with FL2_isTimedOut()
+ *  before testing for errors. After a timeout occurs, do not call FL2_decompressStream() again unless
+ *  a call to FL2_waitDStream() returns 1. A typical application for timeouts is to update the user on
+ *  decompression progress. */
+FL2LIB_API size_t FL2LIB_CALL FL2_setDStreamTimeout(FL2_DStream * fds, unsigned timeout);
+
+/*! FL2_waitDStream() :
+ *  Waits for decompression to end after a timeout has occurred. This function returns after the
+ *  timeout set using FL2_setDStreamTimeout() has elapsed, or when decompression of available input is
+ *  complete. Unnecessary when no timeout is set.
+ *  Returns 0 if the stream is complete, 1 if not complete, or an error code. */
+FL2LIB_API size_t FL2LIB_CALL FL2_waitDStream(FL2_DStream * fds);
+
+/*! FL2_cancelDStream() :
+ *  Frees memory allocated for MT decoding. If a timeout is set and the caller is waiting
+ *  for completion of MT decoding, decompression in progress will be canceled. */
+FL2LIB_API void FL2LIB_CALL FL2_cancelDStream(FL2_DStream *fds);
+
+/*! FL2_getDStreamProgress() :
+ *  Returns the number of bytes decoded since the stream was initialized. */
+FL2LIB_API unsigned long long FL2LIB_CALL FL2_getDStreamProgress(const FL2_DStream * fds);
+
 /*===== Streaming decompression functions =====*/
+
+/*! FL2_initDStream() :
+ *  Call this function before decompressing a stream. FL2_initDStream_withProp()
+ *  must be used for streams which do not include a property byte at position zero.
+ *  The caller is responsible for storing and passing the property byte.
+ *  Returns 0 if okay, or an error if the stream object is still in use from a
+ *  previous call to FL2_decompressStream() (see timeout info above). */
 FL2LIB_API size_t FL2LIB_CALL FL2_initDStream(FL2_DStream* fds);
+FL2LIB_API size_t FL2LIB_CALL FL2_initDStream_withProp(FL2_DStream* fds, unsigned char prop);
+
+/*! FL2_decompressStream() :
+ *  Reads data from input and decompresses to output.
+ *  Returns 1 if the stream is unfinished, 0 if the terminator was encountered (he'll be back)
+ *  and all data was written to output, or an error code. Call this function repeatedly if
+ *  necessary, removing data from output and/or loading data into input before each call.
+ *  Note the requirement for LZMA_REQUIRED_INPUT_MAX bytes of input if the input data is
+ *  incomplete (see intro above). */
 FL2LIB_API size_t FL2LIB_CALL FL2_decompressStream(FL2_DStream* fds, FL2_outBuffer* output, FL2_inBuffer* input);
 
 /*-***************************************************************************
- *  Compression parameters - HowTo
+ *  Compression parameters
  *
  *  Any function that takes a 'compressionLevel' parameter will replace any
  *  parameters affected by compression level that are already set.
- *  Call FL2_CCtx_setParameter with FL2_p_compressionLevel to set the level,
- *  then call FL2_CCtx_setParameter again with any other settings to change.
- *  Specify compressionLevel=0 when calling a compression function.
+ *  To use a preset level and modify it, call FL2_CCtx_setParameter with
+ *  FL2_p_compressionLevel to set the level, then call FL2_CCtx_setParameter again
+ *  with any other settings to change.
+ *  Specify a compressionLevel of 0 when calling a compression function to keep
+ *  the current parameters.
  * *******************************************************************************/
 
+#define FL2_DICTLOG_MIN      20
 #define FL2_DICTLOG_MAX_32   27
 #define FL2_DICTLOG_MAX_64   30
-#define FL2_DICTLOG_MAX    ((unsigned)(sizeof(size_t) == 4 ? FL2_DICTLOG_MAX_32 : FL2_DICTLOG_MAX_64))
-#define FL2_DICTLOG_MIN      20
-#define FL2_CHAINLOG_MAX       14
-#define FL2_CHAINLOG_MIN       4
-#define FL2_SEARCHLOG_MAX     (FL2_CHAINLOG_MAX-1)
-#define FL2_SEARCHLOG_MIN       0
-#define FL2_FASTLENGTH_MIN    6   /* only used by optimizer */
-#define FL2_FASTLENGTH_MAX  273   /* only used by optimizer */
+#define FL2_DICTLOG_MAX      ((unsigned)(sizeof(size_t) == 4 ? FL2_DICTLOG_MAX_32 : FL2_DICTLOG_MAX_64))
+#define FL2_DICTSIZE_MAX     (1U << FL2_DICTLOG_MAX)
+#define FL2_DICTSIZE_MIN     (1U << FL2_DICTLOG_MIN)
 #define FL2_BLOCK_OVERLAP_MIN 0
 #define FL2_BLOCK_OVERLAP_MAX 14
-#define FL2_BLOCK_LOG_MIN 12
-#define FL2_BLOCK_LOG_MAX 32
+#define FL2_RESET_INTERVAL_MIN 1
+#define FL2_RESET_INTERVAL_MAX 16  /* small enough to fit FL2_DICTSIZE_MAX * FL2_RESET_INTERVAL_MAX in 32-bit size_t */
+#define FL2_BUFFER_SIZE_LOG_MIN 0
+#define FL2_BUFFER_SIZE_LOG_MAX 6
+#define FL2_CHAINLOG_MIN       4
+#define FL2_CHAINLOG_MAX       14
+#define FL2_HYBRIDCYCLES_MIN    1
+#define FL2_HYBRIDCYCLES_MAX   64
 #define FL2_SEARCH_DEPTH_MIN 6
 #define FL2_SEARCH_DEPTH_MAX 254
-#define FL2_BUFFER_SIZE_LOG_MIN 6
-#define FL2_BUFFER_SIZE_LOG_MAX 12
+#define FL2_FASTLENGTH_MIN    6   /* only used by optimizer */
+#define FL2_FASTLENGTH_MAX  273   /* only used by optimizer */
 #define FL2_LC_MIN 0
 #define FL2_LC_MAX 4
 #define FL2_LP_MIN 0
 #define FL2_LP_MAX 4
 #define FL2_PB_MIN 0
 #define FL2_PB_MAX 4
+#define FL2_LCLP_MAX 4
+
+typedef enum {
+    FL2_fast,
+    FL2_opt,
+    FL2_ultra
+} FL2_strategy;
+
+typedef struct {
+    size_t   dictionarySize;   /* largest match distance : larger == more compression, more memory needed during decompression; >= 27 == more memory per byte, slower */
+    unsigned overlapFraction;  /* overlap between consecutive blocks in 1/16 units: larger == more compression, slower */
+    unsigned chainLog;         /* HC3 sliding window : larger == more compression, slower; hybrid mode only (ultra) */
+    unsigned cyclesLog;        /* nb of searches : larger == more compression, slower; hybrid mode only (ultra) */
+    unsigned searchDepth;      /* maximum depth for resolving string matches : larger == more compression, slower */
+    unsigned fastLength;       /* acceptable match size for parser : larger == more compression, slower; fast bytes parameter from 7-zip */
+    unsigned divideAndConquer; /* split long chains of 2-byte matches into shorter chains with a small overlap : faster, somewhat less compression; enabled by default */
+    unsigned bufferLog;        /* buffer size for processing match chains is (dictionarySize >> (12 - bufferLog)) : affects compression when divideAndConquer enabled; */
+                               /* when divideAndConquer disabled, affects speed in a hardware-dependent manner */
+    FL2_strategy strategy;     /* encoder strategy : fast, optimized or ultra (hybrid) */
+} FL2_compressionParameters;
 
 typedef enum {
     /* compression parameters */
     FL2_p_compressionLevel, /* Update all compression parameters according to pre-defined cLevel table
-                              * Default level is FL2_CLEVEL_DEFAULT==9.
-                              * Setting FL2_p_highCompression to 1 switches to an alternate cLevel table.
-                              * Special: value 0 means "do not change cLevel". */
+                             * Default level is FL2_CLEVEL_DEFAULT==6.
+                             * Setting FL2_p_highCompression to 1 switches to an alternate cLevel table. */
     FL2_p_highCompression,  /* Maximize compression ratio for a given dictionary size.
-                              * Has 9 levels instead of 12, with dictionaryLog 20 - 28. */
-    FL2_p_7zLevel,          /* For use by the 7-zip fork employing this library. 1 - 9 */
+                             * Levels 1..10 = dictionaryLog 20..29 (1 Mb..512 Mb).
+                             * Typically provides a poor speed/ratio tradeoff. */
     FL2_p_dictionaryLog,    /* Maximum allowed back-reference distance, expressed as power of 2.
-                              * Must be clamped between FL2_DICTLOG_MIN and FL2_DICTLOG_MAX.
-                              * Special: value 0 means "do not change dictionaryLog". */
+                             * Must be clamped between FL2_DICTLOG_MIN and FL2_DICTLOG_MAX.
+                             * Default = 24 */
+    FL2_p_dictionarySize,   /* Same as above but expressed as an absolute value. 
+                             * Must be clamped between FL2_DICTSIZE_MIN and FL2_DICTSIZE_MAX.
+                             * Default = 16 Mb */
     FL2_p_overlapFraction,  /* The radix match finder is block-based, so some overlap is retained from
                              * each block to improve compression of the next. This value is expressed
                              * as n / 16 of the block size (dictionary size). Larger values are slower.
-                             * Values above 2 mostly yield only a small improvement in compression. */
-    FL2_p_blockSize,
+                             * Values above 2 mostly yield only a small improvement in compression.
+                             * A large value for a small dictionary may worsen multithreaded compression.
+                             * Default = 2 */
+    FL2_p_resetInterval,    /* For multithreaded decompression. A dictionary reset will occur
+                             * after each dictionarySize * resetInterval bytes of input.
+                             * Default = 4 */
     FL2_p_bufferLog,        /* Buffering speeds up the matchfinder. Buffer size is 
-                             * 2 ^ (dictionaryLog - bufferLog). Lower number = slower, better compression,
-                             * higher memory usage. */
-    FL2_p_chainLog,         /* Size of the full-search table, as a power of 2.
-                              * Resulting table size is (1 << (chainLog+2)).
-                              * Larger tables result in better and slower compression.
-                              * This parameter is useless when using "fast" strategy.
-                              * Special: value 0 means "do not change chainLog". */
-    FL2_p_searchLog,        /* Number of search attempts, as a power of 2, made by the HC3 match finder
-                              * used only in hybrid mode.
-                              * More attempts result in slightly better and slower compression.
-                              * This parameter is not used by the "fast" and "optimize" strategies.
-                              * Special: value 0 means "do not change searchLog". */
-    FL2_p_literalCtxBits,   /* lc value for LZMA2 encoder */
-    FL2_p_literalPosBits,   /* lp value for LZMA2 encoder */
-    FL2_p_posBits,          /* pb value for LZMA2 encoder */
+                             * (dictionarySize >> (12 - bufferLog)) * 12 bytes. Higher number = slower,
+                             * better compression, higher memory usage. A CPU with a large memory cache
+                             * may make effective use of a larger buffer.
+                             * Default = 4 */
+    FL2_p_hybridChainLog,   /* Size of the hybrid mode HC3 hash chain, as a power of 2.
+                             * Resulting table size is (1 << (chainLog+2)) bytes.
+                             * Larger tables result in better and slower compression.
+                             * This parameter is only used by the hybrid "ultra" strategy.
+                             * Default = 9 */
+    FL2_p_hybridCycles,     /* Number of search attempts made by the HC3 match finder.
+                             * Used only by the hybrid "ultra" strategy.
+                             * More attempts result in slightly better and slower compression.
+                             * Default = 1 */
     FL2_p_searchDepth,      /* Match finder will resolve string matches up to this length. If a longer
-                             * match exists further back in the input, it will not be found. */
+                             * match exists further back in the input, it will not be found.
+                             * Default = 42 */
     FL2_p_fastLength,       /* Only useful for strategies >= opt.
-                             * Length of Match considered "good enough" to stop search.
+                             * Length of match considered "good enough" to stop search.
                              * Larger values make compression stronger and slower.
-                             * Special: value 0 means "do not change fastLength". */
+                             * Default = 48 */
     FL2_p_divideAndConquer, /* Split long chains of 2-byte matches into shorter chains with a small overlap
-                             * during further processing. Allows buffering of all chains at length 2.
-                             * Faster, less compression. Generally a good tradeoff. Enabled by default. */
-    FL2_p_strategy,         /* 1 = fast; 2 = optimize, 3 = ultra (hybrid mode).
+                             * for further processing. Allows buffering of all chains at length 2.
+                             * Faster, less compression. Generally a good tradeoff.
+                             * Default = enabled */
+    FL2_p_strategy,         /* 1 = fast; 2 = optimized, 3 = ultra (hybrid mode).
                              * The higher the value of the selected strategy, the more complex it is,
                              * resulting in stronger and slower compression.
-                             * Special: value 0 means "do not change strategy". */
+                             * Default = ultra */
+    FL2_p_literalCtxBits,   /* lc value for LZMA2 encoder
+                             * Default = 3 */
+    FL2_p_literalPosBits,   /* lp value for LZMA2 encoder
+                             * Default = 0 */
+    FL2_p_posBits,          /* pb value for LZMA2 encoder
+                             * Default = 2 */
+    FL2_p_omitProperties,   /* Omit the property byte at the start of the stream. For use within 7-zip */
+                            /* or other containers which store the property byte elsewhere. */
+                            /* A stream compressed under this setting cannot be decoded by this library. */
 #ifndef NO_XXHASH
     FL2_p_doXXHash,         /* Calculate a 32-bit xxhash value from the input data and store it 
                              * after the stream terminator. The value will be checked on decompression.
                              * 0 = do not calculate; 1 = calculate (default) */
 #endif
-    FL2_p_omitProperties,   /* Omit the property byte at the start of the stream. For use within 7-zip */
-                            /* or other containers which store the property byte elsewhere. */
-                            /* Cannot be decoded by this library. */
 #ifdef RMF_REFERENCE
     FL2_p_useReferenceMF    /* Use the reference matchfinder for development purposes. SLOW. */
 #endif
@@ -429,8 +574,32 @@ typedef enum {
  *  Set one compression parameter, selected by enum FL2_cParameter.
  *  @result : informational value (typically, the one being set, possibly corrected),
  *            or an error code (which can be tested with FL2_isError()). */
-FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_setParameter(FL2_CCtx* cctx, FL2_cParameter param, unsigned value);
-FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cParameter param, unsigned value);
+FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_setParameter(FL2_CCtx* cctx, FL2_cParameter param, size_t value);
+
+/*! FL2_CCtx_getParameter() :
+ *  Get one compression parameter, selected by enum FL2_cParameter.
+ *  @result : the parameter value, or the parameter_unsupported error code
+ *            (which can be tested with FL2_isError()). */
+FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_getParameter(FL2_CCtx* cctx, FL2_cParameter param);
+
+/*! FL2_CStream_setParameter() :
+ *  Set one compression parameter, selected by enum FL2_cParameter.
+ *  @result : informational value (typically, the one being set, possibly corrected),
+ *            or an error code (which can be tested with FL2_isError()). */
+FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cParameter param, size_t value);
+
+/*! FL2_CStream_getParameter() :
+ *  Get one compression parameter, selected by enum FL2_cParameter.
+ *  @result : the parameter value, or the parameter_unsupported error code
+ *            (which can be tested with FL2_isError()). */
+FL2LIB_API size_t FL2LIB_CALL FL2_CStream_getParameter(FL2_CStream* fcs, FL2_cParameter param);
+
+/*! FL2_getLevelParameters() :
+ *  Get all compression parameter values defined by the preset compressionLevel.
+ *  @result : the values in a FL2_compressionParameters struct, or the parameter_outOfBound error code
+ *            (which can be tested with FL2_isError()) if compressionLevel is invalid. */
+FL2LIB_API size_t FL2LIB_CALL FL2_getLevelParameters(int compressionLevel, int high, FL2_compressionParameters *params);
+
 
 /***************************************
 *  Context memory usage
@@ -441,12 +610,29 @@ FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cPa
 *  FL2_estimateCCtxSize() will provide a budget large enough for any compression level up to selected one.
 *  To use FL2_estimateCCtxSize_usingCCtx, set the compression level and any other settings for the context,
 *  then call the function. Some allocation occurs when the context is created, but the large memory buffers
-*  used for string matching are allocated only when compression begins. */
+*  used for string matching are allocated only when compression is initialized. */
 
 FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize(int compressionLevel, unsigned nbThreads); /*!< memory usage determined by level */
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize_byParams(const FL2_compressionParameters *params, unsigned nbThreads); /*!< memory usage determined by params */
 FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize_usingCCtx(const FL2_CCtx* cctx);           /*!< memory usage determined by settings */
-FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize(int compressionLevel, unsigned nbThreads);
-FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_usingCCtx(const FL2_CStream* fcs);
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize(int compressionLevel, unsigned nbThreads, int dualBuffer); /*!< memory usage determined by level */
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_byParams(const FL2_compressionParameters *params, unsigned nbThreads, int dualBuffer); /*!< memory usage determined by params */
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_usingCStream(const FL2_CStream* fcs);   /*!< memory usage determined by settings */
+
+/*! FL2_getDictSizeFromProp() :
+ *  Get the dictionary size from the property byte for a stream. The property byte is the first byte
+*   in the stream, unless omitProperties was enabled, in which case the caller must store it. */
+FL2LIB_API size_t FL2LIB_CALL FL2_getDictSizeFromProp(unsigned char prop);
+
+/*! FL2_estimateDCtxSize() :
+ *  The size of a DCtx does not include a dictionary buffer because the caller must supply one. */
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateDCtxSize(unsigned nbThreads);
+
+/*! FL2_estimateDStreamSize() :
+ *  Estimate decompression memory use from the dictionary size and number of threads.
+ *  For nbThreads == 0 the number of available cores will be used.
+ *  Obtain dictSize by passing the property byte to FL2_getDictSizeFromProp. */
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateDStreamSize(size_t dictSize, unsigned nbThreads); /*!<  obtain dictSize from FL2_getDictSizeFromProp() */
 
 #endif  /* FAST_LZMA2_H */
 
diff --git a/C/fast-lzma2/fl2_common.c b/C/fast-lzma2/fl2_common.c
index 85780c56..6db70714 100644
--- a/C/fast-lzma2/fl2_common.c
+++ b/C/fast-lzma2/fl2_common.c
@@ -14,10 +14,8 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
-#include <stdlib.h>      /* malloc, calloc, free */
-#include <string.h>      /* memset */
 #include "fast-lzma2.h"
-#include "fl2_error_private.h"
+#include "fl2_errors.h"
 #include "fl2_internal.h"
 
 
@@ -29,6 +27,9 @@ FL2LIB_API unsigned FL2LIB_CALL FL2_versionNumber(void) { return FL2_VERSION_NUM
 FL2LIB_API const char* FL2LIB_CALL FL2_versionString(void) { return FL2_VERSION_STRING; }
 
 
+/*-****************************************
+*  Compression helpers
+******************************************/
 FL2LIB_API size_t FL2LIB_CALL FL2_compressBound(size_t srcSize)
 {
     return FL2_COMPRESSBOUND(srcSize);
@@ -37,21 +38,70 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compressBound(size_t srcSize)
 /*-****************************************
 *  FL2 Error Management
 ******************************************/
+HINT_INLINE
+unsigned IsError(size_t code)
+{
+    return (code > FL2_ERROR(maxCode));
+}
+
 /*! FL2_isError() :
  *  tells if a return value is an error code */
-FL2LIB_API unsigned FL2LIB_CALL FL2_isError(size_t code) { return ERR_isError(code); }
+FL2LIB_API unsigned FL2LIB_CALL FL2_isError(size_t code)
+{
+    return IsError(code);
+}
+
+/*! FL2_isTimedOut() :
+ *  tells if a return value is the timeout code */
+FL2LIB_API unsigned FL2LIB_CALL FL2_isTimedOut(size_t code)
+{
+    return (code == FL2_ERROR(timedOut));
+}
 
 /*! FL2_getErrorName() :
  *  provides error code string from function result (useful for debugging) */
-FL2LIB_API const char* FL2LIB_CALL FL2_getErrorName(size_t code) { return ERR_getErrorName(code); }
+FL2LIB_API const char* FL2LIB_CALL FL2_getErrorName(size_t code)
+{
+    return FL2_getErrorString(FL2_getErrorCode(code));
+}
 
 /*! FL2_getError() :
  *  convert a `size_t` function result into a proper FL2_errorCode enum */
-FL2LIB_API FL2_ErrorCode FL2LIB_CALL FL2_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+FL2LIB_API FL2_ErrorCode FL2LIB_CALL FL2_getErrorCode(size_t code)
+{
+    if (!IsError(code)) 
+        return (FL2_ErrorCode)0;
+
+    return (FL2_ErrorCode)(0 - code);
+}
 
 /*! FL2_getErrorString() :
  *  provides error code string from enum */
-FL2LIB_API const char* FL2LIB_CALL FL2_getErrorString(FL2_ErrorCode code) { return ERR_getFL2ErrorString(code); }
+FL2LIB_API const char* FL2LIB_CALL FL2_getErrorString(FL2_ErrorCode code)
+{
+    static const char* const notErrorCode = "Unspecified error code";
+    switch (code)
+    {
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+    case PREFIX(lclpMax_exceeded): return "Parameters lc+lp > 4";
+    case PREFIX(stage_wrong): return "Not possible at this stage of encoding";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size is incorrect";
+    case PREFIX(canceled): return "Processing was canceled by a call to FL2_cancelCStream() or FL2_cancelDStream()";
+    case PREFIX(buffer): return "Streaming progress halted due to buffer(s) full/empty";
+    case PREFIX(timedOut): return "Wait timed out. Timeouts should be handled before errors using FL2_isTimedOut()";
+        /* following error codes are not stable and may be removed or changed in a future version */
+    case PREFIX(maxCode):
+    default: return notErrorCode;
+    }
+}
 
 /*! g_debuglog_enable :
  *  turn on/off debug traces (global switch) */
diff --git a/C/fast-lzma2/fl2_compress.c b/C/fast-lzma2/fl2_compress.c
index 7785364b..2f1af130 100644
--- a/C/fast-lzma2/fl2_compress.c
+++ b/C/fast-lzma2/fl2_compress.c
@@ -11,29 +11,100 @@
 
 #include <string.h>
 #include "fast-lzma2.h"
+#include "fl2_errors.h"
 #include "fl2_internal.h"
 #include "platform.h"
 #include "mem.h"
 #include "util.h"
 #include "fl2_compress_internal.h"
-#include "fl2threading.h"
-#include "fl2pool.h"
+#include "fl2_threading.h"
+#include "fl2_pool.h"
 #include "radix_mf.h"
 #include "lzma2_enc.h"
 
-#define MIN_BYTES_PER_THREAD 0x10000
-
-#define ALIGNMENT_MASK (~(size_t)15)
+#define FL2_MAX_LOOPS 10U
 
 /*-=====  Pre-defined compression levels  =====-*/
 
-#define FL2_CLEVEL_DEFAULT   9
-#define FL2_MAX_CLEVEL      12
-#define FL2_MAX_7Z_CLEVEL   9
-#define FL2_MAX_HIGH_CLEVEL 9
+#define MB *(1U<<20)
+
+#define FL2_MAX_HIGH_CLEVEL 10
+
+#ifdef FL2_XZ_BUILD
+
+#define FL2_CLEVEL_DEFAULT  6
+#define FL2_MAX_CLEVEL      9
+
+static const FL2_compressionParameters FL2_defaultCParameters[FL2_MAX_CLEVEL + 1] = {
+    { 0,0,0,0,0,0,0,0,0 },
+    { 1 MB, 1, 7, 0, 6, 32, 1, 4, FL2_fast }, /* 1 */
+    { 2 MB, 2, 7, 0, 14, 32, 1, 4, FL2_fast }, /* 2 */
+    { 2 MB, 2, 7, 0, 14, 40, 1, 4, FL2_opt }, /* 3 */
+    { 4 MB, 2, 7, 0, 26, 40, 1, 4, FL2_opt }, /* 4 */
+    { 16 MB, 2, 8, 0, 42, 48, 1, 4, FL2_opt }, /* 5 */
+    { 16 MB, 2, 9, 1, 42, 48, 1, 4, FL2_ultra }, /* 6 */
+    { 32 MB, 2, 10, 1, 50, 64, 1, 4, FL2_ultra }, /* 7 */
+    { 64 MB, 2, 11, 2, 62, 96, 1, 3, FL2_ultra }, /* 8 */
+    { 64 MB, 4, 12, 3, 90, 273, 0, 3, FL2_ultra }, /* 9 */
+};
+
+#elif defined(FL2_7ZIP_BUILD)
+
+#define FL2_CLEVEL_DEFAULT  5
+#define FL2_MAX_CLEVEL      9
+
+static const FL2_compressionParameters FL2_defaultCParameters[FL2_MAX_CLEVEL + 1] = {
+    { 0,0,0,0,0,0,0,0,0 },
+    { 1 MB, 1, 7, 0, 6, 32, 1, 4, FL2_fast }, /* 1 */
+    { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_fast }, /* 2 */
+    { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_opt }, /* 3 */
+    { 4 MB, 2, 7, 0, 14, 32, 1, 4, FL2_opt }, /* 4 */
+    { 16 MB, 2, 9, 0, 42, 48, 1, 4, FL2_ultra }, /* 5 */
+    { 32 MB, 2, 10, 0, 50, 64, 1, 4, FL2_ultra }, /* 6 */
+    { 64 MB, 2, 11, 1, 62, 96, 1, 3, FL2_ultra }, /* 7 */
+    { 64 MB, 4, 12, 2, 90, 273, 1, 3, FL2_ultra }, /* 8 */
+    { 128 MB, 2, 14, 3, 254, 273, 0, 2, FL2_ultra } /* 9 */
+};
+
+#else
+
+#define FL2_CLEVEL_DEFAULT   6
+#define FL2_MAX_CLEVEL      10
+
+static const FL2_compressionParameters FL2_defaultCParameters[FL2_MAX_CLEVEL + 1] = {
+    { 0,0,0,0,0,0,0,0,0 },
+    { 1 MB, 1, 7, 0, 6, 32, 1, 4, FL2_fast }, /* 1 */
+    { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_fast }, /* 2 */
+    { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_opt }, /* 3 */
+    { 4 MB, 2, 7, 0, 26, 40, 1, 4, FL2_opt }, /* 4 */
+    { 8 MB, 2, 8, 0, 42, 48, 1, 4, FL2_opt }, /* 5 */
+    { 16 MB, 2, 9, 0, 42, 48, 1, 4, FL2_ultra }, /* 6 */
+    { 32 MB, 2, 10, 0, 50, 64, 1, 4, FL2_ultra }, /* 7 */
+    { 64 MB, 2, 11, 1, 62, 96, 1, 3, FL2_ultra }, /* 8 */
+    { 64 MB, 4, 12, 2, 90, 273, 1, 3, FL2_ultra }, /* 9 */
+    { 128 MB, 2, 14, 3, 254, 273, 0, 2, FL2_ultra } /* 10 */
+};
+
+#endif
+
+static const FL2_compressionParameters FL2_highCParameters[FL2_MAX_HIGH_CLEVEL + 1] = {
+    { 0,0,0,0,0,0,0,0,0 },
+    { 1 MB, 4, 9, 2, 254, 273, 0, 4, FL2_ultra }, /* 1 */
+    { 2 MB, 4, 10, 2, 254, 273, 0, 4, FL2_ultra }, /* 2 */
+    { 4 MB, 4, 11, 2, 254, 273, 0, 4, FL2_ultra }, /* 3 */
+    { 8 MB, 4, 12, 2, 254, 273, 0, 4, FL2_ultra }, /* 4 */
+    { 16 MB, 4, 13, 3, 254, 273, 0, 4, FL2_ultra }, /* 5 */
+    { 32 MB, 4, 14, 3, 254, 273, 0, 4, FL2_ultra }, /* 6 */
+    { 64 MB, 4, 14, 4, 254, 273, 0, 4, FL2_ultra }, /* 7 */
+    { 128 MB, 4, 14, 4, 254, 273, 0, 4, FL2_ultra }, /* 8 */
+    { 256 MB, 4, 14, 5, 254, 273, 0, 3, FL2_ultra }, /* 9 */
+    { 512 MB, 4, 14, 5, 254, 273, 0, 2, FL2_ultra } /* 10 */
+};
+
+#undef MB
 
 FL2LIB_API int FL2LIB_CALL FL2_maxCLevel(void)
-{ 
+{
     return FL2_MAX_CLEVEL;
 }
 
@@ -42,135 +113,89 @@ FL2LIB_API int FL2LIB_CALL FL2_maxHighCLevel(void)
     return FL2_MAX_HIGH_CLEVEL;
 }
 
-static const FL2_compressionParameters FL2_defaultCParameters[FL2_MAX_CLEVEL + 1] = {
-    { 0,0,0,0,0,0,0 },
-    { 20, 1, 7, 0, 6, 32, 1, 8, FL2_fast }, /* 1 */
-    { 20, 2, 7, 0, 12, 32, 1, 8, FL2_fast }, /* 2 */
-    { 21, 2, 7, 0, 14, 32, 1, 8, FL2_fast }, /* 3 */
-    { 20, 2, 7, 0, 12, 32, 1, 8, FL2_opt }, /* 4 */
-    { 21, 2, 7, 0, 14, 40, 1, 8, FL2_opt }, /* 5 */
-    { 22, 2, 7, 0, 26, 40, 1, 8, FL2_opt }, /* 6 */
-    { 23, 2, 8, 0, 42, 48, 1, 8, FL2_opt }, /* 7 */
-    { 24, 2, 9, 0, 42, 48, 1, 8, FL2_ultra }, /* 8 */
-    { 25, 2, 10, 0, 50, 64, 1, 8, FL2_ultra }, /* 9 */
-    { 26, 2, 11, 1, 60, 64, 1, 9, FL2_ultra }, /* 10 */
-    { 27, 2, 12, 2, 126, 96, 1, 10, FL2_ultra }, /* 11 */
-    { 28, 2, 14, 3, 254, 160, 1, 10, FL2_ultra } /* 12 */
-};
-
-static const FL2_compressionParameters FL2_7zCParameters[FL2_MAX_7Z_CLEVEL + 1] = {
-    { 0,0,0,0,0,0,0 },
-    { 20, 1, 7, 0, 6, 32, 1, 8, FL2_fast }, /* 1 */
-    { 20, 2, 7, 0, 12, 32, 1, 8, FL2_fast }, /* 2 */
-    { 21, 2, 7, 0, 16, 32, 1, 8, FL2_fast }, /* 3 */
-    { 20, 2, 7, 0, 16, 32, 1, 8, FL2_opt }, /* 4 */
-    { 24, 2, 9, 0, 40, 48, 1, 8, FL2_ultra }, /* 5 */
-    { 25, 2, 10, 0, 48, 64, 1, 8, FL2_ultra }, /* 6 */
-    { 26, 2, 11, 1, 60, 96, 1, 9, FL2_ultra }, /* 7 */
-    { 27, 2, 12, 2, 128, 128, 1, 10, FL2_ultra }, /* 8 */
-    { 27, 3, 14, 3, 252, 160, 0, 10, FL2_ultra } /* 9 */
-};
-
-static const FL2_compressionParameters FL2_highCParameters[FL2_MAX_HIGH_CLEVEL + 1] = {
-    { 0,0,0,0,0,0,0 },
-    { 20, 3, 9, 1, 60, 128, 0, 8, FL2_ultra }, /* 1 */
-    { 21, 3, 10, 1, 60, 128, 0, 8, FL2_ultra }, /* 2 */
-    { 22, 3, 11, 2, 60, 128, 0, 8, FL2_ultra }, /* 3 */
-    { 23, 3, 12, 2, 60, 128, 0, 8, FL2_ultra }, /* 4 */
-    { 24, 3, 13, 3, 60, 128, 0, 8, FL2_ultra }, /* 5 */
-    { 25, 3, 14, 3, 60, 160, 0, 8, FL2_ultra }, /* 6 */
-    { 26, 3, 14, 4, 60, 160, 0, 8, FL2_ultra }, /* 7 */
-    { 27, 3, 14, 4, 128, 160, 0, 8, FL2_ultra }, /* 8 */
-    { 28, 3, 14, 5, 128, 160, 0, 9, FL2_ultra } /* 9 */
-};
-
-void FL2_fillParameters(FL2_CCtx* const cctx, const FL2_compressionParameters* const params)
+static void FL2_fillParameters(FL2_CCtx* const cctx, const FL2_compressionParameters* const params)
 {
     FL2_lzma2Parameters* const cParams = &cctx->params.cParams;
-    RMF_parameters* const rParams = &cctx->params.rParams;
     cParams->lc = 3;
     cParams->lp = 0;
     cParams->pb = 2;
     cParams->fast_length = params->fastLength;
-    cParams->match_cycles = 1U << params->searchLog;
+    cParams->match_cycles = 1U << params->cyclesLog;
     cParams->strategy = params->strategy;
     cParams->second_dict_bits = params->chainLog;
-    cParams->random_filter = 0;
-    rParams->dictionary_log = MIN(params->dictionaryLog, FL2_DICTLOG_MAX); /* allow for reduced dict in 32-bit version */
-    rParams->match_buffer_log = params->bufferLog;
+
+    RMF_parameters* const rParams = &cctx->params.rParams;
+    rParams->dictionary_size = MIN(params->dictionarySize, FL2_DICTSIZE_MAX); /* allows for reduced dict in 32-bit version */
+    rParams->match_buffer_log = RMF_BUFFER_LOG_BASE - params->bufferLog;
     rParams->overlap_fraction = params->overlapFraction;
-    rParams->block_size_log = rParams->dictionary_log + 2;
     rParams->divide_and_conquer = params->divideAndConquer;
     rParams->depth = params->searchDepth;
-}
-
-FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtx(void)
-{
-    return FL2_createCCtxMt(1);
-}
-
-FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtxMt(unsigned nbThreads)
-{
-    FL2_CCtx* cctx;
-
-#ifndef FL2_SINGLETHREAD
-    if (!nbThreads) {
-        nbThreads = UTIL_countPhysicalCores();
-        nbThreads += !nbThreads;
-    }
-    if (nbThreads > FL2_MAXTHREADS) {
-        nbThreads = FL2_MAXTHREADS;
-    }
-#else
-    nbThreads = 1;
+#ifdef RMF_REFERENCE
+    rParams->use_ref_mf = 1;
 #endif
+}
+
+static FL2_CCtx* FL2_createCCtx_internal(unsigned nbThreads, int const dualBuffer)
+{
+    nbThreads = FL2_checkNbThreads(nbThreads);
 
     DEBUGLOG(3, "FL2_createCCtxMt : %u threads", nbThreads);
 
-    cctx = malloc(sizeof(FL2_CCtx) + (nbThreads - 1) * sizeof(FL2_job));
+    FL2_CCtx* const cctx = calloc(1, sizeof(FL2_CCtx) + (nbThreads - 1) * sizeof(FL2_job));
     if (cctx == NULL)
         return NULL;
 
     cctx->jobCount = nbThreads;
-    for (unsigned u = 0; u < nbThreads; ++u) {
+    for (unsigned u = 0; u < nbThreads; ++u)
         cctx->jobs[u].enc = NULL;
-    }
 
-    cctx->params.highCompression = 0;
-    FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, FL2_CLEVEL_DEFAULT);
 #ifndef NO_XXHASH
     cctx->params.doXXH = 1;
 #endif
-    cctx->params.omitProp = 0;
-
-#ifdef RMF_REFERENCE
-    cctx->params.rParams.use_ref_mf = 0;
-#endif
 
     cctx->matchTable = NULL;
 
 #ifndef FL2_SINGLETHREAD
+    cctx->compressThread = NULL;
     cctx->factory = FL2POOL_create(nbThreads - 1);
     if (nbThreads > 1 && cctx->factory == NULL) {
         FL2_freeCCtx(cctx);
         return NULL;
     }
+    if (dualBuffer) {
+      cctx->compressThread = FL2POOL_create(1);
+      if (cctx->compressThread == NULL)
+        return NULL;
+    }
 #endif
 
     for (unsigned u = 0; u < nbThreads; ++u) {
-        cctx->jobs[u].enc = FL2_lzma2Create();
+        cctx->jobs[u].enc = LZMA2_createECtx();
         if (cctx->jobs[u].enc == NULL) {
             FL2_freeCCtx(cctx);
             return NULL;
         }
         cctx->jobs[u].cctx = cctx;
     }
-    cctx->dictMax = 0;
-    cctx->block_total = 0;
+
+    DICT_construct(&cctx->buf, dualBuffer);
+
+    FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, FL2_CLEVEL_DEFAULT);
+    cctx->params.cParams.reset_interval = 4;
 
     return cctx;
 }
 
+FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtx(void)
+{
+    return FL2_createCCtx_internal(1, 0);
+}
+
+FL2LIB_API FL2_CCtx* FL2LIB_CALL FL2_createCCtxMt(unsigned nbThreads)
+{
+    return FL2_createCCtx_internal(nbThreads, 0);
+}
+
 FL2LIB_API void FL2LIB_CALL FL2_freeCCtx(FL2_CCtx* cctx)
 {
     if (cctx == NULL) 
@@ -178,137 +203,116 @@ FL2LIB_API void FL2LIB_CALL FL2_freeCCtx(FL2_CCtx* cctx)
 
     DEBUGLOG(3, "FL2_freeCCtx : %u threads", cctx->jobCount);
 
+    DICT_destruct(&cctx->buf);
+
     for (unsigned u = 0; u < cctx->jobCount; ++u) {
-        FL2_lzma2Free(cctx->jobs[u].enc);
+        LZMA2_freeECtx(cctx->jobs[u].enc);
     }
 
 #ifndef FL2_SINGLETHREAD
     FL2POOL_free(cctx->factory);
+    FL2POOL_free(cctx->compressThread);
 #endif
 
     RMF_freeMatchTable(cctx->matchTable);
     free(cctx);
 }
 
-FL2LIB_API unsigned FL2LIB_CALL FL2_CCtx_nbThreads(const FL2_CCtx* cctx)
+FL2LIB_API unsigned FL2LIB_CALL FL2_getCCtxThreadCount(const FL2_CCtx* cctx)
 {
     return cctx->jobCount;
 }
 
 /* FL2_buildRadixTable() : FL2POOL_function type */
-static void FL2_buildRadixTable(void* const jobDescription, size_t n)
+static void FL2_buildRadixTable(void* const jobDescription, ptrdiff_t const n)
 {
-    const FL2_job* const job = (FL2_job*)jobDescription;
-    FL2_CCtx* const cctx = job->cctx;
+    FL2_CCtx* const cctx = (FL2_CCtx*)jobDescription;
 
-    RMF_buildTable(cctx->matchTable, n, 1, cctx->curBlock, NULL, NULL, 0, 0);
+    RMF_buildTable(cctx->matchTable, n, 1, cctx->curBlock);
 }
 
 /* FL2_compressRadixChunk() : FL2POOL_function type */
-static void FL2_compressRadixChunk(void* const jobDescription, size_t n)
+static void FL2_compressRadixChunk(void* const jobDescription, ptrdiff_t const n)
 {
-    const FL2_job* const job = (FL2_job*)jobDescription;
-    FL2_CCtx* const cctx = job->cctx;
+    FL2_CCtx* const cctx = (FL2_CCtx*)jobDescription;
 
-    cctx->jobs[n].cSize = FL2_lzma2Encode(cctx->jobs[n].enc, cctx->matchTable, job->block, &cctx->params.cParams, NULL, NULL, 0, 0);
+    cctx->jobs[n].cSize = LZMA2_encode(cctx->jobs[n].enc, cctx->matchTable,
+        cctx->jobs[n].block,
+        &cctx->params.cParams,
+        -1,
+        &cctx->progressIn, &cctx->progressOut, &cctx->canceled);
 }
 
 static int FL2_initEncoders(FL2_CCtx* const cctx)
 {
     for(unsigned u = 0; u < cctx->jobCount; ++u) {
-        if (FL2_lzma2HashAlloc(cctx->jobs[u].enc, &cctx->params.cParams) != 0)
+        if (LZMA2_hashAlloc(cctx->jobs[u].enc, &cctx->params.cParams) != 0)
             return 1;
     }
     return 0;
 }
 
-static size_t FL2_compressCurBlock(FL2_CCtx* const cctx, FL2_progressFn progress, void* opaque)
+static void FL2_initProgress(FL2_CCtx* const cctx)
+{
+    RMF_initProgress(cctx->matchTable);
+    cctx->progressIn = 0;
+    cctx->streamCsize += cctx->progressOut;
+    cctx->progressOut = 0;
+    cctx->canceled = 0;
+}
+
+/* FL2_compressCurBlock_blocking() :
+ * Compress cctx->curBlock and wait until complete.
+ * Write streamProp as the first byte if >= 0
+ */
+static size_t FL2_compressCurBlock_blocking(FL2_CCtx* const cctx, int const streamProp)
 {
     size_t const encodeSize = (cctx->curBlock.end - cctx->curBlock.start);
-    size_t init_done;
-    U32 rmf_weight = ZSTD_highbit32((U32)cctx->curBlock.end);
-    U32 depth_weight = 2 + (cctx->params.rParams.depth >= 12) + (cctx->params.rParams.depth >= 28);
-    U32 enc_weight;
-    int err = 0;
 #ifndef FL2_SINGLETHREAD
     size_t mfThreads = cctx->curBlock.end / RMF_MIN_BYTES_PER_THREAD;
-    size_t nbThreads = MIN(cctx->jobCount, encodeSize / MIN_BYTES_PER_THREAD);
+    size_t nbThreads = MIN(cctx->jobCount, encodeSize / ENC_MIN_BYTES_PER_THREAD);
     nbThreads += !nbThreads;
 #else
     size_t mfThreads = 1;
     size_t nbThreads = 1;
 #endif
 
-    if (rmf_weight >= 20) {
-        rmf_weight = depth_weight * (rmf_weight - 10) + (rmf_weight - 19) * 12;
-        if (cctx->params.cParams.strategy == 0)
-            enc_weight = 20;
-        else if (cctx->params.cParams.strategy == 1)
-            enc_weight = 50;
-        else
-            enc_weight = 60 + cctx->params.cParams.second_dict_bits + ZSTD_highbit32(cctx->params.cParams.fast_length) * 3U;
-        rmf_weight = (rmf_weight << 4) / (rmf_weight + enc_weight);
-        enc_weight = 16 - rmf_weight;
-    }
-    else {
-        rmf_weight = 8;
-        enc_weight = 8;
-    }
-
     DEBUGLOG(5, "FL2_compressCurBlock : %u threads, %u start, %u bytes", (U32)nbThreads, (U32)cctx->curBlock.start, (U32)encodeSize);
 
-    /* Free unsuitable match table before reallocating anything else */
-    if (cctx->matchTable && !RMF_compatibleParameters(cctx->matchTable, &cctx->params.rParams, cctx->curBlock.end)) {
-        RMF_freeMatchTable(cctx->matchTable);
-        cctx->matchTable = NULL;
+    size_t sliceStart = cctx->curBlock.start;
+    size_t const sliceSize = encodeSize / nbThreads;
+    cctx->jobs[0].block.data = cctx->curBlock.data;
+    cctx->jobs[0].block.start = sliceStart;
+    cctx->jobs[0].block.end = sliceStart + sliceSize;
+
+    for (size_t u = 1; u < nbThreads; ++u) {
+        sliceStart += sliceSize;
+        cctx->jobs[u].block.data = cctx->curBlock.data;
+        cctx->jobs[u].block.start = sliceStart;
+        cctx->jobs[u].block.end = sliceStart + sliceSize;
     }
-
-    if(FL2_initEncoders(cctx) != 0) /* Create hash objects together, leaving the (large) match table last */
-        return FL2_ERROR(memory_allocation);
-
-    if (!cctx->matchTable) {
-        cctx->matchTable = RMF_createMatchTable(&cctx->params.rParams, cctx->curBlock.end, cctx->jobCount);
-        if (cctx->matchTable == NULL)
-            return FL2_ERROR(memory_allocation);
-    }
-    else {
-        DEBUGLOG(5, "Have compatible match table");
-        RMF_applyParameters(cctx->matchTable, &cctx->params.rParams, cctx->curBlock.end);
-    }
-
-    {   size_t sliceStart = cctx->curBlock.start;
-        size_t sliceSize = encodeSize / nbThreads;
-        cctx->jobs[0].block.data = cctx->curBlock.data;
-        cctx->jobs[0].block.start = sliceStart;
-        cctx->jobs[0].block.end = sliceStart + sliceSize;
-
-        for (size_t u = 1; u < nbThreads; ++u) {
-            sliceStart += sliceSize;
-            cctx->jobs[u].block.data = cctx->curBlock.data;
-            cctx->jobs[u].block.start = sliceStart;
-            cctx->jobs[u].block.end = sliceStart + sliceSize;
-        }
-        cctx->jobs[nbThreads - 1].block.end = cctx->curBlock.end;
-    }
-
-    /* update largest dict size used */
-    cctx->dictMax = MAX(cctx->dictMax, cctx->curBlock.end);
+    cctx->jobs[nbThreads - 1].block.end = cctx->curBlock.end;
 
     /* initialize to length 2 */
-    init_done = RMF_initTable(cctx->matchTable, cctx->curBlock.data, cctx->curBlock.start, cctx->curBlock.end);
+    cctx->matchTable->progress = RMF_initTable(cctx->matchTable, cctx->curBlock.data, cctx->curBlock.end);
+
+    if (cctx->canceled) {
+        RMF_resetIncompleteBuild(cctx->matchTable);
+        return FL2_ERROR(canceled);
+    }
 
 #ifndef FL2_SINGLETHREAD
+
     mfThreads = MIN(RMF_threadCount(cctx->matchTable), mfThreads);
-    for (size_t u = 1; u < mfThreads; ++u) {
-		FL2POOL_add(cctx->factory, FL2_buildRadixTable, &cctx->jobs[u], u);
-    }
+    FL2POOL_addRange(cctx->factory, FL2_buildRadixTable, cctx, 1, mfThreads);
+
 #endif
 
-    err = RMF_buildTable(cctx->matchTable, 0, mfThreads > 1, cctx->curBlock, progress, opaque, rmf_weight, init_done);
+    int err = RMF_buildTable(cctx->matchTable, 0, mfThreads > 1, cctx->curBlock);
 
 #ifndef FL2_SINGLETHREAD
 
-    FL2POOL_waitAll(cctx->factory);
+    FL2POOL_waitAll(cctx->factory, 0);
 
     if (err)
         return FL2_ERROR(canceled);
@@ -319,12 +323,14 @@ static size_t FL2_compressCurBlock(FL2_CCtx* const cctx, FL2_progressFn progress
         return FL2_ERROR(internal);
 #endif
 
-    for (size_t u = 1; u < nbThreads; ++u) {
-		FL2POOL_add(cctx->factory, FL2_compressRadixChunk, &cctx->jobs[u], u);
-    }
+    FL2POOL_addRange(cctx->factory, FL2_compressRadixChunk, cctx, 1, nbThreads);
 
-    cctx->jobs[0].cSize = FL2_lzma2Encode(cctx->jobs[0].enc, cctx->matchTable, cctx->jobs[0].block, &cctx->params.cParams, progress, opaque, (rmf_weight * encodeSize) >> 4, enc_weight * (U32)nbThreads);
-    FL2POOL_waitAll(cctx->factory);
+    cctx->jobs[0].cSize = LZMA2_encode(cctx->jobs[0].enc, cctx->matchTable,
+        cctx->jobs[0].block,
+        &cctx->params.cParams, streamProp,
+        &cctx->progressIn, &cctx->progressOut, &cctx->canceled);
+
+    FL2POOL_waitAll(cctx->factory, 0);
 
 #else /* FL2_SINGLETHREAD */
 
@@ -336,88 +342,199 @@ static size_t FL2_compressCurBlock(FL2_CCtx* const cctx, FL2_progressFn progress
     if (err)
         return FL2_ERROR(internal);
 #endif
-    cctx->jobs[0].cSize = FL2_lzma2Encode(cctx->jobs[0].enc, cctx->matchTable, cctx->jobs[0].block, &cctx->params.cParams, progress, opaque, (rmf_weight * encodeSize) >> 4, enc_weight);
+    cctx->jobs[0].cSize = LZMA2_encode(cctx->jobs[0].enc, cctx->matchTable,
+        cctx->jobs[0].block,
+        &cctx->params.cParams, streamProp,
+        &cctx->progressIn, &cctx->progressOut, &cctx->canceled);
 
 #endif
 
-    return nbThreads;
+    for (size_t u = 0; u < nbThreads; ++u)
+        if (FL2_isError(cctx->jobs[u].cSize))
+            return cctx->jobs[u].cSize;
+
+    cctx->threadCount = nbThreads;
+
+    return FL2_error_no_error;
 }
 
-FL2LIB_API void FL2LIB_CALL FL2_beginFrame(FL2_CCtx* const cctx)
+/* FL2_compressCurBlock_async() : FL2POOL_function type */
+static void FL2_compressCurBlock_async(void* const jobDescription, ptrdiff_t const n)
+{
+    FL2_CCtx* const cctx = (FL2_CCtx*)jobDescription;
+
+    cctx->asyncRes = FL2_compressCurBlock_blocking(cctx, (int)n);
+}
+
+/* FL2_compressCurBlock() :
+ * Update total input size.
+ * Clear the compressed data buffers.
+ * Init progress info.
+ * Start compression of cctx->curBlock, and wait for completion if no async compression thread exists.
+ */
+static size_t FL2_compressCurBlock(FL2_CCtx* const cctx, int const streamProp)
+{
+    FL2_initProgress(cctx);
+
+    if (cctx->curBlock.start == cctx->curBlock.end)
+        return FL2_error_no_error;
+
+    /* update largest dict size used */
+    cctx->dictMax = MAX(cctx->dictMax, cctx->curBlock.end);
+
+    cctx->outThread = 0;
+    cctx->threadCount = 0;
+    cctx->outPos = 0;
+
+    U32 rmfWeight = ZSTD_highbit32((U32)cctx->curBlock.end);
+    U32 depthWeight = 2 + (cctx->params.rParams.depth >= 12) + (cctx->params.rParams.depth >= 28);
+    U32 encWeight;
+
+    if (rmfWeight >= 20) {
+        rmfWeight = depthWeight * (rmfWeight - 10) + (rmfWeight - 19) * 12;
+        if (cctx->params.cParams.strategy == 0)
+            encWeight = 20;
+        else if (cctx->params.cParams.strategy == 1)
+            encWeight = 50;
+        else
+            encWeight = 60 + cctx->params.cParams.second_dict_bits + ZSTD_highbit32(cctx->params.cParams.fast_length) * 3U;
+        rmfWeight = (rmfWeight << 4) / (rmfWeight + encWeight);
+        encWeight = 16 - rmfWeight;
+    }
+    else {
+        rmfWeight = 8;
+        encWeight = 8;
+    }
+
+    cctx->rmfWeight = rmfWeight;
+    cctx->encWeight = encWeight;
+
+#ifndef FL2_SINGLETHREAD
+    if(cctx->compressThread != NULL)
+        FL2POOL_add(cctx->compressThread, FL2_compressCurBlock_async, cctx, streamProp);
+    else
+#endif
+        cctx->asyncRes = FL2_compressCurBlock_blocking(cctx, streamProp);
+
+    return cctx->asyncRes;
+}
+
+/* FL2_getProp() :
+ * Get the LZMA2 dictionary size property byte. If xxhash is enabled, includes the xxhash flag bit.
+ */
+static BYTE FL2_getProp(FL2_CCtx* const cctx, size_t const dictionarySize)
+{
+#ifndef NO_XXHASH
+    return LZMA2_getDictSizeProp(dictionarySize) | (BYTE)((cctx->params.doXXH != 0) << FL2_PROP_HASH_BIT);
+#else
+    (void)cctx;
+    return LZMA2_getDictSizeProp(dictionarySize);
+#endif
+}
+
+static void FL2_preBeginFrame(FL2_CCtx* const cctx, size_t const dictReduce)
+{
+    /* Free unsuitable match table before reallocating anything else */
+    if (cctx->matchTable && !RMF_compatibleParameters(cctx->matchTable, &cctx->params.rParams, dictReduce)) {
+        RMF_freeMatchTable(cctx->matchTable);
+        cctx->matchTable = NULL;
+    }
+}
+
+static size_t FL2_beginFrame(FL2_CCtx* const cctx, size_t const dictReduce)
+{
+    if (FL2_initEncoders(cctx) != 0) /* Create hash objects together, leaving the (large) match table last */
+        return FL2_ERROR(memory_allocation);
+
+    if (!cctx->matchTable) {
+        cctx->matchTable = RMF_createMatchTable(&cctx->params.rParams, dictReduce, cctx->jobCount);
+        if (cctx->matchTable == NULL)
+            return FL2_ERROR(memory_allocation);
+    }
+    else {
+        DEBUGLOG(5, "Have compatible match table");
+        RMF_applyParameters(cctx->matchTable, &cctx->params.rParams, dictReduce);
+    }
+
+    cctx->dictMax = 0;
+    cctx->streamTotal = 0;
+    cctx->streamCsize = 0;
+    cctx->progressIn = 0;
+    cctx->progressOut = 0;
+    RMF_initProgress(cctx->matchTable);
+    cctx->asyncRes = 0;
+    cctx->outThread = 0;
+    cctx->threadCount = 0;
+    cctx->outPos = 0;
+    cctx->curBlock.start = 0;
+    cctx->curBlock.end = 0;
+    cctx->lockParams = 1;
+
+    return FL2_error_no_error;
+}
+
+static void FL2_endFrame(FL2_CCtx* const cctx)
 {
     cctx->dictMax = 0;
-    cctx->block_total = 0;
+    cctx->asyncRes = 0;
+    cctx->lockParams = 0;
 }
 
-static size_t FL2_compressBlock(FL2_CCtx* const cctx,
-    const void* const src, size_t srcStart, size_t const srcEnd,
-    void* const dst, size_t dstCapacity,
-    FL2_writerFn const writeFn, void* const opaque,
-    FL2_progressFn progress)
+/* Compress a memory buffer which may be larger than the dictionary.
+ * The property byte is written first unless the omit flag is set.
+ * Return: compressed size.
+ */
+static size_t FL2_compressBuffer(FL2_CCtx* const cctx,
+    const void* const src, size_t srcSize,
+    void* const dst, size_t dstCapacity)
 {
-    BYTE* dstBuf = dst;
-    size_t outSize = 0;
-    size_t const dictionary_size = (size_t)1 << cctx->params.rParams.dictionary_log;
-    size_t const block_overlap = OVERLAP_FROM_DICT_LOG(cctx->params.rParams.dictionary_log, cctx->params.rParams.overlap_fraction);
-
-    if (srcStart >= srcEnd)
+    if (srcSize == 0)
         return 0;
+
+    BYTE* dstBuf = dst;
+    size_t const dictionarySize = cctx->params.rParams.dictionary_size;
+    size_t const blockOverlap = OVERLAP_FROM_DICT_SIZE(dictionarySize, cctx->params.rParams.overlap_fraction);
+    int streamProp = cctx->params.omitProp ? -1 : FL2_getProp(cctx, MIN(srcSize, dictionarySize));
+
     cctx->curBlock.data = src;
-    cctx->curBlock.start = srcStart;
+    cctx->curBlock.start = 0;
 
-    while (srcStart < srcEnd) {
-        size_t nbThreads;
+    size_t blockTotal = 0;
 
-        cctx->curBlock.end = cctx->curBlock.start + MIN(srcEnd - srcStart, dictionary_size - cctx->curBlock.start);
+    do {
+        cctx->curBlock.end = cctx->curBlock.start + MIN(srcSize, dictionarySize - cctx->curBlock.start);
+        blockTotal += cctx->curBlock.end - cctx->curBlock.start;
 
-        nbThreads = FL2_compressCurBlock(cctx, progress, opaque);
-        if (FL2_isError(nbThreads))
-            return nbThreads;
+        CHECK_F(FL2_compressCurBlock(cctx, streamProp));
 
-        for (size_t u = 0; u < nbThreads; ++u) {
-            const BYTE* const outBuf = RMF_getTableAsOutputBuffer(cctx->matchTable, cctx->jobs[u].block.start);
-
-            if (FL2_isError(cctx->jobs[u].cSize))
-                return cctx->jobs[u].cSize;
+        streamProp = -1;
 
+        for (size_t u = 0; u < cctx->threadCount; ++u) {
             DEBUGLOG(5, "Write thread %u : %u bytes", (U32)u, (U32)cctx->jobs[u].cSize);
 
-            if (writeFn == NULL && dstCapacity < cctx->jobs[u].cSize) {
+            if (dstCapacity < cctx->jobs[u].cSize) 
                 return FL2_ERROR(dstSize_tooSmall);
-            }
-            if (writeFn != NULL) {
-                if(writeFn(outBuf, cctx->jobs[u].cSize, opaque))
-                    return FL2_ERROR(write_failed);
-                outSize += cctx->jobs[u].cSize;
-            }
-            else {
-                memcpy(dstBuf, outBuf, cctx->jobs[u].cSize);
-                dstBuf += cctx->jobs[u].cSize;
-                dstCapacity -= cctx->jobs[u].cSize;
-            }
+
+            const BYTE* const outBuf = RMF_getTableAsOutputBuffer(cctx->matchTable, cctx->jobs[u].block.start);
+            memcpy(dstBuf, outBuf, cctx->jobs[u].cSize);
+
+            dstBuf += cctx->jobs[u].cSize;
+            dstCapacity -= cctx->jobs[u].cSize;
         }
-        srcStart += cctx->curBlock.end - cctx->curBlock.start;
-        cctx->block_total += cctx->curBlock.end - cctx->curBlock.start;
-        if (cctx->params.rParams.block_size_log && cctx->block_total + MIN(cctx->curBlock.end - block_overlap, srcEnd - srcStart) > ((U64)1 << cctx->params.rParams.block_size_log)) {
+        srcSize -= cctx->curBlock.end - cctx->curBlock.start;
+        if (cctx->params.cParams.reset_interval
+            && blockTotal + MIN(dictionarySize - blockOverlap, srcSize) > dictionarySize * cctx->params.cParams.reset_interval) {
             /* periodically reset the dictionary for mt decompression */
+            DEBUGLOG(4, "Resetting dictionary after %u bytes", (unsigned)blockTotal);
             cctx->curBlock.start = 0;
-            cctx->block_total = 0;
+            blockTotal = 0;
         }
         else {
-            cctx->curBlock.start = block_overlap;
+            cctx->curBlock.start = blockOverlap;
         }
         cctx->curBlock.data += cctx->curBlock.end - cctx->curBlock.start;
-    }
-    return (writeFn != NULL) ? outSize : dstBuf - (const BYTE*)dst;
-}
-
-static BYTE FL2_getProp(FL2_CCtx* cctx, size_t dictionary_size)
-{
-    return FL2_getDictSizeProp(dictionary_size)
-#ifndef NO_XXHASH
-        | (BYTE)((cctx->params.doXXH != 0) << FL2_PROP_HASH_BIT)
-#endif
-        ;
+    } while (srcSize != 0);
+    return dstBuf - (const BYTE*)dst;
 }
 
 FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* cctx,
@@ -425,31 +542,39 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* cctx,
     const void* src, size_t srcSize,
     int compressionLevel)
 {
-    BYTE* dstBuf = dst;
-    BYTE* const end = dstBuf + dstCapacity;
-    size_t cSize = 0;
+    if (dstCapacity < 2U - cctx->params.omitProp) /* empty LZMA2 stream is byte sequence {0, 0} */
+        return FL2_ERROR(dstSize_tooSmall);
 
     if (compressionLevel > 0)
         FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, compressionLevel);
 
     DEBUGLOG(4, "FL2_compressCCtx : level %u, %u src => %u avail", cctx->params.compressionLevel, (U32)srcSize, (U32)dstCapacity);
 
-    if (dstCapacity < 2U - cctx->params.omitProp) /* empty LZMA2 stream is byte sequence {0, 0} */
-        return FL2_ERROR(dstSize_tooSmall);
+#ifndef FL2_SINGLETHREAD
+    /* No async compression for in-memory function */
+    FL2POOL_free(cctx->compressThread);
+    cctx->compressThread = NULL;
+    cctx->timeout = 0;
+#endif
 
-    FL2_beginFrame(cctx);
+    FL2_preBeginFrame(cctx, srcSize);
+    CHECK_F(FL2_beginFrame(cctx, srcSize));
 
-    dstBuf += !cctx->params.omitProp;
-    cSize = FL2_compressBlock(cctx, src, 0, srcSize, dstBuf, end - dstBuf, NULL, NULL, NULL);
-    if(!cctx->params.omitProp)
-        dstBuf[-1] = FL2_getProp(cctx, cctx->dictMax);
+    size_t const cSize = FL2_compressBuffer(cctx, src, srcSize, dst, dstCapacity);
 
     if (FL2_isError(cSize))
         return cSize;
 
+    BYTE* dstBuf = dst;
+    BYTE* const end = dstBuf + dstCapacity;
+
     dstBuf += cSize;
     if(dstBuf >= end)
         return FL2_ERROR(dstSize_tooSmall);
+
+    if (cSize == 0)
+        *dstBuf++ = FL2_getProp(cctx, 0);
+
     *dstBuf++ = LZMA2_END_MARKER;
 
 #ifndef NO_XXHASH
@@ -463,100 +588,25 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtx(FL2_CCtx* cctx,
         dstBuf += XXHASH_SIZEOF;
     }
 #endif
+    
+    FL2_endFrame(cctx);
+
     return dstBuf - (BYTE*)dst;
 }
 
-FL2LIB_API size_t FL2LIB_CALL FL2_blockOverlap(const FL2_CCtx* cctx)
-{
-	return OVERLAP_FROM_DICT_LOG(cctx->params.rParams.dictionary_log, cctx->params.rParams.overlap_fraction);
-}
-
-FL2LIB_API void FL2LIB_CALL FL2_shiftBlock(FL2_CCtx* cctx, FL2_blockBuffer *block)
-{
-    FL2_shiftBlock_switch(cctx, block, NULL);
-}
-
-FL2LIB_API void FL2LIB_CALL FL2_shiftBlock_switch(FL2_CCtx* cctx, FL2_blockBuffer *block, unsigned char *dst)
-{
-    size_t const block_overlap = OVERLAP_FROM_DICT_LOG(cctx->params.rParams.dictionary_log, cctx->params.rParams.overlap_fraction);
-
-	if (block_overlap == 0) {
-		block->start = 0;
-		block->end = 0;
-	}
-	else if (block->end > block_overlap) {
-        size_t const from = (block->end - block_overlap) & ALIGNMENT_MASK;
-        size_t overlap = block->end - from;
-
-        cctx->block_total += block->end - block->start;
-        if (cctx->params.rParams.block_size_log && cctx->block_total + from > ((U64)1 << cctx->params.rParams.block_size_log)) {
-            /* periodically reset the dictionary for mt decompression */
-            overlap = 0;
-            cctx->block_total = 0;
-        }
-        else if (overlap <= from || dst != NULL) {
-            DEBUGLOG(5, "Copy overlap data : %u bytes", (U32)overlap);
-            memcpy(dst ? dst : block->data, block->data + from, overlap);
-        }
-		else if (from != 0) {
-            DEBUGLOG(5, "Move overlap data : %u bytes", (U32)overlap);
-            memmove(block->data, block->data + from, overlap);
-        }
-        block->start = overlap;
-        block->end = overlap;
-    }
-    else {
-        block->start = block->end;
-    }
-}
-
-FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtxBlock(FL2_CCtx* cctx,
-    void* dst, size_t dstCapacity,
-    const FL2_blockBuffer *block,
-    FL2_progressFn progress, void* opaque)
-{
-    return FL2_compressBlock(cctx, block->data, block->start, block->end, dst, dstCapacity, NULL, opaque, progress);
-}
-
-FL2LIB_API size_t FL2LIB_CALL FL2_endFrame(FL2_CCtx* ctx,
-    void* dst, size_t dstCapacity)
-{
-    if (!dstCapacity)
-        return FL2_ERROR(dstSize_tooSmall);
-    *(BYTE*)dst = LZMA2_END_MARKER;
-    return 1;
-}
-
-FL2LIB_API size_t FL2LIB_CALL FL2_compressCCtxBlock_toFn(FL2_CCtx* cctx,
-    FL2_writerFn writeFn, void* opaque,
-    const FL2_blockBuffer *block,
-    FL2_progressFn progress)
-{
-    return FL2_compressBlock(cctx, block->data, block->start, block->end, NULL, 0, writeFn, opaque, progress);
-}
-
-FL2LIB_API size_t FL2LIB_CALL FL2_endFrame_toFn(FL2_CCtx* ctx,
-    FL2_writerFn writeFn, void* opaque)
-{
-    BYTE c = LZMA2_END_MARKER;
-    if(writeFn(&c, 1, opaque))
-        return FL2_ERROR(write_failed);
-    return 1;
-}
-
 FL2LIB_API size_t FL2LIB_CALL FL2_compressMt(void* dst, size_t dstCapacity,
     const void* src, size_t srcSize,
     int compressionLevel,
     unsigned nbThreads)
 {
-    size_t cSize;
     FL2_CCtx* const cctx = FL2_createCCtxMt(nbThreads);
     if (cctx == NULL)
         return FL2_ERROR(memory_allocation);
 
-    cSize = FL2_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel);
+    size_t const cSize = FL2_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel);
 
     FL2_freeCCtx(cctx);
+
     return cSize;
 }
 
@@ -567,462 +617,691 @@ FL2LIB_API size_t FL2LIB_CALL FL2_compress(void* dst, size_t dstCapacity,
     return FL2_compressMt(dst, dstCapacity, src, srcSize, compressionLevel, 1);
 }
 
-FL2LIB_API BYTE FL2LIB_CALL FL2_dictSizeProp(FL2_CCtx* cctx)
+FL2LIB_API BYTE FL2LIB_CALL FL2_getCCtxDictProp(FL2_CCtx* cctx)
 {
-    return FL2_getDictSizeProp(cctx->dictMax ? cctx->dictMax : (size_t)1 << cctx->params.rParams.dictionary_log);
+    return LZMA2_getDictSizeProp(cctx->dictMax ? cctx->dictMax : cctx->params.rParams.dictionary_size);
 }
 
-#define CLAMPCHECK(val,min,max) {            \
+#define MAXCHECK(val,max) do {            \
+    if ((val)>(max)) {     \
+        return FL2_ERROR(parameter_outOfBound);  \
+}   } while(0)
+
+#define CLAMPCHECK(val,min,max) do {            \
     if (((val)<(min)) | ((val)>(max))) {     \
         return FL2_ERROR(parameter_outOfBound);  \
-}   }
+}   } while(0)
 
-FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_setParameter(FL2_CCtx* cctx, FL2_cParameter param, unsigned value)
+
+FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_setParameter(FL2_CCtx* cctx, FL2_cParameter param, size_t value)
+{
+    if (cctx->lockParams
+        && param != FL2_p_literalCtxBits && param != FL2_p_literalPosBits && param != FL2_p_posBits)
+        return FL2_ERROR(stage_wrong);
+
+    switch (param)
+    {
+    case FL2_p_compressionLevel:
+        if (cctx->params.highCompression) {
+            CLAMPCHECK(value, 1, FL2_MAX_HIGH_CLEVEL);
+            FL2_fillParameters(cctx, &FL2_highCParameters[value]);
+        }
+        else {
+            CLAMPCHECK(value, 1, FL2_MAX_CLEVEL);
+            FL2_fillParameters(cctx, &FL2_defaultCParameters[value]);
+        }
+        cctx->params.compressionLevel = (unsigned)value;
+        break;
+
+    case FL2_p_highCompression:
+        cctx->params.highCompression = value != 0;
+        FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, cctx->params.compressionLevel);
+        break;
+
+    case FL2_p_dictionaryLog:
+        CLAMPCHECK(value, FL2_DICTLOG_MIN, FL2_DICTLOG_MAX);
+        cctx->params.rParams.dictionary_size = (size_t)1 << value;
+        break;
+
+    case FL2_p_dictionarySize:
+        CLAMPCHECK(value, FL2_DICTSIZE_MIN, FL2_DICTSIZE_MAX);
+        cctx->params.rParams.dictionary_size = value;
+        break;
+
+    case FL2_p_overlapFraction:
+        MAXCHECK(value, FL2_BLOCK_OVERLAP_MAX);
+        cctx->params.rParams.overlap_fraction = (unsigned)value;
+        break;
+
+    case FL2_p_resetInterval:
+        if (value != 0)
+            CLAMPCHECK(value, FL2_RESET_INTERVAL_MIN, FL2_RESET_INTERVAL_MAX);
+        cctx->params.cParams.reset_interval = (unsigned)value;
+        break;
+
+    case FL2_p_bufferLog:
+        MAXCHECK(value, FL2_BUFFER_SIZE_LOG_MAX);
+        cctx->params.rParams.match_buffer_log = RMF_BUFFER_LOG_BASE - (unsigned)value;
+        break;
+
+    case FL2_p_hybridChainLog:
+        CLAMPCHECK(value, FL2_CHAINLOG_MIN, FL2_CHAINLOG_MAX);
+        cctx->params.cParams.second_dict_bits = (unsigned)value;
+        break;
+
+    case FL2_p_hybridCycles:
+        CLAMPCHECK(value, FL2_HYBRIDCYCLES_MIN, FL2_HYBRIDCYCLES_MAX);
+        cctx->params.cParams.match_cycles = (unsigned)value;
+        break;
+
+    case FL2_p_searchDepth:
+        CLAMPCHECK(value, FL2_SEARCH_DEPTH_MIN, FL2_SEARCH_DEPTH_MAX);
+        cctx->params.rParams.depth = (unsigned)value;
+        break;
+
+    case FL2_p_fastLength:
+        CLAMPCHECK(value, FL2_FASTLENGTH_MIN, FL2_FASTLENGTH_MAX);
+        cctx->params.cParams.fast_length = (unsigned)value;
+        break;
+
+    case FL2_p_divideAndConquer:
+        cctx->params.rParams.divide_and_conquer = value != 0;
+        break;
+
+    case FL2_p_strategy:
+        MAXCHECK(value, (unsigned)FL2_ultra);
+        cctx->params.cParams.strategy = (FL2_strategy)value;
+        break;
+
+        /* lc, lp, pb can be changed between encoder chunks.
+         * A condition where lc+lp > 4 is permitted to allow sequential setting,
+         * but will return an error code to alert the calling function.
+         * If lc+lp is still >4 when encoding begins, lc will be reduced. */
+    case FL2_p_literalCtxBits:
+        MAXCHECK(value, FL2_LC_MAX);
+        cctx->params.cParams.lc = (unsigned)value;
+        if (value + cctx->params.cParams.lp > FL2_LCLP_MAX)
+            return FL2_ERROR(lclpMax_exceeded);
+        break;
+
+    case FL2_p_literalPosBits:
+        MAXCHECK(value, FL2_LP_MAX);
+        cctx->params.cParams.lp = (unsigned)value;
+        if (cctx->params.cParams.lc + value > FL2_LCLP_MAX)
+            return FL2_ERROR(lclpMax_exceeded);
+        break;
+
+    case FL2_p_posBits:
+        MAXCHECK(value, FL2_PB_MAX);
+        cctx->params.cParams.pb = (unsigned)value;
+        break;
+
+#ifndef NO_XXHASH
+    case FL2_p_doXXHash:
+        cctx->params.doXXH = value != 0;
+        break;
+#endif
+
+    case FL2_p_omitProperties:
+        cctx->params.omitProp = value != 0;
+        break;
+#ifdef RMF_REFERENCE
+    case FL2_p_useReferenceMF:
+        cctx->params.rParams.use_ref_mf = value != 0;
+        break;
+#endif
+    default: return FL2_ERROR(parameter_unsupported);
+    }
+    return value;
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_CCtx_getParameter(FL2_CCtx* cctx, FL2_cParameter param)
 {
     switch (param)
     {
     case FL2_p_compressionLevel:
-        if (value > 0) { /* 0 : does not change current level */
-            if (cctx->params.highCompression) {
-                if ((int)value > FL2_MAX_HIGH_CLEVEL) value = FL2_MAX_HIGH_CLEVEL;
-                FL2_fillParameters(cctx, &FL2_highCParameters[value]);
-            }
-            else {
-                if ((int)value > FL2_MAX_CLEVEL) value = FL2_MAX_CLEVEL;
-                FL2_fillParameters(cctx, &FL2_defaultCParameters[value]);
-            }
-            cctx->params.compressionLevel = value;
-        }
         return cctx->params.compressionLevel;
 
     case FL2_p_highCompression:
-        if ((int)value >= 0) { /* < 0 : does not change highCompression */
-            cctx->params.highCompression = value != 0;
-            FL2_CCtx_setParameter(cctx, FL2_p_compressionLevel, cctx->params.compressionLevel);
-        }
         return cctx->params.highCompression;
 
-    case FL2_p_7zLevel:
-        if (value > 0) { /* 0 : does not change current level */
-            if ((int)value > FL2_MAX_7Z_CLEVEL) value = FL2_MAX_7Z_CLEVEL;
-            FL2_fillParameters(cctx, &FL2_7zCParameters[value]);
-            cctx->params.compressionLevel = value;
-        }
-        return cctx->params.compressionLevel;
+    case FL2_p_dictionaryLog: {
+        size_t dictLog = FL2_DICTLOG_MIN;
+        while (((size_t)1 << dictLog) < cctx->params.rParams.dictionary_size)
+            ++dictLog;
+        return dictLog;
+    }
 
-    case FL2_p_dictionaryLog:
-        if (value) {  /* 0 : does not change current dictionaryLog */
-            CLAMPCHECK(value, FL2_DICTLOG_MIN, FL2_DICTLOG_MAX);
-            cctx->params.rParams.dictionary_log = value;
-        }
-        return cctx->params.rParams.dictionary_log;
+    case FL2_p_dictionarySize:
+        return cctx->params.rParams.dictionary_size;
 
     case FL2_p_overlapFraction:
-        if ((int)value >= 0) {  /* < 0 : does not change current overlapFraction */
-            CLAMPCHECK(value, FL2_BLOCK_OVERLAP_MIN, FL2_BLOCK_OVERLAP_MAX);
-            cctx->params.rParams.overlap_fraction = value;
-        }
         return cctx->params.rParams.overlap_fraction;
 
-    case FL2_p_blockSize:
-        if ((int)value >= 0) {  /* < 0 : does not change current overlapFraction */
-            CLAMPCHECK(value, FL2_BLOCK_LOG_MIN, FL2_BLOCK_LOG_MAX);
-            cctx->params.rParams.block_size_log = value;
-        }
-        return cctx->params.rParams.block_size_log;
+    case FL2_p_resetInterval:
+        return cctx->params.cParams.reset_interval;
 
     case FL2_p_bufferLog:
-        if (value) {  /* 0 : does not change current bufferLog */
-            CLAMPCHECK(value, FL2_BUFFER_SIZE_LOG_MIN, FL2_BUFFER_SIZE_LOG_MAX);
-            cctx->params.rParams.match_buffer_log = value;
-        }
-        return cctx->params.rParams.match_buffer_log;
+        return RMF_BUFFER_LOG_BASE - cctx->params.rParams.match_buffer_log;
 
-    case FL2_p_chainLog:
-        if (value) { /* 0 : does not change current chainLog */
-            CLAMPCHECK(value, FL2_CHAINLOG_MIN, FL2_CHAINLOG_MAX);
-            cctx->params.cParams.second_dict_bits = value;
-        }
+    case FL2_p_hybridChainLog:
         return cctx->params.cParams.second_dict_bits;
 
-    case FL2_p_searchLog:
-        if ((int)value >= 0) { /* < 0 : does not change current searchLog */
-            CLAMPCHECK(value, FL2_SEARCHLOG_MIN, FL2_SEARCHLOG_MAX);
-            cctx->params.cParams.match_cycles = 1U << value;
-        }
-        return value;
+    case FL2_p_hybridCycles:
+        return cctx->params.cParams.match_cycles;
 
     case FL2_p_literalCtxBits:
-        if ((int)value >= 0) { /* < 0 : does not change current lc */
-            CLAMPCHECK(value, FL2_LC_MIN, FL2_LC_MAX);
-            cctx->params.cParams.lc = value;
-        }
         return cctx->params.cParams.lc;
 
     case FL2_p_literalPosBits:
-        if ((int)value >= 0) { /* < 0 : does not change current lp */
-            CLAMPCHECK(value, FL2_LP_MIN, FL2_LP_MAX);
-            cctx->params.cParams.lp = value;
-        }
         return cctx->params.cParams.lp;
 
     case FL2_p_posBits:
-        if ((int)value >= 0) { /* < 0 : does not change current pb */
-            CLAMPCHECK(value, FL2_PB_MIN, FL2_PB_MAX);
-            cctx->params.cParams.pb = value;
-        }
         return cctx->params.cParams.pb;
 
     case FL2_p_searchDepth:
-        if (value) { /* 0 : does not change current depth */
-            CLAMPCHECK(value, FL2_SEARCH_DEPTH_MIN, FL2_SEARCH_DEPTH_MAX);
-            cctx->params.rParams.depth = value;
-        }
         return cctx->params.rParams.depth;
 
     case FL2_p_fastLength:
-        if (value) { /* 0 : does not change current fast_length */
-            CLAMPCHECK(value, FL2_FASTLENGTH_MIN, FL2_FASTLENGTH_MAX);
-            cctx->params.cParams.fast_length = value;
-        }
         return cctx->params.cParams.fast_length;
 
     case FL2_p_divideAndConquer:
-        if ((int)value >= 0) { /* < 0 : does not change current divide_and_conquer */
-            cctx->params.rParams.divide_and_conquer = value;
-        }
         return cctx->params.rParams.divide_and_conquer;
 
     case FL2_p_strategy:
-        if ((int)value >= 0) { /* < 0 : does not change current strategy */
-            CLAMPCHECK(value, (unsigned)FL2_fast, (unsigned)FL2_ultra);
-            cctx->params.cParams.strategy = (FL2_strategy)value;
-        }
         return (size_t)cctx->params.cParams.strategy;
 
 #ifndef NO_XXHASH
     case FL2_p_doXXHash:
-        if ((int)value >= 0) { /* < 0 : does not change doXXHash */
-            cctx->params.doXXH = value != 0;
-        }
         return cctx->params.doXXH;
 #endif
 
     case FL2_p_omitProperties:
-        if ((int)value >= 0) { /* < 0 : does not change omitProp */
-            cctx->params.omitProp = value != 0;
-        }
         return cctx->params.omitProp;
 #ifdef RMF_REFERENCE
     case FL2_p_useReferenceMF:
-        if ((int)value >= 0) { /* < 0 : does not change useRefMF */
-            cctx->params.rParams.use_ref_mf = value != 0;
-        }
         return cctx->params.rParams.use_ref_mf;
 #endif
     default: return FL2_ERROR(parameter_unsupported);
     }
 }
 
-FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStream(void)
+FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cParameter param, size_t value)
 {
-    FL2_CCtx* const cctx = FL2_createCCtx();
-    FL2_CStream* const fcs = malloc(sizeof(FL2_CStream));
-
-    DEBUGLOG(3, "FL2_createCStream");
-
-    if (cctx == NULL || fcs == NULL) {
-        free(cctx);
-        free(fcs);
-        return NULL;
-    }
-    fcs->cctx = cctx;
-    fcs->inBuff.bufSize = 0;
-    fcs->inBuff.data = NULL;
-    fcs->inBuff.start = 0;
-    fcs->inBuff.end = 0;
-#ifndef NO_XXHASH
-    fcs->xxh = NULL;
-#endif
-    fcs->out_thread = 0;
-    fcs->thread_count = 0;
-    fcs->out_pos = 0;
-    fcs->hash_pos = 0;
-    fcs->end_marked = 0;
-    fcs->wrote_prop = 0;
-    return fcs;
+    return FL2_CCtx_setParameter(fcs, param, value);
 }
 
-FL2LIB_API size_t FL2LIB_CALL FL2_freeCStream(FL2_CStream* fcs)
+FL2LIB_API size_t FL2LIB_CALL FL2_CStream_getParameter(FL2_CStream* fcs, FL2_cParameter param)
 {
-    if (fcs == NULL)
-        return 0;
+    return FL2_CCtx_getParameter(fcs, param);
+}
 
-    DEBUGLOG(3, "FL2_freeCStream");
+FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStream(void)
+{
+    return FL2_createCCtx_internal(1, 0);
+}
 
-    free(fcs->inBuff.data);
-#ifndef NO_XXHASH
-    XXH32_freeState(fcs->xxh);
-#endif
-    FL2_freeCCtx(fcs->cctx);
-    free(fcs);
-    return 0;
+FL2LIB_API FL2_CStream* FL2LIB_CALL FL2_createCStreamMt(unsigned nbThreads, int dualBuffer)
+{
+    return FL2_createCCtx_internal(nbThreads, dualBuffer);
+}
+
+FL2LIB_API void FL2LIB_CALL FL2_freeCStream(FL2_CStream * fcs)
+{
+    FL2_freeCCtx(fcs);
 }
 
 FL2LIB_API size_t FL2LIB_CALL FL2_initCStream(FL2_CStream* fcs, int compressionLevel)
 {
     DEBUGLOG(4, "FL2_initCStream level %d", compressionLevel);
 
-    fcs->inBuff.start = 0;
-    fcs->inBuff.end = 0;
-    fcs->out_thread = 0;
-    fcs->thread_count = 0;
-    fcs->out_pos = 0;
-    fcs->hash_pos = 0;
-    fcs->end_marked = 0;
-    fcs->wrote_prop = 0;
+    fcs->endMarked = 0;
+    fcs->wroteProp = 0;
+    fcs->loopCount = 0;
 
-    FL2_CCtx_setParameter(fcs->cctx, FL2_p_compressionLevel, compressionLevel);
+    if(compressionLevel > 0)
+        FL2_CCtx_setParameter(fcs, FL2_p_compressionLevel, compressionLevel);
 
-#ifndef NO_XXHASH
-    if (fcs->cctx->params.doXXH && !fcs->cctx->params.omitProp) {
-        if (fcs->xxh == NULL) {
-            fcs->xxh = XXH32_createState();
-            if (fcs->xxh == NULL)
-                return FL2_ERROR(memory_allocation);
-        }
-        XXH32_reset(fcs->xxh, 0);
-    }
+    DICT_buffer *const buf = &fcs->buf;
+    size_t const dictSize = fcs->params.rParams.dictionary_size;
+
+    /* Free unsuitable objects before reallocating anything new */
+    if (DICT_size(buf) < dictSize)
+        DICT_destruct(buf);
+
+    FL2_preBeginFrame(fcs, 0);
+
+#ifdef NO_XXHASH
+    int const doHash = 0;
+#else
+    int const doHash = (fcs->params.doXXH && !fcs->params.omitProp);
 #endif
+    size_t dictOverlap = OVERLAP_FROM_DICT_SIZE(fcs->params.rParams.dictionary_size, fcs->params.rParams.overlap_fraction);
+    if (DICT_init(buf, dictSize, dictOverlap, fcs->params.cParams.reset_interval, doHash) != 0)
+        return FL2_ERROR(memory_allocation);
+
+    CHECK_F(FL2_beginFrame(fcs, 0));
 
-    FL2_beginFrame(fcs->cctx);
     return 0;
 }
 
-static size_t FL2_compressStream_internal(FL2_CStream* const fcs,
-    FL2_outBuffer* const output, int const ending)
+FL2LIB_API size_t FL2LIB_CALL FL2_setCStreamTimeout(FL2_CStream * fcs, unsigned timeout)
 {
-    FL2_CCtx* const cctx = fcs->cctx;
-
-    if (output->pos >= output->size)
-        return 0;
-
-    if (fcs->out_thread == fcs->thread_count) {
-        if (fcs->inBuff.start < fcs->inBuff.end) {
-#ifndef NO_XXHASH
-            if (cctx->params.doXXH && !cctx->params.omitProp) {
-                XXH32_update(fcs->xxh, fcs->inBuff.data + fcs->inBuff.start, fcs->inBuff.end - fcs->inBuff.start);
-            }
-#endif
-            cctx->curBlock.data = fcs->inBuff.data;
-            cctx->curBlock.start = fcs->inBuff.start;
-            cctx->curBlock.end = fcs->inBuff.end;
-
-            fcs->out_thread = 0;
-            fcs->thread_count = FL2_compressCurBlock(cctx, NULL, NULL);
-            if (FL2_isError(fcs->thread_count))
-                return fcs->thread_count;
-
-            fcs->inBuff.start = fcs->inBuff.end;
-        }
-        if (!fcs->wrote_prop && !cctx->params.omitProp) {
-            size_t dictionary_size = ending ? cctx->dictMax : (size_t)1 << cctx->params.rParams.dictionary_log;
-            ((BYTE*)output->dst)[output->pos] = FL2_getProp(cctx, dictionary_size);
-            DEBUGLOG(4, "Writing property byte : 0x%X", ((BYTE*)output->dst)[output->pos]);
-            ++output->pos;
-            fcs->wrote_prop = 1;
+#ifndef FL2_SINGLETHREAD
+    if (timeout != 0) {
+        if (fcs->compressThread == NULL) {
+            fcs->compressThread = FL2POOL_create(1);
+            if (fcs->compressThread == NULL)
+                return FL2_ERROR(memory_allocation);
         }
     }
-    for (; fcs->out_thread < fcs->thread_count; ++fcs->out_thread) {
-        const BYTE* const outBuf = RMF_getTableAsOutputBuffer(cctx->matchTable, cctx->jobs[fcs->out_thread].block.start) + fcs->out_pos;
+    else if (!DICT_async(&fcs->buf) && fcs->dictMax == 0) {
+        /* Only free the thread if not dual buffering and compression not underway */
+        FL2POOL_free(fcs->compressThread);
+        fcs->compressThread = NULL;
+    }
+    fcs->timeout = timeout;
+#endif
+    return FL2_error_no_error;
+}
+
+static size_t FL2_compressStream_internal(FL2_CStream* const fcs, int const ending)
+{
+    CHECK_F(FL2_waitCStream(fcs));
+
+    DICT_buffer *const buf = &fcs->buf;
+
+    /* no compression can occur while compressed output exists */
+    if (fcs->outThread == fcs->threadCount && DICT_hasUnprocessed(buf)) {
+        fcs->streamTotal += fcs->curBlock.end - fcs->curBlock.start;
+
+        DICT_getBlock(buf, &fcs->curBlock);
+
+        int streamProp = -1;
+
+        if (!fcs->wroteProp && !fcs->params.omitProp) {
+            /* If the LZMA2 property byte is required and not already written,
+             * pass it to the compression function 
+             */
+            size_t dictionarySize = ending ? MAX(fcs->dictMax, fcs->curBlock.end)
+                : fcs->params.rParams.dictionary_size;
+            streamProp = FL2_getProp(fcs, dictionarySize);
+            DEBUGLOG(4, "Writing property byte : 0x%X", streamProp);
+            fcs->wroteProp = 1;
+        }
+
+        CHECK_F(FL2_compressCurBlock(fcs, streamProp));
+    }
+    return FL2_error_no_error;
+}
+
+/* Copy the compressed output stored in the match table buffer.
+ * One slice exists per thread.
+ */
+static void FL2_copyCStreamOutput(FL2_CStream* fcs, FL2_outBuffer *output)
+{
+    for (; fcs->outThread < fcs->threadCount; ++fcs->outThread) {
+        const BYTE* const outBuf = RMF_getTableAsOutputBuffer(fcs->matchTable, fcs->jobs[fcs->outThread].block.start) + fcs->outPos;
         BYTE* const dstBuf = (BYTE*)output->dst + output->pos;
         size_t const dstCapacity = output->size - output->pos;
-        size_t to_write = cctx->jobs[fcs->out_thread].cSize;
+        size_t toWrite = fcs->jobs[fcs->outThread].cSize;
 
-        if (FL2_isError(to_write))
-            return to_write;
+        toWrite = MIN(toWrite - fcs->outPos, dstCapacity);
 
-        to_write = MIN(to_write - fcs->out_pos, dstCapacity);
+        DEBUGLOG(5, "CStream : writing %u bytes", (U32)toWrite);
 
-        DEBUGLOG(5, "CStream : writing %u bytes", (U32)to_write);
+        memcpy(dstBuf, outBuf, toWrite);
+        fcs->outPos += toWrite;
+        output->pos += toWrite;
 
-        memcpy(dstBuf, outBuf, to_write);
-        fcs->out_pos += to_write;
-        output->pos += to_write;
-
-        if (fcs->out_pos < cctx->jobs[fcs->out_thread].cSize)
+        /* If the slice is not flushed, the output is full */
+        if (fcs->outPos < fcs->jobs[fcs->outThread].cSize)
             break;
 
-        fcs->out_pos = 0;
+        fcs->outPos = 0;
     }
-    return 0;
 }
 
-static size_t FL2_remainingOutputSize(FL2_CStream* const fcs)
+static size_t FL2_compressStream_input(FL2_CStream* fcs, FL2_inBuffer* input)
 {
-    FL2_CCtx* const cctx = fcs->cctx;
-    size_t pos = fcs->out_pos;
-    size_t total = 0;
+    CHECK_F(fcs->asyncRes);
 
-    if (FL2_isError(fcs->thread_count))
-        return fcs->thread_count;
+    DICT_buffer * const buf = &fcs->buf;
 
-    for (size_t u = fcs->out_thread; u < fcs->thread_count; ++u) {
-        size_t to_write = cctx->jobs[u].cSize;
-
-        if (FL2_isError(to_write))
-            return to_write;
-        total += to_write - pos;
-        pos = 0;
-    }
-    return total;
-}
-
-FL2LIB_API size_t FL2LIB_CALL FL2_compressStream(FL2_CStream* fcs, FL2_outBuffer* output, FL2_inBuffer* input)
-{
-    FL2_blockBuffer* const inBuff = &fcs->inBuff;
-    FL2_CCtx* const cctx = fcs->cctx;
-    size_t block_overlap = OVERLAP_FROM_DICT_LOG(cctx->params.rParams.dictionary_log, cctx->params.rParams.overlap_fraction);
-
-    if (FL2_isError(fcs->thread_count))
-        return fcs->thread_count;
-
-    if (output->pos < output->size) while (input->pos < input->size) {
-        /* read input and/or write output until a buffer is full */
-        if (inBuff->data == NULL) {
-            inBuff->bufSize = (size_t)1 << cctx->params.rParams.dictionary_log;
-
-            DEBUGLOG(3, "Allocating input buffer : %u bytes", (U32)inBuff->bufSize);
-
-            inBuff->data = malloc(inBuff->bufSize);
-
-            if (inBuff->data == NULL)
-                return FL2_ERROR(memory_allocation);
-
-            inBuff->start = 0;
-            inBuff->end = 0;
+    while (input->pos < input->size) {
+        /* read input until the buffer(s) are full */
+        if (DICT_needShift(buf)) {
+            /* cannot shift single dict during compression */
+            if(!DICT_async(buf))
+                CHECK_F(FL2_waitCStream(fcs));
+            DICT_shift(buf);
         }
-        if (inBuff->start > block_overlap && input->pos < input->size) {
-            FL2_shiftBlock(fcs->cctx, inBuff);
+        
+        CHECK_F(fcs->asyncRes);
+
+        DICT_put(buf, input);
+        
+        if (!DICT_availSpace(buf)) {
+            /* break if the compressor is not available */
+            if (fcs->outThread < fcs->threadCount)
+                break;
+
+            CHECK_F(FL2_compressStream_internal(fcs, 0));
         }
-        if (fcs->out_thread == fcs->thread_count) {
-            /* no compressed output to write, so read */
-            size_t const toRead = MIN(input->size - input->pos, inBuff->bufSize - inBuff->end);
 
-            DEBUGLOG(5, "CStream : reading %u bytes", (U32)toRead);
-
-            memcpy(inBuff->data + inBuff->end, (char*)input->src + input->pos, toRead);
-            input->pos += toRead;
-            inBuff->end += toRead;
-        }
-        if (inBuff->end == inBuff->bufSize || fcs->out_thread < fcs->thread_count) {
-            CHECK_F(FL2_compressStream_internal(fcs, output, 0));
-        }
-        /* compressed output remains, so output buffer is full */
-        if (fcs->out_thread < fcs->thread_count)
-            break;
-    }
-    return (inBuff->data == NULL) ? (size_t)1 << cctx->params.rParams.dictionary_log : inBuff->bufSize - inBuff->end;
-}
-
-static size_t FL2_flushStream_internal(FL2_CStream* fcs, FL2_outBuffer* output, int ending)
-{
-    if (FL2_isError(fcs->thread_count))
-        return fcs->thread_count;
-
-    DEBUGLOG(4, "FL2_flushStream_internal : %u to compress, %u to write",
-        (U32)(fcs->inBuff.end - fcs->inBuff.start),
-        (U32)FL2_remainingOutputSize(fcs));
-
-    CHECK_F(FL2_compressStream_internal(fcs, output, ending));
-
-    return FL2_remainingOutputSize(fcs);
-}
-
-FL2LIB_API size_t FL2LIB_CALL FL2_flushStream(FL2_CStream* fcs, FL2_outBuffer* output)
-{
-    return FL2_flushStream_internal(fcs, output, 0);
-}
-
-FL2LIB_API size_t FL2LIB_CALL FL2_endStream(FL2_CStream* fcs, FL2_outBuffer* output)
-{
-    {   size_t cSize = FL2_flushStream_internal(fcs, output, 1);
-        if (cSize != 0)
-            return cSize;
+        CHECK_F(fcs->asyncRes);
     }
 
-    if(!fcs->end_marked) {
-        if (output->pos >= output->size)
-            return 1;
-        DEBUGLOG(4, "Writing end marker");
-        ((BYTE*)output->dst)[output->pos] = LZMA2_END_MARKER;
-        ++output->pos;
-        fcs->end_marked = 1;
+    return FL2_error_no_error;
+}
+
+static size_t FL2_loopCheck(FL2_CStream* fcs, int unchanged)
+{
+    if (unchanged) {
+        ++fcs->loopCount;
+        if (fcs->loopCount > FL2_MAX_LOOPS)
+            return FL2_ERROR(buffer);
     }
+    else {
+        fcs->loopCount = 0;
+    }
+    return FL2_error_no_error;
+}
 
-#ifndef NO_XXHASH
-    if (fcs->cctx->params.doXXH && !fcs->cctx->params.omitProp && fcs->hash_pos < XXHASH_SIZEOF) {
-        size_t const to_write = MIN(output->size - output->pos, XXHASH_SIZEOF - fcs->hash_pos);
-        XXH32_canonical_t canonical;
+FL2LIB_API size_t FL2LIB_CALL FL2_compressStream(FL2_CStream* fcs, FL2_outBuffer *output, FL2_inBuffer* input)
+{
+    if (!fcs->lockParams)
+        return FL2_ERROR(init_missing);
 
-        if (output->pos >= output->size)
-            return 1;
+    size_t const prevIn = input->pos;
+    size_t const prevOut = (output != NULL) ? output->pos : 0;
 
-        XXH32_canonicalFromHash(&canonical, XXH32_digest(fcs->xxh));
-        DEBUGLOG(4, "Writing XXH32 : %u bytes", (U32)to_write);
-        memcpy((BYTE*)output->dst + output->pos, canonical.digest + fcs->hash_pos, to_write);
-        output->pos += to_write;
-        fcs->hash_pos += to_write;
-        return fcs->hash_pos < XXHASH_SIZEOF;
+    if (output != NULL && fcs->outThread < fcs->threadCount)
+        FL2_copyCStreamOutput(fcs, output);
+
+    CHECK_F(FL2_compressStream_input(fcs, input));
+
+    if(output != NULL && fcs->outThread < fcs->threadCount)
+        FL2_copyCStreamOutput(fcs, output);
+
+    CHECK_F(FL2_loopCheck(fcs, prevIn == input->pos && (output == NULL || prevOut == output->pos)));
+
+    return fcs->outThread < fcs->threadCount;
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_getDictionaryBuffer(FL2_CStream * fcs, FL2_dictBuffer * dict)
+{
+    if (!fcs->lockParams)
+        return FL2_ERROR(init_missing);
+
+    CHECK_F(fcs->asyncRes);
+
+    DICT_buffer *buf = &fcs->buf;
+
+    if (!DICT_availSpace(buf) && DICT_hasUnprocessed(buf))
+        CHECK_F(FL2_compressStream_internal(fcs, 0));
+
+    if (DICT_needShift(buf) && !DICT_async(buf))
+        CHECK_F(FL2_waitCStream(fcs));
+
+    dict->size = (unsigned long)DICT_get(buf, &dict->dst);
+
+    return FL2_error_no_error;
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_updateDictionary(FL2_CStream * fcs, size_t addedSize)
+{
+    if (DICT_update(&fcs->buf, addedSize))
+        CHECK_F(FL2_compressStream_internal(fcs, 0));
+
+    return fcs->outThread < fcs->threadCount;
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_getNextCStreamBuffer(FL2_CStream* fcs, FL2_cBuffer* cbuf)
+{
+    cbuf->src = NULL;
+    cbuf->size = 0;
+
+#ifndef FL2_SINGLETHREAD
+    FL2POOL_waitAll(fcs->compressThread, 0);
+    CHECK_F(fcs->asyncRes);
+#endif
+
+    if (fcs->outThread < fcs->threadCount) {
+        cbuf->src = RMF_getTableAsOutputBuffer(fcs->matchTable, fcs->jobs[fcs->outThread].block.start) + fcs->outPos;
+        cbuf->size = fcs->jobs[fcs->outThread].cSize - fcs->outPos;
+        ++fcs->outThread;
+        fcs->outPos = 0;
+    }
+    return cbuf->size;
+}
+
+FL2LIB_API unsigned long long FL2LIB_CALL FL2_getCStreamProgress(const FL2_CStream * fcs, unsigned long long *outputSize)
+{
+    if (outputSize != NULL)
+        *outputSize = fcs->streamCsize + fcs->progressOut;
+
+    U64 const encodeSize = fcs->curBlock.end - fcs->curBlock.start;
+
+    if (fcs->progressIn == 0 && fcs->curBlock.end != 0)
+        return fcs->streamTotal + ((fcs->matchTable->progress * encodeSize / fcs->curBlock.end * fcs->rmfWeight) >> 4);
+
+    return fcs->streamTotal + ((fcs->rmfWeight * encodeSize) >> 4) + ((fcs->progressIn * fcs->encWeight) >> 4);
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_waitCStream(FL2_CStream * fcs)
+{
+#ifndef FL2_SINGLETHREAD
+    if (FL2POOL_waitAll(fcs->compressThread, fcs->timeout) != 0)
+        return FL2_ERROR(timedOut);
+    CHECK_F(fcs->asyncRes);
+#endif
+    return fcs->outThread < fcs->threadCount;
+}
+
+FL2LIB_API void FL2LIB_CALL FL2_cancelCStream(FL2_CStream *fcs)
+{
+#ifndef FL2_SINGLETHREAD
+    if (fcs->compressThread != NULL) {
+        fcs->canceled = 1;
+
+        RMF_cancelBuild(fcs->matchTable);
+        FL2POOL_waitAll(fcs->compressThread, 0);
+
+        fcs->canceled = 0;
     }
 #endif
-    return 0;
+    FL2_endFrame(fcs);
 }
 
-FL2LIB_API size_t FL2LIB_CALL FL2_CStream_setParameter(FL2_CStream* fcs, FL2_cParameter param, unsigned value)
+FL2LIB_API size_t FL2LIB_CALL FL2_remainingOutputSize(const FL2_CStream* fcs)
 {
-    if (fcs->inBuff.start < fcs->inBuff.end)
-        return FL2_ERROR(stage_wrong);
-    return FL2_CCtx_setParameter(fcs->cctx, param, value);
+    CHECK_F(fcs->asyncRes);
+
+    size_t cSize = 0;
+    for (size_t u = fcs->outThread; u < fcs->threadCount; ++u)
+        cSize += fcs->jobs[u].cSize;
+
+    return cSize;
 }
 
-
-size_t FL2_memoryUsage_internal(unsigned const dictionaryLog, unsigned const bufferLog, unsigned const searchDepth,
-    unsigned chainLog, FL2_strategy strategy,
-    unsigned nbThreads)
+/* Write the properties byte (if required), the hash and the end marker
+ * into the output buffer.
+ */
+static void FL2_writeEnd(FL2_CStream* const fcs)
 {
-    size_t size = RMF_memoryUsage(dictionaryLog, bufferLog, searchDepth, nbThreads);
-    return size + FL2_lzma2MemoryUsage(chainLog, strategy, nbThreads);
+    size_t thread = fcs->threadCount - 1;
+    if (fcs->outThread == fcs->threadCount) {
+        fcs->outThread = 0; 
+        fcs->threadCount = 1;
+        fcs->jobs[0].cSize = 0;
+        thread = 0;
+    }
+    BYTE *const dst = RMF_getTableAsOutputBuffer(fcs->matchTable, fcs->jobs[thread].block.start)
+        + fcs->jobs[thread].cSize;
+
+    size_t pos = 0;
+
+    if (!fcs->wroteProp && !fcs->params.omitProp) {
+        /* no compression occurred */
+        dst[pos] = FL2_getProp(fcs, 0);
+        DEBUGLOG(4, "Writing property byte : 0x%X", dst[pos]);
+        ++pos;
+        fcs->wroteProp = 1;
+    }
+
+    DEBUGLOG(4, "Writing end marker");
+    dst[pos++] = LZMA2_END_MARKER;
+
+#ifndef NO_XXHASH
+    if (fcs->params.doXXH && !fcs->params.omitProp) {
+        XXH32_canonical_t canonical;
+
+        XXH32_canonicalFromHash(&canonical, DICT_getDigest(&fcs->buf));
+        DEBUGLOG(4, "Writing XXH32");
+        memcpy(dst + pos, &canonical, XXHASH_SIZEOF);
+
+        pos += XXHASH_SIZEOF;
+    }
+#endif
+    fcs->jobs[thread].cSize += pos;
+    fcs->endMarked = 1;
+
+    FL2_endFrame(fcs);
+}
+
+static size_t FL2_flushStream_internal(FL2_CStream* fcs, int const ending)
+{
+    CHECK_F(fcs->asyncRes);
+
+    DEBUGLOG(4, "FL2_flushStream_internal : %u to compress, %u to write",
+        (U32)(fcs->buf.end - fcs->buf.start),
+        (U32)FL2_remainingOutputSize(fcs));
+
+    CHECK_F(FL2_compressStream_internal(fcs, ending));
+
+    return fcs->outThread < fcs->threadCount;
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_flushStream(FL2_CStream* fcs, FL2_outBuffer *output)
+{
+    if (!fcs->lockParams)
+        return FL2_ERROR(init_missing);
+
+    size_t const prevOut = (output != NULL) ? output->pos : 0;
+
+    if (output != NULL && fcs->outThread < fcs->threadCount)
+        FL2_copyCStreamOutput(fcs, output);
+
+    size_t res = FL2_flushStream_internal(fcs, 0);
+    CHECK_F(res);
+
+    if (output != NULL && res != 0) {
+        FL2_copyCStreamOutput(fcs, output);
+        res = fcs->outThread < fcs->threadCount;
+    }
+
+    CHECK_F(FL2_loopCheck(fcs, output != NULL && prevOut == output->pos));
+
+    return res;
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_endStream(FL2_CStream* fcs, FL2_outBuffer *output)
+{
+    if (!fcs->endMarked && !fcs->lockParams)
+        return FL2_ERROR(init_missing);
+
+    size_t const prevOut = (output != NULL) ? output->pos : 0;
+    
+    if (output != NULL && fcs->outThread < fcs->threadCount)
+        FL2_copyCStreamOutput(fcs, output);
+
+    CHECK_F(FL2_flushStream_internal(fcs, 1));
+
+    size_t res = FL2_waitCStream(fcs);
+    CHECK_F(res);
+
+    if (!fcs->endMarked && !DICT_hasUnprocessed(&fcs->buf)) {
+        FL2_writeEnd(fcs);
+        res = 1;
+    }
+
+    if (output != NULL && res != 0) {
+        FL2_copyCStreamOutput(fcs, output);
+        res = fcs->outThread < fcs->threadCount || DICT_hasUnprocessed(&fcs->buf);
+    }
+
+    CHECK_F(FL2_loopCheck(fcs, output != NULL && prevOut == output->pos));
+
+    return res;
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_getLevelParameters(int compressionLevel, int high, FL2_compressionParameters * params)
+{
+    if (high) {
+        if (compressionLevel < 0 || compressionLevel > FL2_MAX_HIGH_CLEVEL)
+            return FL2_ERROR(parameter_outOfBound);
+        *params = FL2_highCParameters[compressionLevel];
+    }
+    else {
+        if (compressionLevel < 0 || compressionLevel > FL2_MAX_CLEVEL)
+            return FL2_ERROR(parameter_outOfBound);
+        *params = FL2_defaultCParameters[compressionLevel];
+    }
+    return FL2_error_no_error;
+}
+
+static size_t FL2_memoryUsage_internal(size_t const dictionarySize, unsigned const bufferLog,
+    unsigned const chainLog,
+    FL2_strategy const strategy,
+    unsigned const nbThreads)
+{
+    return RMF_memoryUsage(dictionarySize, bufferLog, nbThreads)
+        + LZMA2_encMemoryUsage(chainLog, strategy, nbThreads);
 }
 
 FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize(int compressionLevel, unsigned nbThreads)
 {
-    return FL2_memoryUsage_internal(FL2_defaultCParameters[compressionLevel].dictionaryLog,
-        FL2_defaultCParameters[compressionLevel].bufferLog,
-        FL2_defaultCParameters[compressionLevel].searchDepth,
-        FL2_defaultCParameters[compressionLevel].chainLog,
-        FL2_defaultCParameters[compressionLevel].strategy,
+    if (compressionLevel == 0)
+        compressionLevel = FL2_CLEVEL_DEFAULT;
+
+    CLAMPCHECK(compressionLevel, 1, FL2_MAX_CLEVEL);
+
+    return FL2_estimateCCtxSize_byParams(FL2_defaultCParameters + compressionLevel, nbThreads);
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize_byParams(const FL2_compressionParameters * params, unsigned nbThreads)
+{
+    nbThreads = FL2_checkNbThreads(nbThreads);
+    return FL2_memoryUsage_internal(params->dictionarySize,
+        params->bufferLog,
+        params->chainLog,
+        params->strategy,
         nbThreads);
 }
 
 FL2LIB_API size_t FL2LIB_CALL FL2_estimateCCtxSize_usingCCtx(const FL2_CCtx * cctx)
 {
-    return FL2_memoryUsage_internal(cctx->params.rParams.dictionary_log,
+    return FL2_memoryUsage_internal(cctx->params.rParams.dictionary_size,
         cctx->params.rParams.match_buffer_log,
-        cctx->params.rParams.depth,
         cctx->params.cParams.second_dict_bits,
         cctx->params.cParams.strategy,
-        cctx->jobCount);
+        cctx->jobCount) + DICT_memUsage(&cctx->buf);
 }
 
-FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize(int compressionLevel, unsigned nbThreads)
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize(int compressionLevel, unsigned nbThreads, int dualBuffer)
 {
     return FL2_estimateCCtxSize(compressionLevel, nbThreads)
-        + ((size_t)1 << FL2_defaultCParameters[compressionLevel].dictionaryLog);
+        + (FL2_defaultCParameters[compressionLevel].dictionarySize << (dualBuffer != 0));
 }
 
-FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_usingCCtx(const FL2_CStream* fcs)
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_byParams(const FL2_compressionParameters * params, unsigned nbThreads, int dualBuffer)
 {
-    return FL2_estimateCCtxSize_usingCCtx(fcs->cctx)
-        + ((size_t)1 << fcs->cctx->params.rParams.dictionary_log);
+    return FL2_estimateCCtxSize_byParams(params, nbThreads)
+        + (params->dictionarySize << (dualBuffer != 0));
+}
+
+FL2LIB_API size_t FL2LIB_CALL FL2_estimateCStreamSize_usingCStream(const FL2_CStream* fcs)
+{
+    return FL2_estimateCCtxSize_usingCCtx(fcs);
 }
diff --git a/C/fast-lzma2/fl2_compress_internal.h b/C/fast-lzma2/fl2_compress_internal.h
index ae69bd8f..166457ea 100644
--- a/C/fast-lzma2/fl2_compress_internal.h
+++ b/C/fast-lzma2/fl2_compress_internal.h
@@ -20,8 +20,9 @@
 #include "radix_internal.h"
 #include "lzma2_enc.h"
 #include "fast-lzma2.h"
-#include "fl2threading.h"
-#include "fl2pool.h"
+#include "fl2_threading.h"
+#include "fl2_pool.h"
+#include "dict_buffer.h"
 #ifndef NO_XXHASH
 #  include "xxhash.h"
 #endif
@@ -30,19 +31,6 @@
 extern "C" {
 #endif
 
-typedef struct {
-    unsigned dictionaryLog;    /* largest match distance : larger == more compression, more memory needed during decompression; >= 27 == more memory, slower */
-    unsigned overlapFraction;  /* overlap between consecutive blocks in 1/16 units: larger == more compression, slower */
-    unsigned chainLog;         /* fully searched segment : larger == more compression, slower, more memory; hybrid mode only (ultra) */
-    unsigned searchLog;        /* nb of searches : larger == more compression, slower; hybrid mode only (ultra) */
-    unsigned searchDepth;      /* maximum depth for resolving string matches : larger == more compression, slower; >= 64 == more memory, slower */
-    unsigned fastLength;       /* acceptable match size for parser, not less than searchDepth : larger == more compression, slower; fast bytes parameter from 7-zip */
-    unsigned divideAndConquer; /* split long chains of 2-byte matches into shorter chains with a small overlap : faster, somewhat less compression; enabled by default */
-    unsigned bufferLog;        /* buffer size for processing match chains is (dictionaryLog - bufferLog) : when divideAndConquer enabled, affects compression; */
-                               /* when divideAndConquer disabled, affects speed in a hardware-dependent manner */
-    FL2_strategy strategy;     /* encoder strategy : fast, optimized or ultra (hybrid) */
-} FL2_compressionParameters;
-
 /*-*************************************
 *  Context memory management
 ***************************************/
@@ -60,38 +48,43 @@ typedef struct {
 
 typedef struct {
     FL2_CCtx* cctx;
-    FL2_lzmaEncoderCtx* enc;
+    LZMA2_ECtx* enc;
     FL2_dataBlock block;
     size_t cSize;
 } FL2_job;
 
 struct FL2_CCtx_s {
+    DICT_buffer buf;
     FL2_CCtx_params params;
 #ifndef FL2_SINGLETHREAD
     FL2POOL_ctx* factory;
+    FL2POOL_ctx* compressThread;
 #endif
     FL2_dataBlock curBlock;
+    size_t asyncRes;
+    size_t threadCount;
+    size_t outThread;
+    size_t outPos;
     size_t dictMax;
-    U64 block_total;
+    U64 streamTotal;
+    U64 streamCsize;
     FL2_matchTable* matchTable;
+#ifndef FL2_SINGLETHREAD
+    U32 timeout;
+#endif
+    U32 rmfWeight;
+    U32 encWeight;
+    FL2_atomic progressIn;
+    FL2_atomic progressOut;
+    int canceled;
+    BYTE wroteProp;
+    BYTE endMarked;
+    BYTE loopCount;
+    BYTE lockParams;
     unsigned jobCount;
     FL2_job jobs[1];
 };
 
-struct FL2_CStream_s {
-    FL2_CCtx* cctx;
-    FL2_blockBuffer inBuff;
-#ifndef NO_XXHASH
-    XXH32_state_t *xxh;
-#endif
-    size_t thread_count;
-    size_t out_thread;
-    size_t out_pos;
-    size_t hash_pos;
-    BYTE end_marked;
-    BYTE wrote_prop;
-};
-
 #if defined (__cplusplus)
 }
 #endif
diff --git a/C/fast-lzma2/fl2_error_private.c b/C/fast-lzma2/fl2_error_private.c
deleted file mode 100644
index 66289586..00000000
--- a/C/fast-lzma2/fl2_error_private.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
- * All rights reserved.
- * Modified for FL2 by Conor McCarthy
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* The purpose of this file is to have a single list of error strings embedded in binary */
-
-#include "fl2_error_private.h"
-
-const char* ERR_getFL2ErrorString(ERR_enum code)
-{
-    static const char* const notErrorCode = "Unspecified error code";
-    switch( code )
-    {
-    case PREFIX(no_error): return "No error detected";
-    case PREFIX(GENERIC):  return "Error (generic)";
-    case PREFIX(corruption_detected): return "Corrupted block detected";
-    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
-    case PREFIX(parameter_unsupported): return "Unsupported parameter";
-    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
-    case PREFIX(init_missing): return "Context should be init first";
-    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
-    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
-    case PREFIX(srcSize_wrong): return "Src size is incorrect";
-        /* following error codes are not stable and may be removed or changed in a future version */
-    case PREFIX(maxCode):
-    default: return notErrorCode;
-    }
-}
diff --git a/C/fast-lzma2/fl2_error_private.h b/C/fast-lzma2/fl2_error_private.h
deleted file mode 100644
index 32532a9b..00000000
--- a/C/fast-lzma2/fl2_error_private.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
- * All rights reserved.
- * Modified for FL2 by Conor McCarthy
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-/* Note : this module is expected to remain private, do not expose it */
-
-#ifndef ERROR_H_MODULE
-#define ERROR_H_MODULE
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-
-/* ****************************************
-*  Dependencies
-******************************************/
-#include <stddef.h>        /* size_t */
-#include "fl2_errors.h"  /* enum list */
-
-
-/* ****************************************
-*  Compiler-specific
-******************************************/
-#if defined(__GNUC__)
-#  define ERR_STATIC static __attribute__((unused))
-#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
-#  define ERR_STATIC static inline
-#elif defined(_MSC_VER)
-#  define ERR_STATIC static __inline
-#else
-#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
-#endif
-
-
-/*-****************************************
-*  Customization (error_public.h)
-******************************************/
-typedef FL2_ErrorCode ERR_enum;
-#define PREFIX(name) FL2_error_##name
-
-
-/*-****************************************
-*  Error codes handling
-******************************************/
-#define FL2_ERROR(name) ((size_t)-PREFIX(name))
-
-ERR_STATIC unsigned ERR_isError(size_t code) { return (code > FL2_ERROR(maxCode)); }
-
-ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
-
-
-/*-****************************************
-*  Error Strings
-******************************************/
-
-const char* ERR_getFL2ErrorString(ERR_enum code);   /* error_private.c */
-
-ERR_STATIC const char* ERR_getErrorName(size_t code)
-{
-    return ERR_getFL2ErrorString(ERR_getErrorCode(code));
-}
-
-#if defined (__cplusplus)
-}
-#endif
-
-#endif /* ERROR_H_MODULE */
diff --git a/C/fast-lzma2/fl2_errors.h b/C/fast-lzma2/fl2_errors.h
index d669618f..1068f463 100644
--- a/C/fast-lzma2/fl2_errors.h
+++ b/C/fast-lzma2/fl2_errors.h
@@ -28,21 +28,23 @@ extern "C" {
  *         only static linking is allowed
  ******************************************/
 typedef enum {
-  FL2_error_no_error = 0,
-  FL2_error_GENERIC  = 1,
-  FL2_error_internal = 2,
-  FL2_error_corruption_detected = 3,
-  FL2_error_checksum_wrong      = 4,
+  FL2_error_no_error                = 0,
+  FL2_error_GENERIC                 = 1,
+  FL2_error_internal                = 2,
+  FL2_error_corruption_detected     = 3,
+  FL2_error_checksum_wrong          = 4,
   FL2_error_parameter_unsupported   = 5,
   FL2_error_parameter_outOfBound    = 6,
-  FL2_error_stage_wrong       = 7,
-  FL2_error_init_missing      = 8,
-  FL2_error_memory_allocation = 9,
-  FL2_error_dstSize_tooSmall = 10,
-  FL2_error_srcSize_wrong    = 11,
-  FL2_error_write_failed     = 12,
-  FL2_error_canceled         = 13,
-  FL2_error_maxCode = 20  /* never EVER use this value directly, it can change in future versions! Use FL2_isError() instead */
+  FL2_error_lclpMax_exceeded        = 7,
+  FL2_error_stage_wrong             = 8,
+  FL2_error_init_missing            = 9,
+  FL2_error_memory_allocation       = 10,
+  FL2_error_dstSize_tooSmall        = 11,
+  FL2_error_srcSize_wrong           = 12,
+  FL2_error_canceled                = 13,
+  FL2_error_buffer                  = 14,
+  FL2_error_timedOut                = 15,
+  FL2_error_maxCode                 = 20  /* never EVER use this value directly, it can change in future versions! Use FL2_isError() instead */
 } FL2_ErrorCode;
 
 /*! FL2_getErrorCode() :
diff --git a/C/fast-lzma2/fl2_internal.h b/C/fast-lzma2/fl2_internal.h
index aedda654..9f666458 100644
--- a/C/fast-lzma2/fl2_internal.h
+++ b/C/fast-lzma2/fl2_internal.h
@@ -18,19 +18,30 @@
 ***************************************/
 #include "mem.h"
 #include "compiler.h"
-#include "fl2_error_private.h"
 
 
 #if defined (__cplusplus)
 extern "C" {
 #endif
 
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#define PREFIX(name) FL2_error_##name
+#define FL2_ERROR(name) ((size_t)-PREFIX(name))
+
+
+/*-*************************************
+*  Stream properties
+***************************************/
 #define FL2_PROP_HASH_BIT 7
 #define FL2_LZMA_PROP_MASK 0x3FU
 #ifndef NO_XXHASH
 #  define XXHASH_SIZEOF sizeof(XXH32_canonical_t)
 #endif
 
+
 /*-*************************************
 *  Debug
 ***************************************/
@@ -77,8 +88,8 @@ extern int g_debuglog_enable;
 #undef MAX
 #define MIN(a,b) ((a)<(b) ? (a) : (b))
 #define MAX(a,b) ((a)>(b) ? (a) : (b))
-#define CHECK_F(f) { size_t const errcod = f; if (ERR_isError(errcod)) return errcod; }  /* check and Forward error code */
-#define CHECK_E(f, e) { size_t const errcod = f; if (ERR_isError(errcod)) return FL2_ERROR(e); }  /* check and send Error code */
+#define CHECK_F(f) do { size_t const errcod = f; if (FL2_isError(errcod)) return errcod; } while(0)  /* check and Forward error code */
+#define CHECK_E(f, e) do { size_t const errcod = f; if (FL2_isError(errcod)) return FL2_ERROR(e); } while(0)  /* check and send Error code */
 
 MEM_STATIC U32 ZSTD_highbit32(U32 val)
 {
diff --git a/C/fast-lzma2/fl2_pool.c b/C/fast-lzma2/fl2_pool.c
new file mode 100644
index 00000000..8f90b44c
--- /dev/null
+++ b/C/fast-lzma2/fl2_pool.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ * Modified for FL2 by Conor McCarthy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Dependencies   ======= */
+#include <stddef.h>  /* size_t */
+#include <stdlib.h>  /* malloc, calloc */
+#include "fl2_pool.h"
+#include "fl2_internal.h"
+
+
+#ifndef FL2_SINGLETHREAD
+
+#include "fl2_threading.h"   /* pthread adaptation */
+
+struct FL2POOL_ctx_s {
+    /* Keep track of the threads */
+    size_t numThreads;
+
+    /* All threads work on the same function and object during a job */
+    FL2POOL_function function;
+    void *opaque;
+
+    /* The number of threads working on jobs */
+    size_t numThreadsBusy;
+    /* Indicates the number of threads requested and the values to pass */
+    ptrdiff_t queueIndex;
+    ptrdiff_t queueEnd;
+
+    /* The mutex protects the queue */
+    FL2_pthread_mutex_t queueMutex;
+    /* Condition variable for pushers to wait on when the queue is full */
+    FL2_pthread_cond_t busyCond;
+    /* Condition variable for poppers to wait on when the queue is empty */
+    FL2_pthread_cond_t newJobsCond;
+    /* Indicates if the queue is shutting down */
+    int shutdown;
+
+    /* The threads. Extras to be calloc'd */
+    FL2_pthread_t threads[1];
+};
+
+/* FL2POOL_thread() :
+   Work thread for the thread pool.
+   Waits for jobs and executes them.
+   @returns : NULL on failure else non-null.
+*/
+static void* FL2POOL_thread(void* opaque)
+{
+    FL2POOL_ctx* const ctx = (FL2POOL_ctx*)opaque;
+    if (!ctx) { return NULL; }
+    FL2_pthread_mutex_lock(&ctx->queueMutex);
+    for (;;) {
+
+        /* While the mutex is locked, wait for a non-empty queue or until shutdown */
+        while (ctx->queueIndex >= ctx->queueEnd && !ctx->shutdown) {
+            FL2_pthread_cond_wait(&ctx->newJobsCond, &ctx->queueMutex);
+        }
+        /* empty => shutting down: so stop */
+        if (ctx->shutdown) {
+            FL2_pthread_mutex_unlock(&ctx->queueMutex);
+            return opaque;
+        }
+        /* Pop a job off the queue */
+        size_t n = ctx->queueIndex;
+        ++ctx->queueIndex;
+        ++ctx->numThreadsBusy;
+        /* Unlock the mutex and run the job */
+        FL2_pthread_mutex_unlock(&ctx->queueMutex);
+
+        ctx->function(ctx->opaque, n);
+
+        FL2_pthread_mutex_lock(&ctx->queueMutex);
+        --ctx->numThreadsBusy;
+        /* Signal the master thread waiting for jobs to complete */
+        FL2_pthread_cond_signal(&ctx->busyCond);
+    }  /* for (;;) */
+    /* Unreachable */
+}
+
+FL2POOL_ctx* FL2POOL_create(size_t numThreads)
+{
+    FL2POOL_ctx* ctx;
+    /* Check the parameters */
+    if (!numThreads) { return NULL; }
+    /* Allocate the context and zero initialize */
+    ctx = calloc(1, sizeof(FL2POOL_ctx) + (numThreads - 1) * sizeof(FL2_pthread_t));
+    if (!ctx) { return NULL; }
+    /* Initialize the busy count and jobs range */
+    ctx->numThreadsBusy = 0;
+    ctx->queueIndex = 0;
+    ctx->queueEnd = 0;
+    (void)FL2_pthread_mutex_init(&ctx->queueMutex, NULL);
+    (void)FL2_pthread_cond_init(&ctx->busyCond, NULL);
+    (void)FL2_pthread_cond_init(&ctx->newJobsCond, NULL);
+    ctx->shutdown = 0;
+    ctx->numThreads = 0;
+    /* Initialize the threads */
+    {   size_t i;
+        for (i = 0; i < numThreads; ++i) {
+            if (FL2_pthread_create(&ctx->threads[i], NULL, &FL2POOL_thread, ctx)) {
+                ctx->numThreads = i;
+                FL2POOL_free(ctx);
+                return NULL;
+        }   }
+        ctx->numThreads = numThreads;
+    }
+    return ctx;
+}
+
+/*! FL2POOL_join() :
+    Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void FL2POOL_join(FL2POOL_ctx* ctx)
+{
+    /* Shut down the queue */
+    FL2_pthread_mutex_lock(&ctx->queueMutex);
+    ctx->shutdown = 1;
+    /* Wake up sleeping threads */
+    FL2_pthread_cond_broadcast(&ctx->newJobsCond);
+    FL2_pthread_mutex_unlock(&ctx->queueMutex);
+    /* Join all of the threads */
+    for (size_t i = 0; i < ctx->numThreads; ++i)
+        FL2_pthread_join(ctx->threads[i], NULL);
+}
+
+void FL2POOL_free(FL2POOL_ctx *ctx)
+{
+    if (!ctx) { return; }
+    FL2POOL_join(ctx);
+    FL2_pthread_mutex_destroy(&ctx->queueMutex);
+    FL2_pthread_cond_destroy(&ctx->busyCond);
+    FL2_pthread_cond_destroy(&ctx->newJobsCond);
+    free(ctx);
+}
+
+size_t FL2POOL_sizeof(FL2POOL_ctx *ctx)
+{
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    return sizeof(*ctx) + ctx->numThreads * sizeof(FL2_pthread_t);
+}
+
+void FL2POOL_addRange(void* ctxVoid, FL2POOL_function function, void *opaque, ptrdiff_t first, ptrdiff_t end)
+{
+    FL2POOL_ctx* const ctx = (FL2POOL_ctx*)ctxVoid;
+    if (!ctx)
+		return; 
+
+    /* Callers always wait for jobs to complete before adding a new set */
+    assert(!ctx->numThreadsBusy);
+
+    FL2_pthread_mutex_lock(&ctx->queueMutex);
+    ctx->function = function;
+    ctx->opaque = opaque;
+    ctx->queueIndex = first;
+    ctx->queueEnd = end;
+    FL2_pthread_cond_broadcast(&ctx->newJobsCond);
+    FL2_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+void FL2POOL_add(void* ctxVoid, FL2POOL_function function, void *opaque, ptrdiff_t n)
+{
+    FL2POOL_addRange(ctxVoid, function, opaque, n, n + 1);
+}
+
+int FL2POOL_waitAll(void *ctxVoid, unsigned timeout)
+{
+    FL2POOL_ctx* const ctx = (FL2POOL_ctx*)ctxVoid;
+    if (!ctx || (!ctx->numThreadsBusy && ctx->queueIndex >= ctx->queueEnd) || ctx->shutdown) { return 0; }
+
+    FL2_pthread_mutex_lock(&ctx->queueMutex);
+    /* Need to test for ctx->queueIndex < ctx->queueEnd in case not all jobs have started */
+    if (timeout != 0) {
+        if ((ctx->numThreadsBusy || ctx->queueIndex < ctx->queueEnd) && !ctx->shutdown)
+            FL2_pthread_cond_timedwait(&ctx->busyCond, &ctx->queueMutex, timeout);
+    }
+    else {
+        while ((ctx->numThreadsBusy || ctx->queueIndex < ctx->queueEnd) && !ctx->shutdown)
+            FL2_pthread_cond_wait(&ctx->busyCond, &ctx->queueMutex);
+    }
+    FL2_pthread_mutex_unlock(&ctx->queueMutex);
+    return ctx->numThreadsBusy && !ctx->shutdown;
+}
+
+size_t FL2POOL_threadsBusy(void * ctx)
+{
+    return ((FL2POOL_ctx*)ctx)->numThreadsBusy;
+}
+
+#endif  /* FL2_SINGLETHREAD */
diff --git a/C/fast-lzma2/fl2pool.h b/C/fast-lzma2/fl2_pool.h
similarity index 76%
rename from C/fast-lzma2/fl2pool.h
rename to C/fast-lzma2/fl2_pool.h
index 9c99f3c5..ccf1d003 100644
--- a/C/fast-lzma2/fl2pool.h
+++ b/C/fast-lzma2/fl2_pool.h
@@ -42,16 +42,20 @@ size_t FL2POOL_sizeof(FL2POOL_ctx *ctx);
 /*! FL2POOL_function :
 The function type that can be added to a thread pool.
 */
-typedef void(*FL2POOL_function)(void *, size_t);
+typedef void(*FL2POOL_function)(void *, ptrdiff_t);
 
 /*! FL2POOL_add() :
 Add the job `function(opaque)` to the thread pool.
+FL2POOL_addRange adds multiple jobs with size_t parameter from first to less than end.
 Possibly blocks until there is room in the queue.
 Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed.
 */
-void FL2POOL_add(void *ctx, FL2POOL_function function, void *opaque, size_t n);
+void FL2POOL_add(void* ctxVoid, FL2POOL_function function, void *opaque, ptrdiff_t n);
+void FL2POOL_addRange(void *ctx, FL2POOL_function function, void *opaque, ptrdiff_t first, ptrdiff_t end);
 
-void FL2POOL_waitAll(void *ctx);
+int FL2POOL_waitAll(void *ctx, unsigned timeout);
+
+size_t FL2POOL_threadsBusy(void *ctx);
 
 #if defined (__cplusplus)
 }
diff --git a/C/fast-lzma2/fl2threading.c b/C/fast-lzma2/fl2_threading.c
similarity index 73%
rename from C/fast-lzma2/fl2threading.c
rename to C/fast-lzma2/fl2_threading.c
index 3372b109..d4ac2e91 100644
--- a/C/fast-lzma2/fl2threading.c
+++ b/C/fast-lzma2/fl2_threading.c
@@ -17,6 +17,10 @@
 /* create fake symbol to avoid empty translation unit warning */
 int g_ZSTD_threading_useles_symbol;
 
+#include "fast-lzma2.h"
+#include "fl2_threading.h"
+#include "util.h"
+
 #if !defined(FL2_SINGLETHREAD) && defined(_WIN32)
 
 /**
@@ -28,19 +32,18 @@ int g_ZSTD_threading_useles_symbol;
 /* ===  Dependencies  === */
 #include <process.h>
 #include <errno.h>
-#include "fl2threading.h"
 
 
 /* ===  Implementation  === */
 
 static unsigned __stdcall worker(void *arg)
 {
-    ZSTD_pthread_t* const thread = (ZSTD_pthread_t*) arg;
+    FL2_pthread_t* const thread = (FL2_pthread_t*) arg;
     thread->arg = thread->start_routine(thread->arg);
     return 0;
 }
 
-int FL2_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+int FL2_pthread_create(FL2_pthread_t* thread, const void* unused,
             void* (*start_routine) (void*), void* arg)
 {
     (void)unused;
@@ -54,7 +57,7 @@ int FL2_pthread_create(ZSTD_pthread_t* thread, const void* unused,
         return 0;
 }
 
-int FL2_pthread_join(ZSTD_pthread_t thread, void **value_ptr)
+int FL2_pthread_join(FL2_pthread_t thread, void **value_ptr)
 {
     DWORD result;
 
@@ -73,3 +76,20 @@ int FL2_pthread_join(ZSTD_pthread_t thread, void **value_ptr)
 }
 
 #endif   /* FL2_SINGLETHREAD */
+
+unsigned FL2_checkNbThreads(unsigned nbThreads)
+{
+#ifndef FL2_SINGLETHREAD
+    if (nbThreads == 0) {
+        nbThreads = UTIL_countPhysicalCores();
+        nbThreads += !nbThreads;
+    }
+    if (nbThreads > FL2_MAXTHREADS) {
+        nbThreads = FL2_MAXTHREADS;
+    }
+#else
+    nbThreads = 1;
+#endif
+    return nbThreads;
+}
+
diff --git a/C/fast-lzma2/fl2_threading.h b/C/fast-lzma2/fl2_threading.h
new file mode 100644
index 00000000..c9259b0c
--- /dev/null
+++ b/C/fast-lzma2/fl2_threading.h
@@ -0,0 +1,178 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+#include "mem.h"
+
+#ifndef FL2_XZ_BUILD
+#  ifdef _WIN32
+#    define MYTHREAD_VISTA
+#  else
+#    define MYTHREAD_POSIX  /* posix assumed ; need a better detection method */
+#  endif
+#elif defined(HAVE_CONFIG_H)
+#  include <config.h>
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+unsigned FL2_checkNbThreads(unsigned nbThreads);
+
+
+#if !defined(FL2_SINGLETHREAD) && defined(MYTHREAD_VISTA)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+#ifdef WINVER
+#  undef WINVER
+#endif
+#define WINVER       0x0600
+
+#ifdef _WIN32_WINNT
+#  undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
+#include <synchapi.h>
+
+
+/* mutex */
+#define FL2_pthread_mutex_t           CRITICAL_SECTION
+#define FL2_pthread_mutex_init(a, b)  (InitializeCriticalSection((a)), 0)
+#define FL2_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
+#define FL2_pthread_mutex_lock(a)     EnterCriticalSection((a))
+#define FL2_pthread_mutex_unlock(a)   LeaveCriticalSection((a))
+
+/* condition variable */
+#define FL2_pthread_cond_t                     CONDITION_VARIABLE
+#define FL2_pthread_cond_init(a, b)            (InitializeConditionVariable((a)), 0)
+#define FL2_pthread_cond_destroy(a)            /* No delete */
+#define FL2_pthread_cond_wait(a, b)            SleepConditionVariableCS((a), (b), INFINITE)
+#define FL2_pthread_cond_timedwait(a, b, c)    SleepConditionVariableCS((a), (b), (c))
+#define FL2_pthread_cond_signal(a)             WakeConditionVariable((a))
+#define FL2_pthread_cond_broadcast(a)          WakeAllConditionVariable((a))
+
+/* FL2_pthread_create() and FL2_pthread_join() */
+typedef struct {
+    HANDLE handle;
+    void* (*start_routine)(void*);
+    void* arg;
+} FL2_pthread_t;
+
+int FL2_pthread_create(FL2_pthread_t* thread, const void* unused,
+                   void* (*start_routine) (void*), void* arg);
+
+int FL2_pthread_join(FL2_pthread_t thread, void** value_ptr);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif !defined(FL2_SINGLETHREAD) && defined(MYTHREAD_POSIX)
+/* ===   POSIX Systems   === */
+#  include <sys/time.h>
+#  include <pthread.h>
+
+#define FL2_pthread_mutex_t            pthread_mutex_t
+#define FL2_pthread_mutex_init(a, b)   pthread_mutex_init((a), (b))
+#define FL2_pthread_mutex_destroy(a)   pthread_mutex_destroy((a))
+#define FL2_pthread_mutex_lock(a)      pthread_mutex_lock((a))
+#define FL2_pthread_mutex_unlock(a)    pthread_mutex_unlock((a))
+
+#define FL2_pthread_cond_t             pthread_cond_t
+#define FL2_pthread_cond_init(a, b)    pthread_cond_init((a), (b))
+#define FL2_pthread_cond_destroy(a)    pthread_cond_destroy((a))
+#define FL2_pthread_cond_wait(a, b)    pthread_cond_wait((a), (b))
+#define FL2_pthread_cond_signal(a)     pthread_cond_signal((a))
+#define FL2_pthread_cond_broadcast(a)  pthread_cond_broadcast((a))
+
+#define FL2_pthread_t                  pthread_t
+#define FL2_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define FL2_pthread_join(a, b)         pthread_join((a),(b))
+
+/* Timed wait functions from XZ by Lasse Collin
+*/
+
+/* Sets condtime to the absolute time that is timeout_ms milliseconds
+ * in the future.
+ */
+static inline void
+mythread_condtime_set(struct timespec *condtime, U32 timeout_ms)
+{
+	condtime->tv_sec = timeout_ms / 1000;
+	condtime->tv_nsec = (timeout_ms % 1000) * 1000000;
+
+	struct timeval now;
+	gettimeofday(&now, NULL);
+
+	condtime->tv_sec += now.tv_sec;
+	condtime->tv_nsec += now.tv_usec * 1000L;
+
+	/* tv_nsec must stay in the range [0, 999_999_999]. */
+	if (condtime->tv_nsec >= 1000000000L) {
+		condtime->tv_nsec -= 1000000000L;
+		++condtime->tv_sec;
+	}
+}
+
+/* Waits on a condition or until a timeout expires. If the timeout expires,
+ * non-zero is returned, otherwise zero is returned.
+ */
+static inline void
+FL2_pthread_cond_timedwait(FL2_pthread_cond_t *cond, FL2_pthread_mutex_t *mutex,
+    U32 timeout_ms)
+{
+    struct timespec condtime;
+    mythread_condtime_set(&condtime, timeout_ms);
+	pthread_cond_timedwait(cond, mutex, &condtime);
+}
+
+
+#elif defined(FL2_SINGLETHREAD)
+/* No multithreading support */
+
+typedef int FL2_pthread_mutex_t;
+#define FL2_pthread_mutex_init(a, b)   ((void)a, 0)
+#define FL2_pthread_mutex_destroy(a)
+#define FL2_pthread_mutex_lock(a)
+#define FL2_pthread_mutex_unlock(a)
+
+typedef int FL2_pthread_cond_t;
+#define FL2_pthread_cond_init(a, b)    ((void)a, 0)
+#define FL2_pthread_cond_destroy(a)
+#define FL2_pthread_cond_wait(a, b)
+#define FL2_pthread_cond_signal(a)
+#define FL2_pthread_cond_broadcast(a)
+
+/* do not use FL2_pthread_t */
+
+#else
+#  error FL2_SINGLETHREAD not defined but no threading support found
+#endif /* FL2_SINGLETHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */
diff --git a/C/fast-lzma2/fl2pool.c b/C/fast-lzma2/fl2pool.c
deleted file mode 100644
index 24d4f9e6..00000000
--- a/C/fast-lzma2/fl2pool.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
- * All rights reserved.
- * Modified for FL2 by Conor McCarthy
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
-
-
-/* ======   Dependencies   ======= */
-#include <stddef.h>  /* size_t */
-#include <stdlib.h>  /* malloc, calloc */
-#include "fl2pool.h"
-#include "fl2_internal.h"
-
-/* ======   Compiler specifics   ====== */
-#if defined(_MSC_VER)
-#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
-#endif
-
-
-#ifndef FL2_SINGLETHREAD
-
-#include "fl2threading.h"   /* pthread adaptation */
-
-/* A job is a function and an opaque argument */
-typedef struct FL2POOL_job_s {
-    FL2POOL_function function;
-    void *opaque;
-	size_t n;
-} FL2POOL_job;
-
-struct FL2POOL_ctx_s {
-    /* Keep track of the threads */
-    ZSTD_pthread_t *threads;
-    size_t numThreads;
-
-    /* The queue is a single job */
-    FL2POOL_job queue;
-
-    /* The number of threads working on jobs */
-    size_t numThreadsBusy;
-    /* Indicates if the queue is empty */
-    int queueEmpty;
-
-    /* The mutex protects the queue */
-    ZSTD_pthread_mutex_t queueMutex;
-    /* Condition variable for pushers to wait on when the queue is full */
-    ZSTD_pthread_cond_t queuePushCond;
-    /* Condition variables for poppers to wait on when the queue is empty */
-    ZSTD_pthread_cond_t queuePopCond;
-    /* Indicates if the queue is shutting down */
-    int shutdown;
-};
-
-/* FL2POOL_thread() :
-   Work thread for the thread pool.
-   Waits for jobs and executes them.
-   @returns : NULL on failure else non-null.
-*/
-static void* FL2POOL_thread(void* opaque) {
-    FL2POOL_ctx* const ctx = (FL2POOL_ctx*)opaque;
-    if (!ctx) { return NULL; }
-    for (;;) {
-        /* Lock the mutex and wait for a non-empty queue or until shutdown */
-        ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-
-        while (ctx->queueEmpty && !ctx->shutdown) {
-            ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
-        }
-        /* empty => shutting down: so stop */
-        if (ctx->queueEmpty) {
-            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-            return opaque;
-        }
-        /* Pop a job off the queue */
-        {   FL2POOL_job const job = ctx->queue;
-            ctx->queueEmpty = 1;
-            /* Unlock the mutex, signal a pusher, and run the job */
-            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
-
-            job.function(job.opaque, job.n);
-
-			ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-			ctx->numThreadsBusy--;
-			ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-			ZSTD_pthread_cond_signal(&ctx->queuePushCond);
-		}
-    }  /* for (;;) */
-    /* Unreachable */
-}
-
-FL2POOL_ctx* FL2POOL_create(size_t numThreads) {
-    FL2POOL_ctx* ctx;
-    /* Check the parameters */
-    if (!numThreads) { return NULL; }
-    /* Allocate the context and zero initialize */
-    ctx = (FL2POOL_ctx*)calloc(1, sizeof(FL2POOL_ctx));
-    if (!ctx) { return NULL; }
-    /* Initialize the job queue.
-     * It needs one extra space since one space is wasted to differentiate empty
-     * and full queues.
-     */
-    ctx->numThreadsBusy = 0;
-    ctx->queueEmpty = 1;
-    (void)ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
-    (void)ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
-    (void)ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
-    ctx->shutdown = 0;
-    /* Allocate space for the thread handles */
-    ctx->threads = (ZSTD_pthread_t*)malloc(numThreads * sizeof(ZSTD_pthread_t));
-    ctx->numThreads = 0;
-    /* Check for errors */
-    if (!ctx->threads) { FL2POOL_free(ctx); return NULL; }
-    /* Initialize the threads */
-    {   size_t i;
-        for (i = 0; i < numThreads; ++i) {
-            if (FL2_pthread_create(&ctx->threads[i], NULL, &FL2POOL_thread, ctx)) {
-                ctx->numThreads = i;
-                FL2POOL_free(ctx);
-                return NULL;
-        }   }
-        ctx->numThreads = numThreads;
-    }
-    return ctx;
-}
-
-/*! FL2POOL_join() :
-    Shutdown the queue, wake any sleeping threads, and join all of the threads.
-*/
-static void FL2POOL_join(FL2POOL_ctx* ctx) {
-    /* Shut down the queue */
-    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-    ctx->shutdown = 1;
-    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-    /* Wake up sleeping threads */
-    ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
-    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
-    /* Join all of the threads */
-    {   size_t i;
-        for (i = 0; i < ctx->numThreads; ++i) {
-            FL2_pthread_join(ctx->threads[i], NULL);
-    }   }
-}
-
-void FL2POOL_free(FL2POOL_ctx *ctx) {
-    if (!ctx) { return; }
-    FL2POOL_join(ctx);
-    ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
-    ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
-    ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
-    free(ctx->threads);
-    free(ctx);
-}
-
-size_t FL2POOL_sizeof(FL2POOL_ctx *ctx) {
-    if (ctx==NULL) return 0;  /* supports sizeof NULL */
-    return sizeof(*ctx)
-        + ctx->numThreads * sizeof(ZSTD_pthread_t);
-}
-
-void FL2POOL_add(void* ctxVoid, FL2POOL_function function, void *opaque, size_t n) {
-    FL2POOL_ctx* const ctx = (FL2POOL_ctx*)ctxVoid;
-    if (!ctx)
-		return; 
-
-    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-    {   FL2POOL_job const job = {function, opaque, n};
-
-        /* Wait until there is space in the queue for the new job */
-        while (!ctx->queueEmpty && !ctx->shutdown) {
-          ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
-        }
-        /* The queue is still going => there is space */
-        if (!ctx->shutdown) {
-			ctx->numThreadsBusy++;
-			ctx->queueEmpty = 0;
-            ctx->queue = job;
-        }
-    }
-    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-    ZSTD_pthread_cond_signal(&ctx->queuePopCond);
-}
-
-void FL2POOL_waitAll(void *ctxVoid)
-{
-    FL2POOL_ctx* const ctx = (FL2POOL_ctx*)ctxVoid;
-    if (!ctx) { return; }
-
-    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-    while (ctx->numThreadsBusy && !ctx->shutdown) {
-        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
-    }
-    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-}
-
-#endif  /* FL2_SINGLETHREAD */
diff --git a/C/fast-lzma2/fl2threading.h b/C/fast-lzma2/fl2threading.h
deleted file mode 100644
index 9f6ff3b1..00000000
--- a/C/fast-lzma2/fl2threading.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/**
- * Copyright (c) 2016 Tino Reichardt
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- *
- * You can contact the author at:
- * - zstdmt source repository: https://github.com/mcmilk/zstdmt
- */
-
-#ifndef THREADING_H_938743
-#define THREADING_H_938743
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-#if !defined(FL2_SINGLETHREAD) && defined(_WIN32)
-
-/**
- * Windows minimalist Pthread Wrapper, based on :
- * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
- */
-#ifdef WINVER
-#  undef WINVER
-#endif
-#define WINVER       0x0600
-
-#ifdef _WIN32_WINNT
-#  undef _WIN32_WINNT
-#endif
-#define _WIN32_WINNT 0x0600
-
-#ifndef WIN32_LEAN_AND_MEAN
-#  define WIN32_LEAN_AND_MEAN
-#endif
-
-#include <windows.h>
-
-
-/* mutex */
-#define ZSTD_pthread_mutex_t           CRITICAL_SECTION
-#define ZSTD_pthread_mutex_init(a, b)  (InitializeCriticalSection((a)), 0)
-#define ZSTD_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
-#define ZSTD_pthread_mutex_lock(a)     EnterCriticalSection((a))
-#define ZSTD_pthread_mutex_unlock(a)   LeaveCriticalSection((a))
-
-/* condition variable */
-#define ZSTD_pthread_cond_t             CONDITION_VARIABLE
-#define ZSTD_pthread_cond_init(a, b)    (InitializeConditionVariable((a)), 0)
-#define ZSTD_pthread_cond_destroy(a)    /* No delete */
-#define ZSTD_pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
-#define ZSTD_pthread_cond_signal(a)     WakeConditionVariable((a))
-#define ZSTD_pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
-
-/* FL2_pthread_create() and FL2_pthread_join() */
-typedef struct {
-    HANDLE handle;
-    void* (*start_routine)(void*);
-    void* arg;
-} ZSTD_pthread_t;
-
-int FL2_pthread_create(ZSTD_pthread_t* thread, const void* unused,
-                   void* (*start_routine) (void*), void* arg);
-
-int FL2_pthread_join(ZSTD_pthread_t thread, void** value_ptr);
-
-/**
- * add here more wrappers as required
- */
-
-
-#elif !defined(FL2_SINGLETHREAD)   /* posix assumed ; need a better detection method */
-/* ===   POSIX Systems   === */
-#  include <pthread.h>
-
-#define ZSTD_pthread_mutex_t            pthread_mutex_t
-#define ZSTD_pthread_mutex_init(a, b)   pthread_mutex_init((a), (b))
-#define ZSTD_pthread_mutex_destroy(a)   pthread_mutex_destroy((a))
-#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock((a))
-#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock((a))
-
-#define ZSTD_pthread_cond_t             pthread_cond_t
-#define ZSTD_pthread_cond_init(a, b)    pthread_cond_init((a), (b))
-#define ZSTD_pthread_cond_destroy(a)    pthread_cond_destroy((a))
-#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait((a), (b))
-#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal((a))
-#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast((a))
-
-#define ZSTD_pthread_t                  pthread_t
-#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
-#define ZSTD_pthread_join(a, b)         pthread_join((a),(b))
-
-#else  /* FL2_SINGLETHREAD defined */
-/* No multithreading support */
-
-typedef int ZSTD_pthread_mutex_t;
-#define ZSTD_pthread_mutex_init(a, b)   ((void)a, 0)
-#define ZSTD_pthread_mutex_destroy(a)
-#define ZSTD_pthread_mutex_lock(a)
-#define ZSTD_pthread_mutex_unlock(a)
-
-typedef int ZSTD_pthread_cond_t;
-#define ZSTD_pthread_cond_init(a, b)    ((void)a, 0)
-#define ZSTD_pthread_cond_destroy(a)
-#define ZSTD_pthread_cond_wait(a, b)
-#define ZSTD_pthread_cond_signal(a)
-#define ZSTD_pthread_cond_broadcast(a)
-
-/* do not use ZSTD_pthread_t */
-
-#endif /* FL2_SINGLETHREAD */
-
-#if defined (__cplusplus)
-}
-#endif
-
-#endif /* THREADING_H_938743 */
diff --git a/C/fast-lzma2/lzma2_enc.c b/C/fast-lzma2/lzma2_enc.c
index 2201ef45..b68084ef 100644
--- a/C/fast-lzma2/lzma2_enc.c
+++ b/C/fast-lzma2/lzma2_enc.c
@@ -7,13 +7,24 @@ Public domain
 #include <stdlib.h>
 #include <math.h>
 
+#include "fl2_errors.h"
 #include "fl2_internal.h"
-#include "mem.h"
 #include "lzma2_enc.h"
 #include "fl2_compress_internal.h"
+#include "mem.h"
+#include "count.h"
 #include "radix_mf.h"
 #include "range_enc.h"
-#include "count.h"
+
+#ifdef FL2_XZ_BUILD
+#  include "tuklib_integer.h"
+#  define MEM_readLE32(a) unaligned_read32le(a)
+
+#  ifdef TUKLIB_FAST_UNALIGNED_ACCESS
+#    define MEM_read16(a) (*(const U16*)(a))
+#  endif
+
+#endif
 
 #define kNumReps 4U
 #define kNumStates 12U
@@ -30,7 +41,8 @@ Public domain
 #define kNumAlignBits 4U
 #define kAlignTableSize (1U << kNumAlignBits)
 #define kAlignMask (kAlignTableSize - 1U)
-#define kAlignRepriceFrequency kAlignTableSize
+#define kMatchRepriceFrequency 64U
+#define kRepLenRepriceFrequency 64U
 
 #define kStartPosModelIndex 4U
 #define kEndPosModelIndex 14U
@@ -38,7 +50,6 @@ Public domain
 
 #define kNumFullDistancesBits (kEndPosModelIndex >> 1U)
 #define kNumFullDistances (1U << kNumFullDistancesBits)
-#define kDistanceRepriceFrequency (1U << 7U)
 
 #define kNumPositionBitsMax 4U
 #define kNumPositionStatesMax (1U << kNumPositionBitsMax)
@@ -49,23 +60,28 @@ Public domain
 
 #define kLenNumLowBits 3U
 #define kLenNumLowSymbols (1U << kLenNumLowBits)
-#define kLenNumMidBits 3U
-#define kLenNumMidSymbols (1U << kLenNumMidBits)
 #define kLenNumHighBits 8U
 #define kLenNumHighSymbols (1U << kLenNumHighBits)
 
-#define kLenNumSymbolsTotal (kLenNumLowSymbols + kLenNumMidSymbols + kLenNumHighSymbols)
+#define kLenNumSymbolsTotal (kLenNumLowSymbols * 2 + kLenNumHighSymbols)
 
 #define kMatchLenMin 2U
 #define kMatchLenMax (kMatchLenMin + kLenNumSymbolsTotal - 1U)
 
-#define kOptimizerBufferSize (1U << 12U)
+#define kMatchesMax 65U /* Doesn't need to be larger than FL2_HYBRIDCYCLES_MAX + 1 */
+
+#define kOptimizerEndSize 32U
+#define kOptimizerBufferSize (kMatchLenMax * 2U + kOptimizerEndSize)
+#define kOptimizerSkipSize 16U
 #define kInfinityPrice (1UL << 30U)
 #define kNullDist (U32)-1
 
 #define kChunkSize ((1UL << 16U) - 8192U)
-#define kChunkBufferSize (1UL << 16U)
+#define kSqrtChunkSize 239U
+#define kTempMinOutput (LZMA_REQUIRED_INPUT_MAX * 4U)
+#define kTempBufferSize (kTempMinOutput + kOptimizerBufferSize + kOptimizerBufferSize / 16U)
 #define kMaxChunkUncompressedSize ((1UL << 21U) - kMatchLenMax)
+#define kMaxChunkCompressedSize (1UL << 16U)
 #define kChunkHeaderSize 5U
 #define kChunkResetShift 5U
 #define kChunkUncompressedDictReset 1U
@@ -83,80 +99,93 @@ Public domain
 #define kMinTestChunkSize 0x4000U
 #define kRandomFilterMarginBits 8U
 
+#define kState_LitAfterMatch 4
+#define kState_LitAfterRep   5
+#define kState_MatchAfterLit 7
+#define kState_RepAfterLit   8
+
 static const BYTE kLiteralNextStates[kNumStates] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
-#define LiteralNextState(s) kLiteralNextStates[s]
+#define LIT_NEXT_STATE(s) kLiteralNextStates[s]
 static const BYTE kMatchNextStates[kNumStates] = { 7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10 };
-#define MatchNextState(s) kMatchNextStates[s]
+#define MATCH_NEXT_STATE(s) kMatchNextStates[s]
 static const BYTE kRepNextStates[kNumStates] = { 8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11 };
-#define RepNextState(s) kRepNextStates[s]
+#define REP_NEXT_STATE(s) kRepNextStates[s]
 static const BYTE kShortRepNextStates[kNumStates] = { 9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11 };
-#define ShortRepNextState(s) kShortRepNextStates[s]
+#define SHORT_REP_NEXT_STATE(s) kShortRepNextStates[s]
 
 #include "fastpos_table.h"
+#include "radix_get.h"
 
+/* Probabilities and prices for encoding match lengths.
+ * Two objects of this type are needed, one for normal matches
+ * and another for rep matches.
+ */
 typedef struct 
 {
     size_t table_size;
     unsigned prices[kNumPositionStatesMax][kLenNumSymbolsTotal];
-    unsigned counters[kNumPositionStatesMax];
-    Probability choice;
-    Probability choice_2;
-    Probability low[kNumPositionStatesMax << kLenNumLowBits];
-    Probability mid[kNumPositionStatesMax << kLenNumMidBits];
+    Probability choice; /* low[0] is choice_2. Must be consecutive for speed */
+    Probability low[kNumPositionStatesMax << (kLenNumLowBits + 1)];
     Probability high[kLenNumHighSymbols];
 } LengthStates;
 
+/* All probabilities for the encoder. This is a separate from the encoder object
+ * so the state can be saved and restored in case a chunk is not compressible.
+ */
 typedef struct
 {
-    U32 reps[kNumReps];
-    size_t state;
+    /* Fields are ordered for speed */
+    LengthStates rep_len_states;
+    Probability is_rep0_long[kNumStates][kNumPositionStatesMax];
 
+    size_t state;
+    U32 reps[kNumReps];
+
+    Probability is_match[kNumStates][kNumPositionStatesMax];
     Probability is_rep[kNumStates];
     Probability is_rep_G0[kNumStates];
     Probability is_rep_G1[kNumStates];
     Probability is_rep_G2[kNumStates];
-    Probability is_rep0_long[kNumStates][kNumPositionStatesMax];
-    Probability is_match[kNumStates][kNumPositionStatesMax];
+
+    LengthStates len_states;
 
     Probability dist_slot_encoders[kNumLenToPosStates][1 << kNumPosSlotBits];
     Probability dist_align_encoders[1 << kNumAlignBits];
     Probability dist_encoders[kNumFullDistances - kEndPosModelIndex];
 
-    LengthStates len_states;
-    LengthStates rep_len_states;
-
     Probability literal_probs[(kNumLiterals * kNumLitTables) << kLcLpMax];
 } EncoderStates;
 
+/* 
+ * Linked list item for optimal parsing
+ */
 typedef struct
 {
     size_t state;
-    U32 reps[kNumReps];
     U32 price;
-    unsigned prev_index;
-    U32 prev_dist;
-    unsigned prev_index_2;
-    U32 prev_dist_2;
-    BYTE is_combination;
-    BYTE prev_2;
-
+    unsigned extra; /*  0   : normal
+                     *  1   : LIT : MATCH
+                     *  > 1 : MATCH (extra-1) : LIT : REP0 (len) */
+    unsigned len;
+    U32 dist;
+    U32 reps[kNumReps];
 } OptimalNode;
 
-#define MakeAsLiteral(node) (node).prev_dist = kNullDist; (node).is_combination = 0;
-#define MakeAsShortRep(node) (node).prev_dist = 0; (node).is_combination = 0;
+#define MARK_LITERAL(node) (node).dist = kNullDist; (node).extra = 0;
+#define MARK_SHORT_REP(node) (node).dist = 0; (node).extra = 0;
 
+/*
+ * Table and chain for 3-byte hash. Extra elements in hash_chain_3 are malloced.
+ */
 typedef struct {
     S32 table_3[1 << kHash3Bits];
     S32 hash_chain_3[1];
 } HashChains;
 
-typedef struct
-{
-    U32 length;
-    U32 dist;
-} Match;
-
-struct FL2_lzmaEncoderCtx_s
+/*
+ * LZMA2 encoder.
+ */
+struct LZMA2_ECtx_s
 {
     unsigned lc;
     unsigned lp;
@@ -173,39 +202,35 @@ struct FL2_lzmaEncoderCtx_s
     EncoderStates states;
 
     unsigned match_price_count;
-    unsigned align_price_count;
+    unsigned rep_len_price_count;
     size_t dist_price_table_size;
     unsigned align_prices[kAlignTableSize];
     unsigned dist_slot_prices[kNumLenToPosStates][kDistTableSizeMax];
     unsigned distance_prices[kNumLenToPosStates][kNumFullDistances];
 
-    Match matches[kMatchLenMax-kMatchLenMin];
+    RMF_match base_match; /* Allows access to matches[-1] in LZMA_optimalParse */
+    RMF_match matches[kMatchesMax];
     size_t match_count;
 
     OptimalNode opt_buf[kOptimizerBufferSize];
 
-    BYTE* out_buf;
-
     HashChains* hash_buf;
     ptrdiff_t chain_mask_2;
     ptrdiff_t chain_mask_3;
     ptrdiff_t hash_dict_3;
     ptrdiff_t hash_prev_index;
     ptrdiff_t hash_alloc_3;
+
+    BYTE out_buf[kTempBufferSize];
 };
 
-FL2_lzmaEncoderCtx* FL2_lzma2Create()
+LZMA2_ECtx* LZMA2_createECtx(void)
 {
-    FL2_lzmaEncoderCtx* enc = malloc(sizeof(FL2_lzmaEncoderCtx));
-    DEBUGLOG(3, "FL2_lzma2Create");
+    LZMA2_ECtx *const enc = malloc(sizeof(LZMA2_ECtx));
+    DEBUGLOG(3, "LZMA2_createECtx");
     if (enc == NULL)
         return NULL;
 
-    enc->out_buf = malloc(kChunkBufferSize);
-    if (enc->out_buf == NULL) {
-        free(enc);
-        return NULL;
-    }
     enc->lc = 3;
     enc->lp = 0;
     enc->pb = 2;
@@ -215,8 +240,8 @@ FL2_lzmaEncoderCtx* FL2_lzma2Create()
     enc->pos_mask = (1 << enc->pb) - 1;
     enc->match_cycles = 1;
     enc->strategy = FL2_ultra;
-    enc->match_price_count = kDistanceRepriceFrequency;
-    enc->align_price_count = kAlignRepriceFrequency;
+    enc->match_price_count = 0;
+    enc->rep_len_price_count = 0;
     enc->dist_price_table_size = kDistTableSizeMax;
     enc->hash_buf = NULL;
     enc->hash_dict_3 = 0;
@@ -225,399 +250,268 @@ FL2_lzmaEncoderCtx* FL2_lzma2Create()
     return enc;
 }
 
-void FL2_lzma2Free(FL2_lzmaEncoderCtx* enc)
+void LZMA2_freeECtx(LZMA2_ECtx *const enc)
 {
     if (enc == NULL)
         return;
     free(enc->hash_buf);
-    free(enc->out_buf);
     free(enc);
 }
 
-#define GetLiteralProbs(enc, pos, prev_symbol) (enc->states.literal_probs + ((((pos) & enc->lit_pos_mask) << enc->lc) + ((prev_symbol) >> (8 - enc->lc))) * kNumLiterals * kNumLitTables)
+#define LITERAL_PROBS(enc, pos, prev_symbol) (enc->states.literal_probs + ((((pos) & enc->lit_pos_mask) << enc->lc) + ((prev_symbol) >> (8 - enc->lc))) * kNumLiterals * kNumLitTables)
 
-#define GetLenToDistState(len) (((len) < kNumLenToPosStates + 1) ? (len) - 2 : kNumLenToPosStates - 1)
+#define LEN_TO_DIST_STATE(len) (((len) < kNumLenToPosStates + 1) ? (len) - 2 : kNumLenToPosStates - 1)
 
-#define IsCharState(state) ((state) < 7)
+#define IS_LIT_STATE(state) ((state) < 7)
 
 HINT_INLINE
-unsigned GetRepLen1Price(FL2_lzmaEncoderCtx* enc, size_t state, size_t pos_state)
+unsigned LZMA_getRepLen1Price(LZMA2_ECtx* const enc, size_t const state, size_t const pos_state)
 {
-    unsigned rep_G0_prob = enc->states.is_rep_G0[state];
-    unsigned rep0_long_prob = enc->states.is_rep0_long[state][pos_state];
-    return GET_PRICE_0(enc->rc, rep_G0_prob) + GET_PRICE_0(enc->rc, rep0_long_prob);
+    unsigned const rep_G0_prob = enc->states.is_rep_G0[state];
+    unsigned const rep0_long_prob = enc->states.is_rep0_long[state][pos_state];
+    return GET_PRICE_0(rep_G0_prob) + GET_PRICE_0(rep0_long_prob);
 }
 
-static unsigned GetRepPrice(FL2_lzmaEncoderCtx* enc, size_t rep_index, size_t state, size_t pos_state)
+static unsigned LZMA_getRepPrice(LZMA2_ECtx* const enc, size_t const rep_index, size_t const state, size_t const pos_state)
 {
     unsigned price;
-    unsigned rep_G0_prob = enc->states.is_rep_G0[state];
+    unsigned const rep_G0_prob = enc->states.is_rep_G0[state];
     if (rep_index == 0) {
-        unsigned rep0_long_prob = enc->states.is_rep0_long[state][pos_state];
-        price = GET_PRICE_0(enc->rc, rep_G0_prob);
-        price += GET_PRICE_1(enc->rc, rep0_long_prob);
+        unsigned const rep0_long_prob = enc->states.is_rep0_long[state][pos_state];
+        price = GET_PRICE_0(rep_G0_prob);
+        price += GET_PRICE_1(rep0_long_prob);
     }
     else {
-        unsigned rep_G1_prob = enc->states.is_rep_G1[state];
-        price = GET_PRICE_1(enc->rc, rep_G0_prob);
+        unsigned const rep_G1_prob = enc->states.is_rep_G1[state];
+        price = GET_PRICE_1(rep_G0_prob);
         if (rep_index == 1) {
-            price += GET_PRICE_0(enc->rc, rep_G1_prob);
+            price += GET_PRICE_0(rep_G1_prob);
         }
         else {
-            unsigned rep_G2_prob = enc->states.is_rep_G2[state];
-            price += GET_PRICE_1(enc->rc, rep_G1_prob);
-            price += GET_PRICE(enc->enc->rc, rep_G2_prob, (U32)(rep_index) - 2);
+            unsigned const rep_G2_prob = enc->states.is_rep_G2[state];
+            price += GET_PRICE_1(rep_G1_prob);
+            price += GET_PRICE(rep_G2_prob, rep_index - 2);
         }
     }
     return price;
 }
 
-static unsigned GetRepMatch0Price(FL2_lzmaEncoderCtx* enc, size_t len, size_t state, size_t pos_state)
+static unsigned LZMA_getRepMatch0Price(LZMA2_ECtx *const enc, size_t const len, size_t const state, size_t const pos_state)
 {
-    unsigned rep_G0_prob = enc->states.is_rep_G0[state];
-    unsigned rep0_long_prob = enc->states.is_rep0_long[state][pos_state];
+    unsigned const rep_G0_prob = enc->states.is_rep_G0[state];
+    unsigned const rep0_long_prob = enc->states.is_rep0_long[state][pos_state];
     return enc->states.rep_len_states.prices[pos_state][len - kMatchLenMin]
-        + GET_PRICE_0(enc->rc, rep_G0_prob)
-        + GET_PRICE_1(enc->rc, rep0_long_prob);
+        + GET_PRICE_0(rep_G0_prob)
+        + GET_PRICE_1(rep0_long_prob);
 }
 
-static unsigned GetLiteralPriceMatched(RangeEncoder* rc, const Probability *prob_table, U32 symbol, unsigned match_byte)
+static unsigned LZMA_getLiteralPriceMatched(const Probability *const prob_table, U32 symbol, unsigned match_byte)
 {
     unsigned price = 0;
     unsigned offs = 0x100;
     symbol |= 0x100;
     do {
         match_byte <<= 1;
-        price += GET_PRICE(enc->rc, prob_table[offs + (match_byte & offs) + (symbol >> 8)], (symbol >> 7) & 1);
+        price += GET_PRICE(prob_table[offs + (match_byte & offs) + (symbol >> 8)], (symbol >> 7) & 1);
         symbol <<= 1;
         offs &= ~(match_byte ^ symbol);
     } while (symbol < 0x10000);
     return price;
 }
 
-static void EncodeLiteral(FL2_lzmaEncoderCtx* enc, size_t index, U32 symbol, unsigned prev_symbol)
+HINT_INLINE
+void LZMA_encodeLiteral(LZMA2_ECtx *const enc, size_t const index, U32 symbol, unsigned const prev_symbol)
 {
-    EncodeBit0(&enc->rc, &enc->states.is_match[enc->states.state][index & enc->pos_mask]);
-    enc->states.state = LiteralNextState(enc->states.state);
+    RC_encodeBit0(&enc->rc, &enc->states.is_match[enc->states.state][index & enc->pos_mask]);
+    enc->states.state = LIT_NEXT_STATE(enc->states.state);
 
-    {   Probability* prob_table = GetLiteralProbs(enc, index, prev_symbol);
-        symbol |= 0x100;
-        do {
-            EncodeBit(&enc->rc, prob_table + (symbol >> 8), symbol & (1 << 7));
-            symbol <<= 1;
-        } while (symbol < 0x10000);
+    Probability* const prob_table = LITERAL_PROBS(enc, index, prev_symbol);
+    symbol |= 0x100;
+    do {
+        RC_encodeBit(&enc->rc, prob_table + (symbol >> 8), symbol & (1 << 7));
+        symbol <<= 1;
+    } while (symbol < 0x10000);
+}
+
+HINT_INLINE
+void LZMA_encodeLiteralMatched(LZMA2_ECtx *const enc, const BYTE* const data_block, size_t const index, U32 symbol)
+{
+    RC_encodeBit0(&enc->rc, &enc->states.is_match[enc->states.state][index & enc->pos_mask]);
+    enc->states.state = LIT_NEXT_STATE(enc->states.state);
+
+    unsigned match_symbol = data_block[index - enc->states.reps[0] - 1];
+    Probability* const prob_table = LITERAL_PROBS(enc, index, data_block[index - 1]);
+    unsigned offset = 0x100;
+    symbol |= 0x100;
+    do {
+        match_symbol <<= 1;
+        size_t prob_index = offset + (match_symbol & offset) + (symbol >> 8);
+        RC_encodeBit(&enc->rc, prob_table + prob_index, symbol & (1 << 7));
+        symbol <<= 1;
+        offset &= ~(match_symbol ^ symbol);
+    } while (symbol < 0x10000);
+}
+
+HINT_INLINE
+void LZMA_encodeLiteralBuf(LZMA2_ECtx *const enc, const BYTE* const data_block, size_t const index)
+{
+    U32 const symbol = data_block[index];
+    if (IS_LIT_STATE(enc->states.state)) {
+        unsigned const prev_symbol = data_block[index - 1];
+        LZMA_encodeLiteral(enc, index, symbol, prev_symbol);
+    }
+    else {
+        LZMA_encodeLiteralMatched(enc, data_block, index, symbol);
     }
 }
 
-static void EncodeLiteralMatched(FL2_lzmaEncoderCtx* enc, const BYTE* data_block, size_t index, U32 symbol)
+static void LZMA_lengthStates_SetPrices(const Probability *probs, U32 start_price, unsigned *prices)
 {
-    EncodeBit0(&enc->rc, &enc->states.is_match[enc->states.state][index & enc->pos_mask]);
-    enc->states.state = LiteralNextState(enc->states.state);
+    for (size_t i = 0; i < 8; i += 2) {
+        U32 prob = probs[4 + (i >> 1)];
+        U32 price = start_price + GET_PRICE(probs[1], (i >> 2))
+            + GET_PRICE(probs[2 + (i >> 2)], (i >> 1) & 1);
+        prices[i] = price + GET_PRICE_0(prob);
+        prices[i + 1] = price + GET_PRICE_1(prob);
+    }
+}
 
-    {   unsigned match_symbol = data_block[index - enc->states.reps[0] - 1];
-        Probability* prob_table = GetLiteralProbs(enc, index, data_block[index - 1]);
-        unsigned offset = 0x100;
-        symbol |= 0x100;
+FORCE_NOINLINE
+static void LZMA_lengthStates_updatePrices(LZMA2_ECtx *const enc, LengthStates* const ls)
+{
+    U32 b;
+
+    {
+        unsigned const prob = ls->choice;
+        U32 a, c;
+        b = GET_PRICE_1(prob);
+        a = GET_PRICE_0(prob);
+        c = b + GET_PRICE_0(ls->low[0]);
+        for (size_t pos_state = 0; pos_state <= enc->pos_mask; pos_state++) {
+            unsigned *const prices = ls->prices[pos_state];
+            const Probability *const probs = ls->low + (pos_state << (1 + kLenNumLowBits));
+            LZMA_lengthStates_SetPrices(probs, a, prices);
+            LZMA_lengthStates_SetPrices(probs + kLenNumLowSymbols, c, prices + kLenNumLowSymbols);
+        }
+    }
+
+    size_t i = ls->table_size;
+
+    if (i > kLenNumLowSymbols * 2) {
+        const Probability *const probs = ls->high;
+        unsigned *const prices = ls->prices[0] + kLenNumLowSymbols * 2;
+        i = (i - (kLenNumLowSymbols * 2 - 1)) >> 1;
+        b += GET_PRICE_1(ls->low[0]);
         do {
-            match_symbol <<= 1;
-            size_t prob_index = offset + (match_symbol & offset) + (symbol >> 8);
-            EncodeBit(&enc->rc, prob_table + prob_index, symbol & (1 << 7));
-            symbol <<= 1;
-            offset &= ~(match_symbol ^ symbol);
-        } while (symbol < 0x10000);
+            --i;
+            size_t sym = i + (1 << (kLenNumHighBits - 1));
+            U32 price = b;
+            do {
+                size_t bit = sym & 1;
+                sym >>= 1;
+                price += GET_PRICE(probs[sym], bit);
+            } while (sym >= 2);
+
+            unsigned const prob = probs[i + (1 << (kLenNumHighBits - 1))];
+            prices[i * 2] = price + GET_PRICE_0(prob);
+            prices[i * 2 + 1] = price + GET_PRICE_1(prob);
+        } while (i);
+
+        size_t const size = (ls->table_size - kLenNumLowSymbols * 2) * sizeof(ls->prices[0][0]);
+        for (size_t pos_state = 1; pos_state <= enc->pos_mask; pos_state++)
+            memcpy(ls->prices[pos_state] + kLenNumLowSymbols * 2, ls->prices[0] + kLenNumLowSymbols * 2, size);
+    }
+}
+
+/* Rare enough that not inlining is faster overall */
+FORCE_NOINLINE
+static void LZMA_encodeLength_MidHigh(LZMA2_ECtx *const enc, LengthStates* const len_prob_table, unsigned const len, size_t const pos_state)
+{
+    RC_encodeBit1(&enc->rc, &len_prob_table->choice);
+    if (len < kLenNumLowSymbols * 2) {
+        RC_encodeBit0(&enc->rc, &len_prob_table->low[0]);
+        RC_encodeBitTree(&enc->rc, len_prob_table->low + kLenNumLowSymbols + (pos_state << (1 + kLenNumLowBits)), kLenNumLowBits, len - kLenNumLowSymbols);
+    }
+    else {
+        RC_encodeBit1(&enc->rc, &len_prob_table->low[0]);
+        RC_encodeBitTree(&enc->rc, len_prob_table->high, kLenNumHighBits, len - kLenNumLowSymbols * 2);
     }
 }
 
 HINT_INLINE
-void EncodeLiteralBuf(FL2_lzmaEncoderCtx* enc, const BYTE* data_block, size_t index)
-{
-    U32 symbol = data_block[index];
-    if (IsCharState(enc->states.state)) {
-        unsigned prev_symbol = data_block[index - 1];
-        EncodeLiteral(enc, index, symbol, prev_symbol);
-    }
-    else {
-        EncodeLiteralMatched(enc, data_block, index, symbol);
-    }
-}
-
-static size_t RMF_bitpackExtendMatch(const BYTE* const data,
-    const U32* const table,
-    ptrdiff_t const start_index,
-    ptrdiff_t limit,
-    U32 const link,
-    size_t const length)
-{
-    ptrdiff_t end_index = start_index + length;
-    ptrdiff_t dist = start_index - link;
-    if (limit > start_index + (ptrdiff_t)kMatchLenMax)
-        limit = start_index + kMatchLenMax;
-    while (end_index < limit && end_index - (ptrdiff_t)(table[end_index] & RADIX_LINK_MASK) == dist) {
-        end_index += table[end_index] >> RADIX_LINK_BITS;
-    }
-    if (end_index >= limit) {
-        DEBUGLOG(7, "RMF_bitpackExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(limit - start_index));
-        return limit - start_index;
-    }
-    while (end_index < limit && data[end_index - dist] == data[end_index]) {
-        ++end_index;
-    }
-    DEBUGLOG(7, "RMF_bitpackExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(end_index - start_index));
-    return end_index - start_index;
-}
-
-#define GetMatchLink(table, index) ((const RMF_unit*)(table))[(index) >> UNIT_BITS].links[(index) & UNIT_MASK]
-
-#define GetMatchLength(table, index) ((const RMF_unit*)(table))[(index) >> UNIT_BITS].lengths[(index) & UNIT_MASK]
-
-static size_t RMF_structuredExtendMatch(const BYTE* const data,
-    const U32* const table,
-    ptrdiff_t const start_index,
-    ptrdiff_t limit,
-    U32 const link,
-    size_t const length)
-{
-    ptrdiff_t end_index = start_index + length;
-    ptrdiff_t dist = start_index - link;
-    if (limit > start_index + (ptrdiff_t)kMatchLenMax)
-        limit = start_index + kMatchLenMax;
-    while (end_index < limit && end_index - (ptrdiff_t)GetMatchLink(table, end_index) == dist) {
-        end_index += GetMatchLength(table, end_index);
-    }
-    if (end_index >= limit) {
-        DEBUGLOG(7, "RMF_structuredExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(limit - start_index));
-        return limit - start_index;
-    }
-    while (end_index < limit && data[end_index - dist] == data[end_index]) {
-        ++end_index;
-    }
-    DEBUGLOG(7, "RMF_structuredExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(end_index - start_index));
-    return end_index - start_index;
-}
-
-FORCE_INLINE_TEMPLATE
-Match FL2_radixGetMatch(FL2_dataBlock block,
-    FL2_matchTable* tbl,
-    unsigned max_depth,
-    int structTbl,
-    size_t index)
-{
-    if (structTbl)
-    {
-        Match match;
-        U32 link = GetMatchLink(tbl->table, index);
-        size_t length;
-        size_t dist;
-        match.length = 0;
-        if (link == RADIX_NULL_LINK)
-            return match;
-        length = GetMatchLength(tbl->table, index);
-        dist = index - link - 1;
-        if (length > block.end - index) {
-            match.length = (U32)(block.end - index);
-        }
-        else if (length == max_depth
-            || length == STRUCTURED_MAX_LENGTH /* from HandleRepeat */)
-        {
-            match.length = (U32)RMF_structuredExtendMatch(block.data, tbl->table, index, block.end, link, length);
-        }
-        else {
-            match.length = (U32)length;
-        }
-        match.dist = (U32)dist;
-        return match;
-    }
-    else {
-        Match match;
-        U32 link = tbl->table[index];
-        size_t length;
-        size_t dist;
-        match.length = 0;
-        if (link == RADIX_NULL_LINK)
-            return match;
-        length = link >> RADIX_LINK_BITS;
-        link &= RADIX_LINK_MASK;
-        dist = index - link - 1;
-        if (length > block.end - index) {
-            match.length = (U32)(block.end - index);
-        }
-        else if (length == max_depth
-            || length == BITPACK_MAX_LENGTH /* from HandleRepeat */)
-        {
-            match.length = (U32)RMF_bitpackExtendMatch(block.data, tbl->table, index, block.end, link, length);
-        }
-        else {
-            match.length = (U32)length;
-        }
-        match.dist = (U32)dist;
-        return match;
-    }
-}
-
-FORCE_INLINE_TEMPLATE
-Match FL2_radixGetNextMatch(FL2_dataBlock block,
-    FL2_matchTable* tbl,
-    unsigned max_depth,
-    int structTbl,
-    size_t index)
-{
-    if (structTbl)
-    {
-        Match match;
-        U32 link = GetMatchLink(tbl->table, index);
-        size_t length;
-        size_t dist;
-        match.length = 0;
-        if (link == RADIX_NULL_LINK)
-            return match;
-        length = GetMatchLength(tbl->table, index);
-        dist = index - link - 1;
-        if (link - 1 == GetMatchLink(tbl->table, index - 1)) {
-            /* same as the previous match, one byte shorter */
-            return match;
-        }
-        if (length > block.end - index) {
-            match.length = (U32)(block.end - index);
-        }
-        else if (length == max_depth
-            || length == STRUCTURED_MAX_LENGTH /* from HandleRepeat */)
-        {
-            match.length = (U32)RMF_structuredExtendMatch(block.data, tbl->table, index, block.end, link, length);
-        }
-        else {
-            match.length = (U32)length;
-        }
-        match.dist = (U32)dist;
-        return match;
-    }
-    else {
-        Match match;
-        U32 link = tbl->table[index];
-        size_t length;
-        size_t dist;
-        match.length = 0;
-        if (link == RADIX_NULL_LINK)
-            return match;
-        length = link >> RADIX_LINK_BITS;
-        link &= RADIX_LINK_MASK;
-        dist = index - link - 1;
-        if (link - 1 == (tbl->table[index - 1] & RADIX_LINK_MASK)) {
-            /* same distance, one byte shorter */
-            return match;
-        }
-        if (length > block.end - index) {
-            match.length = (U32)(block.end - index);
-        }
-        else if (length == max_depth
-            || length == BITPACK_MAX_LENGTH /* from HandleRepeat */)
-        {
-            match.length = (U32)RMF_bitpackExtendMatch(block.data, tbl->table, index, block.end, link, length);
-        }
-        else {
-            match.length = (U32)length;
-        }
-        match.dist = (U32)dist;
-        return match;
-    }
-}
-
-static void LengthStates_SetPrices(RangeEncoder* rc, LengthStates* ls, size_t pos_state)
-{
-    unsigned prob = ls->choice;
-    unsigned a0 = GET_PRICE_0(rc, prob);
-    unsigned a1 = GET_PRICE_1(rc, prob);
-    unsigned b0, b1;
-    size_t i = 0;
-    prob = ls->choice_2;
-    b0 = a1 + GET_PRICE_0(rc, prob);
-    b1 = a1 + GET_PRICE_1(rc, prob);
-    for (; i < kLenNumLowSymbols && i < ls->table_size; ++i) {
-        ls->prices[pos_state][i] = a0 + GetTreePrice(rc, ls->low + (pos_state << kLenNumLowBits), kLenNumLowBits, i);
-    }
-    for (; i < kLenNumLowSymbols + kLenNumMidSymbols && i < ls->table_size; ++i) {
-        ls->prices[pos_state][i] = b0 + GetTreePrice(rc, ls->mid + (pos_state << kLenNumMidBits), kLenNumMidBits, i - kLenNumLowSymbols);
-    }
-    for (; i < ls->table_size; ++i) {
-        ls->prices[pos_state][i] = b1 + GetTreePrice(rc, ls->high, kLenNumHighBits, i - kLenNumLowSymbols - kLenNumMidSymbols);
-    }
-    ls->counters[pos_state] = (unsigned)(ls->table_size);
-}
-
-static void EncodeLength(FL2_lzmaEncoderCtx* enc, LengthStates* len_prob_table, unsigned len, size_t pos_state)
+void LZMA_encodeLength(LZMA2_ECtx *const enc, LengthStates* const len_prob_table, unsigned len, size_t const pos_state)
 {
     len -= kMatchLenMin;
     if (len < kLenNumLowSymbols) {
-        EncodeBit0(&enc->rc, &len_prob_table->choice);
-        EncodeBitTree(&enc->rc, len_prob_table->low + (pos_state << kLenNumLowBits), kLenNumLowBits, len);
+        RC_encodeBit0(&enc->rc, &len_prob_table->choice);
+        RC_encodeBitTree(&enc->rc, len_prob_table->low + (pos_state << (1 + kLenNumLowBits)), kLenNumLowBits, len);
     }
     else {
-        EncodeBit1(&enc->rc, &len_prob_table->choice);
-        if (len < kLenNumLowSymbols + kLenNumMidSymbols) {
-            EncodeBit0(&enc->rc, &len_prob_table->choice_2);
-            EncodeBitTree(&enc->rc, len_prob_table->mid + (pos_state << kLenNumMidBits), kLenNumMidBits, len - kLenNumLowSymbols);
-        }
-        else {
-            EncodeBit1(&enc->rc, &len_prob_table->choice_2);
-            EncodeBitTree(&enc->rc, len_prob_table->high, kLenNumHighBits, len - kLenNumLowSymbols - kLenNumMidSymbols);
-        }
-    }
-    if (enc->strategy != FL2_fast && --len_prob_table->counters[pos_state] == 0) {
-        LengthStates_SetPrices(&enc->rc, len_prob_table, pos_state);
+        LZMA_encodeLength_MidHigh(enc, len_prob_table, len, pos_state);
     }
 }
 
-static void EncodeRepMatch(FL2_lzmaEncoderCtx* enc, unsigned len, unsigned rep, size_t pos_state)
+FORCE_NOINLINE
+static void LZMA_encodeRepMatchShort(LZMA2_ECtx *const enc, size_t const pos_state)
 {
-    DEBUGLOG(7, "EncodeRepMatch : length %u, rep %u", len, rep);
-    EncodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]);
-    EncodeBit1(&enc->rc, &enc->states.is_rep[enc->states.state]);
+    DEBUGLOG(7, "LZMA_encodeRepMatchShort");
+    RC_encodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]);
+    RC_encodeBit1(&enc->rc, &enc->states.is_rep[enc->states.state]);
+    RC_encodeBit0(&enc->rc, &enc->states.is_rep_G0[enc->states.state]);
+    RC_encodeBit0(&enc->rc, &enc->states.is_rep0_long[enc->states.state][pos_state]);
+    enc->states.state = SHORT_REP_NEXT_STATE(enc->states.state);
+}
+
+FORCE_NOINLINE
+static void LZMA_encodeRepMatchLong(LZMA2_ECtx *const enc, unsigned const len, unsigned const rep, size_t const pos_state)
+{
+    DEBUGLOG(7, "LZMA_encodeRepMatchLong : length %u, rep %u", len, rep);
+    RC_encodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]);
+    RC_encodeBit1(&enc->rc, &enc->states.is_rep[enc->states.state]);
     if (rep == 0) {
-        EncodeBit0(&enc->rc, &enc->states.is_rep_G0[enc->states.state]);
-        EncodeBit(&enc->rc, &enc->states.is_rep0_long[enc->states.state][pos_state], ((len == 1) ? 0 : 1));
+        RC_encodeBit0(&enc->rc, &enc->states.is_rep_G0[enc->states.state]);
+        RC_encodeBit1(&enc->rc, &enc->states.is_rep0_long[enc->states.state][pos_state]);
     }
     else {
-        U32 distance = enc->states.reps[rep];
-        EncodeBit1(&enc->rc, &enc->states.is_rep_G0[enc->states.state]);
+        U32 const distance = enc->states.reps[rep];
+        RC_encodeBit1(&enc->rc, &enc->states.is_rep_G0[enc->states.state]);
         if (rep == 1) {
-            EncodeBit0(&enc->rc, &enc->states.is_rep_G1[enc->states.state]);
+            RC_encodeBit0(&enc->rc, &enc->states.is_rep_G1[enc->states.state]);
         }
         else {
-            EncodeBit1(&enc->rc, &enc->states.is_rep_G1[enc->states.state]);
-            EncodeBit(&enc->rc, &enc->states.is_rep_G2[enc->states.state], rep - 2);
-            if (rep == 3) {
+            RC_encodeBit1(&enc->rc, &enc->states.is_rep_G1[enc->states.state]);
+            RC_encodeBit(&enc->rc, &enc->states.is_rep_G2[enc->states.state], rep - 2);
+            if (rep == 3)
                 enc->states.reps[3] = enc->states.reps[2];
-            }
             enc->states.reps[2] = enc->states.reps[1];
         }
         enc->states.reps[1] = enc->states.reps[0];
         enc->states.reps[0] = distance;
     }
-    if (len == 1) {
-        enc->states.state = ShortRepNextState(enc->states.state);
-    }
-    else {
-        EncodeLength(enc, &enc->states.rep_len_states, len, pos_state);
-        enc->states.state = RepNextState(enc->states.state);
-    }
+    LZMA_encodeLength(enc, &enc->states.rep_len_states, len, pos_state);
+    enc->states.state = REP_NEXT_STATE(enc->states.state);
+    ++enc->rep_len_price_count;
 }
 
-/* *****************************************/
-/* Distance slot functions based on fastpos.h from XZ*/
+
+/* 
+ * Distance slot functions based on fastpos.h from XZ
+ */
 
 HINT_INLINE
-unsigned FastDistShift(unsigned n)
+unsigned LZMA_fastDistShift(unsigned const n)
 {
     return n * (kFastDistBits - 1);
 }
 
 HINT_INLINE
-unsigned FastDistResult(U32 dist, unsigned n)
+unsigned LZMA_fastDistResult(U32 const dist, unsigned const n)
 {
-    return distance_table[dist >> FastDistShift(n)]
-        + 2 * FastDistShift(n);
+    return distance_table[dist >> LZMA_fastDistShift(n)]
+        + 2 * LZMA_fastDistShift(n);
 }
 
-static size_t GetDistSlot(U32 distance)
+static size_t LZMA_getDistSlot(U32 const distance)
 {
     U32 limit = 1UL << kFastDistBits;
     /* If it is small enough, we can pick the result directly from */
@@ -625,193 +519,192 @@ static size_t GetDistSlot(U32 distance)
     if (distance < limit) {
         return distance_table[distance];
     }
-    limit <<= FastDistShift(1);
+    limit <<= LZMA_fastDistShift(1);
     if (distance < limit) {
-        return FastDistResult(distance, 1);
+        return LZMA_fastDistResult(distance, 1);
     }
-    return FastDistResult(distance, 2);
+    return LZMA_fastDistResult(distance, 2);
 }
 
-/* **************************************** */
+/* * */
 
-static void EncodeNormalMatch(FL2_lzmaEncoderCtx* enc, unsigned len, U32 dist, size_t pos_state)
+
+HINT_INLINE
+void LZMA_encodeNormalMatch(LZMA2_ECtx *const enc, unsigned const len, U32 const dist, size_t const pos_state)
 {
-    DEBUGLOG(7, "EncodeNormalMatch : length %u, dist %u", len, dist);
-    EncodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]);
-    EncodeBit0(&enc->rc, &enc->states.is_rep[enc->states.state]);
-    enc->states.state = MatchNextState(enc->states.state);
-    EncodeLength(enc, &enc->states.len_states, len, pos_state);
+    DEBUGLOG(7, "LZMA_encodeNormalMatch : length %u, dist %u", len, dist);
+    RC_encodeBit1(&enc->rc, &enc->states.is_match[enc->states.state][pos_state]);
+    RC_encodeBit0(&enc->rc, &enc->states.is_rep[enc->states.state]);
+    enc->states.state = MATCH_NEXT_STATE(enc->states.state);
 
-    {   size_t dist_slot = GetDistSlot(dist);
-        EncodeBitTree(&enc->rc, enc->states.dist_slot_encoders[GetLenToDistState(len)], kNumPosSlotBits, (unsigned)(dist_slot));
-        if (dist_slot >= kStartPosModelIndex) {
-            unsigned footerBits = ((unsigned)(dist_slot >> 1) - 1);
-            size_t base = ((2 | (dist_slot & 1)) << footerBits);
-            unsigned posReduced = (unsigned)(dist - base);
-            if (dist_slot < kEndPosModelIndex) {
-                EncodeBitTreeReverse(&enc->rc, enc->states.dist_encoders + base - dist_slot - 1, footerBits, posReduced);
-            }
-            else {
-                EncodeDirect(&enc->rc, posReduced >> kNumAlignBits, footerBits - kNumAlignBits);
-                EncodeBitTreeReverse(&enc->rc, enc->states.dist_align_encoders, kNumAlignBits, posReduced & kAlignMask);
-                ++enc->align_price_count;
-            }
+    LZMA_encodeLength(enc, &enc->states.len_states, len, pos_state);
+
+    size_t const dist_slot = LZMA_getDistSlot(dist);
+    RC_encodeBitTree(&enc->rc, enc->states.dist_slot_encoders[LEN_TO_DIST_STATE(len)], kNumPosSlotBits, (unsigned)dist_slot);
+    if (dist_slot >= kStartPosModelIndex) {
+        unsigned const footer_bits = ((unsigned)(dist_slot >> 1) - 1);
+        size_t const base = ((2 | (dist_slot & 1)) << footer_bits);
+        unsigned const dist_reduced = (unsigned)(dist - base);
+        if (dist_slot < kEndPosModelIndex) {
+            RC_encodeBitTreeReverse(&enc->rc, enc->states.dist_encoders + base - dist_slot - 1, footer_bits, dist_reduced);
+        }
+        else {
+            RC_encodeDirect(&enc->rc, dist_reduced >> kNumAlignBits, footer_bits - kNumAlignBits);
+            RC_encodeBitTreeReverse(&enc->rc, enc->states.dist_align_encoders, kNumAlignBits, dist_reduced & kAlignMask);
         }
     }
     enc->states.reps[3] = enc->states.reps[2];
     enc->states.reps[2] = enc->states.reps[1];
     enc->states.reps[1] = enc->states.reps[0];
     enc->states.reps[0] = dist;
+
     ++enc->match_price_count;
 }
 
-#if defined(_MSC_VER)
-#  pragma warning(disable : 4701)  /* disable: C4701: potentially uninitialized local variable */
-#endif
-
 FORCE_INLINE_TEMPLATE
-size_t EncodeChunkFast(FL2_lzmaEncoderCtx* enc,
+size_t LZMA_encodeChunkFast(LZMA2_ECtx *const enc,
     FL2_dataBlock const block,
-    FL2_matchTable* tbl,
-    int structTbl,
+    FL2_matchTable* const tbl,
+    int const struct_tbl,
     size_t index,
-    size_t uncompressed_end)
+    size_t const uncompressed_end)
 {
     size_t const pos_mask = enc->pos_mask;
     size_t prev = index;
-    unsigned search_depth = tbl->params.depth;
-    while (index < uncompressed_end && enc->rc.out_index < enc->rc.chunk_size)
-    {
+    unsigned const search_depth = tbl->params.depth;
+    while (index < uncompressed_end && enc->rc.out_index < enc->rc.chunk_size) {
         size_t max_len;
         const BYTE* data;
         /* Table of distance restrictions for short matches */
         static const U32 max_dist_table[] = { 0, 0, 0, 1 << 6, 1 << 14 };
         /* Get a match from the table, extended to its full length */
-        Match bestMatch = FL2_radixGetMatch(block, tbl, search_depth, structTbl, index);
-        if (bestMatch.length < kMatchLenMin) {
+        RMF_match best_match = RMF_getMatch(block, tbl, search_depth, struct_tbl, index);
+        if (best_match.length < kMatchLenMin) {
             ++index;
             continue;
         }
         /* Use if near enough */
-        if (bestMatch.length >= 5 || bestMatch.dist < max_dist_table[bestMatch.length]) {
-            bestMatch.dist += kNumReps;
-        }
-        else {
-            bestMatch.length = 0;
-        }
+        if (best_match.length >= 5 || best_match.dist < max_dist_table[best_match.length])
+            best_match.dist += kNumReps;
+        else
+            best_match.length = 0;
+
         max_len = MIN(kMatchLenMax, block.end - index);
         data = block.data + index;
 
-        {   Match bestRep;
-            Match repMatch;
-            bestRep.length = 0;
-            for (repMatch.dist = 0; repMatch.dist < kNumReps; ++repMatch.dist) {
-                const BYTE *data_2 = data - enc->states.reps[repMatch.dist] - 1;
-                if (MEM_read16(data) != MEM_read16(data_2)) {
-                    continue;
-                }
-                repMatch.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2);
-                if (repMatch.length >= max_len) {
-                    bestMatch = repMatch;
-                    goto _encode;
-                }
-                if (repMatch.length > bestRep.length) {
-                    bestRep = repMatch;
-                }
-            }
-            if (bestMatch.length >= max_len)
+        RMF_match best_rep = { 0, 0 };
+        RMF_match rep_match;
+        /* Search all of the rep distances */
+        for (rep_match.dist = 0; rep_match.dist < kNumReps; ++rep_match.dist) {
+            const BYTE *data_2 = data - enc->states.reps[rep_match.dist] - 1;
+            if (MEM_read16(data) != MEM_read16(data_2))
+                continue;
+
+            rep_match.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2);
+            if (rep_match.length >= max_len) {
+                best_match = rep_match;
                 goto _encode;
-            if (bestRep.length >= 2) {
-                int const gain2 = (int)(bestRep.length * 3 - bestRep.dist);
-                int const gain1 = (int)(bestMatch.length * 3 - ZSTD_highbit32(bestMatch.dist + 1) + 1);
+            }
+            if (rep_match.length > best_rep.length)
+                best_rep = rep_match;
+        }
+        /* Encode if it is kMatchLenMax or completes the block */
+        if (best_match.length >= max_len)
+            goto _encode;
+
+        if (best_rep.length >= 2) {
+            if (best_rep.length > best_match.length) {
+                best_match = best_rep;
+            }
+            else {
+                /* Modified ZSTD scheme for estimating cost */
+                int const gain2 = (int)(best_rep.length * 3 - best_rep.dist);
+                int const gain1 = (int)(best_match.length * 3 - ZSTD_highbit32(best_match.dist + 1) + 1);
                 if (gain2 > gain1) {
-                    DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", bestMatch.length, bestMatch.dist, bestRep.length, bestRep.dist);
-                    bestMatch = bestRep;
+                    DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", best_match.length, best_match.dist, best_rep.length, best_rep.dist);
+                    best_match = best_rep;
                 }
             }
         }
 
-        if (bestMatch.length < kMatchLenMin) {
+        if (best_match.length < kMatchLenMin) {
             ++index;
             continue;
         }
 
-        for (size_t next = index + 1; bestMatch.length < kMatchLenMax && next < uncompressed_end; ++next) {
+        for (size_t next = index + 1; best_match.length < kMatchLenMax && next < uncompressed_end; ++next) {
             /* lazy matching scheme from ZSTD */
-            Match next_match = FL2_radixGetNextMatch(block, tbl, search_depth, structTbl, next);
+            RMF_match next_match = RMF_getNextMatch(block, tbl, search_depth, struct_tbl, next);
             if (next_match.length >= kMatchLenMin) {
-                Match bestRep;
-                Match repMatch;
-                bestRep.length = 0;
+                best_rep.length = 0;
                 data = block.data + next;
                 max_len = MIN(kMatchLenMax, block.end - next);
-                for (repMatch.dist = 0; repMatch.dist < kNumReps; ++repMatch.dist) {
-                    const BYTE *data_2 = data - enc->states.reps[repMatch.dist] - 1;
-                    if (MEM_read16(data) != MEM_read16(data_2)) {
+                for (rep_match.dist = 0; rep_match.dist < kNumReps; ++rep_match.dist) {
+                    const BYTE *data_2 = data - enc->states.reps[rep_match.dist] - 1;
+                    if (MEM_read16(data) != MEM_read16(data_2))
                         continue;
-                    }
-                    repMatch.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2);
-                    if (repMatch.length > bestRep.length) {
-                        bestRep = repMatch;
-                    }
+
+                    rep_match.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2);
+                    if (rep_match.length > best_rep.length)
+                        best_rep = rep_match;
                 }
-                if (bestRep.length >= 3) {
-                    int const gain2 = (int)(bestRep.length * 3 - bestRep.dist);
-                    int const gain1 = (int)(bestMatch.length * 3 - ZSTD_highbit32((U32)bestMatch.dist + 1) + 1);
+                if (best_rep.length >= 3) {
+                    int const gain2 = (int)(best_rep.length * 3 - best_rep.dist);
+                    int const gain1 = (int)(best_match.length * 3 - ZSTD_highbit32((U32)best_match.dist + 1) + 1);
                     if (gain2 > gain1) {
-                        DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", bestMatch.length, bestMatch.dist, bestRep.length, bestRep.dist);
-                        bestMatch = bestRep;
+                        DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", best_match.length, best_match.dist, best_rep.length, best_rep.dist);
+                        best_match = best_rep;
                         index = next;
                     }
                 }
-                if (next_match.length >= 3 && next_match.dist != bestMatch.dist) {
+                if (next_match.length >= 3 && next_match.dist != best_match.dist) {
                     int const gain2 = (int)(next_match.length * 4 - ZSTD_highbit32((U32)next_match.dist + 1));   /* raw approx */
-                    int const gain1 = (int)(bestMatch.length * 4 - ZSTD_highbit32((U32)bestMatch.dist + 1) + 4);
+                    int const gain1 = (int)(best_match.length * 4 - ZSTD_highbit32((U32)best_match.dist + 1) + 4);
                     if (gain2 > gain1) {
-                        DEBUGLOG(7, "Replace match (%u, %u) with match (%u, %u)", bestMatch.length, bestMatch.dist, next_match.length, next_match.dist + kNumReps);
-                        bestMatch = next_match;
-                        bestMatch.dist += kNumReps;
+                        DEBUGLOG(7, "Replace match (%u, %u) with match (%u, %u)", best_match.length, best_match.dist, next_match.length, next_match.dist + kNumReps);
+                        best_match = next_match;
+                        best_match.dist += kNumReps;
                         index = next;
                         continue;
                     }
                 }
             }
             if (next < uncompressed_end - 4) {
-                Match bestRep;
-                Match repMatch;
                 ++next;
-                next_match = FL2_radixGetNextMatch(block, tbl, search_depth, structTbl, next);
+
+                next_match = RMF_getNextMatch(block, tbl, search_depth, struct_tbl, next);
                 if (next_match.length < 4)
                     break;
+
                 data = block.data + next;
                 max_len = MIN(kMatchLenMax, block.end - next);
-                bestRep.length = 0;
-                for (repMatch.dist = 0; repMatch.dist < kNumReps; ++repMatch.dist) {
-                    const BYTE *data_2 = data - enc->states.reps[repMatch.dist] - 1;
-                    if (MEM_read16(data) != MEM_read16(data_2)) {
+                best_rep.length = 0;
+
+                for (rep_match.dist = 0; rep_match.dist < kNumReps; ++rep_match.dist) {
+                    const BYTE *data_2 = data - enc->states.reps[rep_match.dist] - 1;
+                    if (MEM_read16(data) != MEM_read16(data_2))
                         continue;
-                    }
-                    repMatch.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2);
-                    if (repMatch.length > bestRep.length) {
-                        bestRep = repMatch;
-                    }
+
+                    rep_match.length = (U32)(ZSTD_count(data + 2, data_2 + 2, data + max_len) + 2);
+                    if (rep_match.length > best_rep.length)
+                        best_rep = rep_match;
                 }
-                if (bestRep.length >= 4) {
-                    int const gain2 = (int)(bestRep.length * 4 - (bestRep.dist >> 1));
-                    int const gain1 = (int)(bestMatch.length * 4 - ZSTD_highbit32((U32)bestMatch.dist + 1) + 1);
+                if (best_rep.length >= 4) {
+                    int const gain2 = (int)(best_rep.length * 4 - (best_rep.dist >> 1));
+                    int const gain1 = (int)(best_match.length * 4 - ZSTD_highbit32((U32)best_match.dist + 1) + 1);
                     if (gain2 > gain1) {
-                        DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", bestMatch.length, bestMatch.dist, bestRep.length, bestRep.dist);
-                        bestMatch = bestRep;
+                        DEBUGLOG(7, "Replace match (%u, %u) with rep (%u, %u)", best_match.length, best_match.dist, best_rep.length, best_rep.dist);
+                        best_match = best_rep;
                         index = next;
                     }
                 }
-                if (next_match.length >= 4 && next_match.dist != bestMatch.dist) {
+                if (next_match.length >= 4 && next_match.dist != best_match.dist) {
                     int const gain2 = (int)(next_match.length * 4 - ZSTD_highbit32((U32)next_match.dist + 1));
-                    int const gain1 = (int)(bestMatch.length * 4 - ZSTD_highbit32((U32)bestMatch.dist + 1) + 7);
+                    int const gain1 = (int)(best_match.length * 4 - ZSTD_highbit32((U32)best_match.dist + 1) + 7);
                     if (gain2 > gain1) {
-                        DEBUGLOG(7, "Replace match (%u, %u) with match (%u, %u)", bestMatch.length, bestMatch.dist, next_match.length, next_match.dist + kNumReps);
-                        bestMatch = next_match;
-                        bestMatch.dist += kNumReps;
+                        DEBUGLOG(7, "Replace match (%u, %u) with match (%u, %u)", best_match.length, best_match.dist, next_match.length, next_match.dist + kNumReps);
+                        best_match = next_match;
+                        best_match.dist += kNumReps;
                         index = next;
                         continue;
                     }
@@ -821,134 +714,165 @@ size_t EncodeChunkFast(FL2_lzmaEncoderCtx* enc,
             break;
         }
 _encode:
-        assert(index + bestMatch.length <= block.end);
-        while (prev < index && enc->rc.out_index < enc->rc.chunk_size) {
-            if (block.data[prev] == block.data[prev - enc->states.reps[0] - 1]) {
-                EncodeRepMatch(enc, 1, 0, prev & pos_mask);
+        assert(index + best_match.length <= block.end);
+
+        /* Chunk overflow size is kOptimizerBufferSize + extra.
+         * Unlikely for this limit to be hit. */
+        size_t rc_end = enc->rc.chunk_size + kOptimizerBufferSize;
+        while (prev < index && enc->rc.out_index < rc_end) {
+            if (block.data[prev] != block.data[prev - enc->states.reps[0] - 1]) {
+                LZMA_encodeLiteralBuf(enc, block.data, prev);
+                ++prev;
             }
             else {
-                EncodeLiteralBuf(enc, block.data, prev);
+                LZMA_encodeRepMatchShort(enc, prev & pos_mask);
+                ++prev;
             }
-            ++prev;
         }
-        if (enc->rc.out_index >= enc->rc.chunk_size) {
+        if (prev < index)
             break;
-        }
-        if(bestMatch.length >= kMatchLenMin) {
-            if (bestMatch.dist < kNumReps) {
-                EncodeRepMatch(enc, bestMatch.length, bestMatch.dist, index & pos_mask);
+
+        if(best_match.length >= kMatchLenMin) {
+            if (best_match.dist >= kNumReps) {
+                LZMA_encodeNormalMatch(enc, best_match.length, best_match.dist - kNumReps, index & pos_mask);
+                index += best_match.length;
+                prev = index;
             }
             else {
-                EncodeNormalMatch(enc, bestMatch.length, bestMatch.dist - kNumReps, index & pos_mask);
+                LZMA_encodeRepMatchLong(enc, best_match.length, best_match.dist, index & pos_mask);
+                index += best_match.length;
+                prev = index;
             }
-            index += bestMatch.length;
-            prev = index;
         }
     }
     while (prev < index && enc->rc.out_index < enc->rc.chunk_size) {
-        if (block.data[prev] == block.data[prev - enc->states.reps[0] - 1]) {
-            EncodeRepMatch(enc, 1, 0, prev & pos_mask);
-        }
-        else {
-            EncodeLiteralBuf(enc, block.data, prev);
-        }
+        if (block.data[prev] != block.data[prev - enc->states.reps[0] - 1])
+            LZMA_encodeLiteralBuf(enc, block.data, prev);
+        else
+            LZMA_encodeRepMatchShort(enc, prev & pos_mask);
         ++prev;
     }
-    Flush(&enc->rc);
     return prev;
 }
 
-/* Reverse the direction of the linked list generated by the optimal parser */
-static void ReverseOptimalChain(OptimalNode* opt_buf, size_t cur)
+/*
+ * Reverse the direction of the linked list generated by the optimal parser
+ */
+FORCE_NOINLINE
+static void LZMA_reverseOptimalChain(OptimalNode* const opt_buf, size_t cur)
 {
-    size_t next_index = opt_buf[cur].prev_index;
-    U32 next_dist = opt_buf[cur].prev_dist;
-    do
-    {
-        if (opt_buf[cur].is_combination)
-        {
-            MakeAsLiteral(opt_buf[next_index]);
-            opt_buf[next_index].prev_index = (unsigned)(next_index - 1);
-            if (opt_buf[cur].prev_2)
-            {
-                opt_buf[next_index - 1].is_combination = 0;
-                opt_buf[next_index - 1].prev_index = opt_buf[cur].prev_index_2;
-                opt_buf[next_index - 1].prev_dist = opt_buf[cur].prev_dist_2;
+    unsigned len = (unsigned)opt_buf[cur].len;
+    U32 dist = opt_buf[cur].dist;
+
+    for(;;) {
+        unsigned const extra = (unsigned)opt_buf[cur].extra;
+        cur -= len;
+
+        if (extra) {
+            opt_buf[cur].len = (U32)len;
+            len = extra;
+            if (extra == 1) {
+                opt_buf[cur].dist = dist;
+                dist = kNullDist;
+                --cur;
+            }
+            else {
+                opt_buf[cur].dist = 0;
+                --cur;
+                --len;
+                opt_buf[cur].dist = kNullDist;
+                opt_buf[cur].len = 1;
+                cur -= len;
             }
         }
 
-        {   U32 temp = opt_buf[next_index].prev_dist;
-            opt_buf[next_index].prev_dist = next_dist;
-            next_dist = temp;
-        }
+        unsigned const next_len = opt_buf[cur].len;
+        U32 const next_dist = opt_buf[cur].dist;
 
-        {   size_t prev_index = next_index;
-            next_index = opt_buf[prev_index].prev_index;
-            opt_buf[prev_index].prev_index = (unsigned)(cur);
-            cur = prev_index;
-        }
-    } while (cur != 0);
+        opt_buf[cur].dist = dist;
+        opt_buf[cur].len = (U32)len;
+
+        if (cur == 0)
+            break;
+
+        len = next_len;
+        dist = next_dist;
+    }
 }
 
-static unsigned GetLiteralPrice(FL2_lzmaEncoderCtx* enc, size_t index, size_t state, unsigned prev_symbol, U32 symbol, unsigned match_byte)
+static unsigned LZMA_getLiteralPrice(LZMA2_ECtx *const enc, size_t const index, size_t const state, unsigned const prev_symbol, U32 symbol, unsigned const match_byte)
 {
-    const Probability* prob_table = GetLiteralProbs(enc, index, prev_symbol);
-    if (IsCharState(state)) {
+    const Probability* const prob_table = LITERAL_PROBS(enc, index, prev_symbol);
+    if (IS_LIT_STATE(state)) {
         unsigned price = 0;
         symbol |= 0x100;
         do {
-            price += GET_PRICE(enc->rc, prob_table[symbol >> 8], (symbol >> 7) & 1);
+            price += GET_PRICE(prob_table[symbol >> 8], (symbol >> 7) & 1);
             symbol <<= 1;
         } while (symbol < 0x10000);
         return price;
     }
-    return GetLiteralPriceMatched(&enc->rc, prob_table, symbol, match_byte);
+    return LZMA_getLiteralPriceMatched(prob_table, symbol, match_byte);
 }
 
-static void HashReset(FL2_lzmaEncoderCtx* enc, unsigned dictionary_bits_3)
+/* 
+ * Reset the hash object for encoding a new slice of a block
+ */
+static void LZMA_hashReset(LZMA2_ECtx *const enc, unsigned const dictionary_bits_3)
 {
     enc->hash_dict_3 = (ptrdiff_t)1 << dictionary_bits_3;
     enc->chain_mask_3 = enc->hash_dict_3 - 1;
     memset(enc->hash_buf->table_3, 0xFF, sizeof(enc->hash_buf->table_3));
 }
 
-static int HashCreate(FL2_lzmaEncoderCtx* enc, unsigned dictionary_bits_3)
+/*
+ * Create hash table and chain with dict size dictionary_bits_3. Frees any existing object.
+ */
+static int LZMA_hashCreate(LZMA2_ECtx *const enc, unsigned const dictionary_bits_3)
 {
     DEBUGLOG(3, "Create hash chain : dict bits %u", dictionary_bits_3);
-    if (enc->hash_buf) {
+
+    if (enc->hash_buf)
         free(enc->hash_buf);
-    }
+
     enc->hash_alloc_3 = (ptrdiff_t)1 << dictionary_bits_3;
     enc->hash_buf = malloc(sizeof(HashChains) + (enc->hash_alloc_3 - 1) * sizeof(S32));
+
     if (enc->hash_buf == NULL)
         return 1;
-    HashReset(enc, dictionary_bits_3);
+
+    LZMA_hashReset(enc, dictionary_bits_3);
+
     return 0;
 }
 
-/* Create a hash chain for hybrid mode */
-int FL2_lzma2HashAlloc(FL2_lzmaEncoderCtx* enc, const FL2_lzma2Parameters* options)
+/* Create a hash chain for hybrid mode if options require one.
+ * Used for allocating before compression begins. Any existing table will be reused if
+ * it is at least as large as required.
+ */
+int LZMA2_hashAlloc(LZMA2_ECtx *const enc, const FL2_lzma2Parameters* const options)
 {
-    if (enc->strategy == FL2_ultra && enc->hash_alloc_3 < ((ptrdiff_t)1 << options->second_dict_bits)) {
-        return HashCreate(enc, options->second_dict_bits);
-    }
+    if (enc->strategy == FL2_ultra && enc->hash_alloc_3 < ((ptrdiff_t)1 << options->second_dict_bits))
+        return LZMA_hashCreate(enc, options->second_dict_bits);
+
     return 0;
 }
 
 #define GET_HASH_3(data) ((((MEM_readLE32(data)) << 8) * 506832829U) >> (32 - kHash3Bits))
 
+/* Find matches nearer than the match from the RMF. If none is at least as long as
+ * the RMF match (most likely), insert that match at the end of the list.
+ */
 HINT_INLINE
-size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
-    ptrdiff_t index,
-    size_t length_limit,
-    Match match)
+size_t LZMA_hashGetMatches(LZMA2_ECtx *const enc, FL2_dataBlock const block,
+    ptrdiff_t const index,
+    size_t const length_limit,
+    RMF_match const match)
 {
     ptrdiff_t const hash_dict_3 = enc->hash_dict_3;
     const BYTE* data = block.data;
-    HashChains* tbl = enc->hash_buf;
+    HashChains* const tbl = enc->hash_buf;
     ptrdiff_t const chain_mask_3 = enc->chain_mask_3;
-    size_t max_len;
-    ptrdiff_t first_3;
 
     enc->match_count = 0;
     enc->hash_prev_index = MAX(enc->hash_prev_index, index - hash_dict_3);
@@ -959,15 +883,16 @@ size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
         tbl->table_3[hash] = (S32)enc->hash_prev_index;
     }
     data += index;
-    max_len = 2;
 
-    {   size_t hash = GET_HASH_3(data);
-        first_3 = tbl->table_3[hash];
-        tbl->table_3[hash] = (S32)(index);
-    }
+    size_t const hash = GET_HASH_3(data);
+    ptrdiff_t const first_3 = tbl->table_3[hash];
+    tbl->table_3[hash] = (S32)index;
+
+    size_t max_len = 2;
+
     if (first_3 >= 0) {
         int cycles = enc->match_cycles;
-        ptrdiff_t end_index = index - (((ptrdiff_t)match.dist < hash_dict_3) ? match.dist : hash_dict_3);
+        ptrdiff_t const end_index = index - (((ptrdiff_t)match.dist < hash_dict_3) ? match.dist : hash_dict_3);
         ptrdiff_t match_3 = first_3;
         if (match_3 >= end_index) {
             do {
@@ -979,9 +904,8 @@ size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
                     enc->matches[enc->match_count].dist = (U32)(index - match_3 - 1);
                     ++enc->match_count;
                     max_len = len_test;
-                    if (len_test >= length_limit) {
+                    if (len_test >= length_limit)
                         break;
-                    }
                 }
                 if (cycles <= 0)
                     break;
@@ -990,7 +914,8 @@ size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
         }
     }
     tbl->hash_chain_3[index & chain_mask_3] = (S32)first_3;
-    if ((unsigned)(max_len) < match.length) {
+    if ((unsigned)max_len < match.length) {
+        /* Insert the match from the RMF */
         enc->matches[enc->match_count] = match;
         ++enc->match_count;
         return match.length;
@@ -998,181 +923,167 @@ size_t HashGetMatches(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
     return max_len;
 }
 
-#if defined(_MSC_VER)
-#  pragma warning(disable : 4701)  /* disable: C4701: potentially uninitialized local variable */
-#endif
-
-/* The speed of this function is critical and the sections have so many variables
-* in common that breaking it up would be inefficient.
+/* The speed of this function is critical. The sections have many variables
+* in common, so breaking it up into shorter functions is not feasible.
 * For each position cur, starting at 1, check some or all possible
 * encoding choices - a literal, 1-byte rep 0 match, all rep match lengths, and
 * all match lengths at available distances. It also checks the combined
-* sequences literal+rep0, rep+rep0 and match+rep0.
+* sequences literal+rep0, rep+lit+rep0 and match+lit+rep0.
 * If is_hybrid != 0, this method works in hybrid mode, using the
 * hash chain to find shorter matches at near distances. */
 FORCE_INLINE_TEMPLATE
-size_t OptimalParse(FL2_lzmaEncoderCtx* const enc, const FL2_dataBlock block,
-    Match match,
+size_t LZMA_optimalParse(LZMA2_ECtx* const enc, FL2_dataBlock const block,
+    RMF_match match,
     size_t const index,
     size_t const cur,
     size_t len_end,
     int const is_hybrid,
     U32* const reps)
 {
-    OptimalNode* cur_opt = &enc->opt_buf[cur];
-    size_t prev_index = cur_opt->prev_index;
-    size_t state = enc->opt_buf[prev_index].state;
+    OptimalNode* const cur_opt = &enc->opt_buf[cur];
     size_t const pos_mask = enc->pos_mask;
-    size_t pos_state = (index & pos_mask);
-    const BYTE* data = block.data + index;
+    size_t const pos_state = (index & pos_mask);
+    const BYTE* const data = block.data + index;
     size_t const fast_length = enc->fast_length;
+    size_t prev_index = cur - cur_opt->len;
+    size_t state;
     size_t bytes_avail;
-    size_t max_length;
-    size_t start_len;
     U32 match_price;
     U32 rep_match_price;
-    Probability is_rep_prob;
 
-    if (cur_opt->is_combination) {
-        --prev_index;
-        if (cur_opt->prev_2) {
-            state = enc->opt_buf[cur_opt->prev_index_2].state;
-            if (cur_opt->prev_dist_2 < kNumReps) {
-                state = RepNextState(state);
-            }
-            else {
-                state = MatchNextState(state);
-            }
+    /* Update the states according to how this location was reached */
+    if (cur_opt->len == 1) {
+        /* Literal or 1-byte rep */
+        const BYTE *next_state = (cur_opt->dist == 0) ? kShortRepNextStates : kLiteralNextStates;
+        state = next_state[enc->opt_buf[prev_index].state];
+    }
+    else {
+        /* Match or rep match */
+        size_t const dist = cur_opt->dist;
+
+        if (cur_opt->extra) {
+            prev_index -= cur_opt->extra;
+            state = kState_RepAfterLit - ((dist >= kNumReps) & (cur_opt->extra == 1));
         }
         else {
             state = enc->opt_buf[prev_index].state;
+            state = MATCH_NEXT_STATE(state) + (dist < kNumReps);
         }
-        state = LiteralNextState(state);
-    }
-    if (prev_index == cur - 1) {
-        if (cur_opt->prev_dist == 0) {
-            state = ShortRepNextState(state);
-        }
-        else {
-            state = LiteralNextState(state);
-        }
-    }
-    else {
-        size_t dist;
-        if (cur_opt->is_combination && cur_opt->prev_2) {
-            prev_index = cur_opt->prev_index_2;
-            dist = cur_opt->prev_dist_2;
-            state = RepNextState(state);
-        }
-        else {
-            dist = cur_opt->prev_dist;
-            if (dist < kNumReps) {
-                state = RepNextState(state);
-            }
-            else {
-                state = MatchNextState(state);
-            }
-        }
-        const OptimalNode* prev_opt = &enc->opt_buf[prev_index];
+        const OptimalNode *const prev_opt = &enc->opt_buf[prev_index];
         if (dist < kNumReps) {
-            size_t i = 1;
+            /* Move the chosen rep to the front.
+             * The table is hideous but faster than branching :D */
             reps[0] = prev_opt->reps[dist];
-            for (; i <= dist; ++i) {
-                reps[i] = prev_opt->reps[i - 1];
-            }
-            for (; i < kNumReps; ++i) {
-                reps[i] = prev_opt->reps[i];
-            }
+            size_t table = 1 | (2 << 2) | (3 << 4)
+                | (0 << 8) | (2 << 10) | (3 << 12)
+                | (0L << 16) | (1L << 18) | (3L << 20)
+                | (0L << 24) | (1L << 26) | (2L << 28);
+            table >>= (dist << 3);
+            reps[1] = prev_opt->reps[table & 3];
+            table >>= 2;
+            reps[2] = prev_opt->reps[table & 3];
+            table >>= 2;
+            reps[3] = prev_opt->reps[table & 3];
         }
         else {
             reps[0] = (U32)(dist - kNumReps);
-            for (size_t i = 1; i < kNumReps; ++i) {
-                reps[i] = prev_opt->reps[i - 1];
-            }
+            reps[1] = prev_opt->reps[0];
+            reps[2] = prev_opt->reps[1];
+            reps[3] = prev_opt->reps[2];
         }
     }
     cur_opt->state = state;
     memcpy(cur_opt->reps, reps, sizeof(cur_opt->reps));
-    is_rep_prob = enc->states.is_rep[state];
+    Probability const is_rep_prob = enc->states.is_rep[state];
 
-    {   Probability is_match_prob = enc->states.is_match[state][pos_state];
-        unsigned cur_byte = *data;
-        unsigned match_byte = *(data - reps[0] - 1);
-        U32 cur_price = cur_opt->price;
-        U32 cur_and_lit_price = cur_price + GET_PRICE_0(rc, is_match_prob) +
-            GetLiteralPrice(enc, index, state, data[-1], cur_byte, match_byte);
-        OptimalNode* next_opt = &enc->opt_buf[cur + 1];
-        BYTE next_is_char = 0;
-        /* Try literal */
-        if (cur_and_lit_price < next_opt->price) {
-            next_opt->price = cur_and_lit_price;
-            next_opt->prev_index = (unsigned)cur;
-            MakeAsLiteral(*next_opt);
-            next_is_char = 1;
+    {   OptimalNode *const next_opt = &enc->opt_buf[cur + 1];
+        U32 const cur_price = cur_opt->price;
+        U32 const next_price = next_opt->price;
+        Probability const is_match_prob = enc->states.is_match[state][pos_state];
+        unsigned const cur_byte = *data;
+        unsigned const match_byte = *(data - reps[0] - 1);
+       
+        U32 cur_and_lit_price = cur_price + GET_PRICE_0(is_match_prob);
+        /* This is a compromise to try to filter out cases where literal + rep0 is unlikely to be cheaper */
+        BYTE try_lit = cur_and_lit_price + kMinLitPrice / 2U <= next_price;
+        if (try_lit) {
+            /* cur_and_lit_price is used later for the literal + rep0 test */
+            cur_and_lit_price += LZMA_getLiteralPrice(enc, index, state, data[-1], cur_byte, match_byte);
+            /* Try literal */
+            if (cur_and_lit_price < next_price) {
+                next_opt->price = cur_and_lit_price;
+                next_opt->len = 1;
+                MARK_LITERAL(*next_opt);
+                if (is_hybrid) /* Evaluates as a constant expression due to inlining */
+                    try_lit = 0;
+            }
         }
-        match_price = cur_price + GET_PRICE_1(rc, is_match_prob);
-        rep_match_price = match_price + GET_PRICE_1(rc, is_rep_prob);
+        match_price = cur_price + GET_PRICE_1(is_match_prob);
+        rep_match_price = match_price + GET_PRICE_1(is_rep_prob);
         if (match_byte == cur_byte) {
             /* Try 1-byte rep0 */
-            U32 short_rep_price = rep_match_price + GetRepLen1Price(enc, state, pos_state);
+            U32 short_rep_price = rep_match_price + LZMA_getRepLen1Price(enc, state, pos_state);
             if (short_rep_price <= next_opt->price) {
                 next_opt->price = short_rep_price;
-                next_opt->prev_index = (unsigned)cur;
-                MakeAsShortRep(*next_opt);
-                next_is_char = 1;
+                next_opt->len = 1;
+                MARK_SHORT_REP(*next_opt);
             }
         }
         bytes_avail = MIN(block.end - index, kOptimizerBufferSize - 1 - cur);
         if (bytes_avail < 2)
             return len_end;
-        if (!next_is_char && match_byte != cur_byte) {
+
+        /* If match_byte == cur_byte a rep0 begins at the current position */
+        if (is_hybrid && try_lit && match_byte != cur_byte) {
             /* Try literal + rep0 */
-            const BYTE *data_2 = data - reps[0];
+            const BYTE *const data_2 = data - reps[0];
             size_t limit = MIN(bytes_avail - 1, fast_length);
             size_t len_test_2 = ZSTD_count(data + 1, data_2, data + 1 + limit);
             if (len_test_2 >= 2) {
-                size_t state_2 = LiteralNextState(state);
-                size_t pos_state_next = (index + 1) & pos_mask;
-                U32 next_rep_match_price = cur_and_lit_price +
-                    GET_PRICE_1(rc, enc->states.is_match[state_2][pos_state_next]) +
-                    GET_PRICE_1(rc, enc->states.is_rep[state_2]);
-                size_t offset = cur + 1 + len_test_2;
-                U32 cur_and_len_price = next_rep_match_price + GetRepMatch0Price(enc, len_test_2, state_2, pos_state_next);
+                size_t const state_2 = LIT_NEXT_STATE(state);
+                size_t const pos_state_next = (index + 1) & pos_mask;
+                U32 const next_rep_match_price = cur_and_lit_price +
+                    GET_PRICE_1(enc->states.is_match[state_2][pos_state_next]) +
+                    GET_PRICE_1(enc->states.is_rep[state_2]);
+                U32 const cur_and_len_price = next_rep_match_price + LZMA_getRepMatch0Price(enc, len_test_2, state_2, pos_state_next);
+                size_t const offset = cur + 1 + len_test_2;
                 if (cur_and_len_price < enc->opt_buf[offset].price) {
                     len_end = MAX(len_end, offset);
                     enc->opt_buf[offset].price = cur_and_len_price;
-                    enc->opt_buf[offset].prev_index = (unsigned)(cur + 1);
-                    enc->opt_buf[offset].prev_dist = 0;
-                    enc->opt_buf[offset].is_combination = 1;
-                    enc->opt_buf[offset].prev_2 = 0;
+                    enc->opt_buf[offset].len = (unsigned)len_test_2;
+                    enc->opt_buf[offset].dist = 0;
+                    enc->opt_buf[offset].extra = 1;
                 }
             }
         }
     }
 
-    max_length = MIN(bytes_avail, fast_length);
-    start_len = 2;
+    size_t const max_length = MIN(bytes_avail, fast_length);
+    size_t start_len = 2;
+
     if (match.length > 0) {
         size_t len_test;
         size_t len;
         U32 cur_rep_price;
         for (size_t rep_index = 0; rep_index < kNumReps; ++rep_index) {
-            const BYTE *data_2 = data - reps[rep_index] - 1;
+            const BYTE *const data_2 = data - reps[rep_index] - 1;
             if (MEM_read16(data) != MEM_read16(data_2))
                 continue;
+            /* Test is limited to fast_length, but it is rare for the RMF to miss the longest match,
+             * therefore this function is rarely called when a rep len > fast_length exists */
             len_test = ZSTD_count(data + 2, data_2 + 2, data + max_length) + 2;
             len_end = MAX(len_end, cur + len_test);
-            cur_rep_price = rep_match_price + GetRepPrice(enc, rep_index, state, pos_state);
+            cur_rep_price = rep_match_price + LZMA_getRepPrice(enc, rep_index, state, pos_state);
             len = 2;
             /* Try rep match */
             do {
-                U32 cur_and_len_price = cur_rep_price + enc->states.rep_len_states.prices[pos_state][len - kMatchLenMin];
-                OptimalNode* opt = &enc->opt_buf[cur + len];
+                U32 const cur_and_len_price = cur_rep_price + enc->states.rep_len_states.prices[pos_state][len - kMatchLenMin];
+                OptimalNode *const opt = &enc->opt_buf[cur + len];
                 if (cur_and_len_price < opt->price) {
                     opt->price = cur_and_len_price;
-                    opt->prev_index = (unsigned)cur;
-                    opt->prev_dist = (U32)(rep_index);
-                    opt->is_combination = 0;
+                    opt->len = (unsigned)len;
+                    opt->dist = (U32)rep_index;
+                    opt->extra = 0;
                 }
             } while (++len <= len_test);
 
@@ -1180,69 +1091,64 @@ size_t OptimalParse(FL2_lzmaEncoderCtx* const enc, const FL2_dataBlock block,
                 /* Save time by exluding normal matches not longer than the rep */
                 start_len = len_test + 1;
             }
+            /* rep + literal + rep0 is not common so this test is skipped for faster, non-hybrid encoding */
             if (is_hybrid && len_test + 3 <= bytes_avail && MEM_read16(data + len_test + 1) == MEM_read16(data_2 + len_test + 1)) {
-                /* Try rep + literal + rep0 */
-                size_t len_test_2 = ZSTD_count(data + len_test + 3,
+                /* Try rep + literal + rep0.
+                 * The second rep may be > fast_length, but it is not worth the extra time to handle this case
+                 * and the price table is not filled for it */
+                size_t const len_test_2 = ZSTD_count(data + len_test + 3,
                     data_2 + len_test + 3,
                     data + MIN(len_test + 1 + fast_length, bytes_avail)) + 2;
-                size_t state_2 = RepNextState(state);
+                size_t state_2 = REP_NEXT_STATE(state);
                 size_t pos_state_next = (index + len_test) & pos_mask;
                 U32 rep_lit_rep_total_price =
-                    cur_rep_price + enc->states.rep_len_states.prices[pos_state][len_test - kMatchLenMin] +
-                    GET_PRICE_0(rc, enc->states.is_match[state_2][pos_state_next]) +
-                    GetLiteralPriceMatched(&enc->rc, GetLiteralProbs(enc, index + len_test, data[len_test - 1]),
+                    cur_rep_price + enc->states.rep_len_states.prices[pos_state][len_test - kMatchLenMin]
+                    + GET_PRICE_0(enc->states.is_match[state_2][pos_state_next])
+                    + LZMA_getLiteralPriceMatched(LITERAL_PROBS(enc, index + len_test, data[len_test - 1]),
                         data[len_test], data_2[len_test]);
-                size_t offset;
 
-                state_2 = LiteralNextState(state_2);
+                state_2 = kState_LitAfterRep;
                 pos_state_next = (index + len_test + 1) & pos_mask;
                 rep_lit_rep_total_price +=
-                    GET_PRICE_1(rc, enc->states.is_match[state_2][pos_state_next]) +
-                    GET_PRICE_1(rc, enc->states.is_rep[state_2]);
-                offset = cur + len_test + 1 + len_test_2;
-                rep_lit_rep_total_price += GetRepMatch0Price(enc, len_test_2, state_2, pos_state_next);
+                    GET_PRICE_1(enc->states.is_match[state_2][pos_state_next]) +
+                    GET_PRICE_1(enc->states.is_rep[state_2]);
+                size_t const offset = cur + len_test + 1 + len_test_2;
+                rep_lit_rep_total_price += LZMA_getRepMatch0Price(enc, len_test_2, state_2, pos_state_next);
                 if (rep_lit_rep_total_price < enc->opt_buf[offset].price) {
                     len_end = MAX(len_end, offset);
                     enc->opt_buf[offset].price = rep_lit_rep_total_price;
-                    enc->opt_buf[offset].prev_index = (unsigned)(cur + len_test + 1);
-                    enc->opt_buf[offset].prev_dist = 0;
-                    enc->opt_buf[offset].is_combination = 1;
-                    enc->opt_buf[offset].prev_2 = 1;
-                    enc->opt_buf[offset].prev_index_2 = (unsigned)cur;
-                    enc->opt_buf[offset].prev_dist_2 = (U32)(rep_index);
+                    enc->opt_buf[offset].len = (unsigned)len_test_2;
+                    enc->opt_buf[offset].dist = (U32)rep_index;
+                    enc->opt_buf[offset].extra = (unsigned)(len_test + 1);
                 }
             }
         }
     }
     if (match.length >= start_len && max_length >= start_len) {
         /* Try normal match */
-        U32 normal_match_price = match_price + GET_PRICE_0(rc, is_rep_prob);
+        U32 const normal_match_price = match_price + GET_PRICE_0(is_rep_prob);
         if (!is_hybrid) {
             /* Normal mode - single match */
-            size_t length = MIN(match.length, max_length);
-            size_t cur_dist = match.dist;
-            size_t dist_slot = GetDistSlot(match.dist);
+            size_t const length = MIN(match.length, max_length);
+            size_t const cur_dist = match.dist;
+            size_t const dist_slot = LZMA_getDistSlot(match.dist);
             size_t len_test = length;
             len_end = MAX(len_end, cur + length);
-            /* Pre-load rep0 data bytes */
-/*            unsigned rep_0_bytes = MEM_read16(data - cur_dist + length); */
             for (; len_test >= start_len; --len_test) {
-                OptimalNode *opt;
                 U32 cur_and_len_price = normal_match_price + enc->states.len_states.prices[pos_state][len_test - kMatchLenMin];
-                size_t len_to_dist_state = GetLenToDistState(len_test);
+                size_t const len_to_dist_state = LEN_TO_DIST_STATE(len_test);
 
-                if (cur_dist < kNumFullDistances) {
+                if (cur_dist < kNumFullDistances)
                     cur_and_len_price += enc->distance_prices[len_to_dist_state][cur_dist];
-                }
-                else {
+                else 
                     cur_and_len_price += enc->dist_slot_prices[len_to_dist_state][dist_slot] + enc->align_prices[cur_dist & kAlignMask];
-                }
-                opt = &enc->opt_buf[cur + len_test];
+
+                OptimalNode *const opt = &enc->opt_buf[cur + len_test];
                 if (cur_and_len_price < opt->price) {
                     opt->price = cur_and_len_price;
-                    opt->prev_index = (unsigned)cur;
-                    opt->prev_dist = (U32)(cur_dist + kNumReps);
-                    opt->is_combination = 0;
+                    opt->len = (unsigned)len_test;
+                    opt->dist = (U32)(cur_dist + kNumReps);
+                    opt->extra = 0;
                 }
                 else break;
             }
@@ -1250,90 +1156,80 @@ size_t OptimalParse(FL2_lzmaEncoderCtx* const enc, const FL2_dataBlock block,
         else {
             /* Hybrid mode */
             size_t main_len;
-            ptrdiff_t match_index;
-            ptrdiff_t start_match;
 
             match.length = MIN(match.length, (U32)max_length);
-            if (match.length < 3 || match.dist < 256) {
+            /* Need to test max_length < 4 because the hash fn reads a U32 */
+            if (match.length < 3 || max_length < 4) {
                 enc->matches[0] = match;
                 enc->match_count = 1;
                 main_len = match.length;
             }
             else {
-                main_len = HashGetMatches(enc, block, index, max_length, match);
-            }
-            match_index = enc->match_count - 1;
-            if (main_len == max_length
-                && match_index > 0
-                && enc->matches[match_index - 1].length == main_len)
-            {
-                --match_index;
+                main_len = LZMA_hashGetMatches(enc, block, index, max_length, match);
             }
+            ptrdiff_t match_index = enc->match_count - 1;
             len_end = MAX(len_end, cur + main_len);
-            start_match = 0;
-            while (start_len > enc->matches[start_match].length) {
+
+            /* Start with a match longer than the best rep if one exists */
+            ptrdiff_t start_match = 0;
+            while (start_len > enc->matches[start_match].length)
                 ++start_match;
-            }
+
+            enc->matches[start_match - 1].length = (U32)start_len - 1; /* Avoids an if..else branch in the loop. [-1] is ok */
+
             for (; match_index >= start_match; --match_index) {
                 size_t len_test = enc->matches[match_index].length;
-                size_t cur_dist = enc->matches[match_index].dist;
-                size_t dist_slot = GetDistSlot((U32)cur_dist);
+                size_t const cur_dist = enc->matches[match_index].dist;
+                const BYTE *const data_2 = data - cur_dist - 1;
+                size_t const rep_0_pos = len_test + 1;
+                size_t dist_slot = LZMA_getDistSlot((U32)cur_dist);
                 U32 cur_and_len_price;
-                size_t base_len = (match_index > start_match) ? enc->matches[match_index - 1].length + 1 : start_len;
-                unsigned rep_0_bytes = MEM_read16(data - cur_dist + len_test);
+                /* Test from the full length down to 1 more than the next shorter match */
+                size_t base_len = enc->matches[match_index - 1].length + 1;
                 for (; len_test >= base_len; --len_test) {
-                    size_t len_to_dist_state;
-                    OptimalNode *opt;
-
                     cur_and_len_price = normal_match_price + enc->states.len_states.prices[pos_state][len_test - kMatchLenMin];
-                    len_to_dist_state = GetLenToDistState(len_test);
-                    if (cur_dist < kNumFullDistances) {
+                    size_t const len_to_dist_state = LEN_TO_DIST_STATE(len_test);
+                    if (cur_dist < kNumFullDistances)
                         cur_and_len_price += enc->distance_prices[len_to_dist_state][cur_dist];
-                    }
-                    else {
+                    else
                         cur_and_len_price += enc->dist_slot_prices[len_to_dist_state][dist_slot] + enc->align_prices[cur_dist & kAlignMask];
-                    }
-                    opt = &enc->opt_buf[cur + len_test];
+
+                    BYTE const sub_len = len_test < enc->matches[match_index].length;
+
+                    OptimalNode *const opt = &enc->opt_buf[cur + len_test];
                     if (cur_and_len_price < opt->price) {
                         opt->price = cur_and_len_price;
-                        opt->prev_index = (unsigned)cur;
-                        opt->prev_dist = (U32)(cur_dist + kNumReps);
-                        opt->is_combination = 0;
+                        opt->len = (unsigned)len_test;
+                        opt->dist = (U32)(cur_dist + kNumReps);
+                        opt->extra = 0;
                     }
-                    else if(len_test < main_len)
-                        break;
-                    if (len_test == enc->matches[match_index].length) {
-                        size_t rep_0_pos = len_test + 1;
-                        if (rep_0_pos + 2 <= bytes_avail && rep_0_bytes == MEM_read16(data + rep_0_pos)) {
-                            /* Try match + literal + rep0 */
-                            const BYTE *data_2 = data - cur_dist - 1;
-                            size_t limit = MIN(rep_0_pos + fast_length, bytes_avail);
-                            size_t len_test_2 = ZSTD_count(data + rep_0_pos + 2, data_2 + rep_0_pos + 2, data + limit) + 2;
-                            size_t state_2 = MatchNextState(state);
-                            size_t pos_state_next = (index + len_test) & pos_mask;
-                            U32 match_lit_rep_total_price = cur_and_len_price +
-                                GET_PRICE_0(rc, enc->states.is_match[state_2][pos_state_next]) +
-                                GetLiteralPriceMatched(&enc->rc, GetLiteralProbs(enc, index + len_test, data[len_test - 1]),
-                                    data[len_test], data_2[len_test]);
-                            size_t offset;
+                    else if(sub_len)
+                        break; /* End the tests if prices for shorter lengths are not lower than those already recorded */
 
-                            state_2 = LiteralNextState(state_2);
-                            pos_state_next = (pos_state_next + 1) & pos_mask;
-                            match_lit_rep_total_price +=
-                                GET_PRICE_1(rc, enc->states.is_match[state_2][pos_state_next]) +
-                                GET_PRICE_1(rc, enc->states.is_rep[state_2]);
-                            offset = cur + rep_0_pos + len_test_2;
-                            match_lit_rep_total_price += GetRepMatch0Price(enc, len_test_2, state_2, pos_state_next);
-                            if (match_lit_rep_total_price < enc->opt_buf[offset].price) {
-                                len_end = MAX(len_end, offset);
-                                enc->opt_buf[offset].price = match_lit_rep_total_price;
-                                enc->opt_buf[offset].prev_index = (unsigned)(cur + rep_0_pos);
-                                enc->opt_buf[offset].prev_dist = 0;
-                                enc->opt_buf[offset].is_combination = 1;
-                                enc->opt_buf[offset].prev_2 = 1;
-                                enc->opt_buf[offset].prev_index_2 = (unsigned)cur;
-                                enc->opt_buf[offset].prev_dist_2 = (U32)(cur_dist + kNumReps);
-                            }
+                    if (!sub_len && rep_0_pos + 2 <= bytes_avail && MEM_read16(data + rep_0_pos) == MEM_read16(data_2 + rep_0_pos)) {
+                        /* Try match + literal + rep0 */
+                        size_t const limit = MIN(rep_0_pos + fast_length, bytes_avail);
+                        size_t const len_test_2 = ZSTD_count(data + rep_0_pos + 2, data_2 + rep_0_pos + 2, data + limit) + 2;
+                        size_t state_2 = MATCH_NEXT_STATE(state);
+                        size_t pos_state_next = (index + len_test) & pos_mask;
+                        U32 match_lit_rep_total_price = cur_and_len_price +
+                            GET_PRICE_0(enc->states.is_match[state_2][pos_state_next]) +
+                            LZMA_getLiteralPriceMatched(LITERAL_PROBS(enc, index + len_test, data[len_test - 1]),
+                                data[len_test], data_2[len_test]);
+
+                        state_2 = kState_LitAfterMatch;
+                        pos_state_next = (pos_state_next + 1) & pos_mask;
+                        match_lit_rep_total_price +=
+                            GET_PRICE_1(enc->states.is_match[state_2][pos_state_next]) +
+                            GET_PRICE_1(enc->states.is_rep[state_2]);
+                        size_t const offset = cur + rep_0_pos + len_test_2;
+                        match_lit_rep_total_price += LZMA_getRepMatch0Price(enc, len_test_2, state_2, pos_state_next);
+                        if (match_lit_rep_total_price < enc->opt_buf[offset].price) {
+                            len_end = MAX(len_end, offset);
+                            enc->opt_buf[offset].price = match_lit_rep_total_price;
+                            enc->opt_buf[offset].len = (unsigned)len_test_2;
+                            enc->opt_buf[offset].extra = (unsigned)rep_0_pos;
+                            enc->opt_buf[offset].dist = (U32)(cur_dist + kNumReps);
                         }
                     }
                 }
@@ -1343,92 +1239,87 @@ size_t OptimalParse(FL2_lzmaEncoderCtx* const enc, const FL2_dataBlock block,
     return len_end;
 }
 
-HINT_INLINE
-void InitMatchesPos0(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
-    Match match,
-    size_t pos_state,
+FORCE_NOINLINE
+static void LZMA_initMatchesPos0(LZMA2_ECtx *const enc,
+    RMF_match const match,
+    size_t const pos_state,
     size_t len,
-    unsigned normal_match_price)
+    unsigned const normal_match_price)
 {
     if ((unsigned)len <= match.length) {
-        size_t distance = match.dist;
-        size_t slot = GetDistSlot(match.dist);
+        size_t const distance = match.dist;
+        size_t const slot = LZMA_getDistSlot(match.dist);
         /* Test every available length of the match */
-        do
-        {
+        do {
             unsigned cur_and_len_price = normal_match_price + enc->states.len_states.prices[pos_state][len - kMatchLenMin];
-            size_t len_to_dist_state = GetLenToDistState(len);
-            if (distance < kNumFullDistances) {
+            size_t const len_to_dist_state = LEN_TO_DIST_STATE(len);
+
+            if (distance < kNumFullDistances)
                 cur_and_len_price += enc->distance_prices[len_to_dist_state][distance];
-            }
-            else {
+            else
                 cur_and_len_price += enc->align_prices[distance & kAlignMask] + enc->dist_slot_prices[len_to_dist_state][slot];
-            }
+
             if (cur_and_len_price < enc->opt_buf[len].price) {
                 enc->opt_buf[len].price = cur_and_len_price;
-                enc->opt_buf[len].prev_index = 0;
-                enc->opt_buf[len].prev_dist = (U32)(distance + kNumReps);
-                enc->opt_buf[len].is_combination = 0;
+                enc->opt_buf[len].len = (unsigned)len;
+                enc->opt_buf[len].dist = (U32)(distance + kNumReps);
+                enc->opt_buf[len].extra = 0;
             }
             ++len;
-        } while ((unsigned)len <= match.length);
+        } while ((U32)len <= match.length);
     }
 }
 
-static size_t InitMatchesPos0Best(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
-    Match match,
-    size_t index,
-    size_t len,
-    unsigned normal_match_price)
+FORCE_NOINLINE
+static size_t LZMA_initMatchesPos0Best(LZMA2_ECtx *const enc, FL2_dataBlock const block,
+    RMF_match const match,
+    size_t const index,
+    size_t start_len,
+    unsigned const normal_match_price)
 {
-    if (len <= match.length) {
+    if (start_len <= match.length) {
         size_t main_len;
-        size_t match_index;
-        size_t pos_state;
-        size_t distance;
-        size_t slot;
-
-        if (match.length < 3 || match.dist < 256) {
+        if (match.length < 3 || block.end - index < 4) {
             enc->matches[0] = match;
             enc->match_count = 1;
             main_len = match.length;
         }
         else {
-            main_len = HashGetMatches(enc, block, index, MIN(block.end - index, enc->fast_length), match);
+            main_len = LZMA_hashGetMatches(enc, block, index, MIN(block.end - index, enc->fast_length), match);
         }
-        match_index = 0;
-        while (len > enc->matches[match_index].length) {
-            ++match_index;
-        }
-        pos_state = index & enc->pos_mask;
-        distance = enc->matches[match_index].dist;
-        slot = GetDistSlot(enc->matches[match_index].dist);
-        /* Test every available match length at the shortest distance. The buffer is sorted */
-        /* in order of increasing length, and therefore increasing distance too. */
-        for (;; ++len) {
-            unsigned cur_and_len_price = normal_match_price
-                + enc->states.len_states.prices[pos_state][len - kMatchLenMin];
-            size_t len_to_dist_state = GetLenToDistState(len);
-            if (distance < kNumFullDistances) {
-                cur_and_len_price += enc->distance_prices[len_to_dist_state][distance];
-            }
-            else {
-                cur_and_len_price += enc->align_prices[distance & kAlignMask] + enc->dist_slot_prices[len_to_dist_state][slot];
-            }
-            if (cur_and_len_price < enc->opt_buf[len].price) {
-                enc->opt_buf[len].price = cur_and_len_price;
-                enc->opt_buf[len].prev_index = 0;
-                enc->opt_buf[len].prev_dist = (U32)(distance + kNumReps);
-                enc->opt_buf[len].is_combination = 0;
-            }
-            if (len == enc->matches[match_index].length) {
-                /* Run out of length for this match. Get the next if any. */
-                if (len == main_len) {
-                    break;
+
+        ptrdiff_t start_match = 0;
+        while (start_len > enc->matches[start_match].length)
+            ++start_match;
+
+        enc->matches[start_match - 1].length = (U32)start_len - 1; /* Avoids an if..else branch in the loop. [-1] is ok */
+
+        size_t pos_state = index & enc->pos_mask;
+
+        for (ptrdiff_t match_index = enc->match_count - 1; match_index >= start_match; --match_index) {
+            size_t len_test = enc->matches[match_index].length;
+            size_t const distance = enc->matches[match_index].dist;
+            size_t const slot = LZMA_getDistSlot((U32)distance);
+            size_t const base_len = enc->matches[match_index - 1].length + 1;
+            /* Test every available match length at the shortest distance. The buffer is sorted */
+            /* in order of increasing length, and therefore increasing distance too. */
+            for (; len_test >= base_len; --len_test) {
+                unsigned cur_and_len_price = normal_match_price
+                    + enc->states.len_states.prices[pos_state][len_test - kMatchLenMin];
+                size_t const len_to_dist_state = LEN_TO_DIST_STATE(len_test);
+
+                if (distance < kNumFullDistances)
+                    cur_and_len_price += enc->distance_prices[len_to_dist_state][distance];
+                else
+                    cur_and_len_price += enc->align_prices[distance & kAlignMask] + enc->dist_slot_prices[len_to_dist_state][slot];
+
+                if (cur_and_len_price < enc->opt_buf[len_test].price) {
+                    enc->opt_buf[len_test].price = cur_and_len_price;
+                    enc->opt_buf[len_test].len = (unsigned)len_test;
+                    enc->opt_buf[len_test].dist = (U32)(distance + kNumReps);
+                    enc->opt_buf[len_test].extra = 0;
                 }
-                ++match_index;
-                distance = enc->matches[match_index].dist;
-                slot = GetDistSlot(enc->matches[match_index].dist);
+                else break;
             }
         }
         return main_len;
@@ -1441,14 +1332,14 @@ static size_t InitMatchesPos0Best(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock b
 * This function must not be called at a position where no match is
 * available. */
 FORCE_INLINE_TEMPLATE
-size_t InitOptimizerPos0(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
-    Match match,
-    size_t index,
+size_t LZMA_initOptimizerPos0(LZMA2_ECtx *const enc, FL2_dataBlock const block,
+    RMF_match const match,
+    size_t const index,
     int const is_hybrid,
-    U32* reps)
+    U32* const reps)
 {
-    size_t max_length = MIN(block.end - index, kMatchLenMax);
-    const BYTE *data = block.data + index;
+    size_t const max_length = MIN(block.end - index, kMatchLenMax);
+    const BYTE *const data = block.data + index;
     const BYTE *data_2;
     size_t rep_max_index = 0;
     size_t rep_lens[kNumReps];
@@ -1462,288 +1353,352 @@ size_t InitOptimizerPos0(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
             continue;
         }
         rep_lens[i] = ZSTD_count(data + 2, data_2 + 2, data + max_length) + 2;
-        if (rep_lens[i] > rep_lens[rep_max_index]) {
+        if (rep_lens[i] > rep_lens[rep_max_index])
             rep_max_index = i;
-        }
     }
     if (rep_lens[rep_max_index] >= enc->fast_length) {
-        enc->opt_buf[0].prev_index = (unsigned)(rep_lens[rep_max_index]);
-        enc->opt_buf[0].prev_dist = (U32)(rep_max_index);
+        enc->opt_buf[0].len = (unsigned)(rep_lens[rep_max_index]);
+        enc->opt_buf[0].dist = (U32)rep_max_index;
         return 0;
     }
     if (match.length >= enc->fast_length) {
-        enc->opt_buf[0].prev_index = match.length;
-        enc->opt_buf[0].prev_dist = match.dist + kNumReps;
+        enc->opt_buf[0].len = match.length;
+        enc->opt_buf[0].dist = match.dist + kNumReps;
         return 0;
     }
 
-    {   unsigned cur_byte = *data;
-        unsigned match_byte = *(data - reps[0] - 1);
-        unsigned match_price;
-        unsigned normal_match_price;
-        unsigned rep_match_price;
-        size_t len;
-        size_t state = enc->states.state;
-        size_t pos_state = index & enc->pos_mask;
-        Probability is_match_prob = enc->states.is_match[state][pos_state];
-        Probability is_rep_prob = enc->states.is_rep[state];
+    unsigned const cur_byte = *data;
+    unsigned const match_byte = *(data - reps[0] - 1);
+    size_t const state = enc->states.state;
+    size_t const pos_state = index & enc->pos_mask;
+    Probability const is_match_prob = enc->states.is_match[state][pos_state];
+    Probability const is_rep_prob = enc->states.is_rep[state];
 
-        enc->opt_buf[0].state = state;
-        /* Set the price for literal */
-        enc->opt_buf[1].price = GET_PRICE_0(rc, is_match_prob) +
-            GetLiteralPrice(enc, index, state, data[-1], cur_byte, match_byte);
-        MakeAsLiteral(enc->opt_buf[1]);
+    enc->opt_buf[0].state = state;
+    /* Set the price for literal */
+    enc->opt_buf[1].price = GET_PRICE_0(is_match_prob) +
+        LZMA_getLiteralPrice(enc, index, state, data[-1], cur_byte, match_byte);
+    MARK_LITERAL(enc->opt_buf[1]);
 
-        match_price = GET_PRICE_1(rc, is_match_prob);
-        rep_match_price = match_price + GET_PRICE_1(rc, is_rep_prob);
-        if (match_byte == cur_byte) {
-            /* Try 1-byte rep0 */
-            unsigned short_rep_price = rep_match_price + GetRepLen1Price(enc, state, pos_state);
-            if (short_rep_price < enc->opt_buf[1].price) {
-                enc->opt_buf[1].price = short_rep_price;
-                MakeAsShortRep(enc->opt_buf[1]);
+    unsigned const match_price = GET_PRICE_1(is_match_prob);
+    unsigned const rep_match_price = match_price + GET_PRICE_1(is_rep_prob);
+    if (match_byte == cur_byte) {
+        /* Try 1-byte rep0 */
+        unsigned const short_rep_price = rep_match_price + LZMA_getRepLen1Price(enc, state, pos_state);
+        if (short_rep_price < enc->opt_buf[1].price) {
+            enc->opt_buf[1].price = short_rep_price;
+            MARK_SHORT_REP(enc->opt_buf[1]);
+        }
+    }
+    memcpy(enc->opt_buf[0].reps, reps, sizeof(enc->opt_buf[0].reps));
+    enc->opt_buf[1].len = 1;
+    /* Test the rep match prices */
+    for (size_t i = 0; i < kNumReps; ++i) {
+        size_t rep_len = rep_lens[i];
+        if (rep_len < 2)
+            continue;
+
+        unsigned const price = rep_match_price + LZMA_getRepPrice(enc, i, state, pos_state);
+        /* Test every available length of the rep */
+        do {
+            unsigned const cur_and_len_price = price + enc->states.rep_len_states.prices[pos_state][rep_len - kMatchLenMin];
+            if (cur_and_len_price < enc->opt_buf[rep_len].price) {
+                enc->opt_buf[rep_len].price = cur_and_len_price;
+                enc->opt_buf[rep_len].len = (unsigned)rep_len;
+                enc->opt_buf[rep_len].dist = (U32)i;
+                enc->opt_buf[rep_len].extra = 0;
             }
-        }
-        memcpy(enc->opt_buf[0].reps, reps, sizeof(enc->opt_buf[0].reps));
-        enc->opt_buf[1].prev_index = 0;
-        /* Test the rep match prices */
-        for (size_t i = 0; i < kNumReps; ++i) {
-            unsigned price;
-            size_t rep_len = rep_lens[i];
-            if (rep_len < 2) {
-                continue;
-            }
-            price = rep_match_price + GetRepPrice(enc, i, state, pos_state);
-            /* Test every available length of the rep */
-            do {
-                unsigned cur_and_len_price = price + enc->states.rep_len_states.prices[pos_state][rep_len - kMatchLenMin];
-                if (cur_and_len_price < enc->opt_buf[rep_len].price) {
-                    enc->opt_buf[rep_len].price = cur_and_len_price;
-                    enc->opt_buf[rep_len].prev_index = 0;
-                    enc->opt_buf[rep_len].prev_dist = (U32)(i);
-                    enc->opt_buf[rep_len].is_combination = 0;
-                }
-            } while (--rep_len >= kMatchLenMin);
-        }
-        normal_match_price = match_price + GET_PRICE_0(rc, is_rep_prob);
-        len = (rep_lens[0] >= 2) ? rep_lens[0] + 1 : 2;
-        /* Test the match prices */
-        if (!is_hybrid) {
-            /* Normal mode */
-            InitMatchesPos0(enc, block, match, pos_state, len, normal_match_price);
-            return MAX(match.length, rep_lens[rep_max_index]);
-        }
-        else {
-            /* Hybrid mode */
-            size_t main_len = InitMatchesPos0Best(enc, block, match, index, len, normal_match_price);
-            return MAX(main_len, rep_lens[rep_max_index]);
-        }
+        } while (--rep_len >= kMatchLenMin);
+    }
+    unsigned const normal_match_price = match_price + GET_PRICE_0(is_rep_prob);
+    size_t const len = (rep_lens[0] >= 2) ? rep_lens[0] + 1 : 2;
+    /* Test the match prices */
+    if (!is_hybrid) {
+        /* Normal mode */
+        LZMA_initMatchesPos0(enc, match, pos_state, len, normal_match_price);
+        return MAX(match.length, rep_lens[rep_max_index]);
+    }
+    else {
+        /* Hybrid mode */
+        size_t main_len = LZMA_initMatchesPos0Best(enc, block, match, index, len, normal_match_price);
+        return MAX(main_len, rep_lens[rep_max_index]);
     }
 }
 
 FORCE_INLINE_TEMPLATE
-size_t EncodeOptimumSequence(FL2_lzmaEncoderCtx* enc, const FL2_dataBlock block,
-    FL2_matchTable* tbl,
-    int const structTbl,
+size_t LZMA_encodeOptimumSequence(LZMA2_ECtx *const enc, FL2_dataBlock const block,
+    FL2_matchTable* const tbl,
+    int const struct_tbl,
     int const is_hybrid,
     size_t start_index,
-    size_t uncompressed_end,
-    Match match)
+    size_t const uncompressed_end,
+    RMF_match match)
 {
     size_t len_end = enc->len_end_max;
-    unsigned search_depth = tbl->params.depth;
+    unsigned const search_depth = tbl->params.depth;
     do {
-        U32 reps[kNumReps];
-        size_t index;
-        size_t cur;
-        unsigned prev_index;
-        size_t i;
         size_t const pos_mask = enc->pos_mask;
-        for (; (len_end & 3) != 0; --len_end) {
+
+        /* Reset all prices that were set last time */
+        for (; (len_end & 3) != 0; --len_end)
             enc->opt_buf[len_end].price = kInfinityPrice;
-        }
         for (; len_end >= 4; len_end -= 4) {
             enc->opt_buf[len_end].price = kInfinityPrice;
             enc->opt_buf[len_end - 1].price = kInfinityPrice;
             enc->opt_buf[len_end - 2].price = kInfinityPrice;
             enc->opt_buf[len_end - 3].price = kInfinityPrice;
         }
-        index = start_index;
+
         /* Set everything up at position 0 */
-        len_end = InitOptimizerPos0(enc, block, match, index, is_hybrid, reps);
+        size_t index = start_index;
+        U32 reps[kNumReps];
+        len_end = LZMA_initOptimizerPos0(enc, block, match, index, is_hybrid, reps);
         match.length = 0;
-        cur = 1;
+        size_t cur = 1;
+
         /* len_end == 0 if a match of fast_length was found */
         if (len_end > 0) {
             ++index;
-            /* Lazy termination of the optimal parser. In the second half of the buffer */
-            /* a resolution within one byte is enough */
-            for (; cur < (len_end - cur / (kOptimizerBufferSize / 2U)); ++cur, ++index) {
-                if (enc->opt_buf[cur + 1].price < enc->opt_buf[cur].price)
-                    continue;
-                match = FL2_radixGetMatch(block, tbl, search_depth, structTbl, index);
-                if (match.length >= enc->fast_length) {
+            for (; cur < len_end; ++cur, ++index) {
+                /* Terminate if the farthest calculated price is too near the buffer end */
+                if (len_end >= kOptimizerBufferSize - kOptimizerEndSize) {
+                    U32 price = enc->opt_buf[cur].price;
+                    /* This is a compromise to favor more distant end points
+                     * even if the price is a bit higher */
+                    U32 const delta = price / (U32)cur / 2U;
+                    for (size_t j = cur + 1; j <= len_end; j++) {
+                        U32 const price2 = enc->opt_buf[j].price;
+                        if (price >= price2) {
+                            price = price2;
+                            cur = j;
+                        }
+                        price += delta;
+                    }
                     break;
                 }
-                len_end = OptimalParse(enc, block, match, index, cur, len_end, is_hybrid, reps);
-            }
-            if (cur < len_end && match.length < enc->fast_length) {
-                /* Adjust the end point base on scaling up the price. */
-                cur += (enc->opt_buf[cur].price + enc->opt_buf[cur].price / cur) >= enc->opt_buf[cur + 1].price;
-            }
-            DEBUGLOG(6, "End optimal parse at %u", (U32)cur);
-            ReverseOptimalChain(enc->opt_buf, cur);
-        }
-        /* Encode the selections in the buffer */
-        prev_index = 0;
-        i = 0;
-        do {
-            unsigned len = enc->opt_buf[i].prev_index - prev_index;
-            prev_index = enc->opt_buf[i].prev_index;
-            if (len == 1 && enc->opt_buf[i].prev_dist == kNullDist)
-            {
-                EncodeLiteralBuf(enc, block.data, start_index + i);
-            }
-            else {
-                size_t match_index = start_index + i;
-                U32 dist = enc->opt_buf[i].prev_dist;
-                /* The last match will be truncated to fit in the optimal buffer so get the full length */
-                if (i + len >= kOptimizerBufferSize - 1 && dist >= kNumReps) {
-                    Match lastmatch = FL2_radixGetMatch(block, tbl, search_depth, tbl->isStruct, match_index);
-                    if (lastmatch.length > len) {
-                        len = lastmatch.length;
-                        dist = lastmatch.dist + kNumReps;
+
+                /* Skip ahead if a lower or equal price is available at greater distance */
+                size_t const end = MIN(cur + kOptimizerSkipSize, len_end);
+                U32 price = enc->opt_buf[cur].price;
+                for (size_t j = cur + 1; j <= end; j++) {
+                    U32 const price2 = enc->opt_buf[j].price;
+                    if (price >= price2) {
+                        price = price2;
+                        index += j - cur;
+                        cur = j;
+                        if (cur == len_end)
+                            goto reverse;
                     }
                 }
-                if (dist < kNumReps) {
-                    EncodeRepMatch(enc, len, dist, match_index & pos_mask);
+
+                match = RMF_getMatch(block, tbl, search_depth, struct_tbl, index);
+                if (match.length >= enc->fast_length)
+                    break;
+
+                len_end = LZMA_optimalParse(enc, block, match, index, cur, len_end, is_hybrid, reps);
+            }
+reverse:
+            DEBUGLOG(6, "End optimal parse at %u", (U32)cur);
+            LZMA_reverseOptimalChain(enc->opt_buf, cur);
+        }
+        /* Encode the selections in the buffer */
+        size_t i = 0;
+        do {
+            unsigned const len = enc->opt_buf[i].len;
+
+            if (len == 1 && enc->opt_buf[i].dist == kNullDist) {
+                LZMA_encodeLiteralBuf(enc, block.data, start_index + i);
+                ++i;
+            }
+            else {
+                size_t const pos_state = (start_index + i) & pos_mask;
+                U32 const dist = enc->opt_buf[i].dist;
+                /* Updating i separately for each case may allow a branch to be eliminated */
+                if (dist >= kNumReps) {
+                    LZMA_encodeNormalMatch(enc, len, dist - kNumReps, pos_state);
+                    i += len;
+                }
+                else if(len == 1) {
+                    LZMA_encodeRepMatchShort(enc, pos_state);
+                    ++i;
                 }
                 else {
-                    EncodeNormalMatch(enc, len, dist - kNumReps, match_index & pos_mask);
+                    LZMA_encodeRepMatchLong(enc, len, dist, pos_state);
+                    i += len;
                 }
             }
-            i += len;
         } while (i < cur);
         start_index += i;
-        /* Do another round if there is a long match pending, because the reps must be checked */
-        /* and the match encoded. */
+        /* Do another round if there is a long match pending,
+         * because the reps must be checked and the match encoded. */
     } while (match.length >= enc->fast_length && start_index < uncompressed_end && enc->rc.out_index < enc->rc.chunk_size);
     enc->len_end_max = len_end;
     return start_index;
 }
 
-static void UpdateLengthPrices(FL2_lzmaEncoderCtx* enc, LengthStates* len_states)
+static void FORCE_NOINLINE LZMA_fillAlignPrices(LZMA2_ECtx *const enc)
 {
-    for (size_t pos_state = 0; pos_state <= enc->pos_mask; ++pos_state) {
-        LengthStates_SetPrices(&enc->rc, len_states, pos_state);
+    unsigned i;
+    const Probability *const probs = enc->states.dist_align_encoders;
+    for (i = 0; i < kAlignTableSize / 2; i++) {
+        U32 price = 0;
+        unsigned sym = i;
+        unsigned m = 1;
+        unsigned bit;
+        bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[m], bit); m = (m << 1) + bit;
+        bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[m], bit); m = (m << 1) + bit;
+        bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[m], bit); m = (m << 1) + bit;
+        U32 const prob = probs[m];
+        enc->align_prices[i] = price + GET_PRICE_0(prob);
+        enc->align_prices[i + 8] = price + GET_PRICE_1(prob);
     }
 }
 
-static void FillAlignPrices(FL2_lzmaEncoderCtx* enc)
+static void FORCE_NOINLINE LZMA_fillDistancesPrices(LZMA2_ECtx *const enc)
 {
-    for (size_t i = 0; i < kAlignTableSize; ++i) {
-        enc->align_prices[i] = GetReverseTreePrice(&enc->rc, enc->states.dist_align_encoders, kNumAlignBits, i);
-    }
-    enc->align_price_count = 0;
-}
+    U32 * const temp_prices = enc->distance_prices[kNumLenToPosStates - 1];
 
-static void FillDistancesPrices(FL2_lzmaEncoderCtx* enc)
-{
-    static const size_t kLastLenToPosState = kNumLenToPosStates - 1;
-    for (size_t i = kStartPosModelIndex; i < kNumFullDistances; ++i) {
-        size_t dist_slot = distance_table[i];
-        unsigned footerBits = (unsigned)((dist_slot >> 1) - 1);
-        size_t base = ((2 | (dist_slot & 1)) << footerBits);
-        enc->distance_prices[kLastLenToPosState][i] = GetReverseTreePrice(&enc->rc, enc->states.dist_encoders + base - dist_slot - 1,
-            footerBits,
-            i - base);
-    }
-    for (size_t lenToPosState = 0; lenToPosState < kNumLenToPosStates; ++lenToPosState) {
-        const Probability* encoder = enc->states.dist_slot_encoders[lenToPosState];
-        for (size_t dist_slot = 0; dist_slot < enc->dist_price_table_size; ++dist_slot) {
-            enc->dist_slot_prices[lenToPosState][dist_slot] = GetTreePrice(&enc->rc, encoder, kNumPosSlotBits, dist_slot);
-        }
-        for (size_t dist_slot = kEndPosModelIndex; dist_slot < enc->dist_price_table_size; ++dist_slot) {
-            enc->dist_slot_prices[lenToPosState][dist_slot] += (((unsigned)(dist_slot >> 1) - 1) - kNumAlignBits) << kNumBitPriceShiftBits;
-        }
-        size_t i = 0;
-        for (; i < kStartPosModelIndex; ++i) {
-            enc->distance_prices[lenToPosState][i] = enc->dist_slot_prices[lenToPosState][i];
-        }
-        for (; i < kNumFullDistances; ++i) {
-            enc->distance_prices[lenToPosState][i] = enc->dist_slot_prices[lenToPosState][distance_table[i]]
-                + enc->distance_prices[kLastLenToPosState][i];
-        }
-    }
     enc->match_price_count = 0;
+
+    for (size_t i = kStartPosModelIndex / 2; i < kNumFullDistances / 2; i++) {
+        unsigned const dist_slot = distance_table[i];
+        unsigned footer_bits = (dist_slot >> 1) - 1;
+        size_t base = ((2 | (dist_slot & 1)) << footer_bits);
+        const Probability *probs = enc->states.dist_encoders + base * 2U;
+        base += i;
+        probs = probs - distance_table[base] - 1;
+        U32 price = 0;
+        unsigned m = 1;
+        unsigned sym = (unsigned)i;
+        unsigned const offset = (unsigned)1 << footer_bits;
+
+        for (; footer_bits != 0; --footer_bits) {
+            unsigned bit = sym & 1;
+            sym >>= 1;
+            price += GET_PRICE(probs[m], bit);
+            m = (m << 1) + bit;
+        };
+
+        unsigned const prob = probs[m];
+        temp_prices[base] = price + GET_PRICE_0(prob);
+        temp_prices[base + offset] = price + GET_PRICE_1(prob);
+    }
+
+    for (unsigned lps = 0; lps < kNumLenToPosStates; lps++) {
+        size_t slot;
+        size_t const dist_table_size2 = (enc->dist_price_table_size + 1) >> 1;
+        U32 *const dist_slot_prices = enc->dist_slot_prices[lps];
+        const Probability *const probs = enc->states.dist_slot_encoders[lps];
+
+        for (slot = 0; slot < dist_table_size2; slot++) {
+            /* dist_slot_prices[slot] = RcTree_GetPrice(encoder, kNumPosSlotBits, slot, p->ProbPrices); */
+            U32 price;
+            unsigned bit;
+            unsigned sym = (unsigned)slot + (1 << (kNumPosSlotBits - 1));
+            bit = sym & 1; sym >>= 1; price = GET_PRICE(probs[sym], bit);
+            bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[sym], bit);
+            bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[sym], bit);
+            bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[sym], bit);
+            bit = sym & 1; sym >>= 1; price += GET_PRICE(probs[sym], bit);
+            unsigned const prob = probs[slot + (1 << (kNumPosSlotBits - 1))];
+            dist_slot_prices[slot * 2] = price + GET_PRICE_0(prob);
+            dist_slot_prices[slot * 2 + 1] = price + GET_PRICE_1(prob);
+        }
+
+        {
+            U32 delta = ((U32)((kEndPosModelIndex / 2 - 1) - kNumAlignBits) << kNumBitPriceShiftBits);
+            for (slot = kEndPosModelIndex / 2; slot < dist_table_size2; slot++) {
+                dist_slot_prices[slot * 2] += delta;
+                dist_slot_prices[slot * 2 + 1] += delta;
+                delta += ((U32)1 << kNumBitPriceShiftBits);
+            }
+        }
+
+        {
+            U32 *const dp = enc->distance_prices[lps];
+
+            dp[0] = dist_slot_prices[0];
+            dp[1] = dist_slot_prices[1];
+            dp[2] = dist_slot_prices[2];
+            dp[3] = dist_slot_prices[3];
+
+            for (size_t i = 4; i < kNumFullDistances; i += 2) {
+                U32 slot_price = dist_slot_prices[distance_table[i]];
+                dp[i] = slot_price + temp_prices[i];
+                dp[i + 1] = slot_price + temp_prices[i + 1];
+            }
+        }
+    }
 }
 
 FORCE_INLINE_TEMPLATE
-size_t EncodeChunkBest(FL2_lzmaEncoderCtx* enc,
+size_t LZMA_encodeChunkBest(LZMA2_ECtx *const enc,
     FL2_dataBlock const block,
-    FL2_matchTable* tbl,
-    int const structTbl,
+    FL2_matchTable* const tbl,
+    int const struct_tbl,
     size_t index,
-    size_t uncompressed_end)
+    size_t const uncompressed_end)
 {
-    unsigned search_depth = tbl->params.depth;
-    FillDistancesPrices(enc);
-    FillAlignPrices(enc);
-    UpdateLengthPrices(enc, &enc->states.len_states);
-    UpdateLengthPrices(enc, &enc->states.rep_len_states);
+    unsigned const search_depth = tbl->params.depth;
+    LZMA_fillDistancesPrices(enc);
+    LZMA_fillAlignPrices(enc);
+    LZMA_lengthStates_updatePrices(enc, &enc->states.len_states);
+    LZMA_lengthStates_updatePrices(enc, &enc->states.rep_len_states);
     while (index < uncompressed_end && enc->rc.out_index < enc->rc.chunk_size)
     {
-        Match match = FL2_radixGetMatch(block, tbl, search_depth, structTbl, index);
+        RMF_match const match = RMF_getMatch(block, tbl, search_depth, struct_tbl, index);
         if (match.length > 1) {
-            if (enc->strategy != FL2_ultra) {
-                index = EncodeOptimumSequence(enc, block, tbl, structTbl, 0, index, uncompressed_end, match);
+            /* Template-like inline function */
+            if (enc->strategy == FL2_ultra) {
+                index = LZMA_encodeOptimumSequence(enc, block, tbl, struct_tbl, 1, index, uncompressed_end, match);
             }
             else {
-                index = EncodeOptimumSequence(enc, block, tbl, structTbl, 1, index, uncompressed_end, match);
+                index = LZMA_encodeOptimumSequence(enc, block, tbl, struct_tbl, 0, index, uncompressed_end, match);
             }
-            if (enc->match_price_count >= kDistanceRepriceFrequency) {
-                FillDistancesPrices(enc);
+            if (enc->match_price_count >= kMatchRepriceFrequency) {
+                LZMA_fillAlignPrices(enc);
+                LZMA_fillDistancesPrices(enc);
+                LZMA_lengthStates_updatePrices(enc, &enc->states.len_states);
             }
-            if (enc->align_price_count >= kAlignRepriceFrequency) {
-                FillAlignPrices(enc);
+            if (enc->rep_len_price_count >= kRepLenRepriceFrequency) {
+                enc->rep_len_price_count = 0;
+                LZMA_lengthStates_updatePrices(enc, &enc->states.rep_len_states);
             }
         }
         else {
-            if (block.data[index] == block.data[index - enc->states.reps[0] - 1]) {
-                EncodeRepMatch(enc, 1, 0, index & enc->pos_mask);
+            if (block.data[index] != block.data[index - enc->states.reps[0] - 1]) {
+                LZMA_encodeLiteralBuf(enc, block.data, index);
+                ++index;
             }
             else {
-                EncodeLiteralBuf(enc, block.data, index);
+                LZMA_encodeRepMatchShort(enc, index & enc->pos_mask);
+                ++index;
             }
-            ++index;
         }
     }
-    Flush(&enc->rc);
     return index;
 }
 
-static void LengthStates_Reset(LengthStates* ls, unsigned fast_length)
+static void LZMA_lengthStates_Reset(LengthStates* const ls, unsigned const fast_length)
 {
     ls->choice = kProbInitValue;
-    ls->choice_2 = kProbInitValue;
-    for (size_t i = 0; i < (kNumPositionStatesMax << kLenNumLowBits); ++i) {
+
+    for (size_t i = 0; i < (kNumPositionStatesMax << (kLenNumLowBits + 1)); ++i)
         ls->low[i] = kProbInitValue;
-    }
-    for (size_t i = 0; i < (kNumPositionStatesMax << kLenNumMidBits); ++i) {
-        ls->mid[i] = kProbInitValue;
-    }
-    for (size_t i = 0; i < kLenNumHighSymbols; ++i) {
+
+    for (size_t i = 0; i < kLenNumHighSymbols; ++i)
         ls->high[i] = kProbInitValue;
-    }
+
     ls->table_size = fast_length + 1 - kMatchLenMin;
 }
 
-static void EncoderStates_Reset(EncoderStates* es, unsigned lc, unsigned lp, unsigned fast_length)
+static void LZMA_encoderStates_Reset(EncoderStates* const es, unsigned const lc, unsigned const lp, unsigned fast_length)
 {
     es->state = 0;
-    for (size_t i = 0; i < kNumReps; ++i) {
+
+    for (size_t i = 0; i < kNumReps; ++i)
         es->reps[i] = 0;
-    }
+
     for (size_t i = 0; i < kNumStates; ++i) {
         for (size_t j = 0; j < kNumPositionStatesMax; ++j) {
             es->is_match[i][j] = kProbInitValue;
@@ -1754,27 +1709,26 @@ static void EncoderStates_Reset(EncoderStates* es, unsigned lc, unsigned lp, uns
         es->is_rep_G1[i] = kProbInitValue;
         es->is_rep_G2[i] = kProbInitValue;
     }
-    size_t num = (size_t)(kNumLiterals * kNumLitTables) << (lp + lc);
-    for (size_t i = 0; i < num; ++i) {
+    size_t const num = (size_t)(kNumLiterals * kNumLitTables) << (lp + lc);
+    for (size_t i = 0; i < num; ++i)
         es->literal_probs[i] = kProbInitValue;
-    }
+
     for (size_t i = 0; i < kNumLenToPosStates; ++i) {
         Probability *probs = es->dist_slot_encoders[i];
-        for (size_t j = 0; j < (1 << kNumPosSlotBits); ++j) {
+        for (size_t j = 0; j < (1 << kNumPosSlotBits); ++j)
             probs[j] = kProbInitValue;
-        }
     }
-    for (size_t i = 0; i < kNumFullDistances - kEndPosModelIndex; ++i) {
+    for (size_t i = 0; i < kNumFullDistances - kEndPosModelIndex; ++i)
         es->dist_encoders[i] = kProbInitValue;
-    }
-    LengthStates_Reset(&es->len_states, fast_length);
-    LengthStates_Reset(&es->rep_len_states, fast_length);
-    for (size_t i = 0; i < (1 << kNumAlignBits); ++i) {
+
+    LZMA_lengthStates_Reset(&es->len_states, fast_length);
+    LZMA_lengthStates_Reset(&es->rep_len_states, fast_length);
+
+    for (size_t i = 0; i < (1 << kNumAlignBits); ++i)
         es->dist_align_encoders[i] = kProbInitValue;
-    }
 }
 
-BYTE FL2_getDictSizeProp(size_t dictionary_size)
+BYTE LZMA2_getDictSizeProp(size_t const dictionary_size)
 {
     BYTE dict_size_prop = 0;
     for (BYTE bit = 11; bit < 32; ++bit) {
@@ -1790,34 +1744,54 @@ BYTE FL2_getDictSizeProp(size_t dictionary_size)
     return dict_size_prop;
 }
 
-size_t FL2_lzma2MemoryUsage(unsigned chain_log, FL2_strategy strategy, unsigned thread_count)
+size_t LZMA2_encMemoryUsage(unsigned const chain_log, FL2_strategy const strategy, unsigned const thread_count)
 {
-    size_t size = sizeof(FL2_lzmaEncoderCtx) + kChunkBufferSize;
+    size_t size = sizeof(LZMA2_ECtx);
     if(strategy == FL2_ultra)
         size += sizeof(HashChains) + (sizeof(U32) << chain_log) - sizeof(U32);
     return size * thread_count;
 }
 
-static void Reset(FL2_lzmaEncoderCtx* enc, size_t max_distance)
+static void LZMA2_reset(LZMA2_ECtx *const enc, size_t const max_distance)
 {
     DEBUGLOG(5, "LZMA encoder reset : max_distance %u", (unsigned)max_distance);
-    U32 i = 0;
-    RangeEncReset(&enc->rc);
-    EncoderStates_Reset(&enc->states, enc->lc, enc->lp, enc->fast_length);
+    RC_reset(&enc->rc);
+    LZMA_encoderStates_Reset(&enc->states, enc->lc, enc->lp, enc->fast_length);
     enc->pos_mask = (1 << enc->pb) - 1;
     enc->lit_pos_mask = (1 << enc->lp) - 1;
+    U32 i = 0;
     for (; max_distance > (size_t)1 << i; ++i) {
     }
     enc->dist_price_table_size = i * 2;
+    enc->rep_len_price_count = 0;
+    enc->match_price_count = 0;
 }
 
-static BYTE GetLcLpPbCode(FL2_lzmaEncoderCtx* enc)
+static BYTE LZMA_getLcLpPbCode(LZMA2_ECtx *const enc)
 {
     return (BYTE)((enc->pb * 5 + enc->lp) * 9 + enc->lc);
 }
 
-BYTE IsChunkRandom(const FL2_matchTable* const tbl,
-    const FL2_dataBlock block, size_t const start,
+/* Integer square root from https://stackoverflow.com/a/1101217 */
+static U32 LZMA2_isqrt(U32 op)
+{
+    U32 res = 0;
+    /* "one" starts at the highest power of four <= than the argument. */
+    U32 one = (U32)1 << (ZSTD_highbit32(op) & ~1);
+
+    while (one != 0) {
+        if (op >= res + one) {
+            op -= res + one;
+            res = res + 2U * one;
+        }
+        res >>= 1;
+        one >>= 2;
+    }
+    return res;
+}
+
+static BYTE LZMA2_chunkNotCompressible(const FL2_matchTable* const tbl,
+    FL2_dataBlock const block, size_t const start,
 	unsigned const strategy)
 {
 	if (block.end - start >= kMinTestChunkSize) {
@@ -1826,7 +1800,7 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl,
 			{ 0, 0, 1U << 6, 1U << 14, 1U << 22 }, /* opt */
 			{ 0, 0, 1U << 6, 1U << 14, 1U << 22 } }; /* ultra */
 		static const size_t margin_divisor[3] = { 60U, 45U, 120U };
-		static const double dev_table[3] = { 6.0, 6.0, 5.0 };
+		static const U32 dev_table[3] = { 24, 24, 20};
 
 		size_t const end = MIN(start + kChunkSize, block.end);
 		size_t const chunk_size = end - start;
@@ -1834,7 +1808,7 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl,
 		size_t const margin = chunk_size / margin_divisor[strategy];
 		size_t const terminator = start + margin;
 
-		if (tbl->isStruct) {
+		if (tbl->is_struct) {
 			size_t prev_dist = 0;
 			for (size_t index = start; index < end; ) {
 				U32 const link = GetMatchLink(tbl->table, index);
@@ -1844,12 +1818,16 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl,
 					prev_dist = 0;
 				}
 				else {
-					size_t length = GetMatchLength(tbl->table, index);
-					size_t dist = index - GetMatchLink(tbl->table, index);
-					if (length > 4)
-						count += dist != prev_dist;
-					else
-						count += (dist < max_dist_table[strategy][length]) ? 1 : length;
+					size_t const length = GetMatchLength(tbl->table, index);
+					size_t const dist = index - GetMatchLink(tbl->table, index);
+                    if (length > 4) {
+                        /* Increase the cost if it's not the same match */
+                        count += dist != prev_dist;
+                    }
+                    else {
+                        /* Increment the cost for a short match. The cost is the entire length if it's too far */
+                        count += (dist < max_dist_table[strategy][length]) ? 1 : length;
+                    }
 					index += length;
 					prev_dist = dist;
 				}
@@ -1867,8 +1845,8 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl,
 					prev_dist = 0;
 				}
 				else {
-					size_t length = link >> RADIX_LINK_BITS;
-					size_t dist = index - (link & RADIX_LINK_MASK);
+					size_t const length = link >> RADIX_LINK_BITS;
+					size_t const dist = index - (link & RADIX_LINK_MASK);
 					if (length > 4)
 						count += dist != prev_dist;
 					else
@@ -1881,166 +1859,187 @@ BYTE IsChunkRandom(const FL2_matchTable* const tbl,
 			}
 		}
 
-		{	U32 char_count[256];
-			double char_total = 0.0;
-			/* Expected normal character count */
-			double const avg = (double)chunk_size / 256.0;
+        U32 char_count[256];
+        U32 char_total = 0;
+        /* Expected normal character count * 4 */
+        U32 const avg = (U32)(chunk_size / 64U);
 
-			memset(char_count, 0, sizeof(char_count));
-			for (size_t index = start; index < end; ++index)
-				++char_count[block.data[index]];
-			/* Sum the deviations */
-			for (size_t i = 0; i < 256; ++i) {
-				double delta = (double)char_count[i] - avg;
-				char_total += delta * delta;
-			}
-			return sqrt(char_total) / sqrt((double)chunk_size) <= dev_table[strategy];
-		}
+        memset(char_count, 0, sizeof(char_count));
+        for (size_t index = start; index < end; ++index)
+            char_count[block.data[index]] += 4;
+        /* Sum the deviations */
+        for (size_t i = 0; i < 256; ++i) {
+            S32 delta = char_count[i] - avg;
+            char_total += delta * delta;
+        }
+        U32 sqrt_chunk = (chunk_size == kChunkSize) ? kSqrtChunkSize : LZMA2_isqrt((U32)chunk_size);
+        /* Result base on character count std dev */
+        return LZMA2_isqrt(char_total) / sqrt_chunk <= dev_table[strategy];
 	}
 	return 0;
 }
 
-#ifdef __GNUC__
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#else
-__pragma(warning(disable:4701))
-#endif
+static size_t LZMA2_encodeChunk(LZMA2_ECtx *const enc,
+    FL2_matchTable* const tbl,
+    FL2_dataBlock const block,
+    size_t const index, size_t const end)
+{
+    /* Template-like inline functions */
+    if (enc->strategy == FL2_fast) {
+        if (tbl->is_struct) {
+            return LZMA_encodeChunkFast(enc, block, tbl, 1,
+                index, end);
+        }
+        else {
+            return LZMA_encodeChunkFast(enc, block, tbl, 0,
+                index, end);
+        }
+    }
+    else {
+        if (tbl->is_struct) {
+            return LZMA_encodeChunkBest(enc, block, tbl, 1,
+                index, end);
+        }
+        else {
+            return LZMA_encodeChunkBest(enc, block, tbl, 0,
+                index, end);
+        }
+    }
+}
 
-size_t FL2_lzma2Encode(FL2_lzmaEncoderCtx* enc,
-    FL2_matchTable* tbl,
-    const FL2_dataBlock block,
-    const FL2_lzma2Parameters* options,
-    FL2_progressFn progress, void* opaque, size_t base, U32 weight)
+size_t LZMA2_encode(LZMA2_ECtx *const enc,
+    FL2_matchTable* const tbl,
+    FL2_dataBlock const block,
+    const FL2_lzma2Parameters* const options,
+    int stream_prop,
+    FL2_atomic *const progress_in,
+    FL2_atomic *const progress_out,
+    int *const canceled)
 {
     size_t const start = block.start;
     BYTE* out_dest = enc->out_buf;
 	/* Each encoder writes a properties byte because the upstream encoder(s) could */
 	/* write only uncompressed chunks with no properties. */
 	BYTE encode_properties = 1;
-    BYTE next_is_random = 0;
+    BYTE not_compressible = 0;
 
-    if (block.end <= block.start) {
+    if (block.end <= block.start)
         return 0;
-    }
+
     enc->lc = options->lc;
-    enc->lp = options->lp;
-    if (enc->lc + enc->lp > 4) {
-        enc->lc = 3;
-        enc->lp = 0;
-    }
+    enc->lp = MIN(options->lp, 4);
+
+    if (enc->lc + enc->lp > 4)
+        enc->lc = 4 - enc->lp;
+
     enc->pb = options->pb;
     enc->strategy = options->strategy;
     enc->fast_length = options->fast_length;
-    enc->match_cycles = options->match_cycles;
-    Reset(enc, block.end);
+    enc->match_cycles = MIN(options->match_cycles, kMatchesMax - 1);
+
+    LZMA2_reset(enc, block.end);
+
     if (enc->strategy == FL2_ultra) {
         /* Create a hash chain to put the encoder into hybrid mode */
         if (enc->hash_alloc_3 < ((ptrdiff_t)1 << options->second_dict_bits)) {
-            if(HashCreate(enc, options->second_dict_bits) != 0)
+            if(LZMA_hashCreate(enc, options->second_dict_bits) != 0)
                 return FL2_ERROR(memory_allocation);
         }
         else {
-            HashReset(enc, options->second_dict_bits);
+            LZMA_hashReset(enc, options->second_dict_bits);
         }
-        enc->hash_prev_index = (start >= (size_t)enc->hash_dict_3) ? start - enc->hash_dict_3 : -1;
+        enc->hash_prev_index = (start >= (size_t)enc->hash_dict_3) ? (ptrdiff_t)(start - enc->hash_dict_3) : (ptrdiff_t)-1;
     }
     enc->len_end_max = kOptimizerBufferSize - 1;
     RMF_limitLengths(tbl, block.end);
     for (size_t index = start; index < block.end;)
     {
-        unsigned header_size = encode_properties ? kChunkHeaderSize + 1 : kChunkHeaderSize;
+        size_t header_size = (stream_prop >= 0) + (encode_properties ? kChunkHeaderSize + 1 : kChunkHeaderSize);
         EncoderStates saved_states;
         size_t next_index;
-        size_t compressed_size;
-        size_t uncompressed_size;
-        RangeEncReset(&enc->rc);
-        SetOutputBuffer(&enc->rc, out_dest + header_size, kChunkSize);
-        if (!next_is_random) {
+        RC_reset(&enc->rc);
+        RC_setOutputBuffer(&enc->rc, out_dest + header_size, kChunkSize);
+        if (!not_compressible) {
+            size_t cur = index;
+            size_t const end = (enc->strategy == FL2_fast) ? MIN(block.end, index + kMaxChunkUncompressedSize)
+                : MIN(block.end, index + kMaxChunkUncompressedSize - kOptimizerBufferSize);
             saved_states = enc->states;
             if (index == 0) {
-                EncodeLiteral(enc, 0, block.data[0], 0);
+                /* First byte of the dictionary */
+                LZMA_encodeLiteral(enc, 0, block.data[0], 0);
+                ++cur;
             }
-            if (enc->strategy == FL2_fast) {
-                if (tbl->isStruct) {
-                    next_index = EncodeChunkFast(enc, block, tbl, 1,
-                        index + (index == 0),
-                        MIN(block.end, index + kMaxChunkUncompressedSize));
-                }
-                else {
-                    next_index = EncodeChunkFast(enc, block, tbl, 0,
-                        index + (index == 0),
-                        MIN(block.end, index + kMaxChunkUncompressedSize));
-                }
-            }
-            else {
-                if (tbl->isStruct) {
-                    next_index = EncodeChunkBest(enc, block, tbl, 1,
-                        index + (index == 0),
-                        MIN(block.end, index + kMaxChunkUncompressedSize - kOptimizerBufferSize));
-                }
-                else {
-                    next_index = EncodeChunkBest(enc, block, tbl, 0,
-                        index + (index == 0),
-                        MIN(block.end, index + kMaxChunkUncompressedSize - kOptimizerBufferSize));
-                }
+            if (index == start) {
+                /* After four bytes we can write data to the match table because the */
+                /* compressed data will never catch up with the table position being read. */
+                enc->rc.chunk_size = kTempMinOutput;
+                cur = LZMA2_encodeChunk(enc, tbl, block, cur, end);
+                enc->rc.chunk_size = kChunkSize;
+                out_dest = RMF_getTableAsOutputBuffer(tbl, start);
+                memcpy(out_dest, enc->out_buf, header_size + enc->rc.out_index);
+                enc->rc.out_buffer = out_dest + header_size;
             }
+            next_index = LZMA2_encodeChunk(enc, tbl, block, cur, end);
+            RC_flush(&enc->rc);
         }
         else {
             next_index = MIN(index + kChunkSize, block.end);
         }
-        compressed_size = enc->rc.out_index;
-        uncompressed_size = next_index - index;
-        out_dest[1] = (BYTE)((uncompressed_size - 1) >> 8);
-        out_dest[2] = (BYTE)(uncompressed_size - 1);
+        size_t compressed_size = enc->rc.out_index;
+        size_t uncompressed_size = next_index - index;
+
+        if (compressed_size > kMaxChunkCompressedSize)
+            return FL2_ERROR(internal);
+
+        BYTE* header = out_dest;
+
+        if (stream_prop >= 0)
+            *header++ = (BYTE)stream_prop;
+        stream_prop = -1;
+
+        header[1] = (BYTE)((uncompressed_size - 1) >> 8);
+        header[2] = (BYTE)(uncompressed_size - 1);
         /* Output an uncompressed chunk if necessary */
-        if (next_is_random || uncompressed_size + 3 <= compressed_size + header_size) {
-            DEBUGLOG(5, "Storing chunk : was %u => %u", (unsigned)uncompressed_size, (unsigned)compressed_size);
-            if (index == 0) {
-                out_dest[0] = kChunkUncompressedDictReset;
-            }
-            else {
-                out_dest[0] = kChunkUncompressed;
-            }
-            memcpy(out_dest + 3, block.data + index, uncompressed_size);
+        if (not_compressible || uncompressed_size + 3 <= compressed_size + header_size) {
+            DEBUGLOG(6, "Storing chunk : was %u => %u", (unsigned)uncompressed_size, (unsigned)compressed_size);
+
+            header[0] = (index == 0) ? kChunkUncompressedDictReset : kChunkUncompressed;
+
+            /* Copy uncompressed data into the output */
+            memcpy(header + 3, block.data + index, uncompressed_size);
+
             compressed_size = uncompressed_size;
-            header_size = 3;
-            if (!next_is_random) {
+            header_size = 3 + (header - out_dest);
+            if (!not_compressible)
                 enc->states = saved_states;
-            }
         }
         else {
-            DEBUGLOG(5, "Compressed chunk : %u => %u", (unsigned)uncompressed_size, (unsigned)compressed_size);
-            if (index == 0) {
-                out_dest[0] = kChunkCompressedFlag | kChunkAllReset;
-            }
-            else if (encode_properties) {
-                out_dest[0] = kChunkCompressedFlag | kChunkStatePropertiesReset;
-            }
-            else {
-                out_dest[0] = kChunkCompressedFlag | kChunkNothingReset;
-            }
-            out_dest[0] |= (BYTE)((uncompressed_size - 1) >> 16);
-            out_dest[3] = (BYTE)((compressed_size - 1) >> 8);
-            out_dest[4] = (BYTE)(compressed_size - 1);
+            DEBUGLOG(6, "Compressed chunk : %u => %u", (unsigned)uncompressed_size, (unsigned)compressed_size);
+
+            if (index == 0)
+                header[0] = kChunkCompressedFlag | kChunkAllReset;
+            else if (encode_properties)
+                header[0] = kChunkCompressedFlag | kChunkStatePropertiesReset;
+            else
+                header[0] = kChunkCompressedFlag | kChunkNothingReset;
+
+            header[0] |= (BYTE)((uncompressed_size - 1) >> 16);
+            header[3] = (BYTE)((compressed_size - 1) >> 8);
+            header[4] = (BYTE)(compressed_size - 1);
             if (encode_properties) {
-                out_dest[5] = GetLcLpPbCode(enc);
+                header[5] = LZMA_getLcLpPbCode(enc);
                 encode_properties = 0;
             }
         }
-        if (next_is_random || uncompressed_size + 3 <= compressed_size + (compressed_size >> kRandomFilterMarginBits) + header_size)
-        {
+        if (not_compressible || uncompressed_size + 3 <= compressed_size + (compressed_size >> kRandomFilterMarginBits) + header_size) {
             /* Test the next chunk for compressibility */
-            next_is_random = IsChunkRandom(tbl, block, next_index, enc->strategy);
-        }
-        if (index == start) {
-            /* After the first chunk we can write data to the match table because the */
-            /* compressed data will never catch up with the table position being read. */
-            out_dest = RMF_getTableAsOutputBuffer(tbl, start);
-            memcpy(out_dest, enc->out_buf, compressed_size + header_size);
+            not_compressible = LZMA2_chunkNotCompressible(tbl, block, next_index, enc->strategy);
         }
         out_dest += compressed_size + header_size;
+        FL2_atomic_add(*progress_in, (long)(next_index - index));
+        FL2_atomic_add(*progress_out, (long)(compressed_size + header_size));
         index = next_index;
-        if (progress && progress(base + (((index - start) * weight) >> 4), opaque) != 0)
+        if (*canceled)
             return FL2_ERROR(canceled);
     }
     return out_dest - RMF_getTableAsOutputBuffer(tbl, start);
diff --git a/C/fast-lzma2/lzma2_enc.h b/C/fast-lzma2/lzma2_enc.h
index 9fbda523..aa821f78 100644
--- a/C/fast-lzma2/lzma2_enc.h
+++ b/C/fast-lzma2/lzma2_enc.h
@@ -10,6 +10,7 @@ Public domain
 #include "mem.h"
 #include "data_block.h"
 #include "radix_mf.h"
+#include "atomic.h"
 
 #if defined (__cplusplus)
 extern "C" {
@@ -19,14 +20,10 @@ extern "C" {
 
 #define LZMA2_END_MARKER '\0'
 #define LZMA_MIN_DICT_BITS 12
+#define ENC_MIN_BYTES_PER_THREAD 0x20000
 
-typedef struct FL2_lzmaEncoderCtx_s FL2_lzmaEncoderCtx;
 
-typedef enum {
-    FL2_fast,
-    FL2_opt,
-    FL2_ultra
-} FL2_strategy;
+typedef struct LZMA2_ECtx_s LZMA2_ECtx;
 
 typedef struct
 {
@@ -37,25 +34,28 @@ typedef struct
     unsigned match_cycles;
     FL2_strategy strategy;
     unsigned second_dict_bits;
-    unsigned random_filter;
+    unsigned reset_interval;
 } FL2_lzma2Parameters;
 
 
-FL2_lzmaEncoderCtx* FL2_lzma2Create();
+LZMA2_ECtx* LZMA2_createECtx(void);
 
-void FL2_lzma2Free(FL2_lzmaEncoderCtx* enc);
+void LZMA2_freeECtx(LZMA2_ECtx *const enc);
 
-int FL2_lzma2HashAlloc(FL2_lzmaEncoderCtx* enc, const FL2_lzma2Parameters* options);
+int LZMA2_hashAlloc(LZMA2_ECtx *const enc, const FL2_lzma2Parameters* const options);
 
-size_t FL2_lzma2Encode(FL2_lzmaEncoderCtx* enc,
-    FL2_matchTable* tbl,
-    const FL2_dataBlock block,
-    const FL2_lzma2Parameters* options,
-    FL2_progressFn progress, void* opaque, size_t base, U32 weight);
+size_t LZMA2_encode(LZMA2_ECtx *const enc,
+    FL2_matchTable* const tbl,
+    FL2_dataBlock const block,
+    const FL2_lzma2Parameters* const options,
+    int stream_prop,
+    FL2_atomic *const progress_in,
+    FL2_atomic *const progress_out,
+    int *const canceled);
 
-BYTE FL2_getDictSizeProp(size_t dictionary_size);
+BYTE LZMA2_getDictSizeProp(size_t const dictionary_size);
 
-size_t FL2_lzma2MemoryUsage(unsigned chain_log, FL2_strategy strategy, unsigned thread_count);
+size_t LZMA2_encMemoryUsage(unsigned const chain_log, FL2_strategy const strategy, unsigned const thread_count);
 
 #if defined (__cplusplus)
 }
diff --git a/C/fast-lzma2/mem.h b/C/fast-lzma2/mem.h
index f54a45ce..5da24875 100644
--- a/C/fast-lzma2/mem.h
+++ b/C/fast-lzma2/mem.h
@@ -28,9 +28,6 @@ extern "C" {
 #if defined(_MSC_VER)   /* Visual Studio */
 #   include <stdlib.h>  /* _byteswap_ulong */
 #   include <intrin.h>  /* _byteswap_* */
-#   pragma warning(disable : 4389)  /* disable: C4389: '==' : signed/unsigned mismatch */
-#endif
-
 #endif
 #if defined(__GNUC__)
 #  define MEM_STATIC static __inline __attribute__((unused))
@@ -42,6 +39,10 @@ extern "C" {
 #  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
 #endif
 
+#ifndef __has_builtin
+#  define __has_builtin(x) 0  /* compat. with non-clang compilers */
+#endif
+
 /* code only tested on 32 and 64 bits systems */
 #define MEM_STATIC_ASSERT(c)   { enum { MEM_static_assert = 1/(int)(!!(c)) }; }
 MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
@@ -60,11 +61,23 @@ MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (size
   typedef  uint64_t U64;
   typedef   int64_t S64;
 #else
+# include <limits.h>
+#if CHAR_BIT != 8
+#  error "this implementation requires char to be exactly 8-bit type"
+#endif
   typedef unsigned char      BYTE;
+#if USHRT_MAX != 65535
+#  error "this implementation requires short to be exactly 16-bit type"
+#endif
   typedef unsigned short      U16;
   typedef   signed short      S16;
+#if UINT_MAX != 4294967295
+#  error "this implementation requires int to be exactly 32-bit type"
+#endif
   typedef unsigned int        U32;
   typedef   signed int        S32;
+/* note : there are no limits defined for long long type in C90.
+ * limits exist in C99, however, in such case, <stdint.h> is preferred */
   typedef unsigned long long  U64;
   typedef   signed long long  S64;
 #endif
@@ -189,7 +202,8 @@ MEM_STATIC U32 MEM_swap32(U32 in)
 {
 #if defined(_MSC_VER)     /* Visual Studio */
     return _byteswap_ulong(in);
-#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap32))
     return __builtin_bswap32(in);
 #else
     return  ((in << 24) & 0xff000000 ) |
@@ -203,7 +217,8 @@ MEM_STATIC U64 MEM_swap64(U64 in)
 {
 #if defined(_MSC_VER)     /* Visual Studio */
     return _byteswap_uint64(in);
-#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap64))
     return __builtin_bswap64(in);
 #else
     return  ((in << 56) & 0xff00000000000000ULL) |
diff --git a/C/fast-lzma2/platform.h b/C/fast-lzma2/platform.h
index a4d7850f..155ebcd1 100644
--- a/C/fast-lzma2/platform.h
+++ b/C/fast-lzma2/platform.h
@@ -21,10 +21,10 @@ extern "C" {
 *  Compiler Options
 ****************************************/
 #if defined(_MSC_VER)
-#  define _CRT_SECURE_NO_WARNINGS   /* Disable Visual Studio warning messages for fopen, strncpy, strerror */
-#  define _CRT_SECURE_NO_DEPRECATE  /* VS2005 - must be declared before <io.h> and <windows.h> */
-#  if (_MSC_VER <= 1800)            /* (1800 = Visual Studio 2013) */
-#    define snprintf sprintf_s      /* snprintf unsupported by Visual <= 2013 */
+#  define _CRT_SECURE_NO_WARNINGS    /* Disable Visual Studio warning messages for fopen, strncpy, strerror */
+#  if (_MSC_VER <= 1800)             /* 1800 == Visual Studio 2013 */
+#    define _CRT_SECURE_NO_DEPRECATE /* VS2005 - must be declared before <io.h> and <windows.h> */
+#    define snprintf sprintf_s       /* snprintf unsupported by Visual <= 2013 */
 #  endif
 #endif
 
@@ -50,53 +50,70 @@ extern "C" {
 /* *********************************************************
 *  Turn on Large Files support (>4GB) for 32-bit Linux/Unix
 ***********************************************************/
-#if !defined(__64BIT__) || defined(__MINGW32__)       /* No point defining Large file for 64 bit but MinGW-w64 requires it */
+#if !defined(__64BIT__) || defined(__MINGW32__)    /* No point defining Large file for 64 bit but MinGW-w64 requires it */
 #  if !defined(_FILE_OFFSET_BITS)
-#    define _FILE_OFFSET_BITS 64                      /* turn off_t into a 64-bit type for ftello, fseeko */
+#    define _FILE_OFFSET_BITS 64                   /* turn off_t into a 64-bit type for ftello, fseeko */
 #  endif
-#  if !defined(_LARGEFILE_SOURCE)                     /* obsolete macro, replaced with _FILE_OFFSET_BITS */
-#    define _LARGEFILE_SOURCE 1                       /* Large File Support extension (LFS) - fseeko, ftello */
+#  if !defined(_LARGEFILE_SOURCE)                  /* obsolete macro, replaced with _FILE_OFFSET_BITS */
+#    define _LARGEFILE_SOURCE 1                    /* Large File Support extension (LFS) - fseeko, ftello */
 #  endif
 #  if defined(_AIX) || defined(__hpux)
-#    define _LARGE_FILES                              /* Large file support on 32-bits AIX and HP-UX */
+#    define _LARGE_FILES                           /* Large file support on 32-bits AIX and HP-UX */
 #  endif
 #endif
 
 
 /* ************************************************************
 *  Detect POSIX version
-*  PLATFORM_POSIX_VERSION = -1 for non-Unix e.g. Windows
-*  PLATFORM_POSIX_VERSION = 0 for Unix-like non-POSIX
-*  PLATFORM_POSIX_VERSION >= 1 is equal to found _POSIX_VERSION
+*  PLATFORM_POSIX_VERSION = 0 for non-Unix e.g. Windows
+*  PLATFORM_POSIX_VERSION = 1 for Unix-like but non-POSIX
+*  PLATFORM_POSIX_VERSION > 1 is equal to found _POSIX_VERSION
+*  Value of PLATFORM_POSIX_VERSION can be forced on command line
 ***************************************************************/
-#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__)) /* UNIX-like OS */ \
-   || defined(__midipix__) || defined(__VMS))
-#  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1–2001 (SUSv3) conformant */ \
+#ifndef PLATFORM_POSIX_VERSION
+
+#  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \
      || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)  /* BSD distros */
+     /* exception rule : force posix version to 200112L,
+      * note: it's better to use unistd.h's _POSIX_VERSION whenever possible */
 #    define PLATFORM_POSIX_VERSION 200112L
-#  else
+
+/* try to determine posix version through official unistd.h's _POSIX_VERSION (http://pubs.opengroup.org/onlinepubs/7908799/xsh/unistd.h.html).
+ * note : there is no simple way to know in advance if <unistd.h> is present or not on target system,
+ * Posix specification mandates its presence and its content, but target system must respect this spec.
+ * It's necessary to _not_ #include <unistd.h> whenever target OS is not unix-like
+ * otherwise it will block preprocessing stage.
+ * The following list of build macros tries to "guess" if target OS is likely unix-like, and therefore can #include <unistd.h>
+ */
+#  elif !defined(_WIN32) \
+     && (defined(__unix__) || defined(__unix) \
+     || defined(__midipix__) || defined(__VMS) || defined(__HAIKU__))
+
 #    if defined(__linux__) || defined(__linux)
 #      ifndef _POSIX_C_SOURCE
-#        define _POSIX_C_SOURCE 200112L  /* use feature test macro */
+#        define _POSIX_C_SOURCE 200112L  /* feature test macro : https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html */
 #      endif
 #    endif
 #    include <unistd.h>  /* declares _POSIX_VERSION */
 #    if defined(_POSIX_VERSION)  /* POSIX compliant */
 #      define PLATFORM_POSIX_VERSION _POSIX_VERSION
 #    else
-#      define PLATFORM_POSIX_VERSION 0
+#      define PLATFORM_POSIX_VERSION 1
 #    endif
-#  endif
-#endif
-#if !defined(PLATFORM_POSIX_VERSION)
-#  define PLATFORM_POSIX_VERSION -1
-#endif
 
+#  else  /* non-unix target platform (like Windows) */
+#    define PLATFORM_POSIX_VERSION 0
+#  endif
+
+#endif   /* PLATFORM_POSIX_VERSION */
 
 /*-*********************************************
 *  Detect if isatty() and fileno() are available
 ************************************************/
-#if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 1)) || (PLATFORM_POSIX_VERSION >= 200112L) || defined(__DJGPP__)
+#if (defined(__linux__) && (PLATFORM_POSIX_VERSION > 1)) \
+ || (PLATFORM_POSIX_VERSION >= 200112L) \
+ || defined(__DJGPP__) \
+ || defined(__MSYS__)
 #  include <unistd.h>   /* isatty */
 #  define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
 #elif defined(MSDOS) || defined(OS2) || defined(__CYGWIN__)
@@ -106,8 +123,7 @@ extern "C" {
 #  include <io.h>      /* _isatty */
 #  include <windows.h> /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
 #  include <stdio.h>   /* FILE */
-static __inline int IS_CONSOLE(FILE* stdStream)
-{
+static __inline int IS_CONSOLE(FILE* stdStream) {
     DWORD dummy;
     return _isatty(_fileno(stdStream)) && GetConsoleMode((HANDLE)_get_osfhandle(_fileno(stdStream)), &dummy);
 }
@@ -117,7 +133,7 @@ static __inline int IS_CONSOLE(FILE* stdStream)
 
 
 /******************************
-*  OS-specific Includes
+*  OS-specific IO behaviors
 ******************************/
 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32)
 #  include <fcntl.h>   /* _O_BINARY */
@@ -125,7 +141,7 @@ static __inline int IS_CONSOLE(FILE* stdStream)
 #  if !defined(__DJGPP__)
 #    include <windows.h> /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
 #    include <winioctl.h> /* FSCTL_SET_SPARSE */
-#    define SET_BINARY_MODE(file) { int unused=_setmode(_fileno(file), _O_BINARY); (void)unused; }
+#    define SET_BINARY_MODE(file) { int const unused=_setmode(_fileno(file), _O_BINARY); (void)unused; }
 #    define SET_SPARSE_FILE_MODE(file) { DWORD dw; DeviceIoControl((HANDLE) _get_osfhandle(_fileno(file)), FSCTL_SET_SPARSE, 0, 0, 0, 0, &dw, 0); }
 #  else
 #    define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
@@ -146,6 +162,34 @@ static __inline int IS_CONSOLE(FILE* stdStream)
 #endif
 
 
+#ifndef ZSTD_START_SYMBOLLIST_FRAME
+#  ifdef __linux__
+#    define ZSTD_START_SYMBOLLIST_FRAME 2
+#  elif defined __APPLE__
+#    define ZSTD_START_SYMBOLLIST_FRAME 4
+#  else
+#    define ZSTD_START_SYMBOLLIST_FRAME 0
+#  endif
+#endif
+
+
+#ifndef ZSTD_SETPRIORITY_SUPPORT
+   /* mandates presence of <sys/resource.h> and support for setpriority() : http://man7.org/linux/man-pages/man2/setpriority.2.html */
+#  define ZSTD_SETPRIORITY_SUPPORT (PLATFORM_POSIX_VERSION >= 200112L)
+#endif
+
+
+#ifndef ZSTD_NANOSLEEP_SUPPORT
+   /* mandates support of nanosleep() within <time.h> : http://man7.org/linux/man-pages/man2/nanosleep.2.html */
+#  if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 199309L)) \
+   || (PLATFORM_POSIX_VERSION >= 200112L)
+#     define ZSTD_NANOSLEEP_SUPPORT 1
+#  else
+#     define ZSTD_NANOSLEEP_SUPPORT 0
+#  endif
+#endif
+
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/C/fast-lzma2/radix_bitpack.c b/C/fast-lzma2/radix_bitpack.c
index a20b0d60..c7e5484c 100644
--- a/C/fast-lzma2/radix_bitpack.c
+++ b/C/fast-lzma2/radix_bitpack.c
@@ -9,7 +9,7 @@
 */
 
 #include "mem.h"          /* U32, U64 */
-#include "fl2threading.h"
+#include "fl2_threading.h"
 #include "fl2_internal.h"
 #include "radix_internal.h"
 
@@ -52,9 +52,8 @@ void RMF_bitpackLimitLengths(FL2_matchTable* const tbl, size_t const index)
     SetNull(index - 1);
     for (U32 length = 2; length < RADIX_MAX_LENGTH && length <= index; ++length) {
         U32 const link = tbl->table[index - length];
-        if (link != RADIX_NULL_LINK) {
+        if (link != RADIX_NULL_LINK)
             tbl->table[index - length] = (MIN(length, link >> RADIX_LINK_BITS) << RADIX_LINK_BITS) | (link & RADIX_LINK_MASK);
-        }
     }
 }
 
diff --git a/C/fast-lzma2/radix_engine.h b/C/fast-lzma2/radix_engine.h
index 1697f942..0886d87d 100644
--- a/C/fast-lzma2/radix_engine.h
+++ b/C/fast-lzma2/radix_engine.h
@@ -9,80 +9,82 @@
 */
 
 #include <stdio.h>  
-#include "count.h"
 
 #define MAX_READ_BEYOND_DEPTH 2
 
 /* If a repeating byte is found, fill that section of the table with matches of distance 1 */
-static size_t HandleRepeat(FL2_matchTable* const tbl, const BYTE* const data_block, size_t const start, ptrdiff_t const block_size, ptrdiff_t i, size_t const radix_16)
+static size_t RMF_handleRepeat(RMF_builder* const tbl, const BYTE* const data_block, size_t const start, ptrdiff_t i, U32 depth)
 {
-    ptrdiff_t const rpt_index = i - (MAX_REPEAT / 2 - 2);
-    ptrdiff_t rpt_end;
-    /* Set the head to the first byte of the repeat and adjust the count */
-    tbl->list_heads[radix_16].head = (U32)(rpt_index - 1);
-    tbl->list_heads[radix_16].count -= MAX_REPEAT / 2 - 2;
-    /* Find the end */
-    i += ZSTD_count(data_block + i + 2, data_block + i + 1, data_block + block_size);
-    rpt_end = i;
+    /* Normally the last 2 bytes, but may be 4 if depth == 4 */
+    ptrdiff_t const last_2 = i + MAX_REPEAT / 2 - 1;
+
+    /* Find the start */
+    i += (4 - (i & 3)) & 3;
+    U32 u = *(U32*)(data_block + i);
+    while (i != 0 && *(U32*)(data_block + i - 4) == u)
+      i -= 4;
+    while (i != 0 && data_block[i - 1] == (BYTE)u)
+      --i;
+
+    ptrdiff_t const rpt_index = i;
     /* No point if it's in the overlap region */
-    if (i >= (ptrdiff_t)start) {
-        U32 len = 2;
+    if (last_2 >= (ptrdiff_t)start) {
+        U32 len = depth;
         /* Set matches at distance 1 and available length */
-        for (; i >= rpt_index && len <= RADIX_MAX_LENGTH; --i) {
+        for (i = last_2; i > rpt_index && len <= RADIX_MAX_LENGTH; --i) {
             SetMatchLinkAndLength(i, (U32)(i - 1), len);
             ++len;
         }
         /* Set matches at distance 1 and max length */
-        for (; i >= rpt_index; --i) {
+        for (; i > rpt_index; --i)
             SetMatchLinkAndLength(i, (U32)(i - 1), RADIX_MAX_LENGTH);
-        }
     }
-    return rpt_end;
+    return rpt_index;
 }
 
 /* If a 2-byte repeat is found, fill that section of the table with matches of distance 2 */
-static size_t HandleRepeat2(FL2_matchTable* const tbl, const BYTE* const data_block, size_t const start, ptrdiff_t const block_size, ptrdiff_t i, size_t const radix_16)
+static size_t RMF_handleRepeat2(RMF_builder* const tbl, const BYTE* const data_block, size_t const start, ptrdiff_t i, U32 depth)
 {
-    size_t radix_16_rev;
-    ptrdiff_t const rpt_index = i - (MAX_REPEAT - 3);
-    ptrdiff_t rpt_end;
+    /* Normally the last 2 bytes, but may be 4 if depth == 4 */
+    ptrdiff_t const last_2 = i + MAX_REPEAT * 2U - 4;
 
-    /* Set the head to the first byte of the repeat and adjust the count */
-    tbl->list_heads[radix_16].head = (U32)(rpt_index - 1);
-    tbl->list_heads[radix_16].count -= MAX_REPEAT / 2 - 2;
-    radix_16_rev = ((radix_16 >> 8) | (radix_16 << 8)) & 0xFFFF;
-    tbl->list_heads[radix_16_rev].head = (U32)(rpt_index - 2);
-    tbl->list_heads[radix_16_rev].count -= MAX_REPEAT / 2 - 1;
-    /* Find the end */
-    i += ZSTD_count(data_block + i + 2, data_block + i, data_block + block_size);
-    rpt_end = i;
+    /* Find the start */
+    ptrdiff_t realign = i & 1;
+    i += (4 - (i & 3)) & 3;
+    U32 u = *(U32*)(data_block + i);
+    while (i != 0 && *(U32*)(data_block + i - 4) == u)
+        i -= 4;
+    while (i != 0 && data_block[i - 1] == data_block[i + 1])
+        --i;
+    i += (i & 1) ^ realign;
+
+    ptrdiff_t const rpt_index = i;
     /* No point if it's in the overlap region */
     if (i >= (ptrdiff_t)start) {
-        U32 len = 2;
+        U32 len = depth + (data_block[last_2 + depth] == data_block[last_2]);
         /* Set matches at distance 2 and available length */
-        for (; i >= rpt_index && len <= RADIX_MAX_LENGTH; --i) {
+        for (i = last_2; i > rpt_index && len <= RADIX_MAX_LENGTH; i -= 2) {
             SetMatchLinkAndLength(i, (U32)(i - 2), len);
-            ++len;
+            len += 2;
         }
         /* Set matches at distance 2 and max length */
-        for (; i >= rpt_index; --i) {
+        for (; i > rpt_index; i -= 2)
             SetMatchLinkAndLength(i, (U32)(i - 2), RADIX_MAX_LENGTH);
-        }
     }
-    return rpt_end;
+    return rpt_index;
 }
 
 /* Initialization for the reference algortithm */
 #ifdef RMF_REFERENCE
-static void RadixInitReference(FL2_matchTable* const tbl, const void* const data, size_t const start, size_t const end)
+static void RMF_initReference(FL2_matchTable* const tbl, const void* const data, size_t const end)
 {
     const BYTE* const data_block = (const BYTE*)data;
     ptrdiff_t const block_size = end - 1;
     size_t st_index = 0;
     for (ptrdiff_t i = 0; i < block_size; ++i)
     {
-        size_t radix_16 = ((size_t)data_block[i] << 8) | data_block[i + 1];
-        U32 prev = tbl->list_heads[radix_16].head;
+        size_t const radix_16 = ((size_t)data_block[i] << 8) | data_block[i + 1];
+        U32 const prev = tbl->list_heads[radix_16].head;
         if (prev != RADIX_NULL_LINK) {
             SetMatchLinkAndLength(i, prev, 2U);
             tbl->list_heads[radix_16].head = (U32)i;
@@ -98,7 +100,6 @@ static void RadixInitReference(FL2_matchTable* const tbl, const void* const data
     SetNull(end - 1);
     tbl->end_index = (U32)st_index;
     tbl->st_index = ATOMIC_INITIAL_VALUE;
-    (void)start;
 }
 #endif
 
@@ -108,82 +109,50 @@ RMF_bitpackInit
 #else
 RMF_structuredInit
 #endif
-(FL2_matchTable* const tbl, const void* const data, size_t const start, size_t const end)
+(FL2_matchTable* const tbl, const void* const data, size_t const end)
 {
-    const BYTE* const data_block = (const BYTE*)data;
-    size_t st_index = 0;
-    size_t radix_16;
-    ptrdiff_t const block_size = end - 2;
-    ptrdiff_t rpt_total = 0;
-    U32 count = 0;
-
     if (end <= 2) {
-        for (size_t i = 0; i < end; ++i) {
+        for (size_t i = 0; i < end; ++i)
             SetNull(i);
-        }
+        tbl->end_index = 0;
         return 0;
     }
 #ifdef RMF_REFERENCE
     if (tbl->params.use_ref_mf) {
-        RadixInitReference(tbl, data, start, end);
+        RMF_initReference(tbl, data, end);
         return 0;
     }
 #endif
+
     SetNull(0);
+
+    const BYTE* const data_block = (const BYTE*)data;
+    size_t st_index = 0;
     /* Initial 2-byte radix value */
-    radix_16 = ((size_t)data_block[0] << 8) | data_block[1];
+    size_t radix_16 = ((size_t)data_block[0] << 8) | data_block[1];
     tbl->stack[st_index++] = (U32)radix_16;
     tbl->list_heads[radix_16].head = 0;
     tbl->list_heads[radix_16].count = 1;
 
     radix_16 = ((size_t)((BYTE)radix_16) << 8) | data_block[2];
 
-	ptrdiff_t i = 1;
+    ptrdiff_t rpt_total = 0;
+    ptrdiff_t i = 1;
+    ptrdiff_t const block_size = end - 2;
     for (; i < block_size; ++i) {
-        /* Pre-load the next value for speed increase */
+        /* Pre-load the next value for speed increase on some hardware. Execution can continue while memory read is pending */
         size_t const next_radix = ((size_t)((BYTE)radix_16) << 8) | data_block[i + 2];
 
         U32 const prev = tbl->list_heads[radix_16].head;
         if (prev != RADIX_NULL_LINK) {
-            S32 dist = (S32)i - prev;
-            /* Check for repeat */
-            if (dist > 2) {
-                count = 0;
-                /* Link this position to the previous occurance */
-                InitMatchLink(i, prev);
-                /* Set the previous to this position */
-                tbl->list_heads[radix_16].head = (U32)i;
-                ++tbl->list_heads[radix_16].count;
-                radix_16 = next_radix;
-            }
-            else {
-                count += 3 - dist;
-                /* Do the usual if the repeat is too short */
-                if (count < MAX_REPEAT - 2) {
-                    InitMatchLink(i, prev);
-                    tbl->list_heads[radix_16].head = (U32)i;
-                    ++tbl->list_heads[radix_16].count;
-                    radix_16 = next_radix;
-                }
-                else {
-                    ptrdiff_t const prev_i = i;
-                    /* Eliminate the repeat from the linked list to save time */
-                    if (dist == 1) {
-                        i = HandleRepeat(tbl, data_block, start, end, i, radix_16);
-                        rpt_total += i - prev_i + MAX_REPEAT / 2U - 1;
-                    }
-                    else {
-                        i = HandleRepeat2(tbl, data_block, start, end, i, radix_16);
-                        rpt_total += i - prev_i + MAX_REPEAT - 2;
-                    }
-					if (i < block_size)
-						radix_16 = ((size_t)data_block[i + 1] << 8) | data_block[i + 2];
-                    count = 0;
-                }
-            }
+            /* Link this position to the previous occurrence */
+            InitMatchLink(i, prev);
+            /* Set the previous to this position */
+            tbl->list_heads[radix_16].head = (U32)i;
+            ++tbl->list_heads[radix_16].count;
+            radix_16 = next_radix;
         }
         else {
-            count = 0;
             SetNull(i);
             tbl->list_heads[radix_16].head = (U32)i;
             tbl->list_heads[radix_16].count = 1;
@@ -192,65 +161,100 @@ RMF_structuredInit
         }
     }
     /* Handle the last value */
-    if (i <= block_size && tbl->list_heads[radix_16].head != RADIX_NULL_LINK) {
+    if (tbl->list_heads[radix_16].head != RADIX_NULL_LINK)
         SetMatchLinkAndLength(block_size, tbl->list_heads[radix_16].head, 2);
-    }
-    else {
+    else
         SetNull(block_size);
-    }
+
     /* Never a match at the last byte */
     SetNull(end - 1);
 
     tbl->end_index = (U32)st_index;
-    tbl->st_index = ATOMIC_INITIAL_VALUE;
 
     return rpt_total;
 }
 
-#if defined(_MSC_VER)
-#  pragma warning(disable : 4701)  /* disable: C4701: potentially uninitialized local variable */
-#endif
-
-
 /* Copy the list into a buffer and recurse it there. This decreases cache misses and allows */
 /* data characters to be loaded every fourth pass and stored for use in the next 4 passes */
-static void RecurseListsBuffered(RMF_builder* const tbl,
+static void RMF_recurseListsBuffered(RMF_builder* const tbl,
     const BYTE* const data_block,
     size_t const block_start,
     size_t link,
-    BYTE depth,
-    BYTE const max_depth,
+    U32 depth,
+    U32 const max_depth,
     U32 orig_list_count,
     size_t const stack_base)
 {
+    if (orig_list_count < 2 || tbl->match_buffer_limit < 2)
+        return;
+
     /* Create an offset data buffer pointer for reading the next bytes */
     const BYTE* data_src = data_block + depth;
     size_t start = 0;
 
-    if (orig_list_count < 2 || tbl->match_buffer_limit < 2)
-        return;
     do {
-        size_t count = start;
         U32 list_count = (U32)(start + orig_list_count);
-        U32 overlap;
 
-        if (list_count > tbl->match_buffer_limit) {
+        if (list_count > tbl->match_buffer_limit)
             list_count = (U32)tbl->match_buffer_limit;
-        }
+
+        size_t count = start;
+        size_t prev_link = (size_t)-1;
+        size_t rpt = 0;
+        size_t rpt_tail = link;
         for (; count < list_count; ++count) {
             /* Pre-load next link */
             size_t const next_link = GetMatchLink(link);
-            /* Get 4 data characters for later. This doesn't block on a cache miss. */
-            tbl->match_buffer[count].src.u32 = MEM_read32(data_src + link);
-            /* Record the actual location of this suffix */
-            tbl->match_buffer[count].from = (U32)link;
-            /* Initialize the next link */
-            tbl->match_buffer[count].next = (U32)(count + 1) | ((U32)depth << 24);
-            link = next_link;
+            size_t dist = prev_link - link;
+            if (dist > 2) {
+                /* Get 4 data characters for later. This doesn't block on a cache miss. */
+                tbl->match_buffer[count].src.u32 = MEM_read32(data_src + link);
+                /* Record the actual location of this suffix */
+                tbl->match_buffer[count].from = (U32)link;
+                /* Initialize the next link */
+                tbl->match_buffer[count].next = (U32)(count + 1) | (depth << 24);
+                rpt = 0;
+                prev_link = link;
+                rpt_tail = link;
+                link = next_link;
+            }
+            else {
+                rpt += 3 - dist;
+                /* Do the usual if the repeat is too short */
+                if (rpt < MAX_REPEAT - 2) {
+                    /* Get 4 data characters for later. This doesn't block on a cache miss. */
+                    tbl->match_buffer[count].src.u32 = MEM_read32(data_src + link);
+                    /* Record the actual location of this suffix */
+                    tbl->match_buffer[count].from = (U32)link;
+                    /* Initialize the next link */
+                    tbl->match_buffer[count].next = (U32)(count + 1) | (depth << 24);
+                    prev_link = link;
+                    link = next_link;
+                }
+                else {
+                    /* Eliminate the repeat from the linked list to save time */
+                    if (dist == 1) {
+                        link = RMF_handleRepeat(tbl, data_block, block_start, link, depth);
+                        count -= MAX_REPEAT / 2;
+                        orig_list_count -= (U32)(rpt_tail - link);
+                    }
+                    else {
+                        link = RMF_handleRepeat2(tbl, data_block, block_start, link, depth);
+                        count -= MAX_REPEAT - 1;
+                        orig_list_count -= (U32)(rpt_tail - link) >> 1;
+                    }
+                    rpt = 0;
+                    list_count = (U32)(start + orig_list_count);
+
+                    if (list_count > tbl->match_buffer_limit)
+                        list_count = (U32)tbl->match_buffer_limit;
+                }
+            }
         }
+        count = list_count;
         /* Make the last element circular so pre-loading doesn't read past the end. */
-        tbl->match_buffer[count - 1].next = (U32)(count - 1) | ((U32)depth << 24);
-        overlap = 0;
+        tbl->match_buffer[count - 1].next = (U32)(count - 1) | (depth << 24);
+        U32 overlap = 0;
         if (list_count < (U32)(start + orig_list_count)) {
             overlap = list_count >> MATCH_BUFFER_OVERLAP;
             overlap += !overlap;
@@ -259,15 +263,25 @@ static void RecurseListsBuffered(RMF_builder* const tbl,
         orig_list_count -= (U32)(list_count - start);
         /* Copy everything back, except the last link which never changes, and any extra overlap */
         count -= overlap + (overlap == 0);
-        for (size_t index = 0; index < count; ++index) {
+#ifdef RMF_BITPACK
+        if (max_depth > RADIX_MAX_LENGTH) for (size_t index = 0; index < count; ++index) {
             size_t const from = tbl->match_buffer[index].from;
             if (from < block_start)
                 return;
-
-            {   U32 length = tbl->match_buffer[index].next >> 24;
-                size_t next = tbl->match_buffer[index].next & BUFFER_LINK_MASK;
-                SetMatchLinkAndLength(from, tbl->match_buffer[next].from, length);
-            }
+            U32 length = tbl->match_buffer[index].next >> 24;
+            length = (length > RADIX_MAX_LENGTH) ? RADIX_MAX_LENGTH : length;
+            size_t const next = tbl->match_buffer[index].next & BUFFER_LINK_MASK;
+            SetMatchLinkAndLength(from, tbl->match_buffer[next].from, length);
+        }
+        else
+#endif
+            for (size_t index = 0; index < count; ++index) {
+            size_t const from = tbl->match_buffer[index].from;
+            if (from < block_start)
+                return;
+            U32 const length = tbl->match_buffer[index].next >> 24;
+            size_t const next = tbl->match_buffer[index].next & BUFFER_LINK_MASK;
+            SetMatchLinkAndLength(from, tbl->match_buffer[next].from, length);
         }
         start = 0;
         if (overlap) {
@@ -275,7 +289,7 @@ static void RecurseListsBuffered(RMF_builder* const tbl,
             for (size_t src = list_count - overlap; src < list_count; ++src) {
                 tbl->match_buffer[dest].from = tbl->match_buffer[src].from;
                 tbl->match_buffer[dest].src.u32 = MEM_read32(data_src + tbl->match_buffer[src].from);
-                tbl->match_buffer[dest].next = (U32)(dest + 1) | ((U32)depth << 24);
+                tbl->match_buffer[dest].next = (U32)(dest + 1) | (depth << 24);
                 ++dest;
             }
             start = dest;
@@ -283,30 +297,23 @@ static void RecurseListsBuffered(RMF_builder* const tbl,
     } while (orig_list_count != 0);
 }
 
-/* Parse the list with bounds checks on data reads. Stop at the point where bound checks are not required. */
+/* Parse the list with an upper bound check on data reads. Stop at the point where bound checks are not required. */
 /* Buffering is used so that parsing can continue below the bound to find a few matches without altering the main table. */
-static void RecurseListsBound(RMF_builder* const tbl,
+static void RMF_recurseListsBound(RMF_builder* const tbl,
     const BYTE* const data_block,
     ptrdiff_t const block_size,
     RMF_tableHead* const list_head,
-    U32 const max_depth)
+    U32 max_depth)
 {
     U32 list_count = list_head->count;
+    if (list_count < 2)
+        return;
+
     ptrdiff_t link = list_head->head;
     ptrdiff_t const bounded_size = max_depth + MAX_READ_BEYOND_DEPTH;
     ptrdiff_t const bounded_start = block_size - MIN(block_size, bounded_size);
-    /* Create an offset data buffer pointer for reading the next bytes */
     size_t count = 0;
     size_t extra_count = (max_depth >> 4) + 4;
-    ptrdiff_t limit;
-    const BYTE* data_src;
-    U32 depth;
-    size_t index;
-    size_t st_index;
-    RMF_listTail* tails_8;
-
-    if (list_count < 2)
-        return;
 
     list_count = MIN((U32)bounded_size, list_count);
     list_count = MIN(list_count, (U32)tbl->match_buffer_size);
@@ -314,9 +321,8 @@ static void RecurseListsBound(RMF_builder* const tbl,
         ptrdiff_t next_link = GetMatchLink(link);
         if (link >= bounded_start) {
             --list_head->count;
-            if (next_link < bounded_start) {
+            if (next_link < bounded_start)
                 list_head->head = (U32)next_link;
-            }
         }
         else {
             --extra_count;
@@ -328,18 +334,20 @@ static void RecurseListsBound(RMF_builder* const tbl,
         link = next_link;
     }
     list_count = (U32)count;
-    limit = block_size - 2;
-    data_src = data_block + 2;
-    depth = 3;
-    index = 0;
-    st_index = 0;
-    tails_8 = tbl->tails_8;
+    ptrdiff_t limit = block_size - 2;
+    /* Create an offset data buffer pointer for reading the next bytes */
+    const BYTE* data_src = data_block + 2;
+    U32 depth = 3;
+    size_t index = 0;
+    size_t st_index = 0;
+    RMF_listTail* const tails_8 = tbl->tails_8;
     do {
         link = tbl->match_buffer[index].from;
         if (link < limit) {
             size_t const radix_8 = data_src[link];
             /* Seen this char before? */
-            const U32 prev = tails_8[radix_8].prev_index;
+            U32 const prev = tails_8[radix_8].prev_index;
+            tails_8[radix_8].prev_index = (U32)index;
             if (prev != RADIX_NULL_LINK) {
                 ++tails_8[radix_8].list_count;
                 /* Link the previous occurrence to this one and record the new length */
@@ -353,7 +361,6 @@ static void RecurseListsBound(RMF_builder* const tbl,
                 tbl->stack[st_index].count = (U32)radix_8;
                 ++st_index;
             }
-            tails_8[radix_8].prev_index = (U32)index;
         }
         ++index;
     } while (index < list_count);
@@ -368,10 +375,9 @@ static void RecurseListsBound(RMF_builder* const tbl,
         /* Pop an item off the stack */
         --st_index;
         list_count = tbl->stack[st_index].count;
-        if (list_count < 2) {
-            /* Nothing to match with */
+        if (list_count < 2) /* Nothing to match with */
             continue;
-        }
+
         index = tbl->stack[st_index].head;
         depth = (tbl->match_buffer[index].next >> 24);
         if (depth >= max_depth)
@@ -390,9 +396,10 @@ static void RecurseListsBound(RMF_builder* const tbl,
             if (link < limit) {
                 size_t const radix_8 = data_src[link];
                 U32 const prev = tails_8[radix_8].prev_index;
+                tails_8[radix_8].prev_index = (U32)index;
                 if (prev != RADIX_NULL_LINK) {
                     ++tails_8[radix_8].list_count;
-                    tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
+                    tbl->match_buffer[prev].next = (U32)index | (depth << 24);
                 }
                 else {
                     tails_8[radix_8].list_count = 1;
@@ -400,7 +407,6 @@ static void RecurseListsBound(RMF_builder* const tbl,
                     tbl->stack[st_index].count = (U32)radix_8;
                     ++st_index;
                 }
-                tails_8[radix_8].prev_index = (U32)index;
             }
             index = tbl->match_buffer[index].next & BUFFER_LINK_MASK;
         } while (--list_count != 0);
@@ -413,20 +419,20 @@ static void RecurseListsBound(RMF_builder* const tbl,
     --count;
     for (index = 0; index < count; ++index) {
         ptrdiff_t const from = tbl->match_buffer[index].from;
-        size_t next;
-        U32 length;
-
         if (from < bounded_start)
             break;
-        length = tbl->match_buffer[index].next >> 24;
+
+        U32 length = tbl->match_buffer[index].next >> 24;
         length = MIN(length, (U32)(block_size - from));
-        next = tbl->match_buffer[index].next & BUFFER_LINK_MASK;
+        length = MIN(length, RADIX_MAX_LENGTH);
+
+        size_t const next = tbl->match_buffer[index].next & BUFFER_LINK_MASK;
         SetMatchLinkAndLength(from, tbl->match_buffer[next].from, length);
     }
 }
 
 /* Compare each string with all others to find the best match */
-static void BruteForce(RMF_builder* const tbl,
+static void RMF_bruteForce(RMF_builder* const tbl,
     const BYTE* const data_block,
     size_t const block_start,
     size_t link,
@@ -445,6 +451,7 @@ static void BruteForce(RMF_builder* const tbl,
         link = GetMatchLink(link);
         buffer[i] = link;
     } while (++i < list_count);
+
     i = 0;
     do {
         size_t longest = 0;
@@ -454,34 +461,37 @@ static void BruteForce(RMF_builder* const tbl,
         do {
             const BYTE* data_2 = data_src + buffer[j];
             size_t len_test = 0;
-            while (data[len_test] == data_2[len_test] && len_test < limit) {
+            while (data[len_test] == data_2[len_test] && len_test < limit)
                 ++len_test;
-            }
+
             if (len_test > longest) {
                 longest_index = j;
                 longest = len_test;
-                if (len_test >= limit) {
+                if (len_test >= limit)
                     break;
-                }
             }
         } while (++j < list_count);
-        if (longest > 0) {
-            SetMatchLinkAndLength(buffer[i],
-                (U32)buffer[longest_index],
-                depth + (U32)longest);
-        }
+
+        if (longest > 0)
+            SetMatchLinkAndLength(buffer[i], (U32)buffer[longest_index], depth + (U32)longest);
+
         ++i;
+    /* Test with block_start to avoid wasting time matching strings in the overlap region with each other */
     } while (i < list_count - 1 && buffer[i] >= block_start);
 }
 
-static void RecurseLists16(RMF_builder* const tbl,
+/* RMF_recurseLists16() : 
+ * Match strings at depth 2 using a 16-bit radix to lengthen to depth 4
+ */
+static void RMF_recurseLists16(RMF_builder* const tbl,
     const BYTE* const data_block,
     size_t const block_start,
     size_t link,
     U32 count,
     U32 const max_depth)
 {
-    /* Offset data pointer. This method is only called at depth 2 */
+    U32 const table_max_depth = MIN(max_depth, RADIX_MAX_LENGTH);
+    /* Offset data pointer. This function is only called at depth 2 */
     const BYTE* const data_src = data_block + 2;
     /* Load radix values from the data chars */
     size_t next_radix_8 = data_src[link];
@@ -489,7 +499,6 @@ static void RecurseLists16(RMF_builder* const tbl,
     size_t reset_list[RADIX8_TABLE_SIZE];
     size_t reset_count = 0;
     size_t st_index = 0;
-    U32 prev;
     /* Last one is done separately */
     --count;
     do
@@ -504,7 +513,8 @@ static void RecurseLists16(RMF_builder* const tbl,
         next_radix_8 = data_src[next_link];
         next_radix_16 = next_radix_8 + ((size_t)(data_src[next_link + 1]) << 8);
 
-        prev = tbl->tails_8[radix_8].prev_index;
+        U32 prev = tbl->tails_8[radix_8].prev_index;
+        tbl->tails_8[radix_8].prev_index = (U32)link;
         if (prev != RADIX_NULL_LINK) {
             /* Link the previous occurrence to this one at length 3. */
             /* This will be overwritten if a 4 is found. */
@@ -513,9 +523,9 @@ static void RecurseLists16(RMF_builder* const tbl,
         else {
             reset_list[reset_count++] = radix_8;
         }
-        tbl->tails_8[radix_8].prev_index = (U32)link;
 
         prev = tbl->tails_16[radix_16].prev_index;
+        tbl->tails_16[radix_16].prev_index = (U32)link;
         if (prev != RADIX_NULL_LINK) {
             ++tbl->tails_16[radix_16].list_count;
             /* Link at length 4, overwriting the 3 */
@@ -524,35 +534,35 @@ static void RecurseLists16(RMF_builder* const tbl,
         else {
             tbl->tails_16[radix_16].list_count = 1;
             tbl->stack[st_index].head = (U32)link;
+            /* Store a reference to this table location to retrieve the count at the end */
             tbl->stack[st_index].count = (U32)radix_16;
             ++st_index;
         }
-        tbl->tails_16[radix_16].prev_index = (U32)link;
         link = next_link;
     } while (--count > 0);
+
     /* Do the last location */
-    prev = tbl->tails_8[next_radix_8].prev_index;
-    if (prev != RADIX_NULL_LINK) {
+    U32 prev = tbl->tails_8[next_radix_8].prev_index;
+    if (prev != RADIX_NULL_LINK)
         SetMatchLinkAndLength(prev, (U32)link, 3);
-    }
+
     prev = tbl->tails_16[next_radix_16].prev_index;
     if (prev != RADIX_NULL_LINK) {
         ++tbl->tails_16[next_radix_16].list_count;
         SetMatchLinkAndLength(prev, (U32)link, 4);
     }
-    for (size_t i = 0; i < reset_count; ++i) {
+
+    for (size_t i = 0; i < reset_count; ++i)
         tbl->tails_8[reset_list[i]].prev_index = RADIX_NULL_LINK;
-    }
+
     for (size_t i = 0; i < st_index; ++i) {
         tbl->tails_16[tbl->stack[i].count].prev_index = RADIX_NULL_LINK;
         tbl->stack[i].count = tbl->tails_16[tbl->stack[i].count].list_count;
     }
-    while (st_index > 0) {
-        U32 list_count;
-        U32 depth;
 
+    while (st_index > 0) {
         --st_index;
-        list_count = tbl->stack[st_index].count;
+        U32 const list_count = tbl->stack[st_index].count;
         if (list_count < 2) {
             /* Nothing to do */
             continue;
@@ -567,19 +577,19 @@ static void RecurseLists16(RMF_builder* const tbl,
             continue;
         }
         /* The current depth */
-        depth = GetMatchLength(link);
+        U32 const depth = GetMatchLength(link);
         if (list_count <= MAX_BRUTE_FORCE_LIST_SIZE) {
             /* Quicker to use brute force, each string compared with all previous strings */
-            BruteForce(tbl, data_block,
+            RMF_bruteForce(tbl, data_block,
                 block_start,
                 link,
                 list_count,
                 depth,
-                max_depth);
+                table_max_depth);
             continue;
         }
         /* Send to the buffer at depth 4 */
-        RecurseListsBuffered(tbl,
+        RMF_recurseListsBuffered(tbl,
             data_block,
             block_start,
             link,
@@ -591,7 +601,10 @@ static void RecurseLists16(RMF_builder* const tbl,
 }
 
 #if 0
-static void RecurseListsUnbuf16(RMF_builder* const tbl,
+/* Unbuffered complete processing to max_depth.
+ * This may be faster on CPUs without a large memory cache.
+ */
+static void RMF_recurseListsUnbuf16(RMF_builder* const tbl,
     const BYTE* const data_block,
     size_t const block_start,
     size_t link,
@@ -607,7 +620,6 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl,
     size_t reset_list[RADIX8_TABLE_SIZE];
     size_t reset_count = 0;
     size_t st_index = 0;
-    U32 prev;
     /* Last one is done separately */
     --count;
     do
@@ -620,7 +632,7 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl,
         size_t radix_16 = next_radix_16;
         next_radix_8 = data_src[next_link];
         next_radix_16 = next_radix_8 + ((size_t)(data_src[next_link + 1]) << 8);
-        prev = tails_8[radix_8].prev_index;
+        U32 prev = tails_8[radix_8].prev_index;
         if (prev != RADIX_NULL_LINK) {
             /* Link the previous occurrence to this one at length 3. */
             /* This will be overwritten if a 4 is found. */
@@ -646,7 +658,7 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl,
         link = next_link;
     } while (--count > 0);
     /* Do the last location */
-    prev = tails_8[next_radix_8].prev_index;
+    U32 prev = tails_8[next_radix_8].prev_index;
     if (prev != RADIX_NULL_LINK) {
         SetMatchLinkAndLength(prev, (U32)link, 3);
     }
@@ -683,7 +695,7 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl,
         U32 depth = GetMatchLength(link);
         if (list_count <= MAX_BRUTE_FORCE_LIST_SIZE) {
             /* Quicker to use brute force, each string compared with all previous strings */
-            BruteForce(tbl, data_block,
+            RMF_bruteForce(tbl, data_block,
                 block_start,
                 link,
                 list_count,
@@ -800,7 +812,7 @@ static void RecurseListsUnbuf16(RMF_builder* const tbl,
 #ifdef RMF_REFERENCE
 
 /* Simple, slow, complete parsing for reference */
-static void RecurseListsReference(RMF_builder* const tbl,
+static void RMF_recurseListsReference(RMF_builder* const tbl,
     const BYTE* const data_block,
     size_t const block_size,
     size_t link,
@@ -836,12 +848,8 @@ static void RecurseListsReference(RMF_builder* const tbl,
     }
     memset(tbl->tails_8, 0xFF, sizeof(tbl->tails_8));
     while (st_index > 0) {
-        U32 list_count;
-        U32 depth;
-        size_t prev_st_index;
-
         --st_index;
-        list_count = tbl->stack[st_index].count;
+        U32 list_count = tbl->stack[st_index].count;
         if (list_count < 2) {
             /* Nothing to do */
             continue;
@@ -854,14 +862,14 @@ static void RecurseListsReference(RMF_builder* const tbl,
         }
         link = tbl->stack[st_index].head;
         /* The current depth */
-        depth = GetMatchLength(link);
+        U32 depth = GetMatchLength(link);
         if (depth >= max_depth)
             continue;
         data_src = data_block + depth;
         limit = block_size - depth;
         /* Next depth for 1 extra char */
         ++depth;
-        prev_st_index = st_index;
+        size_t prev_st_index = st_index;
         do {
             if (link < limit) {
                 size_t const radix_8 = data_src[link];
@@ -890,21 +898,29 @@ static void RecurseListsReference(RMF_builder* const tbl,
 #endif /* RMF_REFERENCE */
 
 /* Atomically take a list from the head table */
-static ptrdiff_t RMF_getNextList(FL2_matchTable* const tbl, unsigned const multi_thread)
+static ptrdiff_t RMF_getNextList_mt(FL2_matchTable* const tbl)
 {
     if (tbl->st_index < tbl->end_index) {
-        long index = multi_thread ? FL2_atomic_increment(tbl->st_index) : FL2_nonAtomic_increment(tbl->st_index);
-        if (index < tbl->end_index) {
+        long index = FL2_atomic_increment(tbl->st_index);
+        if (index < tbl->end_index)
             return index;
-        }
     }
     return -1;
 }
 
-#define UPDATE_INTERVAL 0x40000U
+/* Non-atomically take a list from the head table */
+static ptrdiff_t RMF_getNextList_st(FL2_matchTable* const tbl)
+{
+    if (tbl->st_index < tbl->end_index) {
+        long index = FL2_nonAtomic_increment(tbl->st_index);
+        if (index < tbl->end_index)
+            return index;
+    }
+    return -1;
+}
 
 /* Iterate the head table concurrently with other threads, and recurse each list until max_depth is reached */
-int
+void
 #ifdef RMF_BITPACK
 RMF_bitpackBuildTable
 #else
@@ -913,69 +929,58 @@ RMF_structuredBuildTable
 (FL2_matchTable* const tbl,
 	size_t const job,
     unsigned const multi_thread,
-    FL2_dataBlock const block,
-    FL2_progressFn progress, void* opaque, U32 weight, size_t init_done)
+    FL2_dataBlock const block)
 {
-    if (!block.end)
-        return 0;
-    U64 const enc_size = block.end - block.start;
+    if (block.end == 0)
+        return;
+
     unsigned const best = !tbl->params.divide_and_conquer;
-    unsigned const max_depth = MIN(tbl->params.depth, RADIX_MAX_LENGTH) & ~1;
-    size_t const bounded_start = block.end - max_depth - MAX_READ_BEYOND_DEPTH;
-    ptrdiff_t next_progress = 0;
-    size_t update = UPDATE_INTERVAL;
-    size_t total = init_done;
+    unsigned const max_depth = MIN(tbl->params.depth, STRUCTURED_MAX_LENGTH) & ~1;
+    size_t bounded_start = max_depth + MAX_READ_BEYOND_DEPTH;
+    bounded_start = block.end - MIN(block.end, bounded_start);
+    ptrdiff_t next_progress = (job == 0) ? 0 : RADIX16_TABLE_SIZE;
+    ptrdiff_t(*getNextList)(FL2_matchTable* const tbl)
+        = multi_thread ? RMF_getNextList_mt : RMF_getNextList_st;
 
     for (;;)
     {
         /* Get the next to process */
-        ptrdiff_t index = RMF_getNextList(tbl, multi_thread);
-        RMF_tableHead list_head;
+        ptrdiff_t index = getNextList(tbl);
 
-        if (index < 0) {
+        if (index < 0)
             break;
-        }
-        if (progress) {
-            while (next_progress < index) {
-                total += tbl->list_heads[tbl->stack[next_progress]].count;
-                ++next_progress;
-            }
-            if (total >= update) {
-                if (progress((size_t)((total * enc_size / block.end * weight) >> 4), opaque)) {
-					FL2_atomic_add(tbl->st_index, RADIX16_TABLE_SIZE);
-                    return 1;
-                }
-                update = total + UPDATE_INTERVAL;
-            }
+
+        while (next_progress < index) {
+            /* initial value of next_progress ensures only thread 0 executes this */
+            tbl->progress += tbl->list_heads[tbl->stack[next_progress]].count;
+            ++next_progress;
         }
         index = tbl->stack[index];
-        list_head = tbl->list_heads[index];
+        RMF_tableHead list_head = tbl->list_heads[index];
         tbl->list_heads[index].head = RADIX_NULL_LINK;
-        if (list_head.count < 2 || list_head.head < block.start) {
+        if (list_head.count < 2 || list_head.head < block.start)
             continue;
-        }
+
 #ifdef RMF_REFERENCE
         if (tbl->params.use_ref_mf) {
-            RecurseListsReference(tbl->builders[job], block.data, block.end, list_head.head, list_head.count, max_depth);
+            RMF_recurseListsReference(tbl->builders[job], block.data, block.end, list_head.head, list_head.count, max_depth);
             continue;
         }
 #endif
         if (list_head.head >= bounded_start) {
-            RecurseListsBound(tbl->builders[job], block.data, block.end, &list_head, (BYTE)max_depth);
-            if (list_head.count < 2 || list_head.head < block.start) {
+            RMF_recurseListsBound(tbl->builders[job], block.data, block.end, &list_head, max_depth);
+            if (list_head.count < 2 || list_head.head < block.start)
                 continue;
-            }
         }
         if (best && list_head.count > tbl->builders[job]->match_buffer_limit)
         {
             /* Not worth buffering or too long */
-            RecurseLists16(tbl->builders[job], block.data, block.start, list_head.head, list_head.count, max_depth);
+            RMF_recurseLists16(tbl->builders[job], block.data, block.start, list_head.head, list_head.count, max_depth);
         }
         else {
-            RecurseListsBuffered(tbl->builders[job], block.data, block.start, list_head.head, 2, (BYTE)max_depth, list_head.count, 0);
+            RMF_recurseListsBuffered(tbl->builders[job], block.data, block.start, list_head.head, 2, (BYTE)max_depth, list_head.count, 0);
         }
     }
-    return 0;
 }
 
 int
@@ -984,28 +989,24 @@ RMF_bitpackIntegrityCheck
 #else
 RMF_structuredIntegrityCheck
 #endif
-(const FL2_matchTable* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned const max_depth)
+(const FL2_matchTable* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned max_depth)
 {
+    max_depth &= ~1;
     int err = 0;
     for (index += !index; index < end; ++index) {
-        U32 link;
-        U32 length;
-        U32 len_test;
-        U32 limit;
-
         if (IsNull(index))
             continue;
-        link = GetMatchLink(index);
+        U32 const link = GetMatchLink(index);
         if (link >= index) {
             printf("Forward link at %X to %u\r\n", (U32)index, link);
             err = 1;
             continue;
         }
-        length = GetMatchLength(index);
+        U32 const length = GetMatchLength(index);
         if (index && length < RADIX_MAX_LENGTH && link - 1 == GetMatchLink(index - 1) && length + 1 == GetMatchLength(index - 1))
             continue;
-        len_test = 0;
-        limit = MIN((U32)(end - index), RADIX_MAX_LENGTH);
+        U32 len_test = 0;
+        U32 const limit = MIN((U32)(end - index), RADIX_MAX_LENGTH);
         for (; len_test < limit && data[link + len_test] == data[index + len_test]; ++len_test) {
         }
         if (len_test < length) {
@@ -1013,63 +1014,8 @@ RMF_structuredIntegrityCheck
             err = 1;
         }
         if (length < max_depth && len_test > length)
+            /* These occur occasionally due to splitting of chains in the buffer when long repeats are present */
             printf("Shortened match at %X: %u of %u\r\n", (U32)index, length, len_test);
     }
     return err;
 }
-
-
-static size_t ExtendMatch(const FL2_matchTable* const tbl,
-    const BYTE* const data,
-    ptrdiff_t const start_index,
-    ptrdiff_t const limit,
-    U32 const link,
-    size_t const length)
-{
-    ptrdiff_t end_index = start_index + length;
-    ptrdiff_t const dist = start_index - link;
-    while (end_index < limit && end_index - (ptrdiff_t)GetMatchLink(end_index) == dist) {
-        end_index += GetMatchLength(end_index);
-    }
-    if (end_index >= limit) {
-        return limit - start_index;
-    }
-    while (end_index < limit && data[end_index - dist] == data[end_index]) {
-        ++end_index;
-    }
-    return end_index - start_index;
-}
-
-size_t
-#ifdef RMF_BITPACK
-RMF_bitpackGetMatch
-#else
-RMF_structuredGetMatch
-#endif
-(const FL2_matchTable* const tbl,
-    const BYTE* const data,
-    size_t const index,
-    size_t const limit,
-    unsigned const max_depth,
-    size_t* const offset_ptr)
-{
-    size_t length;
-    size_t dist;
-    U32 link;
-    if (IsNull(index))
-        return 0;
-    link = GetMatchLink(index);
-    length = GetMatchLength(index);
-    if (length < 2)
-        return 0;
-    dist = index - link;
-    *offset_ptr = dist;
-    if (length > limit - index)
-        return limit - index;
-    if (length == max_depth
-        || length == RADIX_MAX_LENGTH /* from HandleRepeat */)
-    {
-        length = ExtendMatch(tbl, data, index, limit, link, length);
-    }
-    return length;
-}
diff --git a/C/fast-lzma2/radix_get.h b/C/fast-lzma2/radix_get.h
new file mode 100644
index 00000000..8696fdae
--- /dev/null
+++ b/C/fast-lzma2/radix_get.h
@@ -0,0 +1,210 @@
+/*
+* Copyright (c) 2018, Conor McCarthy
+* All rights reserved.
+*
+* This source code is licensed under both the BSD-style license (found in the
+* LICENSE file in the root directory of this source tree) and the GPLv2 (found
+* in the COPYING file in the root directory of this source tree).
+* You may select, at your option, one of the above-listed licenses.
+*/
+
+#ifndef FL2_RADIX_GET_H_
+#define FL2_RADIX_GET_H_
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+typedef struct
+{
+    U32 length;
+    U32 dist;
+} RMF_match;
+
+static size_t RMF_bitpackExtendMatch(const BYTE* const data,
+    const U32* const table,
+    ptrdiff_t const start_index,
+    ptrdiff_t limit,
+    U32 const link,
+    size_t const length)
+{
+    ptrdiff_t end_index = start_index + length;
+    ptrdiff_t const dist = start_index - link;
+
+    if (limit > start_index + (ptrdiff_t)kMatchLenMax)
+        limit = start_index + kMatchLenMax;
+
+    while (end_index < limit && end_index - (ptrdiff_t)(table[end_index] & RADIX_LINK_MASK) == dist)
+        end_index += table[end_index] >> RADIX_LINK_BITS;
+
+    if (end_index >= limit) {
+        DEBUGLOG(7, "RMF_bitpackExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(limit - start_index));
+        return limit - start_index;
+    }
+
+    while (end_index < limit && data[end_index - dist] == data[end_index])
+        ++end_index;
+
+    DEBUGLOG(7, "RMF_bitpackExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(end_index - start_index));
+    return end_index - start_index;
+}
+
+#define GetMatchLink(table, index) ((const RMF_unit*)(table))[(index) >> UNIT_BITS].links[(index) & UNIT_MASK]
+
+#define GetMatchLength(table, index) ((const RMF_unit*)(table))[(index) >> UNIT_BITS].lengths[(index) & UNIT_MASK]
+
+static size_t RMF_structuredExtendMatch(const BYTE* const data,
+    const U32* const table,
+    ptrdiff_t const start_index,
+    ptrdiff_t limit,
+    U32 const link,
+    size_t const length)
+{
+    ptrdiff_t end_index = start_index + length;
+    ptrdiff_t const dist = start_index - link;
+
+    if (limit > start_index + (ptrdiff_t)kMatchLenMax)
+        limit = start_index + kMatchLenMax;
+
+    while (end_index < limit && end_index - (ptrdiff_t)GetMatchLink(table, end_index) == dist)
+        end_index += GetMatchLength(table, end_index);
+
+    if (end_index >= limit) {
+        DEBUGLOG(7, "RMF_structuredExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(limit - start_index));
+        return limit - start_index;
+    }
+
+    while (end_index < limit && data[end_index - dist] == data[end_index])
+        ++end_index;
+
+    DEBUGLOG(7, "RMF_structuredExtendMatch : pos %u, link %u, init length %u, full length %u", (U32)start_index, link, (U32)length, (U32)(end_index - start_index));
+    return end_index - start_index;
+}
+
+FORCE_INLINE_TEMPLATE
+RMF_match RMF_getMatch(FL2_dataBlock block,
+    FL2_matchTable* tbl,
+    unsigned max_depth,
+    int structTbl,
+    size_t index)
+{
+    if (structTbl)
+    {
+        U32 const link = GetMatchLink(tbl->table, index);
+
+        RMF_match match;
+        match.length = 0;
+
+        if (link == RADIX_NULL_LINK)
+            return match;
+
+        size_t const length = GetMatchLength(tbl->table, index);
+        size_t const dist = index - link - 1;
+
+        if (length > block.end - index) 
+            match.length = (U32)(block.end - index);
+        else if (length == max_depth || length == STRUCTURED_MAX_LENGTH /* from HandleRepeat */)
+            match.length = (U32)RMF_structuredExtendMatch(block.data, tbl->table, index, block.end, link, length);
+        else
+            match.length = (U32)length;
+
+        match.dist = (U32)dist;
+
+        return match;
+    }
+    else {
+        U32 link = tbl->table[index];
+
+        RMF_match match;
+        match.length = 0;
+
+        if (link == RADIX_NULL_LINK)
+            return match;
+
+        size_t const length = link >> RADIX_LINK_BITS;
+        link &= RADIX_LINK_MASK;
+        size_t const dist = index - link - 1;
+
+        if (length > block.end - index)
+            match.length = (U32)(block.end - index);
+        else if (length == max_depth || length == BITPACK_MAX_LENGTH /* from HandleRepeat */)
+            match.length = (U32)RMF_bitpackExtendMatch(block.data, tbl->table, index, block.end, link, length);
+        else
+            match.length = (U32)length;
+
+        match.dist = (U32)dist;
+
+        return match;
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+RMF_match RMF_getNextMatch(FL2_dataBlock block,
+    FL2_matchTable* tbl,
+    unsigned max_depth,
+    int structTbl,
+    size_t index)
+{
+    if (structTbl)
+    {
+        U32 const link = GetMatchLink(tbl->table, index);
+
+        RMF_match match;
+        match.length = 0;
+
+        if (link == RADIX_NULL_LINK)
+            return match;
+
+        size_t const length = GetMatchLength(tbl->table, index);
+        size_t const dist = index - link - 1;
+
+        /* same distance, one byte shorter */
+        if (link - 1 == GetMatchLink(tbl->table, index - 1))
+            return match;
+
+        if (length > block.end - index)
+            match.length = (U32)(block.end - index);
+        else if (length == max_depth || length == STRUCTURED_MAX_LENGTH /* from HandleRepeat */)
+            match.length = (U32)RMF_structuredExtendMatch(block.data, tbl->table, index, block.end, link, length);
+        else
+            match.length = (U32)length;
+
+        match.dist = (U32)dist;
+
+        return match;
+    }
+    else {
+        U32 link = tbl->table[index];
+
+        RMF_match match;
+        match.length = 0;
+
+        if (link == RADIX_NULL_LINK)
+            return match;
+
+        size_t const length = link >> RADIX_LINK_BITS;
+        link &= RADIX_LINK_MASK;
+        size_t const dist = index - link - 1;
+
+        /* same distance, one byte shorter */
+        if (link - 1 == (tbl->table[index - 1] & RADIX_LINK_MASK))
+            return match;
+
+        if (length > block.end - index)
+            match.length = (U32)(block.end - index);
+        else if (length == max_depth || length == BITPACK_MAX_LENGTH /* from HandleRepeat */)
+            match.length = (U32)RMF_bitpackExtendMatch(block.data, tbl->table, index, block.end, link, length);
+        else
+            match.length = (U32)length;
+
+        match.dist = (U32)dist;
+
+        return match;
+    }
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* FL2_RADIX_GET_H_ */
\ No newline at end of file
diff --git a/C/fast-lzma2/radix_internal.h b/C/fast-lzma2/radix_internal.h
index 4a9ba359..36431939 100644
--- a/C/fast-lzma2/radix_internal.h
+++ b/C/fast-lzma2/radix_internal.h
@@ -14,6 +14,10 @@
 #include "atomic.h"
 #include "radix_mf.h"
 
+#if defined(FL2_XZ_BUILD) && defined(TUKLIB_FAST_UNALIGNED_ACCESS)
+#  define MEM_read32(a) (*(const U32*)(a))
+#endif
+
 #if defined (__cplusplus)
 extern "C" {
 #endif
@@ -21,26 +25,27 @@ extern "C" {
 #define DICTIONARY_LOG_MIN 12U
 #define DICTIONARY_LOG_MAX_64 30U
 #define DICTIONARY_LOG_MAX_32 27U
-#define DEFAULT_BUFFER_LOG 8U
-#define DEFAULT_BLOCK_OVERLAP 2U
-#define DEFAULT_SEARCH_DEPTH 32U
-#define DEFAULT_DIVIDEANDCONQUER 1
-#define MAX_REPEAT 32
-#define RADIX16_TABLE_SIZE (1UL << 16)
-#define RADIX8_TABLE_SIZE (1UL << 8)
+#define DICTIONARY_SIZE_MIN ((size_t)1 << DICTIONARY_LOG_MIN)
+#define DICTIONARY_SIZE_MAX_64 ((size_t)1 << DICTIONARY_LOG_MAX_64)
+#define DICTIONARY_SIZE_MAX_32 ((size_t)1 << DICTIONARY_LOG_MAX_32)
+#define MAX_REPEAT 24
+#define RADIX16_TABLE_SIZE ((size_t)1 << 16)
+#define RADIX8_TABLE_SIZE ((size_t)1 << 8)
 #define STACK_SIZE (RADIX16_TABLE_SIZE * 3)
 #define MAX_BRUTE_FORCE_LIST_SIZE 5
 #define BUFFER_LINK_MASK 0xFFFFFFU
 #define MATCH_BUFFER_OVERLAP 6
-#define BITPACK_MAX_LENGTH 63UL
-#define STRUCTURED_MAX_LENGTH 255UL
+#define BITPACK_MAX_LENGTH 63U
+#define STRUCTURED_MAX_LENGTH 255U
 
 #define RADIX_LINK_BITS 26
-#define RADIX_LINK_MASK ((1UL << RADIX_LINK_BITS) - 1)
-#define RADIX_NULL_LINK 0xFFFFFFFFUL
+#define RADIX_LINK_MASK ((1U << RADIX_LINK_BITS) - 1)
+#define RADIX_NULL_LINK 0xFFFFFFFFU
 
 #define UNIT_BITS 2
-#define UNIT_MASK ((1UL << UNIT_BITS) - 1)
+#define UNIT_MASK ((1U << UNIT_BITS) - 1)
+
+#define RADIX_CANCEL_INDEX (long)(RADIX16_TABLE_SIZE + FL2_MAXTHREADS + 2)
 
 typedef struct
 {
@@ -88,9 +93,10 @@ struct FL2_matchTable_s
 {
     FL2_atomic st_index;
     long end_index;
-    int isStruct;
-    int allocStruct;
+    int is_struct;
+    int alloc_struct;
     unsigned thread_count;
+    size_t progress;
     RMF_parameters params;
     RMF_builder** builders;
     U32 stack[RADIX16_TABLE_SIZE];
@@ -98,27 +104,25 @@ struct FL2_matchTable_s
     U32 table[1];
 };
 
-size_t RMF_bitpackInit(struct FL2_matchTable_s* const tbl, const void* data, size_t const start, size_t const end);
-size_t RMF_structuredInit(struct FL2_matchTable_s* const tbl, const void* data, size_t const start, size_t const end);
-int RMF_bitpackBuildTable(struct FL2_matchTable_s* const tbl,
+size_t RMF_bitpackInit(struct FL2_matchTable_s* const tbl, const void* data, size_t const end);
+size_t RMF_structuredInit(struct FL2_matchTable_s* const tbl, const void* data, size_t const end);
+void RMF_bitpackBuildTable(struct FL2_matchTable_s* const tbl,
 	size_t const job,
     unsigned const multi_thread,
-    FL2_dataBlock const block,
-    FL2_progressFn progress, void* opaque, U32 weight, size_t init_done);
-int RMF_structuredBuildTable(struct FL2_matchTable_s* const tbl,
+    FL2_dataBlock const block);
+void RMF_structuredBuildTable(struct FL2_matchTable_s* const tbl,
 	size_t const job,
     unsigned const multi_thread,
-    FL2_dataBlock const block,
-    FL2_progressFn progress, void* opaque, U32 weight, size_t init_done);
+    FL2_dataBlock const block);
 void RMF_recurseListChunk(RMF_builder* const tbl,
     const BYTE* const data_block,
     size_t const block_start,
-    BYTE const depth,
-    BYTE const max_depth,
+    U32 const depth,
+    U32 const max_depth,
     U32 const list_count,
     size_t const stack_base);
-int RMF_bitpackIntegrityCheck(const struct FL2_matchTable_s* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned const max_depth);
-int RMF_structuredIntegrityCheck(const struct FL2_matchTable_s* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned const max_depth);
+int RMF_bitpackIntegrityCheck(const struct FL2_matchTable_s* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned max_depth);
+int RMF_structuredIntegrityCheck(const struct FL2_matchTable_s* const tbl, const BYTE* const data, size_t index, size_t const end, unsigned max_depth);
 void RMF_bitpackLimitLengths(struct FL2_matchTable_s* const tbl, size_t const index);
 void RMF_structuredLimitLengths(struct FL2_matchTable_s* const tbl, size_t const index);
 BYTE* RMF_bitpackAsOutputBuffer(struct FL2_matchTable_s* const tbl, size_t const index);
diff --git a/C/fast-lzma2/radix_mf.c b/C/fast-lzma2/radix_mf.c
index 55187638..ebfa4d33 100644
--- a/C/fast-lzma2/radix_mf.c
+++ b/C/fast-lzma2/radix_mf.c
@@ -11,21 +11,20 @@
 #include <stddef.h>     /* size_t, ptrdiff_t */
 #include <stdlib.h>     /* malloc, free */
 #include "fast-lzma2.h"
+#include "fl2_errors.h"
 #include "mem.h"          /* U32, U64, MEM_64bits */
 #include "fl2_internal.h"
 #include "radix_internal.h"
 
 #ifdef __GNUC__
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" /* warning: 'rpt_head_next' may be used uninitialized in this function */
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized" /* warning: 'rpt_head_next' may be used uninitialized in this function */
 #elif defined(_MSC_VER)
-#  pragma warning(disable : 4701)  /* disable: C4701: potentially uninitialized local variable */
+#  pragma warning(disable : 4701) /* warning: 'rpt_head_next' may be used uninitialized in this function */
 #endif
 
 #define MIN_MATCH_BUFFER_SIZE 256U /* min buffer size at least FL2_SEARCH_DEPTH_MAX + 2 for bounded build */
 #define MAX_MATCH_BUFFER_SIZE (1UL << 24) /* max buffer size constrained by 24-bit link values */
 
-#define REPEAT_CHECK_TABLE ((1 << 1) | (1 << 2) | (1 << 4) | (1 << 8) | (1 << 16) | (1ULL << 32))
-
 static void RMF_initTailTable(RMF_builder* const tbl)
 {
     for (size_t i = 0; i < RADIX8_TABLE_SIZE; i += 2) {
@@ -43,146 +42,175 @@ static RMF_builder* RMF_createBuilder(size_t match_buffer_size)
     match_buffer_size = MIN(match_buffer_size, MAX_MATCH_BUFFER_SIZE);
     match_buffer_size = MAX(match_buffer_size, MIN_MATCH_BUFFER_SIZE);
 
-    {   RMF_builder* const builder = (RMF_builder*)malloc(
-            sizeof(RMF_builder) + (match_buffer_size - 1) * sizeof(RMF_buildMatch));
-        builder->match_buffer_size = match_buffer_size;
-        builder->match_buffer_limit = match_buffer_size;
-        RMF_initTailTable(builder);
-        return builder;
-    }
+    RMF_builder* const builder = malloc(
+        sizeof(RMF_builder) + (match_buffer_size - 1) * sizeof(RMF_buildMatch));
+
+    if (builder == NULL)
+        return NULL;
+
+    builder->match_buffer_size = match_buffer_size;
+    builder->match_buffer_limit = match_buffer_size;
+
+    RMF_initTailTable(builder);
+
+    return builder;
 }
 
 static void RMF_freeBuilderTable(RMF_builder** const builders, unsigned const size)
 {
     if (builders == NULL)
         return;
-    for (unsigned i = 0; i < size; ++i) {
+
+    for (unsigned i = 0; i < size; ++i)
         free(builders[i]);
-    }
+
     free(builders);
 }
 
-static RMF_builder** RMF_createBuilderTable(U32* const matchTable, size_t const match_buffer_size, unsigned const max_len, unsigned const size)
+/* RMF_createBuilderTable() : 
+ * Create one match table builder object per thread.
+ * max_len : maximum match length supported by the table structure 
+ * size : number of threads 
+ */
+static RMF_builder** RMF_createBuilderTable(U32* const match_table, size_t const match_buffer_size, unsigned const max_len, unsigned const size)
 {
-    RMF_builder** builders = (RMF_builder**)malloc(size * sizeof(RMF_builder*));
     DEBUGLOG(3, "RMF_createBuilderTable : match_buffer_size %u, builders %u", (U32)match_buffer_size, size);
+
+    RMF_builder** const builders = malloc(size * sizeof(RMF_builder*));
+
     if (builders == NULL)
         return NULL;
+
     for (unsigned i = 0; i < size; ++i)
         builders[i] = NULL;
+
     for (unsigned i = 0; i < size; ++i) {
         builders[i] = RMF_createBuilder(match_buffer_size);
         if (builders[i] == NULL) {
             RMF_freeBuilderTable(builders, i);
             return NULL;
         }
-        builders[i]->table = matchTable;
+        builders[i]->table = match_table;
         builders[i]->max_len = max_len;
     }
     return builders;
 }
 
-static int RMF_isStruct(unsigned dictionary_log, unsigned depth)
+static int RMF_isStruct(size_t const dictionary_size)
 {
-    return dictionary_log > RADIX_LINK_BITS || depth > BITPACK_MAX_LENGTH;
+    return dictionary_size > ((size_t)1 << RADIX_LINK_BITS);
 }
 
-static int RMF_isStructParam(const RMF_parameters* const params)
-{
-    return RMF_isStruct(params->dictionary_log, params->depth);
-}
-
-/** RMF_clampCParams() :
-*  make CParam values within valid range.
-*  @return : valid CParams */
+/* RMF_clampParams() :
+*  Make param values within valid range.
+*  Return : valid RMF_parameters */
 static RMF_parameters RMF_clampParams(RMF_parameters params)
 {
 #   define CLAMP(val,min,max) {      \
         if (val<(min)) val=(min);        \
         else if (val>(max)) val=(max);   \
     }
-    CLAMP(params.dictionary_log, DICTIONARY_LOG_MIN, MEM_64bits() ? DICTIONARY_LOG_MAX_64 : DICTIONARY_LOG_MAX_32);
-    CLAMP(params.match_buffer_log, FL2_BUFFER_SIZE_LOG_MIN, FL2_BUFFER_SIZE_LOG_MAX);
-    CLAMP(params.overlap_fraction, FL2_BLOCK_OVERLAP_MIN, FL2_BLOCK_OVERLAP_MAX);
+    CLAMP(params.dictionary_size, DICTIONARY_SIZE_MIN, MEM_64bits() ? DICTIONARY_SIZE_MAX_64 : DICTIONARY_SIZE_MAX_32);
+    CLAMP(params.match_buffer_log, RMF_BUFFER_LOG_MIN, RMF_BUFFER_LOG_MAX);
+    if (params.overlap_fraction > FL2_BLOCK_OVERLAP_MAX)
+        params.overlap_fraction = FL2_BLOCK_OVERLAP_MAX;
     CLAMP(params.depth, FL2_SEARCH_DEPTH_MIN, FL2_SEARCH_DEPTH_MAX);
     return params;
+#   undef CLAMP
 }
 
+/* RMF_applyParameters_internal() :
+ * Set parameters to those specified.
+ * Create a builder table if none exists. Free an existing one if incompatible.
+ * Set match_buffer_limit and max supported match length.
+ * Returns an error if dictionary won't fit.
+ */
 static size_t RMF_applyParameters_internal(FL2_matchTable* const tbl, const RMF_parameters* const params)
 {
-    int const isStruct = RMF_isStructParam(params);
-    unsigned const dictionary_log = tbl->params.dictionary_log;
+    int const is_struct = RMF_isStruct(params->dictionary_size);
+    size_t const dictionary_size = tbl->params.dictionary_size;
     /* dictionary is allocated with the struct and is immutable */
-    if (params->dictionary_log > tbl->params.dictionary_log
-        || (params->dictionary_log == tbl->params.dictionary_log && isStruct > tbl->allocStruct))
+    if (params->dictionary_size > tbl->params.dictionary_size
+        || (params->dictionary_size == tbl->params.dictionary_size && is_struct > tbl->alloc_struct))
         return FL2_ERROR(parameter_unsupported);
 
-    {   size_t const match_buffer_size = (size_t)1 << (params->dictionary_log - params->match_buffer_log);
-        tbl->params = *params;
-        tbl->params.dictionary_log = dictionary_log;
-        tbl->isStruct = isStruct;
-        if (tbl->builders == NULL
-            || match_buffer_size > tbl->builders[0]->match_buffer_size)
-        {
-            RMF_freeBuilderTable(tbl->builders, tbl->thread_count);
-            tbl->builders = RMF_createBuilderTable(tbl->table, match_buffer_size, tbl->isStruct ? STRUCTURED_MAX_LENGTH : BITPACK_MAX_LENGTH, tbl->thread_count);
-            if (tbl->builders == NULL) {
-                return FL2_ERROR(memory_allocation);
-            }
+    size_t const match_buffer_size = params->dictionary_size >> params->match_buffer_log;
+    tbl->params = *params;
+    tbl->params.dictionary_size = dictionary_size;
+    tbl->is_struct = is_struct;
+    if (tbl->builders == NULL
+        || match_buffer_size > tbl->builders[0]->match_buffer_size)
+    {
+        RMF_freeBuilderTable(tbl->builders, tbl->thread_count);
+        tbl->builders = RMF_createBuilderTable(tbl->table, match_buffer_size, tbl->is_struct ? STRUCTURED_MAX_LENGTH : BITPACK_MAX_LENGTH, tbl->thread_count);
+        if (tbl->builders == NULL) {
+            return FL2_ERROR(memory_allocation);
         }
-        else {
-            for (unsigned i = 0; i < tbl->thread_count; ++i) {
-                tbl->builders[i]->match_buffer_limit = match_buffer_size;
-                tbl->builders[i]->max_len = tbl->isStruct ? STRUCTURED_MAX_LENGTH : BITPACK_MAX_LENGTH;
-            }
+    }
+    else {
+        for (unsigned i = 0; i < tbl->thread_count; ++i) {
+            tbl->builders[i]->match_buffer_limit = match_buffer_size;
+            tbl->builders[i]->max_len = tbl->is_struct ? STRUCTURED_MAX_LENGTH : BITPACK_MAX_LENGTH;
         }
     }
     return 0;
 }
 
+/* RMF_reduceDict() : 
+ * Reduce dictionary and match buffer size if the total input size is known and < dictionary_size.
+ */
 static void RMF_reduceDict(RMF_parameters* const params, size_t const dict_reduce)
 {
-    if (dict_reduce)
-        while (params->dictionary_log > DICTIONARY_LOG_MIN && (size_t)1 << (params->dictionary_log - 1) >= dict_reduce) {
-            --params->dictionary_log;
-            params->match_buffer_log = MAX(params->match_buffer_log - 1, FL2_BUFFER_SIZE_LOG_MIN);
+    if (dict_reduce) {
+        for (size_t dict_size = params->dictionary_size; dict_size > DICTIONARY_SIZE_MIN && (dict_size >> 1) >= dict_reduce; dict_size >>= 1) {
+            /* Use unchanged match buffer size for reduced dict */
+            params->match_buffer_log = MAX(params->match_buffer_log - 1, RMF_BUFFER_LOG_MIN);
         }
+        params->dictionary_size = MIN(params->dictionary_size, MAX(dict_reduce, DICTIONARY_SIZE_MIN));
+    }
 }
 
-FL2_matchTable* RMF_createMatchTable(const RMF_parameters* const p, size_t const dict_reduce, unsigned const thread_count)
+static void RMF_initListHeads(FL2_matchTable* const tbl)
 {
-    int isStruct;
-    size_t dictionary_size;
-	size_t table_bytes;
-    FL2_matchTable* tbl;
-    RMF_parameters params = RMF_clampParams(*p);
-
-    RMF_reduceDict(&params, dict_reduce);
-    isStruct = RMF_isStructParam(&params);
-    dictionary_size = (size_t)1 << params.dictionary_log;
-
-    DEBUGLOG(3, "RMF_createMatchTable : isStruct %d, dict %u", isStruct, (U32)dictionary_size);
-
-	table_bytes = isStruct ? ((dictionary_size + 3U) / 4U) * sizeof(RMF_unit)
-		: dictionary_size * sizeof(U32);
-    tbl = (FL2_matchTable*)malloc(
-        sizeof(FL2_matchTable) + table_bytes - sizeof(U32));
-    if (!tbl) return NULL;
-
-    tbl->isStruct = isStruct;
-    tbl->allocStruct = isStruct;
-    tbl->thread_count = thread_count + !thread_count;
-    tbl->params = params;
-    tbl->builders = NULL;
-
-    RMF_applyParameters_internal(tbl, &params);
-
     for (size_t i = 0; i < RADIX16_TABLE_SIZE; i += 2) {
         tbl->list_heads[i].head = RADIX_NULL_LINK;
         tbl->list_heads[i].count = 0;
         tbl->list_heads[i + 1].head = RADIX_NULL_LINK;
         tbl->list_heads[i + 1].count = 0;
     }
+}
+
+/* RMF_createMatchTable() :
+ * Create a match table. Reduce the dict size to input size if possible.
+ * A thread_count of 0 will be raised to 1.
+ */
+FL2_matchTable* RMF_createMatchTable(const RMF_parameters* const p, size_t const dict_reduce, unsigned const thread_count)
+{
+    RMF_parameters params = RMF_clampParams(*p);
+    RMF_reduceDict(&params, dict_reduce);
+
+    int const is_struct = RMF_isStruct(params.dictionary_size);
+    size_t dictionary_size = params.dictionary_size;
+
+    DEBUGLOG(3, "RMF_createMatchTable : is_struct %d, dict %u", is_struct, (U32)dictionary_size);
+
+    size_t const table_bytes = is_struct ? ((dictionary_size + 3U) / 4U) * sizeof(RMF_unit)
+		: dictionary_size * sizeof(U32);
+    FL2_matchTable* const tbl = malloc(sizeof(FL2_matchTable) + table_bytes - sizeof(U32));
+    if (!tbl) return NULL;
+
+    tbl->is_struct = is_struct;
+    tbl->alloc_struct = is_struct;
+    tbl->thread_count = thread_count + !thread_count;
+    tbl->params = params;
+    tbl->builders = NULL;
+
+    RMF_applyParameters_internal(tbl, &params);
+
+    RMF_initListHeads(tbl);
+
+    RMF_initProgress(tbl);
+    
     return tbl;
 }
 
@@ -190,7 +218,9 @@ void RMF_freeMatchTable(FL2_matchTable* const tbl)
 {
     if (tbl == NULL)
         return;
+
     DEBUGLOG(3, "RMF_freeMatchTable");
+
     RMF_freeBuilderTable(tbl->builders, tbl->thread_count);
     free(tbl);
 }
@@ -199,8 +229,8 @@ BYTE RMF_compatibleParameters(const FL2_matchTable* const tbl, const RMF_paramet
 {
     RMF_parameters params = RMF_clampParams(*p);
     RMF_reduceDict(&params, dict_reduce);
-    return tbl->params.dictionary_log > params.dictionary_log
-        || (tbl->params.dictionary_log == params.dictionary_log && tbl->allocStruct >= RMF_isStructParam(&params));
+    return tbl->params.dictionary_size > params.dictionary_size
+        || (tbl->params.dictionary_size == params.dictionary_size && tbl->alloc_struct >= RMF_isStruct(params.dictionary_size));
 }
 
 size_t RMF_applyParameters(FL2_matchTable* const tbl, const RMF_parameters* const p, size_t const dict_reduce)
@@ -215,18 +245,25 @@ size_t RMF_threadCount(const FL2_matchTable* const tbl)
     return tbl->thread_count;
 }
 
-size_t RMF_initTable(FL2_matchTable* const tbl, const void* const data, size_t const start, size_t const end)
+void RMF_initProgress(FL2_matchTable * const tbl)
 {
-    DEBUGLOG(5, "RMF_initTable : start %u, size %u", (U32)start, (U32)end);
-    if (tbl->isStruct) {
-        return RMF_structuredInit(tbl, data, start, end);
-    }
-    else {
-        return RMF_bitpackInit(tbl, data, start, end);
-    }
+    if (tbl != NULL)
+        tbl->progress = 0;
 }
 
-static void HandleRepeat(RMF_buildMatch* const match_buffer,
+size_t RMF_initTable(FL2_matchTable* const tbl, const void* const data, size_t const end)
+{
+    DEBUGLOG(5, "RMF_initTable : size %u", (U32)end);
+
+    tbl->st_index = ATOMIC_INITIAL_VALUE;
+
+    if (tbl->is_struct)
+        return RMF_structuredInit(tbl, data, end);
+    else
+        return RMF_bitpackInit(tbl, data, end);
+}
+
+static void RMF_handleRepeat(RMF_buildMatch* const match_buffer,
     const BYTE* const data_block,
     size_t const next,
     U32 count,
@@ -235,20 +272,22 @@ static void HandleRepeat(RMF_buildMatch* const match_buffer,
     U32 const max_len)
 {
     size_t index = next;
-    size_t next_i;
     U32 length = depth + rpt_len;
+
     const BYTE* const data = data_block + match_buffer[index].from;
     const BYTE* const data_2 = data - rpt_len;
+
     while (data[length] == data_2[length] && length < max_len)
         ++length;
+
     for (; length <= max_len && count; --count) {
-        next_i = match_buffer[index].next & 0xFFFFFF;
+        size_t next_i = match_buffer[index].next & 0xFFFFFF;
         match_buffer[index].next = (U32)next_i | (length << 24);
         length += rpt_len;
         index = next_i;
     }
     for (; count; --count) {
-        next_i = match_buffer[index].next & 0xFFFFFF;
+        size_t next_i = match_buffer[index].next & 0xFFFFFF;
         match_buffer[index].next = (U32)next_i | (max_len << 24);
         index = next_i;
     }
@@ -261,27 +300,29 @@ typedef struct
 	union src_data_u src;
 } BruteForceMatch;
 
-static void BruteForceBuffered(RMF_builder* const tbl,
+static void RMF_bruteForceBuffered(RMF_builder* const tbl,
     const BYTE* const data_block,
     size_t const block_start,
     size_t index,
-    size_t list_count,
+    size_t const list_count,
     size_t const slot,
     size_t const depth,
     size_t const max_depth)
 {
     BruteForceMatch buffer[MAX_BRUTE_FORCE_LIST_SIZE + 1];
-    const BYTE* data_src = data_block + depth;
-    size_t limit = max_depth - depth;
-    const BYTE* start = data_src + block_start;
+    const BYTE* const data_src = data_block + depth;
+    size_t const limit = max_depth - depth;
+    const BYTE* const start = data_src + block_start;
     size_t i = 0;
     for (;;) {
+        /* Load all locations from the match buffer */
         buffer[i].index = index;
         buffer[i].data_src = data_src + tbl->match_buffer[index].from;
         buffer[i].src.u32 = tbl->match_buffer[index].src.u32;
-        if (++i >= list_count) {
+
+        if (++i >= list_count)
             break;
-        }
+
         index = tbl->match_buffer[index].next & 0xFFFFFF;
     }
     i = 0;
@@ -289,28 +330,29 @@ static void BruteForceBuffered(RMF_builder* const tbl,
         size_t longest = 0;
         size_t j = i + 1;
         size_t longest_index = j;
-        const BYTE* data = buffer[i].data_src;
+        const BYTE* const data = buffer[i].data_src;
         do {
+            /* Begin with the remaining chars pulled from the match buffer */
             size_t len_test = slot;
-            while (len_test < 4 && buffer[i].src.chars[len_test] == buffer[j].src.chars[len_test] && len_test - slot < limit) {
+            while (len_test < 4 && buffer[i].src.chars[len_test] == buffer[j].src.chars[len_test] && len_test - slot < limit)
                 ++len_test;
-            }
+
             len_test -= slot;
             if (len_test) {
+                /* Complete the match length count in the raw input buffer */
                 const BYTE* data_2 = buffer[j].data_src;
-                while (data[len_test] == data_2[len_test] && len_test < limit) {
+                while (data[len_test] == data_2[len_test] && len_test < limit)
                     ++len_test;
-                }
             }
             if (len_test > longest) {
                 longest_index = j;
                 longest = len_test;
-                if (len_test >= limit) {
+                if (len_test >= limit)
                     break;
-                }
             }
         } while (++j < list_count);
         if (longest > 0) {
+            /* If the existing match was extended, store the new link and length info in the match buffer */
             index = buffer[i].index;
             tbl->match_buffer[index].next = (U32)(buffer[longest_index].index | ((depth + longest) << 24));
         }
@@ -318,17 +360,19 @@ static void BruteForceBuffered(RMF_builder* const tbl,
     } while (i < list_count - 1 && buffer[i].data_src >= start);
 }
 
+/* Lengthen and divide buffered chains into smaller chains, save them on a stack and process in turn. 
+ * The match finder spends most of its time here.
+ */
 FORCE_INLINE_TEMPLATE
 void RMF_recurseListChunk_generic(RMF_builder* const tbl,
     const BYTE* const data_block,
     size_t const block_start,
-    BYTE depth,
-    BYTE const max_depth,
+    U32 depth,
+    U32 const max_depth,
     U32 list_count,
     size_t const stack_base)
 {
-    /* Create an offset data buffer pointer for reading the next bytes */
-    const BYTE base_depth = depth;
+    U32 const base_depth = depth;
     size_t st_index = stack_base;
     size_t index = 0;
     ++depth;
@@ -338,10 +382,11 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
         size_t const radix_8 = tbl->match_buffer[index].src.chars[0];
         /* Seen this char before? */
         U32 const prev = tbl->tails_8[radix_8].prev_index;
+        tbl->tails_8[radix_8].prev_index = (U32)index;
         if (prev != RADIX_NULL_LINK) {
             ++tbl->tails_8[radix_8].list_count;
             /* Link the previous occurrence to this one and record the new length */
-            tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
+            tbl->match_buffer[prev].next = (U32)index | (depth << 24);
         }
         else {
             tbl->tails_8[radix_8].list_count = 1;
@@ -351,7 +396,6 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
             tbl->stack[st_index].count = (U32)radix_8;
             ++st_index;
         }
-        tbl->tails_8[radix_8].prev_index = (U32)index;
         ++index;
     } while (index < list_count);
 
@@ -361,7 +405,7 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
         U32 const prev = tbl->tails_8[radix_8].prev_index;
         if (prev != RADIX_NULL_LINK) {
             ++tbl->tails_8[radix_8].list_count;
-            tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
+            tbl->match_buffer[prev].next = (U32)index | (depth << 24);
         }
     }
     /* Convert radix values on the stack to counts and reset any used tail slots */
@@ -370,11 +414,6 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
         tbl->stack[j].count = (U32)tbl->tails_8[tbl->stack[j].count].list_count;
     }
     while (st_index > stack_base) {
-        const BYTE* data_src;
-        size_t link;
-        size_t slot;
-        U32 test;
-
         /* Pop an item off the stack */
         --st_index;
         list_count = tbl->stack[st_index].count;
@@ -383,7 +422,7 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
             continue;
         }
         index = tbl->stack[st_index].head;
-        link = tbl->match_buffer[index].from;
+        size_t link = tbl->match_buffer[index].from;
         if (link < block_start) {
             /* Chain starts in the overlap region which is already encoded */
             continue;
@@ -396,10 +435,11 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
             continue;
         }
         depth = tbl->match_buffer[index].next >> 24;
-        slot = (depth - base_depth) & 3;
+        /* Index into the 4-byte pre-loaded input char cache */
+        size_t slot = (depth - base_depth) & 3;
         if (list_count <= MAX_BRUTE_FORCE_LIST_SIZE) {
             /* Quicker to use brute force, each string compared with all previous strings */
-            BruteForceBuffered(tbl,
+            RMF_bruteForceBuffered(tbl,
                 data_block,
                 block_start,
                 index,
@@ -409,35 +449,41 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
                 max_depth);
             continue;
         }
-        /* check for repeats at depth 4,8,16,32 etc */
-        test = max_depth != 6 && ((depth & 3) == 0) && ((REPEAT_CHECK_TABLE >> ((depth >> 2) & 31)) & 1) && (max_depth >= depth + (depth >> 1));
+        /* check for repeats at depth 4,8,16,32 etc unless depth is near max_depth */
+        U32 const test = max_depth != 6 && ((depth & 3) == 0)
+            && (depth & (depth - 1)) == 0
+            && (max_depth >= depth + (depth >> 1));
         ++depth;
-        /* Update the offset data buffer pointer */
-        data_src = data_block + depth;
+        /* Create an offset data buffer pointer for reading the next bytes */
+        const BYTE* const data_src = data_block + depth;
         /* Last pass is done separately */
         if (!test && depth < max_depth) {
             size_t const prev_st_index = st_index;
             /* Last element done separately */
             --list_count;
-            /* slot is the char cache index. If 3 then chars need to be loaded. */
+            /* If slot is 3 then chars need to be loaded. */
             if (slot == 3 && max_depth != 6) do {
                 size_t const radix_8 = tbl->match_buffer[index].src.chars[3];
                 size_t const next_index = tbl->match_buffer[index].next & BUFFER_LINK_MASK;
-                /* Pre-load the next link and data bytes to avoid waiting for RAM access */
+                /* Pre-load the next link and data bytes. On some hardware execution can continue
+                 * ahead while the data is retrieved if no operations except move are done on the data. */
                 tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link);
                 size_t const next_link = tbl->match_buffer[next_index].from;
                 U32 const prev = tbl->tails_8[radix_8].prev_index;
-                if (prev!=RADIX_NULL_LINK) {
+                tbl->tails_8[radix_8].prev_index = (U32)index;
+                if (prev != RADIX_NULL_LINK) {
+                    /* This char has occurred before in the chain. Link the previous (> index) occurance with this */
                     ++tbl->tails_8[radix_8].list_count;
-                    tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
+                    tbl->match_buffer[prev].next = (U32)index | (depth << 24);
                 }
                 else {
+                    /* First occurrence in the chain */
                     tbl->tails_8[radix_8].list_count = 1;
                     tbl->stack[st_index].head = (U32)index;
+                    /* Save the char as a reference to load the count at the end */
                     tbl->stack[st_index].count = (U32)radix_8;
                     ++st_index;
                 }
-                tbl->tails_8[radix_8].prev_index = (U32)index;
                 index = next_index;
                 link = next_link;
             } while (--list_count != 0);
@@ -447,9 +493,10 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
                 /* Pre-load the next link to avoid waiting for RAM access */
                 size_t const next_link = tbl->match_buffer[next_index].from;
                 U32 const prev = tbl->tails_8[radix_8].prev_index;
+                tbl->tails_8[radix_8].prev_index = (U32)index;
                 if (prev != RADIX_NULL_LINK) {
                     ++tbl->tails_8[radix_8].list_count;
-                    tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
+                    tbl->match_buffer[prev].next = (U32)index | (depth << 24);
                 }
                 else {
                     tbl->tails_8[radix_8].list_count = 1;
@@ -457,20 +504,18 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
                     tbl->stack[st_index].count = (U32)radix_8;
                     ++st_index;
                 }
-                tbl->tails_8[radix_8].prev_index = (U32)index;
                 index = next_index;
                 link = next_link;
             } while (--list_count != 0);
 
-            {   size_t const radix_8 = tbl->match_buffer[index].src.chars[slot];
-                U32 const prev = tbl->tails_8[radix_8].prev_index;
-                if (prev != RADIX_NULL_LINK) {
-                    if (slot == 3) {
-                        tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link);
-                    }
-                    ++tbl->tails_8[radix_8].list_count;
-                    tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
-                }
+            size_t const radix_8 = tbl->match_buffer[index].src.chars[slot];
+            U32 const prev = tbl->tails_8[radix_8].prev_index;
+            if (prev != RADIX_NULL_LINK) {
+                if (slot == 3)
+                    tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link);
+
+                ++tbl->tails_8[radix_8].list_count;
+                tbl->match_buffer[prev].next = (U32)index | (depth << 24);
             }
             for (size_t j = prev_st_index; j < st_index; ++j) {
                 tbl->tails_8[tbl->stack[j].count].prev_index = RADIX_NULL_LINK;
@@ -490,14 +535,15 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
                 size_t const next_index = tbl->match_buffer[index].next & BUFFER_LINK_MASK;
                 size_t const next_link = tbl->match_buffer[next_index].from;
                 if ((link - next_link) > rpt_depth) {
-                    if (rpt > 0) {
-                        HandleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len);
-                    }
+                    if (rpt > 0)
+                        RMF_handleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len);
+
                     rpt = -1;
                     U32 const prev = tbl->tails_8[radix_8].prev_index;
+                    tbl->tails_8[radix_8].prev_index = (U32)index;
                     if (prev != RADIX_NULL_LINK) {
                         ++tbl->tails_8[radix_8].list_count;
-                        tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
+                        tbl->match_buffer[prev].next = (U32)index | (depth << 24);
                     }
                     else {
                         tbl->tails_8[radix_8].list_count = 1;
@@ -505,23 +551,23 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
                         tbl->stack[st_index].count = (U32)radix_8;
                         ++st_index;
                     }
-                    tbl->tails_8[radix_8].prev_index = (U32)index;
                     index = next_index;
                     link = next_link;
                 }
                 else {
                     U32 const dist = (U32)(link - next_link);
                     if (rpt < 0 || dist != rpt_dist) {
-                        if (rpt > 0) {
-                            HandleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len);
-                        }
+                        if (rpt > 0)
+                            RMF_handleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len);
+
                         rpt = 0;
                         rpt_head_next = next_index;
                         rpt_dist = dist;
                         U32 const prev = tbl->tails_8[radix_8].prev_index;
+                        tbl->tails_8[radix_8].prev_index = (U32)index;
                         if (prev != RADIX_NULL_LINK) {
                             ++tbl->tails_8[radix_8].list_count;
-                            tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
+                            tbl->match_buffer[prev].next = (U32)index | (depth << 24);
                         }
                         else {
                             tbl->tails_8[radix_8].list_count = 1;
@@ -529,7 +575,6 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
                             tbl->stack[st_index].count = (U32)radix_8;
                             ++st_index;
                         }
-                        tbl->tails_8[radix_8].prev_index = (U32)index;
                     }
                     else {
                         ++rpt;
@@ -538,19 +583,18 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
                     link = next_link;
                 }
             } while (--list_count != 0);
-            if (rpt > 0) {
-                HandleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len);
-            }
 
-            {   size_t const radix_8 = tbl->match_buffer[index].src.chars[slot];
-                U32 const prev = tbl->tails_8[radix_8].prev_index;
-                if (prev != RADIX_NULL_LINK) {
-                    if (slot == 3) {
-                        tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link);
-                    }
-                    ++tbl->tails_8[radix_8].list_count;
-                    tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
+            if (rpt > 0)
+                RMF_handleRepeat(tbl->match_buffer, data_block, rpt_head_next, rpt, rpt_dist, rpt_depth, tbl->max_len);
+
+            size_t const radix_8 = tbl->match_buffer[index].src.chars[slot];
+            U32 const prev = tbl->tails_8[radix_8].prev_index;
+            if (prev != RADIX_NULL_LINK) {
+                if (slot == 3) {
+                    tbl->match_buffer[index].src.u32 = MEM_read32(data_src + link);
                 }
+                ++tbl->tails_8[radix_8].list_count;
+                tbl->match_buffer[prev].next = (U32)index | (depth << 24);
             }
             for (size_t j = prev_st_index; j < st_index; ++j) {
                 tbl->tails_8[tbl->stack[j].count].prev_index = RADIX_NULL_LINK;
@@ -558,7 +602,7 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
             }
         }
         else {
-            size_t prev_st_index = st_index;
+            size_t const prev_st_index = st_index;
             /* The last pass at max_depth */
             do {
                 size_t const radix_8 = tbl->match_buffer[index].src.chars[slot];
@@ -567,14 +611,14 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
                 /* The last element in tbl->match_buffer is circular so this is never an access violation. */
                 size_t const next_link = tbl->match_buffer[next_index].from;
                 U32 const prev = tbl->tails_8[radix_8].prev_index;
+                tbl->tails_8[radix_8].prev_index = (U32)index;
                 if (prev != RADIX_NULL_LINK) {
-                    tbl->match_buffer[prev].next = (U32)index | ((U32)depth << 24);
+                    tbl->match_buffer[prev].next = (U32)index | (depth << 24);
                 }
                 else {
                     tbl->stack[st_index].count = (U32)radix_8;
                     ++st_index;
                 }
-                tbl->tails_8[radix_8].prev_index = (U32)index;
                 index = next_index;
                 link = next_link;
             } while (--list_count != 0);
@@ -589,84 +633,81 @@ void RMF_recurseListChunk_generic(RMF_builder* const tbl,
 void RMF_recurseListChunk(RMF_builder* const tbl,
     const BYTE* const data_block,
     size_t const block_start,
-    BYTE const depth,
-    BYTE const max_depth,
+    U32 const depth,
+    U32 const max_depth,
     U32 const list_count,
     size_t const stack_base)
 {
-    if (max_depth > 6) {
+    if (list_count < 2)
+        return;
+    /* Template-like inline functions */
+    if (list_count <= MAX_BRUTE_FORCE_LIST_SIZE)
+        RMF_bruteForceBuffered(tbl, data_block, block_start, 0, list_count, 0, depth, max_depth);
+    else if (max_depth > 6)
         RMF_recurseListChunk_generic(tbl, data_block, block_start, depth, max_depth, list_count, stack_base);
-    }
-    else {
+    else
         RMF_recurseListChunk_generic(tbl, data_block, block_start, depth, 6, list_count, stack_base);
-    }
 }
 
 /* Iterate the head table concurrently with other threads, and recurse each list until max_depth is reached */
 int RMF_buildTable(FL2_matchTable* const tbl,
-	size_t const job,
+    size_t const job,
     unsigned const multi_thread,
-    FL2_dataBlock const block,
-    FL2_progressFn progress, void* opaque, U32 weight, size_t init_done)
+    FL2_dataBlock const block)
 {
     DEBUGLOG(5, "RMF_buildTable : thread %u", (U32)job);
-    if (tbl->isStruct) {
-        return RMF_structuredBuildTable(tbl, job, multi_thread, block, progress, opaque, weight, init_done);
-    }
-    else {
-        return RMF_bitpackBuildTable(tbl, job, multi_thread, block, progress, opaque, weight, init_done);
+
+    if (tbl->is_struct)
+        RMF_structuredBuildTable(tbl, job, multi_thread, block);
+    else
+        RMF_bitpackBuildTable(tbl, job, multi_thread, block);
+
+    if (job == 0 && tbl->st_index >= RADIX_CANCEL_INDEX) {
+        RMF_initListHeads(tbl);
+        return 1;
     }
+    return 0;
+}
+
+void RMF_cancelBuild(FL2_matchTable * const tbl)
+{
+    if(tbl != NULL)
+        FL2_atomic_add(tbl->st_index, RADIX_CANCEL_INDEX - ATOMIC_INITIAL_VALUE);
+}
+
+void RMF_resetIncompleteBuild(FL2_matchTable * const tbl)
+{
+    RMF_initListHeads(tbl);
 }
 
 int RMF_integrityCheck(const FL2_matchTable* const tbl, const BYTE* const data, size_t const index, size_t const end, unsigned const max_depth)
 {
-    if (tbl->isStruct) {
+    if (tbl->is_struct)
         return RMF_structuredIntegrityCheck(tbl, data, index, end, max_depth);
-    }
-    else {
+    else
         return RMF_bitpackIntegrityCheck(tbl, data, index, end, max_depth);
-    }
-}
-
-size_t RMF_getMatch(FL2_matchTable* const tbl,
-    const BYTE* const data,
-    size_t const index,
-    size_t const limit,
-    unsigned max_depth,
-    size_t* const offset_ptr)
-{
-    if (tbl->isStruct) {
-        return RMF_structuredGetMatch(tbl, data, index, limit, max_depth, offset_ptr);
-    }
-    else {
-        return RMF_bitpackGetMatch(tbl, data, index, limit, max_depth, offset_ptr);
-    }
 }
 
 void RMF_limitLengths(FL2_matchTable* const tbl, size_t const index)
 {
-    if (tbl->isStruct) {
+    if (tbl->is_struct)
         RMF_structuredLimitLengths(tbl, index);
-    }
-    else {
+    else
         RMF_bitpackLimitLengths(tbl, index);
-    }
 }
 
 BYTE* RMF_getTableAsOutputBuffer(FL2_matchTable* const tbl, size_t const index)
 {
-    if (tbl->isStruct) {
+    if (tbl->is_struct)
         return RMF_structuredAsOutputBuffer(tbl, index);
-    }
-    else {
+    else
         return RMF_bitpackAsOutputBuffer(tbl, index);
-    }
 }
 
-size_t RMF_memoryUsage(unsigned const dict_log, unsigned const buffer_log, unsigned const depth, unsigned thread_count)
+size_t RMF_memoryUsage(size_t const dict_size, unsigned const buffer_log, unsigned const thread_count)
 {
-    size_t size = (size_t)(4U + RMF_isStruct(dict_log, depth)) << dict_log;
-    U32 buf_size = (U32)1 << (dict_log - buffer_log);
+    size_t size = (size_t)(4U + RMF_isStruct(dict_size)) * dict_size;
+    size_t const buf_size = dict_size >> buffer_log;
     size += ((buf_size - 1) * sizeof(RMF_buildMatch) + sizeof(RMF_builder)) * thread_count;
     return size;
 }
diff --git a/C/fast-lzma2/radix_mf.h b/C/fast-lzma2/radix_mf.h
index c5bf943d..e6b7711b 100644
--- a/C/fast-lzma2/radix_mf.h
+++ b/C/fast-lzma2/radix_mf.h
@@ -20,16 +20,19 @@ extern "C" {
 
 typedef struct FL2_matchTable_s FL2_matchTable;
 
-#define OVERLAP_FROM_DICT_LOG(d, o) (((size_t)1 << ((d) - 4)) * (o))
+#define OVERLAP_FROM_DICT_SIZE(d, o) (((d) >> 4) * (o))
 
 #define RMF_MIN_BYTES_PER_THREAD 1024
 
+#define RMF_BUFFER_LOG_BASE 12
+#define RMF_BUFFER_LOG_MIN 6
+#define RMF_BUFFER_LOG_MAX 12
+
 typedef struct
 {
-    unsigned dictionary_log;
+    size_t dictionary_size;
     unsigned match_buffer_log;
     unsigned overlap_fraction;
-    unsigned block_size_log;
     unsigned divide_and_conquer;
     unsigned depth;
 #ifdef RMF_REFERENCE
@@ -42,16 +45,18 @@ void RMF_freeMatchTable(FL2_matchTable* const tbl);
 BYTE RMF_compatibleParameters(const FL2_matchTable* const tbl, const RMF_parameters* const params, size_t const dict_reduce);
 size_t RMF_applyParameters(FL2_matchTable* const tbl, const RMF_parameters* const params, size_t const dict_reduce);
 size_t RMF_threadCount(const FL2_matchTable * const tbl);
-size_t RMF_initTable(FL2_matchTable* const tbl, const void* const data, size_t const start, size_t const end);
+void RMF_initProgress(FL2_matchTable * const tbl);
+size_t RMF_initTable(FL2_matchTable* const tbl, const void* const data, size_t const end);
 int RMF_buildTable(FL2_matchTable* const tbl,
 	size_t const job,
     unsigned const multi_thread,
-    FL2_dataBlock const block,
-    FL2_progressFn progress, void* opaque, U32 weight, size_t init_done);
+    FL2_dataBlock const block);
+void RMF_cancelBuild(FL2_matchTable* const tbl);
+void RMF_resetIncompleteBuild(FL2_matchTable* const tbl);
 int RMF_integrityCheck(const FL2_matchTable* const tbl, const BYTE* const data, size_t const index, size_t const end, unsigned const max_depth);
 void RMF_limitLengths(FL2_matchTable* const tbl, size_t const index);
 BYTE* RMF_getTableAsOutputBuffer(FL2_matchTable* const tbl, size_t const index);
-size_t RMF_memoryUsage(unsigned const dict_log, unsigned const buffer_log, unsigned const depth, unsigned thread_count);
+size_t RMF_memoryUsage(size_t const dict_size, unsigned const buffer_log, unsigned const thread_count);
 
 #if defined (__cplusplus)
 }
diff --git a/C/fast-lzma2/radix_struct.c b/C/fast-lzma2/radix_struct.c
index 2aac9093..ce8b6ee1 100644
--- a/C/fast-lzma2/radix_struct.c
+++ b/C/fast-lzma2/radix_struct.c
@@ -9,7 +9,7 @@
 */
 
 #include "mem.h"          /* U32, U64 */
-#include "fl2threading.h"
+#include "fl2_threading.h"
 #include "fl2_internal.h"
 #include "radix_internal.h"
 
@@ -34,7 +34,7 @@ typedef struct FL2_matchTable_s FL2_matchTable;
 
 #define SetMatchLength(index, link, length) ((RMF_unit*)tbl->table)[(index) >> UNIT_BITS].lengths[(index) & UNIT_MASK] = (BYTE)(length)
 
-#define SetMatchLinkAndLength(index, link, length) { size_t i_ = (index) >> UNIT_BITS, u_ = (index) & UNIT_MASK; ((RMF_unit*)tbl->table)[i_].links[u_] = (U32)(link); ((RMF_unit*)tbl->table)[i_].lengths[u_] = (BYTE)(length); }
+#define SetMatchLinkAndLength(index, link, length) do { size_t i_ = (index) >> UNIT_BITS, u_ = (index) & UNIT_MASK; ((RMF_unit*)tbl->table)[i_].links[u_] = (U32)(link); ((RMF_unit*)tbl->table)[i_].lengths[u_] = (BYTE)(length); } while(0)
 
 #define SetNull(index) ((RMF_unit*)tbl->table)[(index) >> UNIT_BITS].links[(index) & UNIT_MASK] = RADIX_NULL_LINK
 
diff --git a/C/fast-lzma2/range_enc.c b/C/fast-lzma2/range_enc.c
index aff9ab80..1da8a1c5 100644
--- a/C/fast-lzma2/range_enc.c
+++ b/C/fast-lzma2/range_enc.c
@@ -7,84 +7,194 @@
 
 #include "fl2_internal.h"
 #include "mem.h"
+#include "platform.h"
 #include "range_enc.h"
 
-const unsigned price_table[kBitModelTotal >> kNumMoveReducingBits] = {
-    128, 103,  91,  84,  78,  73,  69,  66,
-    63,  61,  58,  56,  54,  52,  51,  49,
-    48,  46,  45,  44,  43,  42,  41,  40,
-    39,  38,  37,  36,  35,  34,  34,  33,
-    32,  31,  31,  30,  29,  29,  28,  28,
-    27,  26,  26,  25,  25,  24,  24,  23,
-    23,  22,  22,  22,  21,  21,  20,  20,
-    19,  19,  19,  18,  18,  17,  17,  17,
-    16,  16,  16,  15,  15,  15,  14,  14,
-    14,  13,  13,  13,  12,  12,  12,  11,
-    11,  11,  11,  10,  10,  10,  10,   9,
-    9,   9,   9,   8,   8,   8,   8,   7,
-    7,   7,   7,   6,   6,   6,   6,   5,
-    5,   5,   5,   5,   4,   4,   4,   4,
-    3,   3,   3,   3,   3,   2,   2,   2,
-    2,   2,   2,   1,   1,   1,   1,   1
-};
+/* The first and last elements of these tables are never used */
+BYTE price_table[2][kPriceTableSize] = { {
+   0, 193, 182, 166, 154, 145, 137, 131,
+ 125, 120, 115, 111, 107, 103, 100,  97,
+  94,  91,  89,  86,  84,  82,  80,  78,
+  76,  74,  72,  71,  69,  67,  66,  64,
+  63,  61,  60,  59,  57,  56,  55,  54,
+  53,  52,  50,  49,  48,  47,  46,  45,
+  44,  43,  42,  42,  41,  40,  39,  38,
+  37,  36,  36,  35,  34,  33,  33,  32,
+  31,  30,  30,  29,  28,  28,  27,  26,
+  26,  25,  25,  24,  23,  23,  22,  21,
+  21,  20,  20,  19,  19,  18,  18,  17,
+  17,  16,  16,  15,  15,  14,  14,  13,
+  13,  12,  12,  11,  11,  10,  10,   9,
+   9,   8,   8,   8,   7,   7,   6,   6,
+   5,   5,   5,   4,   4,   3,   3,   3,
+   2,   2,   2,   1,   1,   0,   0,   0
+}, {
+   0,   0,   0,   1,   1,   2,   2,   2,
+   3,   3,   3,   4,   4,   5,   5,   5,
+   6,   6,   7,   7,   8,   8,   8,   9,
+   9,  10,  10,  11,  11,  12,  12,  13,
+  13,  13,  14,  14,  15,  15,  16,  17,
+  17,  18,  18,  19,  19,  20,  20,  21,
+  21,  22,  23,  23,  24,  24,  25,  26,
+  26,  27,  28,  28,  29,  30,  30,  31,
+  32,  33,  33,  34,  35,  36,  36,  37,
+  38,  39,  40,  41,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  53,
+  54,  55,  56,  57,  59,  60,  61,  63,
+  64,  66,  67,  69,  70,  72,  74,  76,
+  78,  80,  82,  84,  86,  89,  91,  94,
+  97, 100, 103, 107, 111, 115, 119, 125,
+ 130, 137, 145, 154, 165, 181, 192,   0
+} };
 
-void SetOutputBuffer(RangeEncoder* const rc, BYTE *const out_buffer, size_t chunk_size)
+#if 0
+
+#include <stdio.h>
+
+/* Generates price_table */
+void RC_printPriceTable()
+{
+    static const unsigned test_size = 0x4000;
+    const unsigned test_div = test_size >> 8;
+    BYTE buf[0x3062];
+    unsigned table0[kPriceTableSize];
+    unsigned table1[kPriceTableSize];
+    unsigned count[kPriceTableSize];
+    memset(table0, 0, sizeof(table0));
+    memset(table1, 0, sizeof(table1));
+    memset(count, 0, sizeof(count));
+    for (Probability i = 31; i <= kBitModelTotal - 31; ++i) {
+        RangeEncoder rc;
+        RC_reset(&rc);
+        RC_setOutputBuffer(&rc, buf, sizeof(buf));
+        for (unsigned j = 0; j < test_size; ++j) {
+            Probability prob = i;
+            RC_encodeBit0(&rc, &prob);
+        }
+        RC_flush(&rc);
+        table0[i >> kNumMoveReducingBits] += (unsigned)rc.out_index - 5;
+        RC_reset(&rc);
+        RC_setOutputBuffer(&rc, buf, sizeof(buf));
+        for (unsigned j = 0; j < test_size; ++j) {
+            Probability prob = i;
+            RC_encodeBit1(&rc, &prob);
+        }
+        RC_flush(&rc);
+        table1[i >> kNumMoveReducingBits] += (unsigned)rc.out_index - 5;
+        ++count[i >> kNumMoveReducingBits];
+    }
+    for (int i = 0; i < kPriceTableSize; ++i) if (count[i]) {
+        table0[i] = (table0[i] / count[i]) / test_div;
+        table1[i] = (table1[i] / count[i]) / test_div;
+    }
+    fputs("const BYTE price_table[2][kPriceTableSize] = {\r\n", stdout);
+    for (int i = 0; i < kPriceTableSize;) {
+        for (int j = 0; j < 8; ++j, ++i)
+            printf("%4d,", table0[i]);
+        fputs("\r\n", stdout);
+    }
+    fputs("}, {\r\n", stdout);
+    for (int i = 0; i < kPriceTableSize;) {
+        for (int j = 0; j < 8; ++j, ++i)
+            printf("%4d,", table1[i]);
+        fputs("\r\n", stdout);
+    }
+    fputs("} };\r\n", stdout);
+}
+
+#endif
+
+void RC_setOutputBuffer(RangeEncoder* const rc, BYTE *const out_buffer, size_t chunk_size)
 {
     rc->out_buffer = out_buffer;
     rc->chunk_size = chunk_size;
     rc->out_index = 0;
 }
 
-void RangeEncReset(RangeEncoder* const rc)
+void RC_reset(RangeEncoder* const rc)
 {
     rc->low = 0;
     rc->range = (U32)-1;
-    rc->cache_size = 1;
+    rc->cache_size = 0;
     rc->cache = 0;
 }
 
-void ShiftLow(RangeEncoder* const rc)
+#ifdef __64BIT__
+
+void FORCE_NOINLINE RC_shiftLow(RangeEncoder* const rc)
 {
-	if (rc->low < 0xFF000000 || rc->low > 0xFFFFFFFF)
-	{
-		BYTE temp = rc->cache;
-		do {
-			assert (rc->out_index < rc->chunk_size - 4096);
-            rc->out_buffer[rc->out_index++] = temp + (BYTE)(rc->low >> 32);
-            temp = 0xFF;
-		} while (--rc->cache_size != 0);
-        rc->cache = (BYTE)(rc->low >> 24);
-	}
-	++rc->cache_size;
-    rc->low = (rc->low << 8) & 0xFFFFFFFF;
+    U64 low = rc->low;
+    rc->low = (U32)(low << 8);
+    if (low < 0xFF000000 || low > 0xFFFFFFFF) {
+        BYTE high = (BYTE)(low >> 32);
+        rc->out_buffer[rc->out_index++] = rc->cache + high;
+        rc->cache = (BYTE)(low >> 24);
+        if (rc->cache_size != 0) {
+            high += 0xFF;
+            do {
+                rc->out_buffer[rc->out_index++] = high;
+            } while (--rc->cache_size != 0);
+        }
+    }
+    else {
+        rc->cache_size++;
+    }
 }
 
-void EncodeBitTree(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol)
+#else
+
+void FORCE_NOINLINE RC_shiftLow(RangeEncoder* const rc)
 {
-	size_t tree_index = 1;
-    assert(bit_count > 0);
+    U32 low = (U32)rc->low;
+    unsigned high = (unsigned)(rc->low >> 32);
+    rc->low = low << 8;
+    if (low < (U32)0xFF000000 || high != 0) {
+        rc->out_buffer[rc->out_index++] = rc->cache + (BYTE)high;
+        rc->cache = (BYTE)(low >> 24);
+        if (rc->cache_size != 0) {
+            high += 0xFF;
+            do {
+                rc->out_buffer[rc->out_index++] = (BYTE)high;
+            } while (--rc->cache_size != 0);
+        }
+    }
+    else {
+        rc->cache_size++;
+    }
+}
+
+#endif
+
+void RC_encodeBitTree(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol)
+{
+    assert(bit_count > 1);
+    --bit_count;
+    unsigned bit = symbol >> bit_count;
+    RC_encodeBit(rc, &probs[1], bit);
+    size_t tree_index = 1;
     do {
-        unsigned bit;
-		--bit_count;
-		bit = (symbol >> bit_count) & 1;
-		EncodeBit(rc, &probs[tree_index], bit);
-		tree_index = (tree_index << 1) | bit;
-	} while (bit_count != 0);
+        --bit_count;
+        tree_index = (tree_index << 1) | bit;
+        bit = (symbol >> bit_count) & 1;
+        RC_encodeBit(rc, &probs[tree_index], bit);
+    } while (bit_count != 0);
 }
 
-void EncodeBitTreeReverse(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol)
+void RC_encodeBitTreeReverse(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol)
 {
-	unsigned tree_index = 1;
     assert(bit_count != 0);
-    do {
-		unsigned bit = symbol & 1;
-		EncodeBit(rc, &probs[tree_index], bit);
-		tree_index = (tree_index << 1) + bit;
-		symbol >>= 1;
-	} while (--bit_count != 0);
+    unsigned bit = symbol & 1;
+    RC_encodeBit(rc, &probs[1], bit);
+    unsigned tree_index = 1;
+    while (--bit_count != 0) {
+        tree_index = (tree_index << 1) + bit;
+        symbol >>= 1;
+        bit = symbol & 1;
+		RC_encodeBit(rc, &probs[tree_index], bit);
+	}
 }
 
-void EncodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count)
+void FORCE_NOINLINE RC_encodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count)
 {
 	assert(bit_count > 0);
 	do {
@@ -93,7 +203,7 @@ void EncodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count)
         rc->low += rc->range & -((int)(value >> bit_count) & 1);
 		if (rc->range < kTopValue) {
             rc->range <<= 8;
-			ShiftLow(rc);
+			RC_shiftLow(rc);
 		}
 	} while (bit_count != 0);
 }
diff --git a/C/fast-lzma2/range_enc.h b/C/fast-lzma2/range_enc.h
index 54672f4e..159449ad 100644
--- a/C/fast-lzma2/range_enc.h
+++ b/C/fast-lzma2/range_enc.h
@@ -28,9 +28,13 @@ typedef U16 Probability;
 #define kNumMoveBits 5U
 #define kProbInitValue (kBitModelTotal >> 1U)
 #define kNumMoveReducingBits 4U
-#define kNumBitPriceShiftBits 4U
+#define kNumBitPriceShiftBits 5U
+#define kPriceTableSize (kBitModelTotal >> kNumMoveReducingBits)
 
-extern const unsigned price_table[kBitModelTotal >> kNumMoveReducingBits];
+extern BYTE price_table[2][kPriceTableSize];
+#if 0
+void RC_printPriceTable();
+#endif
 
 typedef struct
 {
@@ -43,22 +47,20 @@ typedef struct
 	BYTE cache;
 } RangeEncoder;
 
-void RangeEncReset(RangeEncoder* const rc);
+void RC_reset(RangeEncoder* const rc);
 
-void SetOutputBuffer(RangeEncoder* const rc, BYTE *const out_buffer, size_t chunk_size);
+void RC_setOutputBuffer(RangeEncoder* const rc, BYTE *const out_buffer, size_t chunk_size);
 
-void RangeEncReset(RangeEncoder* const rc);
+void FORCE_NOINLINE RC_shiftLow(RangeEncoder* const rc);
 
-void ShiftLow(RangeEncoder* const rc);
+void RC_encodeBitTree(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol);
 
-void EncodeBitTree(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol);
+void RC_encodeBitTreeReverse(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol);
 
-void EncodeBitTreeReverse(RangeEncoder* const rc, Probability *const probs, unsigned bit_count, unsigned symbol);
-
-void EncodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count);
+void FORCE_NOINLINE RC_encodeDirect(RangeEncoder* const rc, unsigned value, unsigned bit_count);
 
 HINT_INLINE
-void EncodeBit0(RangeEncoder* const rc, Probability *const rprob)
+void RC_encodeBit0(RangeEncoder* const rc, Probability *const rprob)
 {
 	unsigned prob = *rprob;
     rc->range = (rc->range >> kNumBitModelTotalBits) * prob;
@@ -66,12 +68,12 @@ void EncodeBit0(RangeEncoder* const rc, Probability *const rprob)
 	*rprob = (Probability)prob;
 	if (rc->range < kTopValue) {
         rc->range <<= 8;
-		ShiftLow(rc);
+		RC_shiftLow(rc);
 	}
 }
 
 HINT_INLINE
-void EncodeBit1(RangeEncoder* const rc, Probability *const rprob)
+void RC_encodeBit1(RangeEncoder* const rc, Probability *const rprob)
 {
 	unsigned prob = *rprob;
 	U32 new_bound = (rc->range >> kNumBitModelTotalBits) * prob;
@@ -81,16 +83,16 @@ void EncodeBit1(RangeEncoder* const rc, Probability *const rprob)
 	*rprob = (Probability)prob;
 	if (rc->range < kTopValue) {
         rc->range <<= 8;
-		ShiftLow(rc);
+		RC_shiftLow(rc);
 	}
 }
 
 HINT_INLINE
-void EncodeBit(RangeEncoder* const rc, Probability *const rprob, unsigned const bit)
+void RC_encodeBit(RangeEncoder* const rc, Probability *const rprob, unsigned const bit)
 {
 	unsigned prob = *rprob;
 	if (bit != 0) {
-		U32 new_bound = (rc->range >> kNumBitModelTotalBits) * prob;
+		U32 const new_bound = (rc->range >> kNumBitModelTotalBits) * prob;
         rc->low += new_bound;
         rc->range -= new_bound;
 		prob -= prob >> kNumMoveBits;
@@ -102,52 +104,56 @@ void EncodeBit(RangeEncoder* const rc, Probability *const rprob, unsigned const
 	*rprob = (Probability)prob;
 	if (rc->range < kTopValue) {
         rc->range <<= 8;
-		ShiftLow(rc);
+		RC_shiftLow(rc);
 	}
 }
 
-#define GET_PRICE(rc, prob, symbol) \
-  price_table[((prob) ^ ((-(int)(symbol)) & (kBitModelTotal - 1))) >> kNumMoveReducingBits];
+#define GET_PRICE(prob, symbol) \
+  price_table[symbol][(prob) >> kNumMoveReducingBits]
 
-#define GET_PRICE_0(rc, prob) price_table[(prob) >> kNumMoveReducingBits]
+#define GET_PRICE_0(prob) price_table[0][(prob) >> kNumMoveReducingBits]
 
-#define GET_PRICE_1(rc, prob) price_table[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits]
+#define GET_PRICE_1(prob) price_table[1][(prob) >> kNumMoveReducingBits]
+
+#define kMinLitPrice 8U
 
 HINT_INLINE
-unsigned GetTreePrice(RangeEncoder* const rc, const Probability* const prob_table, unsigned const bit_count, size_t symbol)
+unsigned RC_getTreePrice(const Probability* const prob_table, unsigned bit_count, size_t symbol)
 {
 	unsigned price = 0;
-	symbol |= ((size_t)1 << bit_count);
-	while (symbol != 1) {
-		size_t next_symbol = symbol >> 1;
+    symbol |= ((size_t)1 << bit_count);
+    do {
+		size_t const next_symbol = symbol >> 1;
 		unsigned prob = prob_table[next_symbol];
-		unsigned bit = (unsigned)symbol & 1;
-		price += GET_PRICE(rc, prob, bit);
+        size_t bit = symbol & 1;
+		price += GET_PRICE(prob, bit);
 		symbol = next_symbol;
-	}
+    } while (symbol != 1);
 	return price;
 }
 
 HINT_INLINE
-unsigned GetReverseTreePrice(RangeEncoder* const rc, const Probability* const prob_table, unsigned const bit_count, size_t symbol)
+unsigned RC_getReverseTreePrice(const Probability* const prob_table, unsigned bit_count, size_t symbol)
 {
-	unsigned price = 0;
-	size_t m = 1;
-	for (unsigned i = bit_count; i != 0; --i) {
-		unsigned prob = prob_table[m];
-		unsigned bit = symbol & 1;
-		symbol >>= 1;
-		price += GET_PRICE(rc, prob, bit);
-		m = (m << 1) | bit;
-	}
-	return price;
+    unsigned prob = prob_table[1];
+    size_t bit = symbol & 1;
+    unsigned price = GET_PRICE(prob, bit);
+    size_t m = 1;
+    while (--bit_count != 0) {
+        m = (m << 1) | bit;
+        symbol >>= 1;
+        prob = prob_table[m];
+        bit = symbol & 1;
+        price += GET_PRICE(prob, bit);
+    }
+    return price;
 }
 
 HINT_INLINE
-void Flush(RangeEncoder* const rc)
+void RC_flush(RangeEncoder* const rc)
 {
     for (int i = 0; i < 5; ++i)
-        ShiftLow(rc);
+        RC_shiftLow(rc);
 }
 
 #if defined (__cplusplus)
diff --git a/C/fast-lzma2/util.c b/C/fast-lzma2/util.c
new file mode 100644
index 00000000..d6466063
--- /dev/null
+++ b/C/fast-lzma2/util.c
@@ -0,0 +1,707 @@
+/*
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "util.h"       /* note : ensure that platform.h is included first ! */
+#include <errno.h>
+#include <assert.h>
+
+
+int UTIL_fileExist(const char* filename)
+{
+    stat_t statbuf;
+#if defined(_MSC_VER)
+    int const stat_error = _stat64(filename, &statbuf);
+#else
+    int const stat_error = stat(filename, &statbuf);
+#endif
+    return !stat_error;
+}
+
+int UTIL_isRegularFile(const char* infilename)
+{
+    stat_t statbuf;
+    return UTIL_getFileStat(infilename, &statbuf); /* Only need to know whether it is a regular file */
+}
+
+int UTIL_getFileStat(const char* infilename, stat_t *statbuf)
+{
+    int r;
+#if defined(_MSC_VER)
+    r = _stat64(infilename, statbuf);
+    if (r || !(statbuf->st_mode & S_IFREG)) return 0;   /* No good... */
+#else
+    r = stat(infilename, statbuf);
+    if (r || !S_ISREG(statbuf->st_mode)) return 0;   /* No good... */
+#endif
+    return 1;
+}
+
+int UTIL_setFileStat(const char *filename, stat_t *statbuf)
+{
+    int res = 0;
+    struct utimbuf timebuf;
+
+    if (!UTIL_isRegularFile(filename))
+        return -1;
+
+    timebuf.actime = time(NULL);
+    timebuf.modtime = statbuf->st_mtime;
+    res += utime(filename, &timebuf);  /* set access and modification times */
+
+#if !defined(_WIN32)
+    res += chown(filename, statbuf->st_uid, statbuf->st_gid);  /* Copy ownership */
+#endif
+
+    res += chmod(filename, statbuf->st_mode & 07777);  /* Copy file permissions */
+
+    errno = 0;
+    return -res; /* number of errors is returned */
+}
+
+U32 UTIL_isDirectory(const char* infilename)
+{
+    int r;
+    stat_t statbuf;
+#if defined(_MSC_VER)
+    r = _stat64(infilename, &statbuf);
+    if (!r && (statbuf.st_mode & _S_IFDIR)) return 1;
+#else
+    r = stat(infilename, &statbuf);
+    if (!r && S_ISDIR(statbuf.st_mode)) return 1;
+#endif
+    return 0;
+}
+
+U32 UTIL_isLink(const char* infilename)
+{
+/* macro guards, as defined in : https://linux.die.net/man/2/lstat */
+#ifndef __STRICT_ANSI__
+#if defined(_BSD_SOURCE) \
+    || (defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE >= 500)) \
+    || (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) \
+    || (defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) \
+    || (defined(__APPLE__) && defined(__MACH__)) \
+    || defined(__OpenBSD__) \
+    || defined(__FreeBSD__)
+    int r;
+    stat_t statbuf;
+    r = lstat(infilename, &statbuf);
+    if (!r && S_ISLNK(statbuf.st_mode)) return 1;
+#endif
+#endif
+    (void)infilename;
+    return 0;
+}
+
+U64 UTIL_getFileSize(const char* infilename)
+{
+    if (!UTIL_isRegularFile(infilename)) return UTIL_FILESIZE_UNKNOWN;
+    {   int r;
+#if defined(_MSC_VER)
+        struct __stat64 statbuf;
+        r = _stat64(infilename, &statbuf);
+        if (r || !(statbuf.st_mode & S_IFREG)) return UTIL_FILESIZE_UNKNOWN;
+#elif defined(__MINGW32__) && defined (__MSVCRT__)
+        struct _stati64 statbuf;
+        r = _stati64(infilename, &statbuf);
+        if (r || !(statbuf.st_mode & S_IFREG)) return UTIL_FILESIZE_UNKNOWN;
+#else
+        struct stat statbuf;
+        r = stat(infilename, &statbuf);
+        if (r || !S_ISREG(statbuf.st_mode)) return UTIL_FILESIZE_UNKNOWN;
+#endif
+        return (U64)statbuf.st_size;
+    }
+}
+
+
+U64 UTIL_getTotalFileSize(const char* const * const fileNamesTable, unsigned nbFiles)
+{
+    U64 total = 0;
+    int error = 0;
+    unsigned n;
+    for (n=0; n<nbFiles; n++) {
+        U64 const size = UTIL_getFileSize(fileNamesTable[n]);
+        error |= (size == UTIL_FILESIZE_UNKNOWN);
+        total += size;
+    }
+    return error ? UTIL_FILESIZE_UNKNOWN : total;
+}
+
+#ifdef _WIN32
+int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
+{
+    char* path;
+    int dirLength, fnameLength, pathLength, nbFiles = 0;
+    WIN32_FIND_DATAA cFile;
+    HANDLE hFile;
+
+    dirLength = (int)strlen(dirName);
+    path = (char*) malloc(dirLength + 3);
+    if (!path) return 0;
+
+    memcpy(path, dirName, dirLength);
+    path[dirLength] = '\\';
+    path[dirLength+1] = '*';
+    path[dirLength+2] = 0;
+
+    hFile=FindFirstFileA(path, &cFile);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        UTIL_DISPLAYLEVEL(1, "Cannot open directory '%s'\n", dirName);
+        return 0;
+    }
+    free(path);
+
+    do {
+        fnameLength = (int)strlen(cFile.cFileName);
+        path = (char*) malloc(dirLength + fnameLength + 2);
+        if (!path) { FindClose(hFile); return 0; }
+        memcpy(path, dirName, dirLength);
+        path[dirLength] = '\\';
+        memcpy(path+dirLength+1, cFile.cFileName, fnameLength);
+        pathLength = dirLength+1+fnameLength;
+        path[pathLength] = 0;
+        if (cFile.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+            if ( strcmp (cFile.cFileName, "..") == 0
+              || strcmp (cFile.cFileName, ".") == 0 )
+                continue;
+            /* Recursively call "UTIL_prepareFileList" with the new path. */
+            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks);
+            if (*bufStart == NULL) { free(path); FindClose(hFile); return 0; }
+        } else if ( (cFile.dwFileAttributes & FILE_ATTRIBUTE_NORMAL)
+                 || (cFile.dwFileAttributes & FILE_ATTRIBUTE_ARCHIVE)
+                 || (cFile.dwFileAttributes & FILE_ATTRIBUTE_COMPRESSED) ) {
+            if (*bufStart + *pos + pathLength >= *bufEnd) {
+                ptrdiff_t const newListSize = (*bufEnd - *bufStart) + LIST_SIZE_INCREASE;
+                *bufStart = (char*)UTIL_realloc(*bufStart, newListSize);
+                if (*bufStart == NULL) { free(path); FindClose(hFile); return 0; }
+                *bufEnd = *bufStart + newListSize;
+            }
+            if (*bufStart + *pos + pathLength < *bufEnd) {
+                memcpy(*bufStart + *pos, path, pathLength+1 /* include final \0 */);
+                *pos += pathLength + 1;
+                nbFiles++;
+            }
+        }
+        free(path);
+    } while (FindNextFileA(hFile, &cFile));
+
+    FindClose(hFile);
+    return nbFiles;
+}
+
+#elif defined(__linux__) || (PLATFORM_POSIX_VERSION >= 200112L)  /* opendir, readdir require POSIX.1-2001 */
+
+int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
+{
+    DIR *dir;
+    struct dirent *entry;
+    char* path;
+    int dirLength, fnameLength, pathLength, nbFiles = 0;
+
+    if (!(dir = opendir(dirName))) {
+        UTIL_DISPLAYLEVEL(1, "Cannot open directory '%s': %s\n", dirName, strerror(errno));
+        return 0;
+    }
+
+    dirLength = (int)strlen(dirName);
+    errno = 0;
+    while ((entry = readdir(dir)) != NULL) {
+        if (strcmp (entry->d_name, "..") == 0 ||
+            strcmp (entry->d_name, ".") == 0) continue;
+        fnameLength = (int)strlen(entry->d_name);
+        path = (char*) malloc(dirLength + fnameLength + 2);
+        if (!path) { closedir(dir); return 0; }
+        memcpy(path, dirName, dirLength);
+
+        path[dirLength] = '/';
+        memcpy(path+dirLength+1, entry->d_name, fnameLength);
+        pathLength = dirLength+1+fnameLength;
+        path[pathLength] = 0;
+
+        if (!followLinks && UTIL_isLink(path)) {
+            UTIL_DISPLAYLEVEL(2, "Warning : %s is a symbolic link, ignoring\n", path);
+            continue;
+        }
+
+        if (UTIL_isDirectory(path)) {
+            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks);  /* Recursively call "UTIL_prepareFileList" with the new path. */
+            if (*bufStart == NULL) { free(path); closedir(dir); return 0; }
+        } else {
+            if (*bufStart + *pos + pathLength >= *bufEnd) {
+                ptrdiff_t newListSize = (*bufEnd - *bufStart) + LIST_SIZE_INCREASE;
+                *bufStart = (char*)UTIL_realloc(*bufStart, newListSize);
+                *bufEnd = *bufStart + newListSize;
+                if (*bufStart == NULL) { free(path); closedir(dir); return 0; }
+            }
+            if (*bufStart + *pos + pathLength < *bufEnd) {
+                memcpy(*bufStart + *pos, path, pathLength + 1);  /* with final \0 */
+                *pos += pathLength + 1;
+                nbFiles++;
+            }
+        }
+        free(path);
+        errno = 0; /* clear errno after UTIL_isDirectory, UTIL_prepareFileList */
+    }
+
+    if (errno != 0) {
+        UTIL_DISPLAYLEVEL(1, "readdir(%s) error: %s\n", dirName, strerror(errno));
+        free(*bufStart);
+        *bufStart = NULL;
+    }
+    closedir(dir);
+    return nbFiles;
+}
+
+#else
+
+int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
+{
+    (void)bufStart; (void)bufEnd; (void)pos; (void)followLinks;
+    UTIL_DISPLAYLEVEL(1, "Directory %s ignored (compiled without _WIN32 or _POSIX_C_SOURCE)\n", dirName);
+    return 0;
+}
+
+#endif /* #ifdef _WIN32 */
+
+/*
+ * UTIL_createFileList - takes a list of files and directories (params: inputNames, inputNamesNb), scans directories,
+ *                       and returns a new list of files (params: return value, allocatedBuffer, allocatedNamesNb).
+ * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer)
+ * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called.
+ */
+const char**
+UTIL_createFileList(const char **inputNames, unsigned inputNamesNb,
+                    char** allocatedBuffer, unsigned* allocatedNamesNb,
+                    int followLinks)
+{
+    size_t pos;
+    unsigned i, nbFiles;
+    char* buf = (char*)malloc(LIST_SIZE_INCREASE);
+    char* bufend = buf + LIST_SIZE_INCREASE;
+    const char** fileTable;
+
+    if (!buf) return NULL;
+
+    for (i=0, pos=0, nbFiles=0; i<inputNamesNb; i++) {
+        if (!UTIL_isDirectory(inputNames[i])) {
+            size_t const len = strlen(inputNames[i]);
+            if (buf + pos + len >= bufend) {
+                ptrdiff_t newListSize = (bufend - buf) + LIST_SIZE_INCREASE;
+                buf = (char*)UTIL_realloc(buf, newListSize);
+                bufend = buf + newListSize;
+                if (!buf) return NULL;
+            }
+            if (buf + pos + len < bufend) {
+                memcpy(buf+pos, inputNames[i], len+1);  /* with final \0 */
+                pos += len + 1;
+                nbFiles++;
+            }
+        } else {
+            nbFiles += UTIL_prepareFileList(inputNames[i], &buf, &pos, &bufend, followLinks);
+            if (buf == NULL) return NULL;
+    }   }
+
+    if (nbFiles == 0) { free(buf); return NULL; }
+
+    fileTable = (const char**)malloc((nbFiles+1) * sizeof(const char*));
+    if (!fileTable) { free(buf); return NULL; }
+
+    for (i=0, pos=0; i<nbFiles; i++) {
+        fileTable[i] = buf + pos;
+        pos += strlen(fileTable[i]) + 1;
+    }
+
+    if (buf + pos > bufend) { free(buf); free((void*)fileTable); return NULL; }
+
+    *allocatedBuffer = buf;
+    *allocatedNamesNb = nbFiles;
+
+    return fileTable;
+}
+
+/*-****************************************
+*  Console log
+******************************************/
+int g_utilDisplayLevel;
+
+
+/*-****************************************
+*  Time functions
+******************************************/
+#if defined(_WIN32)   /* Windows */
+
+UTIL_time_t UTIL_getTime(void) { UTIL_time_t x; QueryPerformanceCounter(&x); return x; }
+
+U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd)
+{
+    static LARGE_INTEGER ticksPerSecond;
+    static int init = 0;
+    if (!init) {
+        if (!QueryPerformanceFrequency(&ticksPerSecond))
+            UTIL_DISPLAYLEVEL(1, "ERROR: QueryPerformanceFrequency() failure\n");
+        init = 1;
+    }
+    return 1000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart;
+}
+
+U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd)
+{
+    static LARGE_INTEGER ticksPerSecond;
+    static int init = 0;
+    if (!init) {
+        if (!QueryPerformanceFrequency(&ticksPerSecond))
+            UTIL_DISPLAYLEVEL(1, "ERROR: QueryPerformanceFrequency() failure\n");
+        init = 1;
+    }
+    return 1000000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart;
+}
+
+#elif defined(__APPLE__) && defined(__MACH__)
+
+UTIL_time_t UTIL_getTime(void) { return mach_absolute_time(); }
+
+U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd)
+{
+    static mach_timebase_info_data_t rate;
+    static int init = 0;
+    if (!init) {
+        mach_timebase_info(&rate);
+        init = 1;
+    }
+    return (((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom))/1000ULL;
+}
+
+U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd)
+{
+    static mach_timebase_info_data_t rate;
+    static int init = 0;
+    if (!init) {
+        mach_timebase_info(&rate);
+        init = 1;
+    }
+    return ((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom);
+}
+
+#elif (PLATFORM_POSIX_VERSION >= 200112L) \
+   && (defined(__UCLIBC__)                \
+      || (defined(__GLIBC__)              \
+          && ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) \
+             || (__GLIBC__ > 2))))
+
+UTIL_time_t UTIL_getTime(void)
+{
+    UTIL_time_t time;
+    if (clock_gettime(CLOCK_MONOTONIC, &time))
+        UTIL_DISPLAYLEVEL(1, "ERROR: Failed to get time\n");   /* we could also exit() */
+    return time;
+}
+
+UTIL_time_t UTIL_getSpanTime(UTIL_time_t begin, UTIL_time_t end)
+{
+    UTIL_time_t diff;
+    if (end.tv_nsec < begin.tv_nsec) {
+        diff.tv_sec = (end.tv_sec - 1) - begin.tv_sec;
+        diff.tv_nsec = (end.tv_nsec + 1000000000ULL) - begin.tv_nsec;
+    } else {
+        diff.tv_sec = end.tv_sec - begin.tv_sec;
+        diff.tv_nsec = end.tv_nsec - begin.tv_nsec;
+    }
+    return diff;
+}
+
+U64 UTIL_getSpanTimeMicro(UTIL_time_t begin, UTIL_time_t end)
+{
+    UTIL_time_t const diff = UTIL_getSpanTime(begin, end);
+    U64 micro = 0;
+    micro += 1000000ULL * diff.tv_sec;
+    micro += diff.tv_nsec / 1000ULL;
+    return micro;
+}
+
+U64 UTIL_getSpanTimeNano(UTIL_time_t begin, UTIL_time_t end)
+{
+    UTIL_time_t const diff = UTIL_getSpanTime(begin, end);
+    U64 nano = 0;
+    nano += 1000000000ULL * diff.tv_sec;
+    nano += diff.tv_nsec;
+    return nano;
+}
+
+#else   /* relies on standard C (note : clock_t measurements can be wrong when using multi-threading) */
+
+UTIL_time_t UTIL_getTime(void) { return clock(); }
+U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd) { return 1000000ULL * (clockEnd - clockStart) / CLOCKS_PER_SEC; }
+U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd) { return 1000000000ULL * (clockEnd - clockStart) / CLOCKS_PER_SEC; }
+
+#endif
+
+/* returns time span in microseconds */
+U64 UTIL_clockSpanMicro(UTIL_time_t clockStart )
+{
+    UTIL_time_t const clockEnd = UTIL_getTime();
+    return UTIL_getSpanTimeMicro(clockStart, clockEnd);
+}
+
+/* returns time span in microseconds */
+U64 UTIL_clockSpanNano(UTIL_time_t clockStart )
+{
+    UTIL_time_t const clockEnd = UTIL_getTime();
+    return UTIL_getSpanTimeNano(clockStart, clockEnd);
+}
+
+void UTIL_waitForNextTick(void)
+{
+    UTIL_time_t const clockStart = UTIL_getTime();
+    UTIL_time_t clockEnd;
+    do {
+        clockEnd = UTIL_getTime();
+    } while (UTIL_getSpanTimeNano(clockStart, clockEnd) == 0);
+}
+
+/* count the number of physical cores */
+#if defined(_WIN32) || defined(WIN32)
+
+#include <windows.h>
+
+typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+
+int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0;
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    {   LPFN_GLPI glpi;
+        BOOL done = FALSE;
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
+        DWORD returnLength = 0;
+        size_t byteOffset = 0;
+
+        glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")),
+                                         "GetLogicalProcessorInformation");
+
+        if (glpi == NULL) {
+            goto failed;
+        }
+
+        while(!done) {
+            DWORD rc = glpi(buffer, &returnLength);
+            if (FALSE == rc) {
+                if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
+                    if (buffer)
+                        free(buffer);
+                    buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(returnLength);
+
+                    if (buffer == NULL) {
+                        perror("zstd");
+                        exit(1);
+                    }
+                } else {
+                    /* some other error */
+                    goto failed;
+                }
+            } else {
+                done = TRUE;
+            }
+        }
+
+        ptr = buffer;
+
+        while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) {
+
+            if (ptr->Relationship == RelationProcessorCore) {
+                numPhysicalCores++;
+            }
+
+            ptr++;
+            byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        }
+
+        free(buffer);
+
+        return numPhysicalCores;
+    }
+
+failed:
+    /* try to fall back on GetSystemInfo */
+    {   SYSTEM_INFO sysinfo;
+        GetSystemInfo(&sysinfo);
+        numPhysicalCores = sysinfo.dwNumberOfProcessors;
+        if (numPhysicalCores == 0) numPhysicalCores = 1; /* just in case */
+    }
+    return numPhysicalCores;
+}
+
+#elif defined(__APPLE__)
+
+#include <sys/sysctl.h>
+
+/* Use apple-provided syscall
+ * see: man 3 sysctl */
+int UTIL_countPhysicalCores(void)
+{
+    static S32 numPhysicalCores = 0; /* apple specifies int32_t */
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    {   size_t size = sizeof(S32);
+        int const ret = sysctlbyname("hw.physicalcpu", &numPhysicalCores, &size, NULL, 0);
+        if (ret != 0) {
+            if (errno == ENOENT) {
+                /* entry not present, fall back on 1 */
+                numPhysicalCores = 1;
+            } else {
+                perror("zstd: can't get number of physical cpus");
+                exit(1);
+            }
+        }
+
+        return numPhysicalCores;
+    }
+}
+
+#elif defined(__linux__)
+
+/* parse /proc/cpuinfo
+ * siblings / cpu cores should give hyperthreading ratio
+ * otherwise fall back on sysconf */
+int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0;
+
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numPhysicalCores == -1) {
+        /* value not queryable, fall back on 1 */
+        return numPhysicalCores = 1;
+    }
+
+    /* try to determine if there's hyperthreading */
+    {   FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
+#define BUF_SIZE 80
+        char buff[BUF_SIZE];
+
+        int siblings = 0;
+        int cpu_cores = 0;
+        int ratio = 1;
+
+        if (cpuinfo == NULL) {
+            /* fall back on the sysconf value */
+            return numPhysicalCores;
+        }
+
+        /* assume the cpu cores/siblings values will be constant across all
+         * present processors */
+        while (!feof(cpuinfo)) {
+            if (fgets(buff, BUF_SIZE, cpuinfo) != NULL) {
+                if (strncmp(buff, "siblings", 8) == 0) {
+                    const char* const sep = strchr(buff, ':');
+                    if (*sep == '\0') {
+                        /* formatting was broken? */
+                        goto failed;
+                    }
+
+                    siblings = atoi(sep + 1);
+                }
+                if (strncmp(buff, "cpu cores", 9) == 0) {
+                    const char* const sep = strchr(buff, ':');
+                    if (*sep == '\0') {
+                        /* formatting was broken? */
+                        goto failed;
+                    }
+
+                    cpu_cores = atoi(sep + 1);
+                }
+            } else if (ferror(cpuinfo)) {
+                /* fall back on the sysconf value */
+                goto failed;
+            }
+        }
+        if (siblings && cpu_cores) {
+            ratio = siblings / cpu_cores;
+        }
+failed:
+        fclose(cpuinfo);
+        return numPhysicalCores = numPhysicalCores / ratio;
+    }
+}
+
+#elif defined(__FreeBSD__)
+
+#include <sys/param.h>
+#include <sys/sysctl.h>
+
+/* Use physical core sysctl when available
+ * see: man 4 smp, man 3 sysctl */
+int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0; /* freebsd sysctl is native int sized */
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+#if __FreeBSD_version >= 1300008
+    {   size_t size = sizeof(numPhysicalCores);
+        int ret = sysctlbyname("kern.smp.cores", &numPhysicalCores, &size, NULL, 0);
+        if (ret == 0) return numPhysicalCores;
+        if (errno != ENOENT) {
+            perror("zstd: can't get number of physical cpus");
+            exit(1);
+        }
+        /* sysctl not present, fall through to older sysconf method */
+    }
+#endif
+
+    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numPhysicalCores == -1) {
+        /* value not queryable, fall back on 1 */
+        numPhysicalCores = 1;
+    }
+    return numPhysicalCores;
+}
+
+#elif defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
+
+/* Use POSIX sysconf
+ * see: man 3 sysconf */
+int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0;
+
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numPhysicalCores == -1) {
+        /* value not queryable, fall back on 1 */
+        return numPhysicalCores = 1;
+    }
+    return numPhysicalCores;
+}
+
+#else
+
+int UTIL_countPhysicalCores(void)
+{
+    /* assume 1 */
+    return 1;
+}
+
+#endif
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/C/fast-lzma2/util.h b/C/fast-lzma2/util.h
index d5688203..f78bcbe1 100644
--- a/C/fast-lzma2/util.h
+++ b/C/fast-lzma2/util.h
@@ -16,17 +16,15 @@ extern "C" {
 #endif
 
 
-
 /*-****************************************
 *  Dependencies
 ******************************************/
-#include "platform.h"     /* PLATFORM_POSIX_VERSION */
-#include <stdlib.h>       /* malloc */
+#include "platform.h"     /* PLATFORM_POSIX_VERSION, ZSTD_NANOSLEEP_SUPPORT, ZSTD_SETPRIORITY_SUPPORT */
+#include <stdlib.h>       /* malloc, realloc, free */
 #include <stddef.h>       /* size_t, ptrdiff_t */
 #include <stdio.h>        /* fprintf */
-#include <string.h>       /* strncmp */
 #include <sys/types.h>    /* stat, utime */
-#include <sys/stat.h>     /* stat */
+#include <sys/stat.h>     /* stat, chmod */
 #if defined(_MSC_VER)
 #  include <sys/utime.h>  /* utime */
 #  include <io.h>         /* _chmod */
@@ -34,13 +32,12 @@ extern "C" {
 #  include <unistd.h>     /* chown, stat */
 #  include <utime.h>      /* utime */
 #endif
-#include <time.h>         /* time */
-#include <errno.h>
+#include <time.h>         /* clock_t, clock, CLOCKS_PER_SEC, nanosleep */
 #include "mem.h"          /* U32, U64 */
 
 
-/* ************************************************************
-* Avoid fseek()'s 2GiB barrier with MSVC, MacOS, *BSD, MinGW
+/*-************************************************************
+* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
 ***************************************************************/
 #if defined(_MSC_VER) && (_MSC_VER >= 1400)
 #   define UTIL_fseek _fseeki64
@@ -53,37 +50,38 @@ extern "C" {
 #endif
 
 
-/*-****************************************
-*  Sleep functions: Windows - Posix - others
-******************************************/
+/*-*************************************************
+*  Sleep & priority functions: Windows - Posix - others
+***************************************************/
 #if defined(_WIN32)
 #  include <windows.h>
 #  define SET_REALTIME_PRIORITY SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS)
 #  define UTIL_sleep(s) Sleep(1000*s)
 #  define UTIL_sleepMilli(milli) Sleep(milli)
-#elif PLATFORM_POSIX_VERSION >= 0 /* Unix-like operating system */
-#  include <unistd.h>
-#  include <sys/resource.h> /* setpriority */
-#  include <time.h>         /* clock_t, nanosleep, clock, CLOCKS_PER_SEC */
-#  if defined(PRIO_PROCESS)
-#    define SET_REALTIME_PRIORITY setpriority(PRIO_PROCESS, 0, -20)
-#  else
-#    define SET_REALTIME_PRIORITY /* disabled */
-#  endif
+
+#elif PLATFORM_POSIX_VERSION > 0 /* Unix-like operating system */
+#  include <unistd.h>   /* sleep */
 #  define UTIL_sleep(s) sleep(s)
-#  if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 199309L)) || (PLATFORM_POSIX_VERSION >= 200112L)  /* nanosleep requires POSIX.1-2001 */
+#  if ZSTD_NANOSLEEP_SUPPORT   /* necessarily defined in platform.h */
 #      define UTIL_sleepMilli(milli) { struct timespec t; t.tv_sec=0; t.tv_nsec=milli*1000000ULL; nanosleep(&t, NULL); }
 #  else
 #      define UTIL_sleepMilli(milli) /* disabled */
 #  endif
-#else
-#  define SET_REALTIME_PRIORITY      /* disabled */
+#  if ZSTD_SETPRIORITY_SUPPORT
+#    include <sys/resource.h> /* setpriority */
+#    define SET_REALTIME_PRIORITY setpriority(PRIO_PROCESS, 0, -20)
+#  else
+#    define SET_REALTIME_PRIORITY /* disabled */
+#  endif
+
+#else  /* unknown non-unix operating systen */
 #  define UTIL_sleep(s)          /* disabled */
 #  define UTIL_sleepMilli(milli) /* disabled */
+#  define SET_REALTIME_PRIORITY  /* disabled */
 #endif
 
 
-/* *************************************
+/*-*************************************
 *  Constants
 ***************************************/
 #define LIST_SIZE_INCREASE   (8*1024)
@@ -101,8 +99,6 @@ extern "C" {
 #  define UTIL_STATIC static inline
 #elif defined(_MSC_VER)
 #  define UTIL_STATIC static __inline
-#  pragma warning(disable : 4996)  /* disable: C4996: 'strncpy': This function or variable may be unsafe. */
-#  pragma warning(disable : 4389)  /* disable: C4389: '==' : signed/unsigned mismatch */
 #else
 #  define UTIL_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
 #endif
@@ -111,7 +107,7 @@ extern "C" {
 /*-****************************************
 *  Console log
 ******************************************/
-static int g_utilDisplayLevel;
+extern int g_utilDisplayLevel;
 #define UTIL_DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define UTIL_DISPLAYLEVEL(l, ...) { if (g_utilDisplayLevel>=l) { UTIL_DISPLAY(__VA_ARGS__); } }
 
@@ -120,119 +116,47 @@ static int g_utilDisplayLevel;
 *  Time functions
 ******************************************/
 #if defined(_WIN32)   /* Windows */
+
+    #define UTIL_TIME_INITIALIZER { { 0, 0 } }
     typedef LARGE_INTEGER UTIL_time_t;
-    UTIL_STATIC UTIL_time_t UTIL_getTime(void) { UTIL_time_t x; QueryPerformanceCounter(&x); return x; }
-    UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd)
-    {
-        static LARGE_INTEGER ticksPerSecond;
-        static int init = 0;
-        if (!init) {
-            if (!QueryPerformanceFrequency(&ticksPerSecond))
-                UTIL_DISPLAYLEVEL(1, "ERROR: QueryPerformanceFrequency() failure\n");
-            init = 1;
-        }
-        return 1000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart;
-    }
-    UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd)
-    {
-        static LARGE_INTEGER ticksPerSecond;
-        static int init = 0;
-        if (!init) {
-            if (!QueryPerformanceFrequency(&ticksPerSecond))
-                UTIL_DISPLAYLEVEL(1, "ERROR: QueryPerformanceFrequency() failure\n");
-            init = 1;
-        }
-        return 1000000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart;
-    }
+
 #elif defined(__APPLE__) && defined(__MACH__)
+
     #include <mach/mach_time.h>
+    #define UTIL_TIME_INITIALIZER 0
     typedef U64 UTIL_time_t;
-    UTIL_STATIC UTIL_time_t UTIL_getTime(void) { return mach_absolute_time(); }
-    UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd)
-    {
-        static mach_timebase_info_data_t rate;
-        static int init = 0;
-        if (!init) {
-            mach_timebase_info(&rate);
-            init = 1;
-        }
-        return (((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom))/1000ULL;
-    }
-    UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd)
-    {
-        static mach_timebase_info_data_t rate;
-        static int init = 0;
-        if (!init) {
-            mach_timebase_info(&rate);
-            init = 1;
-        }
-        return ((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom);
-    }
-#elif (PLATFORM_POSIX_VERSION >= 200112L)
-    #include <time.h>
+
+#elif (PLATFORM_POSIX_VERSION >= 200112L) \
+   && (defined(__UCLIBC__)                \
+      || (defined(__GLIBC__)              \
+          && ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) \
+             || (__GLIBC__ > 2))))
+
+    #define UTIL_TIME_INITIALIZER { 0, 0 }
     typedef struct timespec UTIL_freq_t;
     typedef struct timespec UTIL_time_t;
-    UTIL_STATIC UTIL_time_t UTIL_getTime(void)
-    {
-        UTIL_time_t time;
-        if (clock_gettime(CLOCK_MONOTONIC, &time))
-            UTIL_DISPLAYLEVEL(1, "ERROR: Failed to get time\n");   /* we could also exit() */
-        return time;
-    }
-    UTIL_STATIC UTIL_time_t UTIL_getSpanTime(UTIL_time_t begin, UTIL_time_t end)
-    {
-        UTIL_time_t diff;
-        if (end.tv_nsec < begin.tv_nsec) {
-            diff.tv_sec = (end.tv_sec - 1) - begin.tv_sec;
-            diff.tv_nsec = (end.tv_nsec + 1000000000ULL) - begin.tv_nsec;
-        } else {
-            diff.tv_sec = end.tv_sec - begin.tv_sec;
-            diff.tv_nsec = end.tv_nsec - begin.tv_nsec;
-        }
-        return diff;
-    }
-    UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t begin, UTIL_time_t end)
-    {
-        UTIL_time_t const diff = UTIL_getSpanTime(begin, end);
-        U64 micro = 0;
-        micro += 1000000ULL * diff.tv_sec;
-        micro += diff.tv_nsec / 1000ULL;
-        return micro;
-    }
-    UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t begin, UTIL_time_t end)
-    {
-        UTIL_time_t const diff = UTIL_getSpanTime(begin, end);
-        U64 nano = 0;
-        nano += 1000000000ULL * diff.tv_sec;
-        nano += diff.tv_nsec;
-        return nano;
-    }
+
+    UTIL_time_t UTIL_getSpanTime(UTIL_time_t begin, UTIL_time_t end);
+
 #else   /* relies on standard C (note : clock_t measurements can be wrong when using multi-threading) */
+
     typedef clock_t UTIL_time_t;
-    UTIL_STATIC UTIL_time_t UTIL_getTime(void) { return clock(); }
-    UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd) { return 1000000ULL * (clockEnd - clockStart) / CLOCKS_PER_SEC; }
-    UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd) { return 1000000000ULL * (clockEnd - clockStart) / CLOCKS_PER_SEC; }
+    #define UTIL_TIME_INITIALIZER 0
+
 #endif
 
+UTIL_time_t UTIL_getTime(void);
+U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd);
+U64 UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd);
+
+#define SEC_TO_MICRO 1000000
 
 /* returns time span in microseconds */
-UTIL_STATIC U64 UTIL_clockSpanMicro( UTIL_time_t clockStart )
-{
-    UTIL_time_t const clockEnd = UTIL_getTime();
-    return UTIL_getSpanTimeMicro(clockStart, clockEnd);
-}
-
-
-UTIL_STATIC void UTIL_waitForNextTick(void)
-{
-    UTIL_time_t const clockStart = UTIL_getTime();
-    UTIL_time_t clockEnd;
-    do {
-        clockEnd = UTIL_getTime();
-    } while (UTIL_getSpanTimeNano(clockStart, clockEnd) == 0);
-}
-
+U64 UTIL_clockSpanMicro(UTIL_time_t clockStart);
 
+/* returns time span in microseconds */
+U64 UTIL_clockSpanNano(UTIL_time_t clockStart);
+void UTIL_waitForNextTick(void);
 
 /*-****************************************
 *  File functions
@@ -245,118 +169,23 @@ UTIL_STATIC void UTIL_waitForNextTick(void)
 #endif
 
 
-UTIL_STATIC int UTIL_setFileStat(const char *filename, stat_t *statbuf)
-{
-    int res = 0;
-    struct utimbuf timebuf;
-
-    timebuf.actime = time(NULL);
-    timebuf.modtime = statbuf->st_mtime;
-    res += utime(filename, &timebuf);  /* set access and modification times */
-
-#if !defined(_WIN32)
-    res += chown(filename, statbuf->st_uid, statbuf->st_gid);  /* Copy ownership */
-#endif
-
-    res += chmod(filename, statbuf->st_mode & 07777);  /* Copy file permissions */
-
-    errno = 0;
-    return -res; /* number of errors is returned */
-}
-
-
-UTIL_STATIC int UTIL_getFileStat(const char* infilename, stat_t *statbuf)
-{
-    int r;
-#if defined(_MSC_VER)
-    r = _stat64(infilename, statbuf);
-    if (r || !(statbuf->st_mode & S_IFREG)) return 0;   /* No good... */
-#else
-    r = stat(infilename, statbuf);
-    if (r || !S_ISREG(statbuf->st_mode)) return 0;   /* No good... */
-#endif
-    return 1;
-}
-
-
-UTIL_STATIC int UTIL_isRegularFile(const char* infilename)
-{
-    stat_t statbuf;
-    return UTIL_getFileStat(infilename, &statbuf); /* Only need to know whether it is a regular file */
-}
-
-
-UTIL_STATIC U32 UTIL_isDirectory(const char* infilename)
-{
-    int r;
-    stat_t statbuf;
-#if defined(_MSC_VER)
-    r = _stat64(infilename, &statbuf);
-    if (!r && (statbuf.st_mode & _S_IFDIR)) return 1;
-#else
-    r = stat(infilename, &statbuf);
-    if (!r && S_ISDIR(statbuf.st_mode)) return 1;
-#endif
-    return 0;
-}
-
-UTIL_STATIC U32 UTIL_isLink(const char* infilename)
-{
-#if defined(_WIN32)
-    /* no symlinks on windows */
-    (void)infilename;
-#else
-    int r;
-    stat_t statbuf;
-    r = lstat(infilename, &statbuf);
-    if (!r && S_ISLNK(statbuf.st_mode)) return 1;
-#endif
-    return 0;
-}
-
+int UTIL_fileExist(const char* filename);
+int UTIL_isRegularFile(const char* infilename);
+int UTIL_setFileStat(const char* filename, stat_t* statbuf);
+U32 UTIL_isDirectory(const char* infilename);
+int UTIL_getFileStat(const char* infilename, stat_t* statbuf);
 
+U32 UTIL_isLink(const char* infilename);
 #define UTIL_FILESIZE_UNKNOWN  ((U64)(-1))
-UTIL_STATIC U64 UTIL_getFileSize(const char* infilename)
-{
-    if (!UTIL_isRegularFile(infilename)) return UTIL_FILESIZE_UNKNOWN;
-    {   int r;
-#if defined(_MSC_VER)
-        struct __stat64 statbuf;
-        r = _stat64(infilename, &statbuf);
-        if (r || !(statbuf.st_mode & S_IFREG)) return UTIL_FILESIZE_UNKNOWN;
-#elif defined(__MINGW32__) && defined (__MSVCRT__)
-        struct _stati64 statbuf;
-        r = _stati64(infilename, &statbuf);
-        if (r || !(statbuf.st_mode & S_IFREG)) return UTIL_FILESIZE_UNKNOWN;
-#else
-        struct stat statbuf;
-        r = stat(infilename, &statbuf);
-        if (r || !S_ISREG(statbuf.st_mode)) return UTIL_FILESIZE_UNKNOWN;
-#endif
-        return (U64)statbuf.st_size;
-    }
-}
-
-
-UTIL_STATIC U64 UTIL_getTotalFileSize(const char* const * const fileNamesTable, unsigned nbFiles)
-{
-    U64 total = 0;
-    int error = 0;
-    unsigned n;
-    for (n=0; n<nbFiles; n++) {
-        U64 const size = UTIL_getFileSize(fileNamesTable[n]);
-        error |= (size == UTIL_FILESIZE_UNKNOWN);
-        total += size;
-    }
-    return error ? UTIL_FILESIZE_UNKNOWN : total;
-}
+U64 UTIL_getFileSize(const char* infilename);
 
+U64 UTIL_getTotalFileSize(const char* const * const fileNamesTable, unsigned nbFiles);
 
 /*
  * A modified version of realloc().
  * If UTIL_realloc() fails the original block is freed.
 */
-UTIL_STATIC void *UTIL_realloc(void *ptr, size_t size)
+UTIL_STATIC void* UTIL_realloc(void *ptr, size_t size)
 {
     void *newptr = realloc(ptr, size);
     if (newptr) return newptr;
@@ -364,143 +193,14 @@ UTIL_STATIC void *UTIL_realloc(void *ptr, size_t size)
     return NULL;
 }
 
+int UTIL_prepareFileList(const char* dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks);
 #ifdef _WIN32
 #  define UTIL_HAS_CREATEFILELIST
-
-UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
-{
-    char* path;
-    int dirLength, fnameLength, pathLength, nbFiles = 0;
-    WIN32_FIND_DATAA cFile;
-    HANDLE hFile;
-
-    dirLength = (int)strlen(dirName);
-    path = (char*) malloc(dirLength + 3);
-    if (!path) return 0;
-
-    memcpy(path, dirName, dirLength);
-    path[dirLength] = '\\';
-    path[dirLength+1] = '*';
-    path[dirLength+2] = 0;
-
-    hFile=FindFirstFileA(path, &cFile);
-    if (hFile == INVALID_HANDLE_VALUE) {
-        UTIL_DISPLAYLEVEL(1, "Cannot open directory '%s'\n", dirName);
-        return 0;
-    }
-    free(path);
-
-    do {
-        fnameLength = (int)strlen(cFile.cFileName);
-        path = (char*) malloc(dirLength + fnameLength + 2);
-        if (!path) { FindClose(hFile); return 0; }
-        memcpy(path, dirName, dirLength);
-        path[dirLength] = '\\';
-        memcpy(path+dirLength+1, cFile.cFileName, fnameLength);
-        pathLength = dirLength+1+fnameLength;
-        path[pathLength] = 0;
-        if (cFile.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
-            if (strcmp (cFile.cFileName, "..") == 0 ||
-                strcmp (cFile.cFileName, ".") == 0) continue;
-
-            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks);  /* Recursively call "UTIL_prepareFileList" with the new path. */
-            if (*bufStart == NULL) { free(path); FindClose(hFile); return 0; }
-        }
-        else if ((cFile.dwFileAttributes & FILE_ATTRIBUTE_NORMAL) || (cFile.dwFileAttributes & FILE_ATTRIBUTE_ARCHIVE) || (cFile.dwFileAttributes & FILE_ATTRIBUTE_COMPRESSED)) {
-            if (*bufStart + *pos + pathLength >= *bufEnd) {
-                ptrdiff_t newListSize = (*bufEnd - *bufStart) + LIST_SIZE_INCREASE;
-                *bufStart = (char*)UTIL_realloc(*bufStart, newListSize);
-                *bufEnd = *bufStart + newListSize;
-                if (*bufStart == NULL) { free(path); FindClose(hFile); return 0; }
-            }
-            if (*bufStart + *pos + pathLength < *bufEnd) {
-                strncpy(*bufStart + *pos, path, *bufEnd - (*bufStart + *pos));
-                *pos += pathLength + 1;
-                nbFiles++;
-            }
-        }
-        free(path);
-    } while (FindNextFileA(hFile, &cFile));
-
-    FindClose(hFile);
-    return nbFiles;
-}
-
 #elif defined(__linux__) || (PLATFORM_POSIX_VERSION >= 200112L)  /* opendir, readdir require POSIX.1-2001 */
 #  define UTIL_HAS_CREATEFILELIST
 #  include <dirent.h>       /* opendir, readdir */
 #  include <string.h>       /* strerror, memcpy */
-
-UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
-{
-    DIR *dir;
-    struct dirent *entry;
-    char* path;
-    int dirLength, fnameLength, pathLength, nbFiles = 0;
-
-    if (!(dir = opendir(dirName))) {
-        UTIL_DISPLAYLEVEL(1, "Cannot open directory '%s': %s\n", dirName, strerror(errno));
-        return 0;
-    }
-
-    dirLength = (int)strlen(dirName);
-    errno = 0;
-    while ((entry = readdir(dir)) != NULL) {
-        if (strcmp (entry->d_name, "..") == 0 ||
-            strcmp (entry->d_name, ".") == 0) continue;
-        fnameLength = (int)strlen(entry->d_name);
-        path = (char*) malloc(dirLength + fnameLength + 2);
-        if (!path) { closedir(dir); return 0; }
-        memcpy(path, dirName, dirLength);
-
-        path[dirLength] = '/';
-        memcpy(path+dirLength+1, entry->d_name, fnameLength);
-        pathLength = dirLength+1+fnameLength;
-        path[pathLength] = 0;
-
-        if (!followLinks && UTIL_isLink(path)) {
-            UTIL_DISPLAYLEVEL(2, "Warning : %s is a symbolic link, ignoring\n", path);
-            continue;
-        }
-
-        if (UTIL_isDirectory(path)) {
-            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks);  /* Recursively call "UTIL_prepareFileList" with the new path. */
-            if (*bufStart == NULL) { free(path); closedir(dir); return 0; }
-        } else {
-            if (*bufStart + *pos + pathLength >= *bufEnd) {
-                ptrdiff_t newListSize = (*bufEnd - *bufStart) + LIST_SIZE_INCREASE;
-                *bufStart = (char*)UTIL_realloc(*bufStart, newListSize);
-                *bufEnd = *bufStart + newListSize;
-                if (*bufStart == NULL) { free(path); closedir(dir); return 0; }
-            }
-            if (*bufStart + *pos + pathLength < *bufEnd) {
-                strncpy(*bufStart + *pos, path, *bufEnd - (*bufStart + *pos));
-                *pos += pathLength + 1;
-                nbFiles++;
-            }
-        }
-        free(path);
-        errno = 0; /* clear errno after UTIL_isDirectory, UTIL_prepareFileList */
-    }
-
-    if (errno != 0) {
-        UTIL_DISPLAYLEVEL(1, "readdir(%s) error: %s\n", dirName, strerror(errno));
-        free(*bufStart);
-        *bufStart = NULL;
-    }
-    closedir(dir);
-    return nbFiles;
-}
-
 #else
-
-UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
-{
-    (void)bufStart; (void)bufEnd; (void)pos;
-    UTIL_DISPLAYLEVEL(1, "Directory %s ignored (compiled without _WIN32 or _POSIX_C_SOURCE)\n", dirName);
-    return 0;
-}
-
 #endif /* #ifdef _WIN32 */
 
 /*
@@ -509,53 +209,10 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
  * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer)
  * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called.
  */
-UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb, int followLinks)
-{
-    size_t pos;
-    unsigned i, nbFiles;
-    char* buf = (char*)malloc(LIST_SIZE_INCREASE);
-    char* bufend = buf + LIST_SIZE_INCREASE;
-    const char** fileTable;
-
-    if (!buf) return NULL;
-
-    for (i=0, pos=0, nbFiles=0; i<inputNamesNb; i++) {
-        if (!UTIL_isDirectory(inputNames[i])) {
-            size_t const len = strlen(inputNames[i]);
-            if (buf + pos + len >= bufend) {
-                ptrdiff_t newListSize = (bufend - buf) + LIST_SIZE_INCREASE;
-                buf = (char*)UTIL_realloc(buf, newListSize);
-                bufend = buf + newListSize;
-                if (!buf) return NULL;
-            }
-            if (buf + pos + len < bufend) {
-                strncpy(buf + pos, inputNames[i], bufend - (buf + pos));
-                pos += len + 1;
-                nbFiles++;
-            }
-        } else {
-            nbFiles += UTIL_prepareFileList(inputNames[i], &buf, &pos, &bufend, followLinks);
-            if (buf == NULL) return NULL;
-    }   }
-
-    if (nbFiles == 0) { free(buf); return NULL; }
-
-    fileTable = (const char**)malloc((nbFiles+1) * sizeof(const char*));
-    if (!fileTable) { free(buf); return NULL; }
-
-    for (i=0, pos=0; i<nbFiles; i++) {
-        fileTable[i] = buf + pos;
-        pos += strlen(fileTable[i]) + 1;
-    }
-
-    if (buf + pos > bufend) { free(buf); free((void*)fileTable); return NULL; }
-
-    *allocatedBuffer = buf;
-    *allocatedNamesNb = nbFiles;
-
-    return fileTable;
-}
-
+const char**
+UTIL_createFileList(const char **inputNames, unsigned inputNamesNb,
+                    char** allocatedBuffer, unsigned* allocatedNamesNb,
+                    int followLinks);
 
 UTIL_STATIC void UTIL_freeFileList(const char** filenameTable, char* allocatedBuffer)
 {
@@ -563,201 +220,7 @@ UTIL_STATIC void UTIL_freeFileList(const char** filenameTable, char* allocatedBu
     if (filenameTable) free((void*)filenameTable);
 }
 
-/* count the number of physical cores */
-#if defined(_WIN32) || defined(WIN32)
-
-#include <windows.h>
-
-typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
-
-UTIL_STATIC int UTIL_countPhysicalCores(void)
-{
-    static int numPhysicalCores = 0;
-    if (numPhysicalCores != 0) return numPhysicalCores;
-
-    {   LPFN_GLPI glpi;
-        BOOL done = FALSE;
-        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
-        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
-        DWORD returnLength = 0;
-        size_t byteOffset = 0;
-
-        glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")),
-                                         "GetLogicalProcessorInformation");
-
-        if (glpi == NULL) {
-            goto failed;
-        }
-
-        while(!done) {
-            DWORD rc = glpi(buffer, &returnLength);
-            if (FALSE == rc) {
-                if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
-                    if (buffer)
-                        free(buffer);
-                    buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(returnLength);
-
-                    if (buffer == NULL) {
-                        perror("zstd");
-                        exit(1);
-                    }
-                } else {
-                    /* some other error */
-                    goto failed;
-                }
-            } else {
-                done = TRUE;
-            }
-        }
-
-        ptr = buffer;
-
-        while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) {
-
-            if (ptr->Relationship == RelationProcessorCore) {
-                numPhysicalCores++;
-            }
-
-            ptr++;
-            byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
-        }
-
-        free(buffer);
-
-        return numPhysicalCores;
-    }
-
-failed:
-    /* try to fall back on GetSystemInfo */
-    {   SYSTEM_INFO sysinfo;
-        GetSystemInfo(&sysinfo);
-        numPhysicalCores = sysinfo.dwNumberOfProcessors;
-        if (numPhysicalCores == 0) numPhysicalCores = 1; /* just in case */
-    }
-    return numPhysicalCores;
-}
-
-#elif defined(__APPLE__)
-
-#include <sys/sysctl.h>
-
-/* Use apple-provided syscall
- * see: man 3 sysctl */
-UTIL_STATIC int UTIL_countPhysicalCores(void)
-{
-    static S32 numPhysicalCores = 0; /* apple specifies int32_t */
-    if (numPhysicalCores != 0) return numPhysicalCores;
-
-    {   size_t size = sizeof(S32);
-        int const ret = sysctlbyname("hw.physicalcpu", &numPhysicalCores, &size, NULL, 0);
-        if (ret != 0) {
-            if (errno == ENOENT) {
-                /* entry not present, fall back on 1 */
-                numPhysicalCores = 1;
-            } else {
-                perror("zstd: can't get number of physical cpus");
-                exit(1);
-            }
-        }
-
-        return numPhysicalCores;
-    }
-}
-
-#elif defined(__linux__)
-
-/* parse /proc/cpuinfo
- * siblings / cpu cores should give hyperthreading ratio
- * otherwise fall back on sysconf */
-UTIL_STATIC int UTIL_countPhysicalCores(void)
-{
-    static int numPhysicalCores = 0;
-
-    if (numPhysicalCores != 0) return numPhysicalCores;
-
-    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
-    if (numPhysicalCores == -1) {
-        /* value not queryable, fall back on 1 */
-        return numPhysicalCores = 1;
-    }
-
-    /* try to determine if there's hyperthreading */
-    {   FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
-#define BUF_SIZE 80
-        char buff[BUF_SIZE];
-
-        int siblings = 0;
-        int cpu_cores = 0;
-        int ratio = 1;
-
-        if (cpuinfo == NULL) {
-            /* fall back on the sysconf value */
-            return numPhysicalCores;
-        }
-
-        /* assume the cpu cores/siblings values will be constant across all
-         * present processors */
-        while (!feof(cpuinfo)) {
-            if (fgets(buff, BUF_SIZE, cpuinfo) != NULL) {
-                if (strncmp(buff, "siblings", 8) == 0) {
-                    const char* const sep = strchr(buff, ':');
-                    if (*sep == '\0') {
-                        /* formatting was broken? */
-                        goto failed;
-                    }
-
-                    siblings = atoi(sep + 1);
-                }
-                if (strncmp(buff, "cpu cores", 9) == 0) {
-                    const char* const sep = strchr(buff, ':');
-                    if (*sep == '\0') {
-                        /* formatting was broken? */
-                        goto failed;
-                    }
-
-                    cpu_cores = atoi(sep + 1);
-                }
-            } else if (ferror(cpuinfo)) {
-                /* fall back on the sysconf value */
-                goto failed;
-            }
-        }
-        if (siblings && cpu_cores) {
-            ratio = siblings / cpu_cores;
-        }
-failed:
-        fclose(cpuinfo);
-        return numPhysicalCores = numPhysicalCores / ratio;
-    }
-}
-
-#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
-
-/* Use apple-provided syscall
- * see: man 3 sysctl */
-UTIL_STATIC int UTIL_countPhysicalCores(void)
-{
-    static int numPhysicalCores = 0;
-
-    if (numPhysicalCores != 0) return numPhysicalCores;
-
-    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
-    if (numPhysicalCores == -1) {
-        /* value not queryable, fall back on 1 */
-        return numPhysicalCores = 1;
-    }
-    return numPhysicalCores;
-}
-
-#else
-
-UTIL_STATIC int UTIL_countPhysicalCores(void)
-{
-    /* assume 1 */
-    return 1;
-}
-
-#endif
+int UTIL_countPhysicalCores(void);
 
 #if defined (__cplusplus)
 }
diff --git a/CPP/7zip/7zip.mak b/CPP/7zip/7zip.mak
index e759328c..582dfd85 100644
--- a/CPP/7zip/7zip.mak
+++ b/CPP/7zip/7zip.mak
@@ -212,7 +212,7 @@ $(ZSTDMT_OBJS): ../../../../C/zstdmt/$(*B).c
 
 !IFDEF FASTLZMA2_OBJS
 $(FASTLZMA2_OBJS): ../../../../C/fast-lzma2/$(*B).c
-	$(COMPL_O2) -DNO_XXHASH
+	$(COMPL_O2) -DNO_XXHASH -DFL2_7ZIP_BUILD
 !ENDIF
 
 
@@ -298,7 +298,7 @@ $(FASTLZMA2_OBJS): ../../../../C/fast-lzma2/$(*B).c
 	-I ../../../../C/lz5 \
 	-I ../../../../C/zstd
 {../../../../C/fast-lzma2}.c{$O}.obj::
-	$(COMPLB_O2) -DNO_XXHASH
+	$(COMPLB_O2) -DNO_XXHASH -DFL2_7ZIP_BUILD
 
 !ENDIF
 
diff --git a/CPP/7zip/Bundles/Alone/makefile b/CPP/7zip/Bundles/Alone/makefile
index 43524082..f68a4296 100644
--- a/CPP/7zip/Bundles/Alone/makefile
+++ b/CPP/7zip/Bundles/Alone/makefile
@@ -322,16 +322,17 @@ ZSTDMT_OBJS = \
   $O\zstd-mt_threading.obj \
 
 FASTLZMA2_OBJS = \
-  $O\fl2_error_private.obj \
-  $O\fl2pool.obj \
-  $O\fl2threading.obj \
+  $O\dict_buffer.obj \
   $O\fl2_common.obj \
   $O\fl2_compress.obj \
+  $O\fl2_pool.obj \
+  $O\fl2_threading.obj \
   $O\lzma2_enc.obj \
   $O\radix_bitpack.obj \
   $O\radix_mf.obj \
   $O\radix_struct.obj \
   $O\range_enc.obj \
+  $O\util.obj \
 
 !include "../../UI/Console/Console.mak"
 
diff --git a/CPP/7zip/Bundles/Codec_flzma2/makefile b/CPP/7zip/Bundles/Codec_flzma2/makefile
index 30b42cb2..ef34c5d4 100644
--- a/CPP/7zip/Bundles/Codec_flzma2/makefile
+++ b/CPP/7zip/Bundles/Codec_flzma2/makefile
@@ -36,15 +36,16 @@ COMPRESS_OBJS = $(COMPRESS_OBJS) \
   $O\FastLzma2Register.obj \
 
 FASTLZMA2_OBJS = \
-  $O\fl2_error_private.obj \
-  $O\fl2pool.obj \
-  $O\fl2threading.obj \
+  $O\dict_buffer.obj \
   $O\fl2_common.obj \
   $O\fl2_compress.obj \
+  $O\fl2_pool.obj \
+  $O\fl2_threading.obj \
   $O\lzma2_enc.obj \
   $O\radix_bitpack.obj \
   $O\radix_mf.obj \
   $O\radix_struct.obj \
   $O\range_enc.obj \
+  $O\util.obj \
 
 !include "../../7zip.mak"
diff --git a/CPP/7zip/Bundles/Format7z/makefile b/CPP/7zip/Bundles/Format7z/makefile
index dafcf4ae..1db7800b 100644
--- a/CPP/7zip/Bundles/Format7z/makefile
+++ b/CPP/7zip/Bundles/Format7z/makefile
@@ -244,16 +244,17 @@ ZSTDMT_OBJS = \
   $O\zstd-mt_threading.obj \
 
 FASTLZMA2_OBJS = \
-  $O\fl2_error_private.obj \
-  $O\fl2pool.obj \
-  $O\fl2threading.obj \
+  $O\dict_buffer.obj \
   $O\fl2_common.obj \
   $O\fl2_compress.obj \
+  $O\fl2_pool.obj \
+  $O\fl2_threading.obj \
   $O\lzma2_enc.obj \
   $O\radix_bitpack.obj \
   $O\radix_mf.obj \
   $O\radix_struct.obj \
   $O\range_enc.obj \
+  $O\util.obj \
 
 
 
diff --git a/CPP/7zip/Bundles/Format7zF/makefile b/CPP/7zip/Bundles/Format7zF/makefile
index ecf51975..f23b4250 100644
--- a/CPP/7zip/Bundles/Format7zF/makefile
+++ b/CPP/7zip/Bundles/Format7zF/makefile
@@ -119,15 +119,16 @@ ZSTDMT_OBJS = \
   $O\zstd-mt_threading.obj \
 
 FASTLZMA2_OBJS = \
-  $O\fl2_error_private.obj \
-  $O\fl2pool.obj \
-  $O\fl2threading.obj \
+  $O\dict_buffer.obj \
   $O\fl2_common.obj \
   $O\fl2_compress.obj \
+  $O\fl2_pool.obj \
+  $O\fl2_threading.obj \
   $O\lzma2_enc.obj \
   $O\radix_bitpack.obj \
   $O\radix_mf.obj \
   $O\radix_struct.obj \
   $O\range_enc.obj \
+  $O\util.obj \
 
 !include "../../7zip.mak"
diff --git a/CPP/7zip/Bundles/Format7zFO/makefile b/CPP/7zip/Bundles/Format7zFO/makefile
index 5cc2b44d..799356c1 100644
--- a/CPP/7zip/Bundles/Format7zFO/makefile
+++ b/CPP/7zip/Bundles/Format7zFO/makefile
@@ -119,15 +119,16 @@ ZSTDMT_OBJS = \
   $O\zstd-mt_threading.obj \
 
 FASTLZMA2_OBJS = \
-  $O\fl2_error_private.obj \
-  $O\fl2pool.obj \
-  $O\fl2threading.obj \
+  $O\dict_buffer.obj \
   $O\fl2_common.obj \
   $O\fl2_compress.obj \
+  $O\fl2_pool.obj \
+  $O\fl2_threading.obj \
   $O\lzma2_enc.obj \
   $O\radix_bitpack.obj \
   $O\radix_mf.obj \
   $O\radix_struct.obj \
   $O\range_enc.obj \
+  $O\util.obj \
 
 !include "../../7zip.mak"
diff --git a/CPP/7zip/Bundles/Format7zUSB/makefile b/CPP/7zip/Bundles/Format7zUSB/makefile
index 58c45033..68301d24 100644
--- a/CPP/7zip/Bundles/Format7zUSB/makefile
+++ b/CPP/7zip/Bundles/Format7zUSB/makefile
@@ -236,15 +236,16 @@ ZSTDMT_OBJS = \
   $O\zstd-mt_threading.obj \
 
 FASTLZMA2_OBJS = \
-  $O\fl2_error_private.obj \
-  $O\fl2pool.obj \
-  $O\fl2threading.obj \
+  $O\dict_buffer.obj \
   $O\fl2_common.obj \
   $O\fl2_compress.obj \
+  $O\fl2_pool.obj \
+  $O\fl2_threading.obj \
   $O\lzma2_enc.obj \
   $O\radix_bitpack.obj \
   $O\radix_mf.obj \
   $O\radix_struct.obj \
   $O\range_enc.obj \
+  $O\util.obj \
 
 !include "../../7zip.mak"
diff --git a/CPP/7zip/Compress/Lzma2Encoder.cpp b/CPP/7zip/Compress/Lzma2Encoder.cpp
index 5eb88d50..d8c5e0f9 100644
--- a/CPP/7zip/Compress/Lzma2Encoder.cpp
+++ b/CPP/7zip/Compress/Lzma2Encoder.cpp
@@ -121,23 +121,39 @@ STDMETHODIMP CEncoder::Code(ISequentialInStream *inStream, ISequentialOutStream
   return SResToHRESULT(res);
 }
   
-CFastEncoder::CFastEncoder()
+static HRESULT TranslateError(size_t res)
 {
-  _encoder = NULL;
-  reduceSize = 0;
+  if (FL2_getErrorCode(res) == FL2_error_memory_allocation)
+    return E_OUTOFMEMORY;
+  return S_FALSE;
 }
 
-CFastEncoder::~CFastEncoder()
+#define CHECK_S(f_) do { \
+  size_t r_ = f_; \
+  if (FL2_isError(r_)) \
+    return TranslateError(r_); \
+} while (false)
+
+#define CHECK_H(f_) do { \
+  HRESULT r_ = f_; \
+  if (r_ != S_OK) \
+    return r_; \
+} while (false)
+
+#define CHECK_P(f) if (FL2_isError(f)) return E_INVALIDARG;  /* check and convert error code */
+
+CFastEncoder::FastLzma2::FastLzma2()
+  : fcs(NULL),
+  dict_pos(0)
 {
-  if (_encoder)
-    FL2_freeCCtx(_encoder);
 }
 
+CFastEncoder::FastLzma2::~FastLzma2()
+{
+  FL2_freeCCtx(fcs);
+}
 
-#define CHECK_F(f) if (FL2_isError(f)) return E_INVALIDARG;  /* check and convert error code */
-
-STDMETHODIMP CFastEncoder::SetCoderProperties(const PROPID *propIDs,
-  const PROPVARIANT *coderProps, UInt32 numProps)
+HRESULT CFastEncoder::FastLzma2::SetCoderProperties(const PROPID *propIDs, const PROPVARIANT *coderProps, UInt32 numProps)
 {
   CLzma2EncProps lzma2Props;
   Lzma2EncProps_Init(&lzma2Props);
@@ -146,56 +162,165 @@ STDMETHODIMP CFastEncoder::SetCoderProperties(const PROPID *propIDs,
   {
     RINOK(SetLzma2Prop(propIDs[i], coderProps[i], lzma2Props));
   }
-  if (_encoder == NULL) {
-    _encoder = FL2_createCCtxMt(lzma2Props.numTotalThreads);
-    if (_encoder == NULL)
+  if (fcs == NULL) {
+    fcs = FL2_createCStreamMt(lzma2Props.numTotalThreads, 1);
+    if (fcs == NULL)
       return E_OUTOFMEMORY;
   }
   if (lzma2Props.lzmaProps.algo > 2) {
     if (lzma2Props.lzmaProps.algo > 3)
       return E_INVALIDARG;
     lzma2Props.lzmaProps.algo = 2;
-    FL2_CCtx_setParameter(_encoder, FL2_p_highCompression, 1);
-    FL2_CCtx_setParameter(_encoder, FL2_p_compressionLevel, lzma2Props.lzmaProps.level);
+    FL2_CCtx_setParameter(fcs, FL2_p_highCompression, 1);
+    FL2_CCtx_setParameter(fcs, FL2_p_compressionLevel, lzma2Props.lzmaProps.level);
   }
   else {
-    FL2_CCtx_setParameter(_encoder, FL2_p_7zLevel, lzma2Props.lzmaProps.level);
+    FL2_CCtx_setParameter(fcs, FL2_p_compressionLevel, lzma2Props.lzmaProps.level);
   }
-  dictSize = lzma2Props.lzmaProps.dictSize;
+  size_t dictSize = lzma2Props.lzmaProps.dictSize;
   if (!dictSize) {
-    dictSize = (UInt32)1 << FL2_CCtx_setParameter(_encoder, FL2_p_dictionaryLog, 0);
+    dictSize = (UInt32)FL2_CCtx_getParameter(fcs, FL2_p_dictionarySize);
   }
-  reduceSize = lzma2Props.lzmaProps.reduceSize;
+  size_t reduceSize = lzma2Props.lzmaProps.reduceSize;
   reduceSize += (reduceSize < (UInt64)-1); /* prevent extra buffer shift after read */
   dictSize = (UInt32)min(dictSize, reduceSize);
-  unsigned dictLog = FL2_DICTLOG_MIN;
-  while (((UInt32)1 << dictLog) < dictSize)
-    ++dictLog;
-  CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_dictionaryLog, dictLog));
+  dictSize = max(dictSize, FL2_DICTSIZE_MIN);
+  CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_dictionarySize, dictSize));
   if (lzma2Props.lzmaProps.algo >= 0) {
-    CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_strategy, (unsigned)lzma2Props.lzmaProps.algo));
+    CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_strategy, (unsigned)lzma2Props.lzmaProps.algo));
   }
   if (lzma2Props.lzmaProps.fb > 0)
-    CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_fastLength, lzma2Props.lzmaProps.fb));
-  if (lzma2Props.lzmaProps.mc) {
-    unsigned ml = 0;
-    while (((UInt32)1 << ml) < lzma2Props.lzmaProps.mc)
-      ++ml;
-    CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_searchLog, ml));
-  }
+    CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_fastLength, lzma2Props.lzmaProps.fb));
+  if (lzma2Props.lzmaProps.mc > 0)
+    CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_hybridCycles, lzma2Props.lzmaProps.mc));
   if (lzma2Props.lzmaProps.lc >= 0)
-    CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_literalCtxBits, lzma2Props.lzmaProps.lc));
+    CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_literalCtxBits, lzma2Props.lzmaProps.lc));
   if (lzma2Props.lzmaProps.lp >= 0)
-    CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_literalPosBits, lzma2Props.lzmaProps.lp));
+    CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_literalPosBits, lzma2Props.lzmaProps.lp));
   if (lzma2Props.lzmaProps.pb >= 0)
-    CHECK_F(FL2_CCtx_setParameter(_encoder, FL2_p_posBits, lzma2Props.lzmaProps.pb));
-  FL2_CCtx_setParameter(_encoder, FL2_p_omitProperties, 1);
-#ifndef NO_XXHASH
-  FL2_CCtx_setParameter(_encoder, FL2_p_doXXHash, 0);
-#endif
+    CHECK_P(FL2_CCtx_setParameter(fcs, FL2_p_posBits, lzma2Props.lzmaProps.pb));
+  FL2_CCtx_setParameter(fcs, FL2_p_omitProperties, 1);
+  FL2_setCStreamTimeout(fcs, 500);
   return S_OK;
 }
 
+size_t CFastEncoder::FastLzma2::GetDictSize() const
+{
+  return FL2_CCtx_getParameter(fcs, FL2_p_dictionarySize);
+}
+
+HRESULT CFastEncoder::FastLzma2::Begin()
+{
+  CHECK_S(FL2_initCStream(fcs, 0));
+  CHECK_S(FL2_getDictionaryBuffer(fcs, &dict));
+  dict_pos = 0;
+  return S_OK;
+}
+
+BYTE* CFastEncoder::FastLzma2::GetAvailableBuffer(unsigned long& size)
+{
+  size = static_cast<unsigned long>(dict.size - dict_pos);
+  return reinterpret_cast<BYTE*>(dict.dst) + dict_pos;
+}
+
+HRESULT CFastEncoder::FastLzma2::WaitAndReport(size_t& res, ICompressProgressInfo *progress)
+{
+  while (FL2_isTimedOut(res)) {
+    if (!UpdateProgress(progress))
+      return S_FALSE;
+    res = FL2_waitCStream(fcs);
+  }
+  CHECK_S(res);
+  return S_OK;
+}
+
+HRESULT CFastEncoder::FastLzma2::AddByteCount(size_t count, ISequentialOutStream *outStream, ICompressProgressInfo *progress)
+{
+  dict_pos += count;
+  if (dict_pos == dict.size) {
+    size_t res = FL2_updateDictionary(fcs, dict_pos);
+    CHECK_H(WaitAndReport(res, progress));
+    if (res != 0)
+      CHECK_H(WriteBuffers(outStream));
+    do {
+      res = FL2_getDictionaryBuffer(fcs, &dict);
+    } while (FL2_isTimedOut(res));
+    CHECK_S(res);
+    dict_pos = 0;
+  }
+  if (!UpdateProgress(progress))
+    return S_FALSE;
+  return S_OK;
+}
+
+bool CFastEncoder::FastLzma2::UpdateProgress(ICompressProgressInfo *progress)
+{
+  if (progress) {
+    UInt64 outProcessed;
+    UInt64 inProcessed = FL2_getCStreamProgress(fcs, &outProcessed);
+    HRESULT err = progress->SetRatioInfo(&inProcessed, &outProcessed);
+    if (err != S_OK) {
+      FL2_cancelCStream(fcs);
+      return false;
+    }
+  }
+  return true;
+}
+
+HRESULT CFastEncoder::FastLzma2::WriteBuffers(ISequentialOutStream *outStream)
+{
+  size_t csize;
+  for (;;) {
+    FL2_cBuffer cbuf;
+    // Waits if compression in progress
+    csize = FL2_getNextCStreamBuffer(fcs, &cbuf);
+    CHECK_S(csize);
+    if (csize == 0)
+      break;
+    HRESULT err = WriteStream(outStream, cbuf.src, cbuf.size);
+    if (err != S_OK)
+      return err;
+  }
+  return S_OK;
+}
+
+HRESULT CFastEncoder::FastLzma2::End(ISequentialOutStream *outStream, ICompressProgressInfo *progress)
+{
+  if (dict_pos) {
+    size_t res = FL2_updateDictionary(fcs, dict_pos);
+    CHECK_H(WaitAndReport(res, progress));
+  }
+
+  size_t res = FL2_endStream(fcs, nullptr);
+  CHECK_H(WaitAndReport(res, progress));
+  while (res) {
+    WriteBuffers(outStream);
+    res = FL2_endStream(fcs, nullptr);
+    CHECK_H(WaitAndReport(res, progress));
+  }
+  return S_OK;
+}
+
+void CFastEncoder::FastLzma2::Cancel()
+{
+  FL2_cancelCStream(fcs);
+}
+
+CFastEncoder::CFastEncoder()
+{
+}
+
+CFastEncoder::~CFastEncoder()
+{
+}
+
+
+STDMETHODIMP CFastEncoder::SetCoderProperties(const PROPID *propIDs,
+  const PROPVARIANT *coderProps, UInt32 numProps)
+{
+  return _encoder.SetCoderProperties(propIDs, coderProps, numProps);
+}
+
 
 #define LZMA2_DIC_SIZE_FROM_PROP(p) (((UInt32)2 | ((p) & 1)) << ((p) / 2 + 11))
 
@@ -203,6 +328,7 @@ STDMETHODIMP CFastEncoder::WriteCoderProperties(ISequentialOutStream *outStream)
 {
   Byte prop;
   unsigned i;
+  size_t dictSize = _encoder.GetDictSize();
   for (i = 0; i < 40; i++)
     if (dictSize <= LZMA2_DIC_SIZE_FROM_PROP(i))
       break;
@@ -211,79 +337,29 @@ STDMETHODIMP CFastEncoder::WriteCoderProperties(ISequentialOutStream *outStream)
 }
 
 
-typedef struct
-{
-  ISequentialOutStream* outStream;
-  ICompressProgressInfo* progress;
-  UInt64 in_processed;
-  UInt64 out_processed;
-  HRESULT res;
-} EncodingObjects;
-
-static int FL2LIB_CALL Progress(size_t done, void* opaque)
-{
-  EncodingObjects* p = (EncodingObjects*)opaque;
-  if (p && p->progress) {
-    UInt64 in_processed = p->in_processed + done;
-    p->res = p->progress->SetRatioInfo(&in_processed, &p->out_processed);
-    return p->res != S_OK;
-  }
-  return 0;
-}
-
-static int FL2LIB_CALL Write(const void* src, size_t srcSize, void* opaque)
-{
-  EncodingObjects* p = (EncodingObjects*)opaque;
-  p->res = WriteStream(p->outStream, src, srcSize);
-  return p->res != S_OK;
-}
-
 STDMETHODIMP CFastEncoder::Code(ISequentialInStream *inStream, ISequentialOutStream *outStream,
   const UInt64 * /* inSize */, const UInt64 * /* outSize */, ICompressProgressInfo *progress)
 {
-  HRESULT err = S_OK;
-  inBuffer.AllocAtLeast(dictSize);
-  EncodingObjects objs = { outStream, progress, 0, 0, S_OK };
-  FL2_blockBuffer block = { inBuffer, 0, 0, dictSize };
+  CHECK_H(_encoder.Begin());
+  size_t inSize;
+  unsigned long dSize;
   do
   {
-    FL2_shiftBlock(_encoder, &block);
-    size_t inSize = dictSize - block.start;
-    err = ReadStream(inStream, inBuffer + block.start, &inSize);
-    if (err != S_OK)
-      break;
-    block.end += inSize;
-    if (inSize) {
-      size_t cSize = FL2_compressCCtxBlock_toFn(_encoder, Write, &objs, &block, Progress);
-      if (FL2_isError(cSize)) {
-        if (FL2_getErrorCode(cSize) == FL2_error_memory_allocation)
-          return E_OUTOFMEMORY;
-        return objs.res != S_OK ? objs.res : S_FALSE;
-      }
-      if (objs.res != S_OK)
-        return objs.res;
-      objs.out_processed += cSize;
-      objs.in_processed += inSize;
-      if (progress) {
-        err = progress->SetRatioInfo(&objs.in_processed, &objs.out_processed);
-        if (err != S_OK)
-          break;
-      }
-      if (block.end < dictSize)
-        break;
+    BYTE* dict = _encoder.GetAvailableBuffer(dSize);
+
+    inSize = dSize;
+    HRESULT err = ReadStream(inStream, dict, &inSize);
+    if (err != S_OK) {
+      _encoder.Cancel();
+      return err;
     }
-    else break;
+    CHECK_H(_encoder.AddByteCount(inSize, outStream, progress));
 
-  } while (err == S_OK);
+  } while (inSize == dSize);
 
-  if (err == S_OK) {
-    size_t cSize = FL2_endFrame_toFn(_encoder, Write, &objs);
-    if (FL2_isError(cSize))
-      return S_FALSE;
-    objs.out_processed += cSize;
-    err = objs.res;
-  }
-  return err;
+  CHECK_H(_encoder.End(outStream, progress));
+
+  return S_OK;
 }
 
 }}
diff --git a/CPP/7zip/Compress/Lzma2Encoder.h b/CPP/7zip/Compress/Lzma2Encoder.h
index 4279825f..734c697b 100644
--- a/CPP/7zip/Compress/Lzma2Encoder.h
+++ b/CPP/7zip/Compress/Lzma2Encoder.h
@@ -24,13 +24,13 @@ class CEncoder:
   CLzma2EncHandle _encoder;
 public:
   MY_UNKNOWN_IMP4(
-      ICompressCoder,
-      ICompressSetCoderProperties,
-      ICompressWriteCoderProperties,
-      ICompressSetCoderPropertiesOpt)
+    ICompressCoder,
+    ICompressSetCoderProperties,
+    ICompressWriteCoderProperties,
+    ICompressSetCoderPropertiesOpt)
  
   STDMETHOD(Code)(ISequentialInStream *inStream, ISequentialOutStream *outStream,
-      const UInt64 *inSize, const UInt64 *outSize, ICompressProgressInfo *progress);
+    const UInt64 *inSize, const UInt64 *outSize, ICompressProgressInfo *progress);
   STDMETHOD(SetCoderProperties)(const PROPID *propIDs, const PROPVARIANT *props, UInt32 numProps);
   STDMETHOD(WriteCoderProperties)(ISequentialOutStream *outStream);
   STDMETHOD(SetCoderPropertiesOpt)(const PROPID *propIDs, const PROPVARIANT *props, UInt32 numProps);
@@ -45,10 +45,33 @@ class CFastEncoder :
   public ICompressWriteCoderProperties,
   public CMyUnknownImp
 {
-  FL2_CCtx* _encoder;
-  CByteBuffer inBuffer;
-  UInt64 reduceSize;
-  UInt32 dictSize;
+  class FastLzma2
+  {
+  public:
+    FastLzma2();
+    ~FastLzma2();
+    HRESULT SetCoderProperties(const PROPID *propIDs, const PROPVARIANT *props, UInt32 numProps);
+    size_t GetDictSize() const;
+    HRESULT Begin();
+    BYTE* GetAvailableBuffer(unsigned long& size);
+    HRESULT AddByteCount(size_t count, ISequentialOutStream *outStream, ICompressProgressInfo *progress);
+    HRESULT End(ISequentialOutStream *outStream, ICompressProgressInfo *progress);
+    void Cancel();
+
+  private:
+    bool UpdateProgress(ICompressProgressInfo *progress);
+    HRESULT WaitAndReport(size_t& res, ICompressProgressInfo *progress);
+    HRESULT WriteBuffers(ISequentialOutStream *outStream);
+
+    FL2_CStream* fcs;
+    FL2_dictBuffer dict;
+    size_t dict_pos;
+
+    FastLzma2(const FastLzma2&) = delete;
+    FastLzma2& operator=(const FastLzma2&) = delete;
+  };
+
+  FastLzma2 _encoder;
 
 public:
   MY_UNKNOWN_IMP3(
@@ -57,7 +80,7 @@ public:
     ICompressWriteCoderProperties)
 
   STDMETHOD(Code)(ISequentialInStream *inStream, ISequentialOutStream *outStream,
-      const UInt64 *inSize, const UInt64 *outSize, ICompressProgressInfo *progress);
+    const UInt64 *inSize, const UInt64 *outSize, ICompressProgressInfo *progress);
   STDMETHOD(SetCoderProperties)(const PROPID *propIDs, const PROPVARIANT *props, UInt32 numProps);
   STDMETHOD(WriteCoderProperties)(ISequentialOutStream *outStream);
 
diff --git a/CPP/7zip/UI/GUI/CompressDialog.cpp b/CPP/7zip/UI/GUI/CompressDialog.cpp
index c47228fb..5e0fda2f 100644
--- a/CPP/7zip/UI/GUI/CompressDialog.cpp
+++ b/CPP/7zip/UI/GUI/CompressDialog.cpp
@@ -1410,7 +1410,7 @@ typedef enum {
 } FL2_strategy;
 
 typedef struct {
-  unsigned dictionaryLog;    /* largest match distance : larger == more compression, more memory needed during decompression; >= 27 == more memory, slower */
+  UInt32   dictionarySize;   /* largest match distance : larger == more compression, more memory needed during decompression; >= 27 == more memory per byte, slower */
   unsigned overlapFraction;  /* overlap between consecutive blocks in 1/16 units: larger == more compression, slower */
   unsigned chainLog;         /* fully searched segment : larger == more compression, slower, more memory; hybrid mode only (ultra) */
   unsigned searchLog;        /* nb of searches : larger == more compression, slower; hybrid mode only (ultra) */
@@ -1424,19 +1424,23 @@ typedef struct {
 
 #define FL2_MAX_7Z_CLEVEL 9
 
+#define MB *(1U<<20)
+
 static const FL2_compressionParameters FL2_7zCParameters[FL2_MAX_7Z_CLEVEL + 1] = {
-  { 0,0,0,0,0,0,0 },
-  { 20, 1, 7, 0, 6, 32, 1, 8, FL2_fast }, /* 1 */
-  { 20, 2, 7, 0, 12, 32, 1, 8, FL2_fast }, /* 2 */
-  { 21, 2, 7, 0, 16, 32, 1, 8, FL2_fast }, /* 3 */
-  { 20, 2, 7, 0, 16, 32, 1, 8, FL2_opt }, /* 4 */
-  { 24, 2, 9, 0, 40, 48, 1, 8, FL2_ultra }, /* 5 */
-  { 25, 2, 10, 0, 48, 64, 1, 8, FL2_ultra }, /* 6 */
-  { 26, 2, 11, 1, 60, 96, 1, 9, FL2_ultra }, /* 7 */
-  { 27, 2, 12, 2, 128, 128, 1, 10, FL2_ultra }, /* 8 */
-  { 27, 3, 14, 3, 252, 160, 0, 10, FL2_ultra } /* 9 */
+  { 0,0,0,0,0,0,0,0,FL2_fast },
+  { 1 MB, 1, 7, 0, 6, 32, 1, 4, FL2_fast }, /* 1 */
+  { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_fast }, /* 2 */
+  { 2 MB, 2, 7, 0, 10, 32, 1, 4, FL2_opt }, /* 3 */
+  { 4 MB, 2, 7, 0, 14, 32, 1, 4, FL2_opt }, /* 4 */
+  { 16 MB, 2, 9, 0, 42, 48, 1, 4, FL2_ultra }, /* 5 */
+  { 32 MB, 2, 10, 0, 50, 64, 1, 4, FL2_ultra }, /* 6 */
+  { 64 MB, 2, 11, 1, 62, 96, 1, 3, FL2_ultra }, /* 7 */
+  { 64 MB, 4, 12, 2, 90, 273, 1, 3, FL2_ultra }, /* 8 */
+  { 128 MB, 2, 14, 3, 254, 273, 0, 2, FL2_ultra } /* 9 */
 };
 
+#undef MB
+
 #define RMF_BUILDER_SIZE (8 * 0x40100U)
 
 void CCompressDialog::SetDictionary()
@@ -1512,7 +1516,7 @@ void CCompressDialog::SetDictionary()
       if (level > FL2_MAX_7Z_CLEVEL)
         level = FL2_MAX_7Z_CLEVEL;
       if (defaultDict == (UInt32)(Int32)-1)
-        defaultDict = (UInt32)1 << FL2_7zCParameters[level].dictionaryLog;
+        defaultDict = FL2_7zCParameters[level].dictionarySize;
 
       m_Dictionary.SetCurSel(0);
 
@@ -2020,11 +2024,11 @@ UInt64 CCompressDialog::GetMemoryUsage(UInt32 dict, UInt64 &decompressMemory)
     {
       if (level > FL2_MAX_7Z_CLEVEL)
         level = FL2_MAX_7Z_CLEVEL;
-      size += dict * 5 + (1UL << 18) * numThreads;
-      unsigned depth = FL2_7zCParameters[level].searchDepth;
-      UInt32 bufSize = UInt32(1) << (FL2_7zCParameters[level].dictionaryLog - FL2_7zCParameters[level].bufferLog);
+      /* dual buffer is enabled in Lzma2Encoder.cpp so size is dict * 6 */
+      size += dict * 6 + (1UL << 18) * numThreads;
+      UInt32 bufSize = dict >> (12 - FL2_7zCParameters[level].bufferLog);
       size += (bufSize * 12 + RMF_BUILDER_SIZE) * numThreads;
-      if (dict > (UInt32(1) << 26) || depth > 63)
+      if (dict > (UInt32(1) << 26))
         size += dict;
       if (FL2_7zCParameters[level].strategy == FL2_ultra)
         size += (UInt32(4) << 14) + (UInt32(4) << FL2_7zCParameters[level].chainLog);