Files
easy7zip/7zip/Compress/Huffman/HuffmanEncoder.cpp
Igor Pavlov bd9a40b0ed 4.40 beta
2016-05-28 00:15:48 +01:00

326 lines
10 KiB
C++
Executable File

// Compression/HuffmanEncoder.cpp
#include "StdAfx.h"
#include "HuffmanEncoder.h"
#include "Common/Defs.h"
#include "Common/Alloc.h"
namespace NCompression {
namespace NHuffman {
static const char *kIncorrectBitLenCountsMessage = "Incorrect bit len counts";
CEncoder::CEncoder():
m_Items(0),
m_Heap(0),
m_Depth(0)
{}
void CEncoder::Free()
{
MyFree(m_Items);
MyFree(m_Heap);
MyFree(m_Depth);
m_Items = 0;
m_Heap = 0;
m_Depth = 0;
}
bool CEncoder::Create(UInt32 numSymbols,
const Byte *extraBits, UInt32 extraBase, UInt32 maxLength)
{
m_NumSymbols = numSymbols;
m_ExtraBits = extraBits;
m_ExtraBase = extraBase;
m_MaxLength = maxLength;
m_HeapSize = numSymbols * 2 + 1;
Free();
m_Items = (CItem *)MyAlloc(m_HeapSize * sizeof(CItem));
m_Heap = (UInt32 *)MyAlloc(m_HeapSize * sizeof(UInt32));
m_Depth = (Byte *)MyAlloc(m_HeapSize * sizeof(Byte));
if (m_Items == 0 || m_Heap == 0 || m_Depth == 0)
{
Free();
return false;
}
return true;
}
CEncoder::~CEncoder()
{
Free();
}
void CEncoder::StartNewBlock()
{
for (UInt32 i = 0; i < m_NumSymbols; i++)
m_Items[i].Freq = 0;
}
static const int kSmallest = 1;
// ===========================================================================
// Remove the smallest element from the heap and recreate the heap with
// one less element. Updates heap and m_HeapLength.
UInt32 CEncoder::RemoveSmallest()
{
UInt32 top = m_Heap[kSmallest];
m_Heap[kSmallest] = m_Heap[m_HeapLength--];
DownHeap(kSmallest);
return top;
}
// ===========================================================================
// Compares to subtrees, using the tree m_Depth as tie breaker when
// the subtrees have equal frequency. This minimizes the worst case length.
bool CEncoder::Smaller(int n, int m)
{
return (m_Items[n].Freq < m_Items[m].Freq ||
(m_Items[n].Freq == m_Items[m].Freq && m_Depth[n] <= m_Depth[m]));
}
// ===========================================================================
// Restore the m_Heap property by moving down the tree starting at node k,
// exchanging a node with the smallest of its two sons if necessary, stopping
// when the m_Heap property is re-established (each father CompareFreqs than its
// two sons).
void CEncoder::DownHeap(UInt32 k)
{
UInt32 symbol = m_Heap[k];
for (UInt32 j = k << 1; j <= m_HeapLength;) // j: left son of k
{
// Set j to the smallest of the two sons:
if (j < m_HeapLength && Smaller(m_Heap[j+1], m_Heap[j]))
j++;
UInt32 htemp = m_Heap[j]; // htemp required because of bug in SASC compiler
if (Smaller(symbol, htemp)) // Exit if v is smaller than both sons
break;
m_Heap[k] = htemp; // Exchange v with the smallest son
k = j;
j <<= 1; // And continue down the tree, setting j to the left son of k
}
m_Heap[k] = symbol;
}
// ===========================================================================
// Compute the optimal bit lengths for a tree and update the total bit length
// for the current block.
// IN assertion: the fields freq and dad are set, heap[heapMax] and
// above are the tree nodes sorted by increasing frequency.
// OUT assertions: the field len is set to the optimal bit length, the
// array m_BitLenCounters contains the frequencies for each bit length.
// The length m_BlockBitLength is updated; static_len is also updated if stree is
// not null.
void CEncoder::GenerateBitLen(UInt32 maxCode, UInt32 heapMax)
{
int overflow = 0; // number of elements with bit length too large
for (UInt32 i = 0; i <= kNumBitsInLongestCode; i++)
m_BitLenCounters[i] = 0;
/* In a first pass, compute the optimal bit lengths (which may
* overflow in the case of the bit length tree).
*/
m_Items[m_Heap[heapMax]].Len = 0; /* root of the heap */
UInt32 h; /* heap index */
for (h = heapMax+1; h < m_HeapSize; h++)
{
UInt32 symbol = m_Heap[h];
UInt32 len = m_Items[m_Items[symbol].Dad].Len + 1;
if (len > m_MaxLength)
{
len = m_MaxLength;
overflow++;
}
m_Items[symbol].Len = len; // We overwrite m_Items[symbol].Dad which is no longer needed
if (symbol > maxCode)
continue; // not a leaf node
m_BitLenCounters[len]++;
UInt32 extraBits;
if (m_ExtraBits != 0 && symbol >= m_ExtraBase)
extraBits = m_ExtraBits[symbol - m_ExtraBase];
else
extraBits = 0;
m_BlockBitLength += (m_Items[symbol].Freq * (len + extraBits));
}
if (overflow == 0)
return;
// This happens for example on obj2 and pic of the Calgary corpus
// Find the first bit length which could increase:
do
{
UInt32 bits = m_MaxLength-1;
while (m_BitLenCounters[bits] == 0)
bits--;
m_BitLenCounters[bits]--; // move one leaf down the m_Items
m_BitLenCounters[bits + 1] += 2; // move one overflow item as its brother
m_BitLenCounters[m_MaxLength]--;
// The brother of the overflow item also moves one step up,
// but this does not affect m_BitLenCounters[m_MaxLength]
overflow -= 2;
}
while (overflow > 0);
// Now recompute all bit lengths, scanning in increasing frequency.
// h is still equal to HEAP_SIZE. (It is simpler to reconstruct all
// lengths instead of fixing only the wrong ones. This idea is taken
// from 'ar' written by Haruhiko Okumura.)
for (UInt32 bits = m_MaxLength; bits != 0; bits--)
{
UInt32 numNodes = m_BitLenCounters[bits];
while (numNodes != 0)
{
UInt32 m = m_Heap[--h];
if (m > maxCode)
continue;
if (m_Items[m].Len != (unsigned) bits)
{
m_BlockBitLength += ((long)bits - (long)m_Items[m].Len) * (long)m_Items[m].Freq;
m_Items[m].Len = bits;
}
numNodes--;
}
}
}
// ===========================================================================
// Generate the codes for a given tree and bit counts (which need not be
// optimal).
// IN assertion: the array m_BitLenCounters contains the bit length statistics for
// the given tree and the field len is set for all tree elements.
// OUT assertion: the field code is set for all tree elements of non
// zero code length.
// UInt32 maxCode = largest code with non zero frequency
void CEncoder::GenerateCodes(UInt32 maxCode)
{
UInt32 nextCodes[kNumBitsInLongestCode + 1]; // next code value for each bit length
UInt32 code = 0; // running code value
// The distribution counts are first used to generate the code values
// without bit reversal.
for (UInt32 bits = 1; bits <= kNumBitsInLongestCode; bits++)
nextCodes[bits] = code = (code + m_BitLenCounters[bits - 1]) << 1;
// Check that the bit counts in m_BitLenCounters are consistent. The last code
// must be all ones.
if (code + m_BitLenCounters[kNumBitsInLongestCode] - 1 != (1 << kNumBitsInLongestCode) - 1)
throw kIncorrectBitLenCountsMessage;
for (UInt32 n = 0; n <= maxCode; n++)
{
int len = m_Items[n].Len;
if (len == 0)
continue;
m_Items[n].Code = nextCodes[len]++;
}
}
// ===========================================================================
// Construct one Huffman tree and assigns the code bit strings and lengths.
// Update the total bit length for the current block.
// IN assertion: the field freq is set for all tree elements.
// OUT assertions: the fields len and code are set to the optimal bit length
// and corresponding code. The length m_BlockBitLength is updated; static_len is
// also updated if stree is not null. The field max_code is set.
void CEncoder::BuildTree(Byte *levels)
{
m_BlockBitLength = 0;
int maxCode = -1; // WAS = -1; largest code with non zero frequency */
// Construct the initial m_Heap, with least frequent element in
// m_Heap[kSmallest]. The sons of m_Heap[n] are m_Heap[2*n] and m_Heap[2*n+1].
// m_Heap[0] is not used.
//
m_HeapLength = 0;
UInt32 n; // iterate over m_Heap elements
for (n = 0; n < m_NumSymbols; n++)
{
if (m_Items[n].Freq != 0)
{
m_Heap[++m_HeapLength] = maxCode = n;
m_Depth[n] = 0;
}
else
m_Items[n].Len = 0;
}
// The pkzip format requires that at least one distance code exists,
// and that at least one bit should be sent even if there is only one
// possible code. So to avoid special checks later on we force at least
// two codes of non zero frequency.
while (m_HeapLength < 2)
{
int aNewNode = m_Heap[++m_HeapLength] = (maxCode < 2 ? ++maxCode : 0);
m_Items[aNewNode].Freq = 1;
m_Depth[aNewNode] = 0;
m_BlockBitLength--;
// if (stree) static_len -= stree[aNewNode].Len;
// aNewNode is 0 or 1 so it does not have m_ExtraBits bits
}
// The elements m_Heap[m_HeapLength/2+1 .. m_HeapLength] are leaves of the m_Items,
// establish sub-heaps of increasing lengths:
for (n = m_HeapLength / 2; n >= 1; n--)
DownHeap(n);
// Construct the Huffman tree by repeatedly combining the least two
// frequent nodes.
int node = m_NumSymbols; // next internal node of the tree
UInt32 heapMax = m_NumSymbols * 2+ 1;
do
{
n = RemoveSmallest(); /* n = node of least frequency */
UInt32 m = m_Heap[kSmallest]; /* m = node of next least frequency */
m_Heap[--heapMax] = n; /* keep the nodes sorted by frequency */
m_Heap[--heapMax] = m;
// Create a new node father of n and m
m_Items[node].Freq = m_Items[n].Freq + m_Items[m].Freq;
m_Depth[node] = (Byte) (MyMax(m_Depth[n], m_Depth[m]) + 1);
m_Items[n].Dad = m_Items[m].Dad = node;
// and insert the new node in the m_Heap
m_Heap[kSmallest] = node++;
DownHeap(kSmallest);
}
while (m_HeapLength >= 2);
m_Heap[--heapMax] = m_Heap[kSmallest];
// At this point, the fields freq and dad are set. We can now
// generate the bit lengths.
GenerateBitLen(maxCode, heapMax);
// The field len is now set, we can generate the bit codes
GenerateCodes (maxCode);
for (n = 0; n < m_NumSymbols; n++)
levels[n] = Byte(m_Items[n].Len);
}
void CEncoder::ReverseBits()
{
for (UInt32 symbol = 0; symbol < m_NumSymbols; symbol++)
{
CItem &item = m_Items[symbol];
UInt32 value = item.Code;
UInt32 reverseValue = 0;
for(UInt32 i = item.Len; i != 0; i--, value >>= 1)
reverseValue = (reverseValue << 1) | (value & 1);
item.Code = reverseValue;
}
}
}}