// Compression/HuffmanEncoder.cpp #include "StdAfx.h" #include "HuffmanEncoder.h" #include "Common/Defs.h" #include "Common/Alloc.h" namespace NCompression { namespace NHuffman { static const char *kIncorrectBitLenCountsMessage = "Incorrect bit len counts"; CEncoder::CEncoder(): m_Items(0), m_Heap(0), m_Depth(0) {} void CEncoder::Free() { MyFree(m_Items); MyFree(m_Heap); MyFree(m_Depth); m_Items = 0; m_Heap = 0; m_Depth = 0; } bool CEncoder::Create(UInt32 numSymbols, const Byte *extraBits, UInt32 extraBase, UInt32 maxLength) { m_NumSymbols = numSymbols; m_ExtraBits = extraBits; m_ExtraBase = extraBase; m_MaxLength = maxLength; m_HeapSize = numSymbols * 2 + 1; Free(); m_Items = (CItem *)MyAlloc(m_HeapSize * sizeof(CItem)); m_Heap = (UInt32 *)MyAlloc(m_HeapSize * sizeof(UInt32)); m_Depth = (Byte *)MyAlloc(m_HeapSize * sizeof(Byte)); if (m_Items == 0 || m_Heap == 0 || m_Depth == 0) { Free(); return false; } return true; } CEncoder::~CEncoder() { Free(); } void CEncoder::StartNewBlock() { for (UInt32 i = 0; i < m_NumSymbols; i++) m_Items[i].Freq = 0; } void CEncoder::SetFreqs(const UInt32 *freqs) { for (UInt32 i = 0; i < m_NumSymbols; i++) m_Items[i].Freq = freqs[i]; } static const int kSmallest = 1; // =========================================================================== // Remove the smallest element from the heap and recreate the heap with // one less element. Updates heap and m_HeapLength. UInt32 CEncoder::RemoveSmallest() { UInt32 top = m_Heap[kSmallest]; m_Heap[kSmallest] = m_Heap[m_HeapLength--]; DownHeap(kSmallest); return top; } // =========================================================================== // Compares to subtrees, using the tree m_Depth as tie breaker when // the subtrees have equal frequency. This minimizes the worst case length. bool CEncoder::Smaller(int n, int m) { return (m_Items[n].Freq < m_Items[m].Freq || (m_Items[n].Freq == m_Items[m].Freq && m_Depth[n] <= m_Depth[m])); } // =========================================================================== // Restore the m_Heap property by moving down the tree starting at node k, // exchanging a node with the smallest of its two sons if necessary, stopping // when the m_Heap property is re-established (each father CompareFreqs than its // two sons). void CEncoder::DownHeap(UInt32 k) { UInt32 symbol = m_Heap[k]; for (UInt32 j = k << 1; j <= m_HeapLength;) // j: left son of k { // Set j to the smallest of the two sons: if (j < m_HeapLength && Smaller(m_Heap[j+1], m_Heap[j])) j++; UInt32 htemp = m_Heap[j]; // htemp required because of bug in SASC compiler if (Smaller(symbol, htemp)) // Exit if v is smaller than both sons break; m_Heap[k] = htemp; // Exchange v with the smallest son k = j; j <<= 1; // And continue down the tree, setting j to the left son of k } m_Heap[k] = symbol; } // =========================================================================== // Compute the optimal bit lengths for a tree and update the total bit length // for the current block. // IN assertion: the fields freq and dad are set, heap[heapMax] and // above are the tree nodes sorted by increasing frequency. // OUT assertions: the field len is set to the optimal bit length, the // array m_BitLenCounters contains the frequencies for each bit length. // The length m_BlockBitLength is updated; static_len is also updated if stree is // not null. void CEncoder::GenerateBitLen(UInt32 maxCode, UInt32 heapMax) { int overflow = 0; // number of elements with bit length too large for (UInt32 i = 0; i <= kNumBitsInLongestCode; i++) m_BitLenCounters[i] = 0; /* In a first pass, compute the optimal bit lengths (which may * overflow in the case of the bit length tree). */ m_Items[m_Heap[heapMax]].Len = 0; /* root of the heap */ UInt32 h; /* heap index */ for (h = heapMax+1; h < m_HeapSize; h++) { UInt32 symbol = m_Heap[h]; UInt32 len = m_Items[m_Items[symbol].Dad].Len + 1; if (len > m_MaxLength) { len = m_MaxLength; overflow++; } m_Items[symbol].Len = len; // We overwrite m_Items[symbol].Dad which is no longer needed if (symbol > maxCode) continue; // not a leaf node m_BitLenCounters[len]++; UInt32 extraBits; if (m_ExtraBits != 0 && symbol >= m_ExtraBase) extraBits = m_ExtraBits[symbol - m_ExtraBase]; else extraBits = 0; m_BlockBitLength += (m_Items[symbol].Freq * (len + extraBits)); } if (overflow == 0) return; // This happens for example on obj2 and pic of the Calgary corpus // Find the first bit length which could increase: do { UInt32 bits = m_MaxLength-1; while (m_BitLenCounters[bits] == 0) bits--; m_BitLenCounters[bits]--; // move one leaf down the m_Items m_BitLenCounters[bits + 1] += 2; // move one overflow item as its brother m_BitLenCounters[m_MaxLength]--; // The brother of the overflow item also moves one step up, // but this does not affect m_BitLenCounters[m_MaxLength] overflow -= 2; } while (overflow > 0); // Now recompute all bit lengths, scanning in increasing frequency. // h is still equal to HEAP_SIZE. (It is simpler to reconstruct all // lengths instead of fixing only the wrong ones. This idea is taken // from 'ar' written by Haruhiko Okumura.) for (UInt32 bits = m_MaxLength; bits != 0; bits--) { UInt32 numNodes = m_BitLenCounters[bits]; while (numNodes != 0) { UInt32 m = m_Heap[--h]; if (m > maxCode) continue; if (m_Items[m].Len != (unsigned) bits) { m_BlockBitLength += ((long)bits - (long)m_Items[m].Len) * (long)m_Items[m].Freq; m_Items[m].Len = bits; } numNodes--; } } } // =========================================================================== // Generate the codes for a given tree and bit counts (which need not be // optimal). // IN assertion: the array m_BitLenCounters contains the bit length statistics for // the given tree and the field len is set for all tree elements. // OUT assertion: the field code is set for all tree elements of non // zero code length. // UInt32 maxCode = largest code with non zero frequency void CEncoder::GenerateCodes(UInt32 maxCode) { UInt32 nextCodes[kNumBitsInLongestCode + 1]; // next code value for each bit length UInt32 code = 0; // running code value // The distribution counts are first used to generate the code values // without bit reversal. for (UInt32 bits = 1; bits <= kNumBitsInLongestCode; bits++) nextCodes[bits] = code = (code + m_BitLenCounters[bits - 1]) << 1; // Check that the bit counts in m_BitLenCounters are consistent. The last code // must be all ones. if (code + m_BitLenCounters[kNumBitsInLongestCode] - 1 != (1 << kNumBitsInLongestCode) - 1) throw kIncorrectBitLenCountsMessage; for (UInt32 n = 0; n <= maxCode; n++) { int len = m_Items[n].Len; if (len == 0) continue; m_Items[n].Code = nextCodes[len]++; } } // =========================================================================== // Construct one Huffman tree and assigns the code bit strings and lengths. // Update the total bit length for the current block. // IN assertion: the field freq is set for all tree elements. // OUT assertions: the fields len and code are set to the optimal bit length // and corresponding code. The length m_BlockBitLength is updated; static_len is // also updated if stree is not null. The field max_code is set. void CEncoder::BuildTree(Byte *levels) { m_BlockBitLength = 0; int maxCode = -1; // WAS = -1; largest code with non zero frequency */ // Construct the initial m_Heap, with least frequent element in // m_Heap[kSmallest]. The sons of m_Heap[n] are m_Heap[2*n] and m_Heap[2*n+1]. // m_Heap[0] is not used. // m_HeapLength = 0; UInt32 n; // iterate over m_Heap elements for (n = 0; n < m_NumSymbols; n++) { if (m_Items[n].Freq != 0) { m_Heap[++m_HeapLength] = maxCode = n; m_Depth[n] = 0; } else m_Items[n].Len = 0; } // The pkzip format requires that at least one distance code exists, // and that at least one bit should be sent even if there is only one // possible code. So to avoid special checks later on we force at least // two codes of non zero frequency. while (m_HeapLength < 2) { int aNewNode = m_Heap[++m_HeapLength] = (maxCode < 2 ? ++maxCode : 0); m_Items[aNewNode].Freq = 1; m_Depth[aNewNode] = 0; m_BlockBitLength--; // if (stree) static_len -= stree[aNewNode].Len; // aNewNode is 0 or 1 so it does not have m_ExtraBits bits } // The elements m_Heap[m_HeapLength/2+1 .. m_HeapLength] are leaves of the m_Items, // establish sub-heaps of increasing lengths: for (n = m_HeapLength / 2; n >= 1; n--) DownHeap(n); // Construct the Huffman tree by repeatedly combining the least two // frequent nodes. int node = m_NumSymbols; // next internal node of the tree UInt32 heapMax = m_NumSymbols * 2+ 1; do { n = RemoveSmallest(); /* n = node of least frequency */ UInt32 m = m_Heap[kSmallest]; /* m = node of next least frequency */ m_Heap[--heapMax] = n; /* keep the nodes sorted by frequency */ m_Heap[--heapMax] = m; // Create a new node father of n and m m_Items[node].Freq = m_Items[n].Freq + m_Items[m].Freq; m_Depth[node] = (Byte) (MyMax(m_Depth[n], m_Depth[m]) + 1); m_Items[n].Dad = m_Items[m].Dad = node; // and insert the new node in the m_Heap m_Heap[kSmallest] = node++; DownHeap(kSmallest); } while (m_HeapLength >= 2); m_Heap[--heapMax] = m_Heap[kSmallest]; // At this point, the fields freq and dad are set. We can now // generate the bit lengths. GenerateBitLen(maxCode, heapMax); // The field len is now set, we can generate the bit codes GenerateCodes (maxCode); for (n = 0; n < m_NumSymbols; n++) levels[n] = Byte(m_Items[n].Len); } }}