Huffman Compress

Huffman coding is a data compression algorithm. It is based on the idea that frequently-appearing letters should have shorter bit representations and less common letters should have longer representations.

For Huffman Decompress algorithm click here.



									/*****Please include following header files*****/
// stdint.h
/***********************************************/

#define MAX_TREE_NODES 511

typedef struct {
	uint8_t* BytePointer;
	uint32_t BitPosition;
} BitStream;

typedef struct {
	int Symbol;
	uint32_t Count;
	uint32_t Code;
	uint32_t Bits;
} Symbol;

typedef struct encodeNode {
	struct encodeNode* ChildA;
	struct encodeNode* ChildB;
	int Count;
	int Symbol;
} EncodeNode;

void initBitstream(BitStream* stream, uint8_t* buffer)
{
	stream->BytePointer = buffer;
	stream->BitPosition = 0;
}

void writeBits(BitStream* stream, uint32_t x, uint32_t bits)
{
	uint8_t* buffer = stream->BytePointer;
	uint32_t bit = stream->BitPosition;
	uint32_t mask = 1 << (bits - 1);

	for (uint32_t count = 0; count < bits; ++count)
	{
		*buffer = (*buffer & (0xff ^ (1 << (7 - bit)))) + ((x & mask ? 1 : 0) << (7 - bit));
		x <<= 1;
		bit = (bit + 1) & 7;

		if (!bit)
		{
			++buffer;
		}
	}

	stream->BytePointer = buffer;
	stream->BitPosition = bit;
}

void histogram(uint8_t* input, Symbol* sym, uint32_t size)
{
	int i;

	for (i = 0; i < 256; ++i)
	{
		sym[i].Symbol = i;
		sym[i].Count = 0;
		sym[i].Code = 0;
		sym[i].Bits = 0;
	}

	for (i = size; i; --i)
	{
		sym[*input++].Count++;
	}
}

void storeTree(EncodeNode* node, Symbol* sym, BitStream* stream, uint32_t code, uint32_t bits)
{
	uint32_t symbolIndex;

	if (node->Symbol >= 0)
	{
		writeBits(stream, 1, 1);
		writeBits(stream, node->Symbol, 8);

		for (symbolIndex = 0; symbolIndex < 256; ++symbolIndex)
		{
			if (sym[symbolIndex].Symbol == node->Symbol)
				break;
		}

		sym[symbolIndex].Code = code;
		sym[symbolIndex].Bits = bits;
		return;
	}
	else
	{
		writeBits(stream, 0, 1);
	}

	storeTree(node->ChildA, sym, stream, (code << 1) + 0, bits + 1);
	storeTree(node->ChildB, sym, stream, (code << 1) + 1, bits + 1);
}

void makeTree(Symbol* sym, BitStream* stream)
{
	EncodeNode nodes[MAX_TREE_NODES], *node1, *node2, *root;
	uint32_t i, numSymbols = 0, nodesLeft, nextIndex;

	for (i = 0; i < 256; ++i)
	{
		if (sym[i].Count > 0)
		{
			nodes[numSymbols].Symbol = sym[i].Symbol;
			nodes[numSymbols].Count = sym[i].Count;
			nodes[numSymbols].ChildA = (EncodeNode *)0;
			nodes[numSymbols].ChildB = (EncodeNode *)0;
			++numSymbols;
		}
	}

	root = (EncodeNode*)0;
	nodesLeft = numSymbols;
	nextIndex = numSymbols;

	while (nodesLeft > 1)
	{
		node1 = (EncodeNode *)0;
		node2 = (EncodeNode *)0;

		for (i = 0; i < nextIndex; ++i)
		{
			if (nodes[i].Count > 0)
			{
				if (!node1 || (nodes[i].Count <= node1->Count))
				{
					node2 = node1;
					node1 = &nodes[i];
				}
				else if (!node2 || (nodes[i].Count <= node2->Count))
				{
					node2 = &nodes[i];
				}
			}
		}

		root = &nodes[nextIndex];
		root->ChildA = node1;
		root->ChildB = node2;
		root->Count = node1->Count + node2->Count;
		root->Symbol = -1;
		node1->Count = 0;
		node2->Count = 0;
		++nextIndex;
		--nodesLeft;
	}

	if (root)
	{
		storeTree(root, sym, stream, 0, 0);
	}
	else
	{
		root = &nodes[0];
		storeTree(root, sym, stream, 0, 1);
	}
}

int Compress(uint8_t* input, uint8_t* output, uint32_t inputSize)
{
	Symbol sym[256], temp;
	BitStream stream;
	uint32_t i, totalBytes, swaps, symbol;

	if (inputSize < 1)
		return 0;

	initBitstream(&stream, output);
	histogram(input, sym, inputSize);
	makeTree(sym, &stream);

	do
	{
		swaps = 0;

		for (i = 0; i < 255; ++i)
		{
			if (sym[i].Symbol > sym[i + 1].Symbol)
			{
				temp = sym[i];
				sym[i] = sym[i + 1];
				sym[i + 1] = temp;
				swaps = 1;
			}
		}
	} while (swaps);

	for (i = 0; i < inputSize; ++i)
	{
		symbol = input[i];
		writeBits(&stream, sym[symbol].Code, sym[symbol].Bits);
	}

	totalBytes = (int)(stream.BytePointer - output);

	if (stream.BitPosition > 0)
	{
		++totalBytes;
	}

	return totalBytes;
}
								


Example

									/*****Please include following header files*****/
// string.h
// stdlib.h
/***********************************************/

char* str = "This is an example for Huffman coding";
uint8_t* originalData = (uint8_t*)str;
int originalDataSize = strlen(str);
uint8_t* compressedData = (uint8_t*)malloc(originalDataSize * (101 / 100) + 320);

int compressedDataSize = Compress(originalData, compressedData, originalDataSize);