Huffman Compress

Huffman coding is a data compression algorithm. It is based on the idea that frequently-appearing letters should have shorter bit representations and less common letters should have longer representations.

For Huffman Decompress algorithm click here.



									Private Const MAX_TREE_NODES As Integer = 511

Public Class BitStream
	Public BytePointer As Byte()
	Public BitPosition As UInteger
	Public Index As UInteger
End Class

Public Structure Symbol
	Public Sym As Integer
	Public Count As UInteger
	Public Code As UInteger
	Public Bits As UInteger
End Structure

Public Class EncodeNode
	Public ChildA As EncodeNode
	Public ChildB As EncodeNode
	Public Count As Integer
	Public Symbol As Integer
End Class

Private Shared Sub initBitstream(ByRef stream As BitStream, buffer As Byte())
	stream.BytePointer = buffer
	stream.BitPosition = 0
End Sub

Private Shared Sub writeBits(ByRef stream As BitStream, x As UInteger, bits As UInteger)
	Dim buffer As Byte() = stream.BytePointer
	Dim bit As UInteger = stream.BitPosition
	Dim mask As UInteger = CUInt(1 << CInt(bits - 1))

	For count As UInteger = 0 To bits - 1
		buffer(stream.Index) = CByte((buffer(stream.Index) And (&HFF Xor (1 << CInt(7 - bit)))) + ((If(Convert.ToBoolean(x And mask), 1, 0)) << CInt(7 - bit)))
		x <<= 1
		bit = (bit + 1) And 7

		If Not Convert.ToBoolean(bit) Then
			stream.Index += 1
		End If
	Next

	stream.BytePointer = buffer
	stream.BitPosition = bit
End Sub

Private Shared Sub histogram(input As Byte(), sym As Symbol(), size As UInteger)
	Dim i As Integer
	Dim index As Integer = 0

	For i = 0 To 255
		sym(i).Sym = i
		sym(i).Count = 0
		sym(i).Code = 0
		sym(i).Bits = 0
	Next

	i = CInt(size)
	While Convert.ToBoolean(i)
		sym(input(index)).Count += 1
		i -= 1
		index += 1
	End While
End Sub

Private Shared Sub storeTree(ByRef node As EncodeNode, sym As Symbol(), ByRef stream As BitStream, code As UInteger, bits As UInteger)
	Dim symbolIndex As UInteger

	If node.Symbol >= 0 Then
		writeBits(stream, 1, 1)
		writeBits(stream, CUInt(node.Symbol), 8)

		For symbolIndex = 0 To 255
			If sym(symbolIndex).Sym = node.Symbol Then
				Exit For
			End If
		Next

		sym(symbolIndex).Code = code
		sym(symbolIndex).Bits = bits
		Return
	Else
		writeBits(stream, 0, 1)
	End If

	storeTree(node.ChildA, sym, stream, (code << 1) + 0, bits + 1)
	storeTree(node.ChildB, sym, stream, (code << 1) + 1, bits + 1)
End Sub

Private Shared Sub makeTree(sym As Symbol(), ByRef stream As BitStream)
	Dim nodes As EncodeNode() = New EncodeNode(MAX_TREE_NODES - 1) {}

	For counter As Integer = 0 To nodes.Length - 1
		nodes(counter) = New EncodeNode()
	Next

	Dim node1 As EncodeNode, node2 As EncodeNode, root As EncodeNode
	Dim i As UInteger, numSymbols As UInteger = 0, nodesLeft As UInteger, nextIndex As UInteger

	For i = 0 To 255
		If sym(i).Count > 0 Then
			nodes(numSymbols).Symbol = sym(i).Sym
			nodes(numSymbols).Count = CInt(sym(i).Count)
			nodes(numSymbols).ChildA = Nothing
			nodes(numSymbols).ChildB = Nothing
			numSymbols += 1
		End If
	Next

	root = Nothing
	nodesLeft = numSymbols
	nextIndex = numSymbols

	While nodesLeft > 1
		node1 = Nothing
		node2 = Nothing

		For i = 0 To nextIndex - 1
			If nodes(i).Count > 0 Then
				If node1 Is Nothing OrElse (nodes(i).Count <= node1.Count) Then
					node2 = node1
					node1 = nodes(i)
				ElseIf node2 Is Nothing OrElse (nodes(i).Count <= node2.Count) Then
					node2 = nodes(i)
				End If
			End If
		Next

		root = nodes(nextIndex)
		root.ChildA = node1
		root.ChildB = node2
		root.Count = node1.Count + node2.Count
		root.Symbol = -1
		node1.Count = 0
		node2.Count = 0
		nextIndex += 1
		nodesLeft -= 1
	End While

	If root IsNot Nothing Then
		storeTree(root, sym, stream, 0, 0)
	Else
		root = nodes(0)
		storeTree(root, sym, stream, 0, 1)
	End If
End Sub

Public Shared Function Compress(input As Byte(), output As Byte(), inputSize As UInteger) As Integer
	Dim sym As Symbol() = New Symbol(255) {}
	Dim temp As Symbol
	Dim stream As New BitStream()
	Dim i As UInteger, totalBytes As UInteger, swaps As UInteger, symbol As UInteger

	If inputSize < 1 Then
		Return 0
	End If

	initBitstream(stream, output)
	histogram(input, sym, inputSize)
	makeTree(sym, stream)

	Do
		swaps = 0

		For i = 0 To 254
			If sym(i).Sym > sym(i + 1).Sym Then
				temp = sym(i)
				sym(i) = sym(i + 1)
				sym(i + 1) = temp
				swaps = 1
			End If
		Next
	Loop While Convert.ToBoolean(swaps)

	For i = 0 To inputSize - 1
		symbol = input(i)
		writeBits(stream, sym(symbol).Code, sym(symbol).Bits)
	Next

	totalBytes = stream.Index

	If stream.BitPosition > 0 Then
		totalBytes += 1
	End If

	Return CInt(totalBytes)
End Function
								


Example

									Dim str As String = "This is an example for Huffman coding"
Dim originalData As Byte() = Encoding.[Default].GetBytes(str)
Dim originalDataSize As UInteger = CUInt(str.Length)
Dim compressedData As Byte() = New Byte(originalDataSize * (101 \ 100) + 319) {}

Dim compressedDataSize As Integer = Compress(originalData, compressedData, originalDataSize)