Aho–Corasick Algorithm

Aho–Corasick algorithm is a string searching algorithm. It is a kind of dictionary-matching algorithm that locates elements of a finite set of strings (the "dictionary") within an input text. It matches all strings simultaneously.



									$MaxStates = 6 * 50 + 10;
$MaxChars = 26;

$Out = array();
$FF = array();
$GF = array();

function BuildMatchingMachine($words, $lowestChar = 'a', $highestChar = 'z')
{
	global $Out, $FF, $GF, $MaxStates, $MaxChars;
	
	$Out = array_fill(0, $MaxStates, 0);
	$FF = array_fill(0, $MaxStates, -1);

	for ($i = 0; $i < $MaxStates; $i++)
	{
		for ($j = 0; $j < $MaxChars; $j++)
		{
			$GF[$i][$j] = -1;
		}
	}

	$states = 1;

	for ($i = 0; $i < count($words); $i++)
	{
		$keyword = $words[$i];
		$currentState = 0;

		for ($j = 0; $j < strlen($keyword); $j++)
		{
			$c = ord($keyword[$j]) - ord($lowestChar);

			if ($GF[$currentState][$c] == -1)
			{
				$GF[$currentState][$c] = $states++;
			}

			$currentState = $GF[$currentState][$c];
		}

		$Out[$currentState] |= (1 << $i);
	}

	for ($c = 0; $c < $MaxChars; $c++)
	{
		if ($GF[0][$c] == -1)
		{
			$GF[0][$c] = 0;
		}
	}

	$q = array();
	for ($c = 0; $c <= ord($highestChar) - ord($lowestChar); $c++)
	{
		if ($GF[0][$c] != -1 && $GF[0][$c] != 0)
		{
			$FF[$GF[0][$c]] = 0;
			$q[] = $GF[0][$c];
		}
	}

	while (count($q))
	{
		$state = $q[0];
		unset($q[0]);
		$q = array_values($q);

		for ($c = 0; $c <= ord($highestChar) - ord($lowestChar); $c++)
		{
			if ($GF[$state][$c] != -1)
			{
				$failure = $FF[$state];

				while ($GF[$failure][$c] == -1)
				{
					$failure = $FF[$failure];
				}

				$failure = $GF[$failure][$c];
				$FF[$GF[$state][$c]] = $failure;
				$Out[$GF[$state][$c]] |= $Out[$failure];
				$q[] = $GF[$state][$c];
			}
		}
	}

	return $states;
}

function FindNextState($currentState, $nextInput, $lowestChar = 'a')
{
	global $GF, $FF;

	$answer = $currentState;
	$c = ord($nextInput) - ord($lowestChar);

	while ($GF[$answer][$c] == -1)
	{
		$answer = $FF[$answer];
	}

	return $GF[$answer][$c];
}

function FindAllStates($text, $keywords, $lowestChar = 'a', $highestChar = 'z')
{
	global $Out;
	
	BuildMatchingMachine($keywords, $lowestChar, $highestChar);

	$currentState = 0;
	$retVal = array();

	for ($i = 0; $i < strlen($text); $i++)
	{
		$currentState = FindNextState($currentState, $text[$i], $lowestChar);

		if ($Out[$currentState] == 0)
			continue;

		for ($j = 0; $j < count($keywords); $j++)
		{
			if ($Out[$currentState] & (1 << $j))
			{
				if (count($retVal) == 0) {
					$retVal[0] = $i - strlen($keywords[$j]) + 1;
				}
				else {
					array_unshift($retVal, $i - strlen($keywords[$j]) + 1);
				}
			}
		}
	}

	return $retVal;
}
								


Example

									$keywords = array( "he", "she", "hers", "his" );

$text = "ahishers";

$states = FindAllStates($text, $keywords);
								


Output

									4
3
4
1