Aho–Corasick Algorithm

Aho–Corasick algorithm is a string searching algorithm. It is a kind of dictionary-matching algorithm that locates elements of a finite set of strings (the "dictionary") within an input text. It matches all strings simultaneously.



									private const int MaxStates = 6 * 50 + 10;
private const int MaxChars = 26;

private static int[] Out = new int[MaxStates];
private static int[] FF = new int[MaxStates];
private static int[,] GF = new int[MaxStates, MaxChars];

private static int BuildMatchingMachine(string[] words, char lowestChar = 'a', char highestChar = 'z')
{
	Out = Enumerable.Repeat(0, Out.Length).ToArray();
	FF = Enumerable.Repeat(-1, FF.Length).ToArray();

	for (int i = 0; i < MaxStates; ++i)
	{
		for (int j = 0; j < MaxChars; ++j)
		{
			GF[i, j] = -1;
		}
	}

	int states = 1;

	for (int i = 0; i < words.Length; ++i)
	{
		string keyword = words[i];
		int currentState = 0;

		for (int j = 0; j < keyword.Length; ++j)
		{
			int c = keyword[j] - lowestChar;

			if (GF[currentState, c] == -1)
			{
				GF[currentState, c] = states++;
			}

			currentState = GF[currentState, c];
		}

		Out[currentState] |= (1 << i);
	}

	for (int c = 0; c < MaxChars; ++c)
	{
		if (GF[0, c] == -1)
		{
			GF[0, c] = 0;
		}
	}

	List<int> q = new List<int>();
	for (int c = 0; c <= highestChar - lowestChar; ++c)
	{
		if (GF[0, c] != -1 && GF[0, c] != 0)
		{
			FF[GF[0, c]] = 0;
			q.Add(GF[0, c]);
		}
	}

	while (Convert.ToBoolean(q.Count))
	{
		int state = q[0];
		q.RemoveAt(0);

		for (int c = 0; c <= highestChar - lowestChar; ++c)
		{
			if (GF[state, c] != -1)
			{
				int failure = FF[state];

				while (GF[failure, c] == -1)
				{
					failure = FF[failure];
				}

				failure = GF[failure, c];
				FF[GF[state, c]] = failure;
				Out[GF[state, c]] |= Out[failure];
				q.Add(GF[state, c]);
			}
		}
	}

	return states;
}

private static int FindNextState(int currentState, char nextInput, char lowestChar = 'a')
{
	int answer = currentState;
	int c = nextInput - lowestChar;

	while (GF[answer, c] == -1)
	{
		answer = FF[answer];
	}

	return GF[answer, c];
}

public static List<int> FindAllStates(string text, string[] keywords, char lowestChar = 'a', char highestChar = 'z')
{
	BuildMatchingMachine(keywords, lowestChar, highestChar);

	int currentState = 0;
	List<int> retVal = new List<int>();

	for (int i = 0; i < text.Length; ++i)
	{
		currentState = FindNextState(currentState, text[i], lowestChar);

		if (Out[currentState] == 0)
			continue;

		for (int j = 0; j < keywords.Length; ++j)
		{
			if (Convert.ToBoolean(Out[currentState] & (1 << j)))
			{
				retVal.Insert(0, i - keywords[j].Length + 1);
			}
		}
	}

	return retVal;
}
								


Example

									string[] keywords = { "he", "she", "hers", "his" };

string text = "ahishers";

List<int> states = FindAllStates(text, keywords);
								


Output

									4
3
4
1