AC自动机（hdu2222）

2024年2月23日 252次阅读

AC自动机

Aho-Corasick automaton，该算法在1975年产生于贝尔实验室，是著名的多模匹配算法。
要学会AC自动机，我们必须知道什么是Trie，也就是字典树。Trie树，又称单词查找树或键树，是一种树形结构，是一种哈希树的变种。典型应用是用于统计和排序大量的字符串（但不仅限于字符串），所以经常被搜索引擎系统用于文本词频统计。
AC自动机其实是Trie树和KMP算法的集合体，
首先要有这两个算法基础才能更好的理解AC自动机的核心所在。
字典树查找匹配串，当失配时，利用如同KMP中next数组一样的失败指针，去找下一个匹配节点
避免了每次都从根节点遍历的尴尬
大大减少了时间复杂度。
具体的思想可以参见此视频讲解的比较清楚，比看大部分博客理解起来都快
http://www.bilibili.com/video/av6295004/#page=1
(夭寿了！！！我竟然在B站看算法视频)
讲的不错。

给出模板题
hdu2222

Keywords Search

Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 131072/131072 K (Java/Others)
Total Submission(s): 61692 Accepted Submission(s): 20399

Problem Description In the modern time, Search engine came into the life of everybody like Google, Baidu, etc.

Wiskey also wants to bring this feature to his image retrieval system.

Every image have a long description, when users type some keywords to find the image, the system will match the keywords with description of image and show the image which the most keywords be matched.

To simplify the problem, giving you a description of image, and some keywords, you should tell me how many keywords will be match.

Input First line will contain one integer means how many cases will follow by.

Each case will contain two integers N means the number of keywords and N keywords follow. (N <= 10000)

Each keyword will only contains characters ‘a’-‘z’, and the length will be not longer than 50.

The last line is the description, and the length will be not longer than 1000000.

Output Print how many keywords are contained in the description.

Sample Input

1 5 she he say shr her yasherhs

Sample Output

3 查找匹配串含有的单词数

代码给出详细的注释，比较易懂

本题可作为刷题模板

#include<iostream>
#include<algorithm>
#include<queue>
#include<string.h>
#include<stdlib.h>

using namespace std;

#define MAX_N 1000006
#define MAX_Tot 500005

struct ACo{
	struct state{
		//子节点数组 
		int next[26];
		//当前节点的失败指针 
		int fail;
		//到当前位置的字符串结束个数 
		int cnt;
		
	}stateTable[MAX_Tot];
	
	//当前AC自动机树的节点个数 
	int size; 
	queue<int> que;
	//初始化 
	void init()
	{
		//将节点初始化 
		while(que.size()) que.pop();
		for(int i=0;i<MAX_Tot;i++)
		{
			memset(stateTable[i].next,0,sizeof(stateTable[i].next));
			stateTable[i].fail = 0;
			stateTable[i].cnt = 0; 
		}
		//根节点一定存在，所以节点个数为1 
		size = 1;
	} 
	
	//构建字典树 
	void insert(char *str)
	{
		int n = strlen(str);
		int now = 0;
		for(int i=0;i<n;i++)
		{
			char c = str[i];
			//如果到当前节点子节点不存在的话 
			if(!stateTable[now].next[c-'a'])
			{
				//开辟新节点，并将节点个数加一，注意size从1开始的 
				stateTable[now].next[c-'a'] = size++;
			}	
			//每次都要进行走到下一节点 
			now = stateTable[now].next[c-'a'];
			
		} 
		//该字符串便利完之后走到的节点 
		stateTable[now].cnt++;
	} 
	//构造失配指针 
	void build()
	{
		
		//根节点的失配指针设为-1 
		stateTable[0].fail = -1;
		//将根节点压入队列 
		que.push(0);
		
		while(que.size())
		{
			//取当前队列中的第一个，即广度优先遍历数，保证每层节点的失败指针都选择完成，才有可能继续下一层
			//否则，如果深度优先遍历会导致指针不为最优，因为别的叉没有被构造。 
			int u = que.front();
			//取一个，要将其弹出 
			que.pop();
			
			//将当前节点的所有子节点遍历 
			for(int i=0;i<26;i++)
			{
				//如果当前节点的子节点之一存在子节点 
				if(stateTable[u].next[i])
				{
					//判断当前点是否为根节点 
					if(u==0)
					{
						//如果为根节点，没办法，只能让当前节点的子节点的失败指针指向0，即指向根节点 
						//根节点的第一层节点都满足，可以想一下。 
						stateTable[stateTable[u].next[i]].fail = 0;
					}
					//否则 
					else
					{
						//记录当前节点的失败指针 
						int v = stateTable[u].fail;
						//如果失败指针存在的话 
						while(v!=-1)
						{
							//并且其失败指针节点也存在子节点 
							if(stateTable[v].next[i])
							{
								//将当前节点 的 失败指针 指向其 失败指针节点 的下一个节点 
								stateTable[stateTable[u].next[i]].fail = stateTable[v].next[i] ;
								break;
							}
							//记录下失败指针的位置。 
							v = stateTable[v].fail; 
						}
						//如果找了半天，其各种祖先节点的失败指针仍然不存在 
						if(v==-1)
						{
							//只能将当前节点的失败指针指向根节点 
							stateTable[stateTable[u].next[i]].fail = 0;
						}
					}
					//将当前节点入队列，毕竟要广搜一层来确定 
					que.push(stateTable[u].next[i]);
				}
			}
		}
		
	}
	int Get(int u)
	{
		int res = 0;
		while(u)
		{
			//当前节点的不为根节点
			//res+上当前节点下的单词数 
			res = res+stateTable[u].cnt;
			//当前节点单词数清零，避免一条子树下重复加值 
			stateTable[u].cnt = 0;
			//回溯其失败指针下满足条件的单词数 
			u = stateTable[u].fail;
		}
		return res;
	}
	//制造匹配函数 
	int match(char *S)
	{
		int n = strlen(S);
		int res = 0,now = 0;
		for(int i=0;i<n;i++)
		{
			char c = S[i];
			//存在自不必多说，向下一个指针移动 
			if(stateTable[now].next[c-'a'])
			{
				now = stateTable[now].next[c-'a']; 
			}
			else
			{
				//一旦失配，不回溯根节点，而是回溯失败指针节点 
				int p = stateTable[now].fail;
				//如果失配指针存在，或者失配指针指向的根节点存在 
				while(p!=-1 && stateTable[p].next[c-'a']==0)
				{
					//更新失败指针的位置，即不=停的向父节点靠拢 
					p = stateTable[p].fail;
				}
				//如果只能找到根节点，那就从根节点进行匹配 
				if(p==-1)
				{
					now = 0;
				}
				else{
					//不然，当前节点跳到失败节点的存在子节点 
					now = stateTable[p].next[c-'a'];
				}
			}
			//如果当前节点下存在的单词数不为0 
			if(stateTable[now].cnt)
			{
				//记录到当前节点为止所有含有的父节点（失败节点）的单词数， 
				res +=Get(now);
			}
		}
		return res;
	}
}aho;
int T;
int n;
char S[MAX_N];

int main()
{
	scanf("%d",&T);
	while(T--)
	{
		aho.init();
		scanf("%d",&n);
		for(int i=0;i<n;i++)
		{
			scanf("%s",S);
			aho.insert(S);
		}
		aho.build();
		scanf("%s",S);
		printf("%d\n",aho.match(S));
	}
 }