之前写过一篇关于Trie树实现多模匹配算法的文章,参见:基于Trie树的多模匹配算法的实现与优化,其实还可以进一步进行优化——在空间上进行优化。
1.进一步优化方案
上文中进行构造Trie树的过程中,每增加一个节点,就要不断的new一个新节点。这些新增加的节点都是随机的,我们不知到它的分不清况。他们通过指针联系起来。但是如果它们分布的过于分散的话,指针在寻址上面就要花费很大的时间,同时操作系统中有高速缓存机制,其速度要比内存快的多,如果指针总是跳来跳去的话,缓存的命中率就会下降。因此,为了降低寻址的开销,同时提高缓存的命中率,可以考虑用这样的办法:预先分配一个大的数组,数组的内容就是无数个节点,这样一来,这些节点就存在于大片的连续内存里面。不但能够降低寻址开销,还能大幅提高高速缓存的命中率。
将原来的构建Trie树的函数进行改进,改用一次性申请N个节点,而不是一个个的申请节点。
2.实验测试
表1 Trie树多模匹配时间
词典词长 | 建树时间(ms) | 穿线时间(ms) | 穷举匹配 | KMP匹配 | 加速比 | ||
每字符跳转次数 | 匹配用时(ms) | 每字符跳转次数 | 匹配用时(ms) | ||||
5~10 | 241 | 461 | 4.75 | 216 | 1 | 130 | 1.66 |
15~20 | 670 | 1612 | 5.01 | 551 | 1 | 206 | 2.67 |
25~30 | 1068 | 2824 | 5.08 | 923 | 1 | 288 | 3.20 |
45~50 | 1863 | 5292 | 5.13 | 1632 | 1 | 443 | 3.68 |
65~70 | 2481 | 7477 | 5.15 | 2293 | 1 | 585 | 3.92 |
75~80 | 2963 | 9061 | 5.16 | 2771 | 1 | 673 | 4.11 |
表2 Trie树多模匹配(空间优化)
词典词长 | 建树时间(ms) | 穿线时间(ms) | 穷举匹配 | KMP匹配 | 加速比 | ||
每字符跳转次数 | 匹配用时(ms) | 每字符跳转次数 | 匹配用时(ms) | ||||
5~10 | 107 | 464 | 4.75 | 206 | 1 | 104 | 1.98 |
15~20 | 206 | 1595 | 5.01 | 546 | 1 | 191 | 2.86 |
25~30 | 301 | 2783 | 5.08 | 907 | 1 | 274 | 3.31 |
45~50 | 487 | 5284 | 5.13 | 1661 | 1 | 427 | 3.89 |
65~70 | 750 | 7472 | 5.15 | 2333 | 1 | 568 | 4.11 |
75~80 | 1387 | 9178 | 5.16 | 3023 | 1 | 660 | 4.58 |
表3 Trie树多模匹配算法空间优化前后对比
词典词长 | 穷举匹配时间(ms) | KMP匹配时间(ms) | ||||||
优化前 | 优化后 | 效果 | 比率 | 优化前 | 优化后 | 效果 | 比率 | |
5~10 | 216 | 206 | 10 | 4.63% | 130 | 104 | 26 | 20.00% |
15~20 | 551 | 546 | 9 | 1.63% | 206 | 191 | 15 | 7.28% |
25~30 | 923 | 907 | 16 | 1.73% | 288 | 274 | 14 | 4.86% |
45~50 | 1632 | 1661 | -29 | -1.78% | 443 | 427 | 17 | 3.84% |
65~70 | 2293 | 2333 | -40 | -1.74% | 585 | 568 | 17 | 2.91% |
75~80 | 2771 | 3023 | -252 | -9.10% | 673 | 660 | 13 | 1.93% |
其中,表1是原来的测试结果,表2是空间优化后的测试结果,表3是对这两个表进行的对比。
从表三中可以看出:
(1)对于穷举匹配来说,在词典规模比较小的时候,进行空间优化还是有效果的,但是词典规模大了以后,这种优化的效果减弱甚至效果相反。
(2)对于kmp匹配来说,同样是词典规模小的时候,优化效果明显,随着词典的扩大,这种效果减弱,应该是单词长度变长之后,Trie树的节点数目增多,相邻两层间的节点距离增加,Cache的命中率下降,因此寻址时间下降到优化前的水平。
3.源代码
代码如下:
#include <iostream>
#include <fstream>
#include <string>
#include <sys/time.h>
#include <cstring>
#include <queue>
#include <sstream>
#include <cstdlib>
#include <ctime>
#include <algorithm>
using namespace std;
const int MAX_NODE = 100000*30; //Trie树中节点的最大数量
const int MAX = 5000000;
const int Son_Num = 26; //每个孩子的最大分支个数
const int word_max_len = 30; //词典中单词的最大长度
//---------------------Trie树的定义----------------------------
//节点定义
class Node{
public:
int flag; //节点的是否有单词出现的标记
int count; //单词出现的次数
string word; //到达该节点时出现的单词
vector<string> word_set;//到达该节点时能够匹配上的单词
Node* failjump; //匹配失败时指针的跳转目标
Node* son[Son_Num]; //指向分支节点的指针,这里为26个
Node* fail[Son_Num]; //指向各个分支跳转节点的指针,26个
public:
Node() : flag(0),count(0),word(""){
failjump = NULL;
memset(son, NULL, sizeof(Node*) * Son_Num);
memset(fail, NULL, sizeof(Node*) * Son_Num);
}
};
//trie树的操作定义
class Trie{
private:
Node* pArray;
Node* pPos;
Node* pRoot;
private:
void print(Node* pRoot);
public:
Trie();
~Trie();
void insert(string str); //插入字符串
bool search(string str, int &count); //查询字符串
bool remove(string str); //删除字符串
void destory(Node* pRoot); //销毁Trie树
void printAll(); //打印Trie树
void failjump(); //穿线算法
void multi_match_1(string &str, int begin, int end, vector<string> &result); //多模匹配(回溯)
void multi_match_2(string &str, int begin, int end, vector<string> &result); //多模匹配(KMP)
};
//构造函数
Trie::Trie(){
pArray = new Node[MAX_NODE];
pPos = pArray;
pRoot = pPos;
//pRoot = new Node();
}
//析构函数
Trie::~Trie(){
destory(pRoot);
delete [] pArray;
pPos == NULL;
}
//打印以root为根的Trie树
void Trie::print(Node* pRoot){
if(pRoot != NULL){
if(pRoot -> word != ""){
cout << pRoot -> count << " " << pRoot -> word << endl;
}
if((pRoot -> word_set).size()){
for(int i = 0; i < (pRoot -> word_set).size(); i++){
cout << "--" << (pRoot -> word_set)[i];
}
cout << endl;
}
for(int i = 0; i < Son_Num; i++){
print(pRoot -> son[i]);
}
}
}
//打印整棵树
void Trie::printAll(){
print(pRoot);
}
//插入新的单词
void Trie::insert(string str){
int index = 0;
Node* pNode = pRoot;
//不断向下搜索单词的字符是否出现
for(int i = 0; i < str.size(); i++){
index = str[i] - 'a';
//字符在规定的范围内时,才进行检查
if(index >= 0 && index < Son_Num){
//父节点是否有指针指向本字符
if(NULL == pNode -> son[index]){
pPos++;
pNode -> son[index] = pPos;
}
//指针向下传递
pNode = pNode -> son[index];
}else{
return;
}
}
//判断单词是否出现过
if(pNode -> flag == 1){
//单词已经出现过,计数器加1
pNode -> count++;
return;
}else{
//单词没有出现过,标记为出现,计数器加1
pNode -> flag = 1;
pNode -> count++;
pNode -> word = str;
}
}
//搜索指定单词,并返回单词出现次数(如果存在)
bool Trie::search(string str, int &count){
Node* pNode = pRoot;
int index = 0;
int i = 0;
while(pNode != NULL && i < str.size()){
index = str[i] - 'a';
if(index >= 0 && index < Son_Num){
//字符在指定的范围内时
pNode = pNode -> son[index];
i++;
}else{
//字符不再指定的范围内
return false;
}
}
//判断字符串是否出现过
if(pNode != NULL && pNode -> flag == 1){
count = pNode -> count;
return true;
}else{
return false;
}
}
//删除指定的单词
bool Trie::remove(string str){
Node* pNode = pRoot;
int i = 0;
int index = 0;
while(pNode != NULL && i < str.size()){
index = str[i] - 'a';
if(index >= 0 && index < Son_Num){
pNode = pNode -> son[index];
i++;
}else{
return false;
}
}
if(pNode != NULL && pNode -> flag == 1){
pNode -> flag = 0;
pNode -> count = 0;
return true;
}else{
return false;
}
}
//销毁Trie树
void Trie::destory(Node* pRoot){
if(NULL == pRoot){
return;
}
for(int i = 0; i < Son_Num; i++){
destory(pRoot -> son[i]);
}
pRoot == NULL;
}
//穿线算法
void Trie::failjump(){
queue<Node*> q;
//将根节点的孩子的failjump域和fail[]域都设置为根节点
for(int i = 0; i < Son_Num; i++){
if(pRoot -> son[i] != NULL){
pRoot -> son[i] -> failjump = pRoot;
q.push(pRoot -> son[i]);
}else{
pRoot -> fail[i] = pRoot;
}
}
//广度遍历树,为其他节点设置failjump域和fail[]域
while(!q.empty()){
Node *cur = q.front();
q.pop();
if(cur -> failjump != NULL){
for(int j = 0; j < Son_Num; j++){
//循环设置跳转域
Node* fail = cur -> failjump;
Node* cur_child = cur -> son[j];
if(cur_child != NULL){
//当孩子存在时,设置孩子的failjump
while(fail != NULL){
if(fail -> son[j] != NULL){
//设置failjump域
cur_child -> failjump = fail -> son[j];
//设置word_set集合
if((fail -> son[j] -> word_set).size()){
cur_child -> word_set = fail -> son[j] -> word_set;
}
if(cur_child -> flag == 1){
(cur_child -> word_set).push_back(cur_child -> word);
}
break;
}else{
fail = fail -> failjump;
}
}
if(cur_child -> failjump == NULL){
cur_child -> failjump = pRoot;
if(cur_child -> flag == 1){
(cur_child -> word_set).push_back(cur_child -> word);
}
}
q.push(cur_child);
}else{
//当孩子不存在时,设置父节点fail[]域
while(fail != NULL){
if(fail -> son[j] != NULL){
//设置对应的fail[j];
cur -> fail[j] = fail -> son[j];
break;
}else{
if(fail == pRoot){
cur -> fail[j] = pRoot;
break;
}else{
fail = fail -> failjump;
}
}
}
}
}
}
}
}
//多模匹配算法1(穷举匹配)
void Trie::multi_match_1(string &str, int begin, int end, vector<string> &result){
int count = 0;
for(int pos = 0; pos < end; pos++){
Node* pNode = pRoot;
int index = 0;
int i = pos;
while(pNode != NULL && i < pos + word_max_len && i < end){
index = str[i] - 'a';
if(index >= 0 && index < Son_Num){
//字符在指定的范围内时
pNode = pNode -> son[index];
count++;
i++;
//若字符串出现过,输出
if(pNode != NULL && pNode -> flag == 1){
//cout << pNode -> word << endl;
result.push_back(pNode -> word);
}
}
}
}
cout << " 跳转次数:count = " << count << endl;
cout << " 每个字符平均跳转次数:" << double(count)/(end - begin) << endl;
}
//多模匹配算法2(类KMP匹配)
void Trie::multi_match_2(string &str, int begin, int end, vector<string> &result){
int count_1 = 0, count_2 = 0, count_3 = 0;
Node* pNode = pRoot;
int index = 0;
int i = begin;
int word_set_size = 0;
while(pNode != NULL && i < end){
index = str[i] - 'a';
if(index >= 0 && index < Son_Num){
//字符在指定的范围内时
if(pNode -> son[index]){
//该孩子存在,继续向下搜索
pNode = pNode -> son[index];
count_1++;
}else if(pNode != pRoot){
//该孩子不存在,且当前不是根节点,进行跳转
pNode = pNode -> fail[index];
count_2++;
}else{
//该孩子不存在,且当前已经是跟节点,目标字符串前移
count_3++;
}
//若该位置有匹配词语,输出
if(word_set_size = (pNode -> word_set).size()){
for(int i = 0; i < word_set_size; i++){
//cout << (pNode -> word_set)[i] << endl;
result.push_back((pNode -> word_set)[i]);
}
}
//目标串前移
i++;
}
}
cout << " 跳转次数:count_1 = " << count_1 << " count_2 = " << count_2 << " count_3 = " << count_3 << endl;
}
//----------------------辅助代码----------------------------
//生成随机字符串
string rand_string(int min, int max){
char a[MAX+1];
int len = rand()%(max - min) + min;
for(int i = 0; i < len; i++){
a[i] = rand()%26 + 'a';
}
a[len] = '\0';
string str(a);
return str;
}
//-----------------测试代码---------------------------
//获取当前时间(ms)
long getCurrentTime(){
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec*1000 + tv.tv_usec/1000;
}
//树的基本操作,增删查改测试
void Test_1(){
//1.建立对象
Trie trie;
string str;
ifstream fin;
fin.open("dict.txt");
//2.建立Trie树
long time_1 = getCurrentTime();
while(getline(fin, str, '\n')){
trie.insert(str);
}
long time_2 = getCurrentTime();
fin.close();
//3.查找单词
str = "siubetamwm";
int count = -1;
long time_3 = getCurrentTime();
bool isFind = trie.search(str, count);
cout << "要查找的字符串:" << str << endl;
long time_4 = getCurrentTime();
if(isFind){
cout << " 该单词存在,存在次数:" << count << endl;
}else{
cout << " 该单词不存在!" << endl;
}
//4.删除单词
str = "lutgjrxjavgfkij";
cout << "要删除的字符串:" << str << endl;
bool isRemove = trie.remove(str);
if(isRemove){
cout << " 该单词存在,删除成功!" << endl;
}else{
cout << " 该单词不存在,删除失败!" << endl;
}
}
//调用词典,多模匹配测试
void Test_2(){
//1.建立对象
Trie trie;
string str;
ifstream fin;
fin.open("dict.txt");
//2.建立Trie树
long time_1 = getCurrentTime();
while(getline(fin, str, '\n')){
trie.insert(str);
}
fin.close();
long time_2 = getCurrentTime();
//3.将短字符串组合为长字符串
string long_str;
ostringstream string_out;
fin.open("dict.txt");
while(getline(fin, str, '\n')){
string_out << str;
}
fin.close();
long_str = string_out.str();
//long_str = rand_string(MAX - 20, MAX);
vector<string> result_1;
vector<string> result_2;
//4.在长字符串中搜索字典中的字符串(方法一:穷举)
cout << "穷举匹配:" << endl;
long time_3 = getCurrentTime();
trie.multi_match_1(long_str, 0, long_str.size(), result_1);
long time_4 = getCurrentTime();
cout << "穷举匹配完毕!" << endl;
//5.进行穿线处理
cout << "穿线处理" << endl;
long time_5 = getCurrentTime();
trie.failjump();
long time_6 = getCurrentTime();
cout << "穿线完毕" << endl;
//6.在长字符串中搜索字典中的字符串(方法二)
cout << "KMP匹配:" << endl;
long time_7 = getCurrentTime();
trie.multi_match_2(long_str, 0, long_str.size(), result_2);
long time_8 = getCurrentTime();
cout << "KMP匹配完毕" << endl;
//7.输出结果
cout << "目标字符串长度:" << long_str.size() << endl;
cout << "穷举匹配结果数量:" << result_1.size() << endl;
cout << "KMP匹配结果数量:" <<result_2.size() << endl;
sort(result_1.begin(), result_1.end());
sort(result_2.begin(), result_2.end());
if(result_1 == result_2){
cout << "两种多模匹配方式结果相符:True" << endl;
}else{
cout << "两种多模匹配方式结果不相符:False" << endl;
}
cout << endl;
cout << "建立Trie树用时(ms):" << time_2 - time_1 << endl;
cout << "穿线算法用时(ms): " << time_6 - time_5 << endl;
cout << "多模匹配1用时(ms):" << time_4 - time_3 << endl;
cout << "多模匹配2用时(ms):" << time_8 - time_7 << endl;
cout << "加速比:" << double(time_4 - time_3)/(time_8 - time_7) << endl;
}
//小测试
void Test_3(){
//1.建立对象
Trie trie;
//2.建立Trie树
string str_1 = "uuidi";
string str_2 = "ui";
string str_3 = "idi";
string str_4 = "idk";
string str_5 = "di";
trie.insert(str_1);
trie.insert(str_2);
trie.insert(str_3);
trie.insert(str_4);
trie.insert(str_5);
//3.构造待匹配的字符串
string long_str = rand_string(MAX - 20, MAX);
//string long_str = "uuidiuiidiidkdi";
cout << "length: " << long_str.size() << endl;
vector<string> result_1;
vector<string> result_2;
//4.在长字符串中搜索字典中的字符串(方法一:穷举)
cout << "穷举匹配:" << endl;
long time_5 = getCurrentTime();
trie.multi_match_1(long_str, 0, long_str.size(), result_1);
long time_6 = getCurrentTime();
cout << "穷举匹配完毕!" << endl;
//5.进行穿线处理
cout << "穿线处理" << endl;
trie.failjump();
cout << "穿线完毕" << endl;
//6.在长字符串中搜索字典中的字符串(方法二)
cout << "KMP匹配:" << endl;
long time_7 = getCurrentTime();
trie.multi_match_2(long_str, 0, long_str.size(), result_2);
long time_8 = getCurrentTime();
cout << "KMP匹配完毕" << endl;
//7.输出结果
cout << result_1.size() << endl;
cout << result_2.size() << endl;
sort(result_1.begin(), result_1.end());
sort(result_2.begin(), result_2.end());
cout << "目标字符传长度:" << long_str.size() << endl;
if(result_1 == result_2){
cout << "两种多模匹配方式结果相符:True" << endl;
}else{
cout << "两种多模匹配方式结果不相符:False" << endl;
}
cout << "多模匹配1用时(ms):" << time_6 - time_5 << endl;
cout << "多模匹配2用时(ms):" << time_8 - time_7 << endl;
}
int main(){
//Test_1();
Test_2();
//Test_3();
}