



早在1975年贝尔实验室的两位研究人员Alfred V. Aho 和Margaret J. Corasick就提出了以他们的名字命名的高效的匹配算法—AC算法。该算法几乎与KMP算法同时问世。与KMP算法相同,AC算法时至今日仍然在模式匹配领域被广泛应用。
最近本人由于兴趣所致,在学习完KMP和BM单模式匹配算法之后,开始学习多模式匹配算法。看了一些博文和中文文章之后发现学习AC算法的最有效方式仍然是Aho和Corasick的论文《Efficient String Matching: An Aid to Bibliographic Search》。本文算是对这篇文章的一些总结。




goto表本质上是一个有限状态机,这里称作模式匹配机(Pattern Matching Machine,PMM)。下面以论文中的例子来说明goto表的构造过程。对于模式串集合K{he, she, his, hers}
第一步:PMM初始状态为0,然后向PMM中加入第一个模式串K[0] = “he”。
第二步:继续向PMM中添加第二个模式串K[1] = “she”,每次添加都是从状态0开始扫描。
第三步:从状态0开始继续添加第三个模式串K[2] = “his”,这里值得注意的是遇到相同字符跳转时要重复利用以前已经生成的跳转。如这里的’h’在第一步中已经存在。
第四步:添加模式串K[3] = “hers”。至此,goto表已经构造完成。


1、若depth(s) = 1,则f(s) = 0;
2、假设对于depth(r) < d的所有状态r,已近计算出了f(r);
(1) 若g(r,a) = fail,对于所有的a,则不动作;(注:a为字符,g为状态转移函数);
(2) 否则,对于a使得g(r,a) = s,则如下步骤:
a、使state = f(r)
b、重复步骤state = f(state),直到g(state, a) != fail。(注意对于任意的a,状态0的g(0,a) != fail)
c、使f(s) = g(state, a)。


         i     1  2  3  4  5  6  7  8  9
       f(i)    0  0  0  1  2  0  3  0  3 



        i           output(i)
        2           {he}
        5           {she}
        7           {his}
        9           {hers}

2、在构造failure表时,若f(s) = s’,则将s和s‘对应的output集合求并集。如f(5) = 2,则得到最终的output表为:

        i           output(i)
        2           {he}
        5           {she,he}
        7           {his}
        9           {hers}


    u      s     h     e      r      s
    0      0     3     4      5      8      9



                 输入字符                        下一状态       
state 0:         h                              1
                 s                              3
                 *                              0
state 1:         e                              2
                 i                              6
                 h                              1
                 s                              3
                 *                              0
state9,7,3:      h                              4
                 s                              3
                 *                              0
state5,2:        r                              8
                 h                              1
                 s                              3
                 *                              0 
state 6:         s                              7
                 h                              1
                 *                              0
state 4:         e                              5
                 i                              6
                 h                              1
                 s                              3
                 *                              0
state 8:         s                              9
                 h                              1
                 *                              0



/* ** Copyright (C) 2002 Martin Roesch <[email protected]> ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 2 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */

/* ** ACSMX.H ** ** */
#ifndef ACSMX_H
#define ACSMX_H

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* * Prototypes */
#define ALPHABET_SIZE 256 
#define MAXLEN 256

#define ACSM_FAIL_STATE -1 

typedef struct _acsm_pattern {      

    struct  _acsm_pattern *next;
    unsigned char         *patrn;
    unsigned char         *casepatrn;
    int      n;
    int      nocase;
    void   * id;
    int      nmatch;


typedef struct  {    

    /* Next state - based on input character */
    int      NextState[ ALPHABET_SIZE ];  

    /* Failure state - used while building NFA & DFA */
    int      FailState;   

    /* List of patterns that end here, if any */
    ACSM_PATTERN *MatchList;   


/* * State machine Struct */
typedef struct {

    int acsmMaxStates;  
    int acsmNumStates;  

    ACSM_PATTERN    * acsmPatterns;
    ACSM_STATETABLE * acsmStateTable;


/* * Prototypes */
ACSM_STRUCT * acsmNew ();
int acsmAddPattern( ACSM_STRUCT * p, unsigned char * pat, int n,int nocase);
int acsmCompile ( ACSM_STRUCT * acsm );
//int acsmSearch ( ACSM_STRUCT * acsm,unsigned char * T, int n, int (*Match) (ACSM_PATTERN * pattern,ACSM_PATTERN * mlist, int nline,int index));
int acsmSearch (ACSM_STRUCT * acsm, unsigned char *Tx, int n,void (*PrintMatch) (ACSM_PATTERN * pattern,ACSM_PATTERN * mlist, int nline,int index));
void acsmFree ( ACSM_STRUCT * acsm );
void PrintMatch (ACSM_PATTERN * pattern,ACSM_PATTERN * mlist, int nline,int index) ;
void PrintSummary (ACSM_PATTERN * pattern) ;



** Multi-Pattern Search Engine 
** Aho-Corasick State Machine -  uses a Deterministic Finite Automata - DFA 
** Copyright (C) 2002 Sourcefire,Inc. 
** Marc Norton 
** This program is free software; you can redistribute it and/or modify 
** it under the terms of the GNU General Public License as published by 
** the Free Software Foundation; either version 2 of the License, or 
** (at your option) any later version. 
** This program is distributed in the hope that it will be useful, 
** but WITHOUT ANY WARRANTY; without even the implied warranty of 
** GNU General Public License for more details. 
** You should have received a copy of the GNU General Public License 
** along with this program; if not, write to the Free Software 
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 
**   Reference - Efficient String matching: An Aid to Bibliographic Search 
**               Alfred V Aho and Margaret J Corasick 
**               Bell Labratories  
**               Copyright(C) 1975 Association for Computing Machinery,Inc 
**   Implemented from the 4 algorithms in the paper by Aho & Corasick 
**   and some implementation ideas from 'Practical Algorithms in C' 
**   Notes: 
**     1) This version uses about 1024 bytes per pattern character - heavy  on the memory.  
**     2) This algorithm finds all occurrences of all patterns within a   
**        body of text. 
**     3) Support is included to handle upper and lower case matching.      
**     4) Some comopilers optimize the search routine well, others don't, this makes all the difference. 
**     5) Aho inspects all bytes of the search text, but only once so it's very efficient, 
**        if the patterns are all large than the Modified Wu-Manbar method is often faster. 
**     6) I don't subscribe to any one method is best for all searching needs, 
**        the data decides which method is best, 
**        and we don't know until after the search method has been tested on the specific data sets. 
**  May 2002  : Marc Norton 1st Version   
**  June 2002 : Modified interface for SNORT, added case support 
**  Aug 2002  : Cleaned up comments, and removed dead code. 
**  Nov 2,2002: Fixed queue_init() , added count=0 
**  Wangyao : [email protected] 
**  Apr 24,2007: WangYao Combined Build_NFA() and Convert_NFA_To_DFA() into Build_DFA(); 
**               And Delete Some redundancy Code  

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <ctype.h> 
#include "acsmx.h" 

#define MEMASSERT(p,s) if(!p){fprintf(stderr,"ACSM-No Memory: %s!\n",s);exit(0);} 

/*Define the number of the line,when match a keyword*/  
extern int nline=1;  

* Malloc the AC Memory 
static void *AC_MALLOC (int n)   
    void *p;  
    p = malloc (n);  
 return p;  

*Free the AC Memory 
static void AC_FREE (void *p)   
    if (p)  
        free (p);  

*    Simple QUEUE NODE 
typedef struct _qnode  
    int state;  
    struct _qnode *next;  

*    Simple QUEUE Structure 
typedef struct _queue  
    QNODE * head, *tail;  
    int count;  

*Init the Queue 
static void queue_init (QUEUE * s)   
    s->head = s->tail = 0;  
    s->count = 0;  

*  Add Tail Item to queue 
static void queue_add (QUEUE * s, int state)   
    QNODE * q;  
    /*Queue is empty*/  
    if (!s->head)  
        q = s->tail = s->head = (QNODE *) AC_MALLOC (sizeof (QNODE));  
        /*if malloc failed,exit the problom*/  
        MEMASSERT (q, "queue_add");  
        q->state = state;  
        q->next = 0; /*Set the New Node's Next Null*/  
        q = (QNODE *) AC_MALLOC (sizeof (QNODE));  
        MEMASSERT (q, "queue_add");  
        q->state = state;  
        q->next = 0;  
        /*Add the new Node into the queue*/  
        s->tail->next = q;  
        /*set the new node is the Queue's Tail*/  
        s->tail = q;  

*  Remove Head Item from queue 
static int queue_remove (QUEUE * s)   
    int state = 0;  
    QNODE * q;  
    /*Remove A QueueNode From the head of the Queue*/  
    if (s->head)  
        q = s->head;  
        state = q->state;  
        s->head = s->head->next;  

        /*If Queue is Empty,After Remove A QueueNode*/  
        if (!s->head)  
            s->tail = 0;  
            s->count = 0;  
        /*Free the QueNode Memory*/  
        AC_FREE (q);  
 return state;  

*Return The count of the Node in the Queue 
static int queue_count (QUEUE * s)   
 return s->count;  

*Free the Queue Memory 
static void queue_free (QUEUE * s)   
    while (queue_count (s))  
        queue_remove (s);  

** Case Translation Table  
static unsigned char xlatcase[256];  

* Init the xlatcase Table,Trans alpha to UpperMode 
* Just for the NoCase State 
static void init_xlatcase ()   
    int i;  
    for (i = 0; i < 256; i++)  
        xlatcase[i] = toupper (i);  

*Convert the pattern string into upper 
static void ConvertCaseEx (unsigned char *d, unsigned char *s, int m)   
    int i;  
    for (i = 0; i < m; i++)  
        d[i] = xlatcase[s[i]];  

*  Add a pattern to the list of patterns terminated at this state. 
*  Insert at front of list. 
static void AddMatchListEntry (ACSM_STRUCT * acsm, int state, ACSM_PATTERN * px)   
    ACSM_PATTERN * p;  
    p = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));  
    MEMASSERT (p, "AddMatchListEntry");  
    memcpy (p, px, sizeof (ACSM_PATTERN));  

    /*Add the new pattern to the pattern  list*/  
    p->next = acsm->acsmStateTable[state].MatchList;  
    acsm->acsmStateTable[state].MatchList = p;  

* Add Pattern States 
static void AddPatternStates (ACSM_STRUCT * acsm, ACSM_PATTERN * p)   
    unsigned char *pattern;  
    int state=0, next, n;  
    n = p->n; /*The number of alpha in the pattern string*/  
    pattern = p->patrn;  

    *  Match up pattern with existing states 
    for (; n > 0; pattern++, n--) 
        next = acsm->acsmStateTable[state].NextState[*pattern];  
        if (next == ACSM_FAIL_STATE)  
        state = next;  

    *   Add new states for the rest of the pattern bytes, 1 state per byte 
    for (; n > 0; pattern++, n--) 
        acsm->acsmStateTable[state].NextState[*pattern] = acsm->acsmNumStates;  
        state = acsm->acsmNumStates;  
    /*Here,An accept state,just add into the MatchListof the state*/  
    AddMatchListEntry (acsm, state, p);  

*   Build Non-Deterministic Finite Automata 
static void Build_DFA (ACSM_STRUCT * acsm)   
    int r, s;  
    int i;  
    QUEUE q, *queue = &q;  
    ACSM_PATTERN * mlist=0;  
    ACSM_PATTERN * px=0;  

    /* Init a Queue */   
    queue_init (queue);  

    /* Add the state 0 transitions 1st */  
    /*1st depth Node's FailState is 0, fail(x)=0 */  
    for (i = 0; i < ALPHABET_SIZE; i++)  
        s = acsm->acsmStateTable[0].NextState[i];  
        if (s)  
            queue_add (queue, s);  
            acsm->acsmStateTable[s].FailState = 0;  

    /* Build the fail state transitions for each valid state */   
    while (queue_count (queue) > 0)  
        r = queue_remove (queue);  

        /* Find Final States for any Failure */   
        for (i = 0; i < ALPHABET_SIZE; i++)  
            int fs, next;  
            /*** Note NextState[i] is a const variable in this block ***/  
            if ((s = acsm->acsmStateTable[r].NextState[i]) != ACSM_FAIL_STATE)  
                queue_add (queue, s);  
                fs = acsm->acsmStateTable[r].FailState;  

                *  Locate the next valid state for 'i' starting at s  
                /**** Note the  variable "next" ****/  
                /*** Note "NextState[i]" is a const variable in this block ***/  
                while ((next=acsm->acsmStateTable[fs].NextState[i]) ==  
                    fs = acsm->acsmStateTable[fs].FailState;  

                *  Update 's' state failure state to point to the next valid state 
                acsm->acsmStateTable[s].FailState = next;  
                acsm->acsmStateTable[r].NextState[i] =  

    /* Clean up the queue */   
    queue_free (queue);  

* Init the acsm DataStruct 
ACSM_STRUCT * acsmNew ()   
    ACSM_STRUCT * p;  
    init_xlatcase ();  
    p = (ACSM_STRUCT *) AC_MALLOC (sizeof (ACSM_STRUCT));  
    MEMASSERT (p, "acsmNew");  
    if (p)  
        memset (p, 0, sizeof (ACSM_STRUCT));  
 return p;  

*   Add a pattern to the list of patterns for this state machine 
int acsmAddPattern (ACSM_STRUCT * p, unsigned char *pat, int n, int nocase)   
    ACSM_PATTERN * plist;  
    plist = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));  
    MEMASSERT (plist, "acsmAddPattern");  
    plist->patrn = (unsigned char *) AC_MALLOC (n+1);  
    ConvertCaseEx (plist->patrn, pat, n);  
    plist->casepatrn = (unsigned char *) AC_MALLOC (n+1);  
    memcpy (plist->casepatrn, pat, n);  
    plist->n = n;  
    plist->nocase = nocase;  

    /*Add the pattern into the pattern list*/  
    plist->next = p->acsmPatterns;  
    p->acsmPatterns = plist;  
 return 0;  

*   Compile State Machine 
int acsmCompile (ACSM_STRUCT * acsm)   
    int i, k;  
    ACSM_PATTERN * plist;  

    /* Count number of states */   
    acsm->acsmMaxStates = 1; /*State 0*/  
    for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)  
        acsm->acsmMaxStates += plist->n;  

    acsm->acsmStateTable = (ACSM_STATETABLE *) AC_MALLOC (sizeof (ACSM_STATETABLE) * acsm->acsmMaxStates);  
    MEMASSERT (acsm->acsmStateTable, "acsmCompile");  
    memset (acsm->acsmStateTable, 0,sizeof (ACSM_STATETABLE) * acsm->acsmMaxStates);  

    /* Initialize state zero as a branch */   
    acsm->acsmNumStates = 0;  

    /* Initialize all States NextStates to FAILED */   
    for (k = 0; k < acsm->acsmMaxStates; k++)  
        for (i = 0; i < ALPHABET_SIZE; i++)  
            acsm->acsmStateTable[k].NextState[i] = ACSM_FAIL_STATE;  

    /* This is very import */  
    /* Add each Pattern to the State Table */   
    for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)  
        AddPatternStates (acsm, plist);  

    /* Set all failed state transitions which from state 0 to return to the 0'th state */   
    for (i = 0; i < ALPHABET_SIZE; i++)  
        if (acsm->acsmStateTable[0].NextState[i] == ACSM_FAIL_STATE)  
            acsm->acsmStateTable[0].NextState[i] = 0;  

    /* Build the NFA  */   
    Build_DFA (acsm);  
 return 0;  

/*64KB Memory*/  
static unsigned char Tc[64*1024];  

*   Search Text or Binary Data for Pattern matches 
int acsmSearch (ACSM_STRUCT * acsm, unsigned char *Tx, int n,void (*PrintMatch)(ACSM_PATTERN * pattern,ACSM_PATTERN * mlist, int nline,int index)) { int state; ACSM_PATTERN * mlist; unsigned char *Tend; ACSM_STATETABLE * StateTable = acsm->acsmStateTable; int nfound = 0; /*Number of the found(matched) patten string*/ unsigned char *T; int index; /* Case conversion */ ConvertCaseEx (Tc, Tx, n); T = Tc; Tend = T + n; for (state = 0; T < Tend; T++) { state = StateTable[state].NextState[*T]; /* State is a accept state? */ if( StateTable[state].MatchList != NULL ) { for( mlist=StateTable[state].MatchList; mlist!=NULL; mlist=mlist->next ) { /*Get the index of the Match Pattern String in the Text*/ index = T - mlist->n + 1 - Tc; //mlist->nmatch++; nfound++; PrintMatch (acsm->acsmPatterns,mlist, nline,index); } } } return nfound; } /* * Free all memory */ void acsmFree (ACSM_STRUCT * acsm) { int i; ACSM_PATTERN * mlist, *ilist; for (i = 0; i < acsm->acsmMaxStates; i++) { if (acsm->acsmStateTable[i].MatchList != NULL) { mlist = acsm->acsmStateTable[i].MatchList; while (mlist) { ilist = mlist; mlist = mlist->next; AC_FREE (ilist); } } } AC_FREE (acsm->acsmStateTable); } /* * Print A Match String's Information */ void PrintMatch (ACSM_PATTERN * pattern,ACSM_PATTERN * mlist, int nline,int index) { /* Count the Each Match Pattern */ ACSM_PATTERN *temp = pattern; for (;temp!=NULL;temp=temp->next) { if (!strcmp(temp->patrn,mlist->patrn)) //strcmp succeed return 0,So here use "!" operation { temp->nmatch++; } } if(mlist->nocase) fprintf (stdout, "Match KeyWord %s at %d line %d char\n", mlist->patrn,nline,index); else fprintf (stdout, "Match KeyWord %s at %d line %d char\n", mlist->casepatrn,nline,index); } /* * Print Summary Information of the AC Match */ void PrintSummary (ACSM_PATTERN * pattern) { ACSM_PATTERN * mlist = pattern; printf("\n### Summary ###\n"); for (;mlist!=NULL;mlist=mlist->next) { if(mlist->nocase) printf("%12s : %5d\n",mlist->patrn,mlist->nmatch); else printf("%12s : %5d\n",mlist->casepatrn,mlist->nmatch); } } 


/* Author: wangyao Email: [email protected] */  
#include "acsmx.h" 

/* * Text Data Buffer */   
unsigned char text[MAXLEN];  
extern int nline;  

int main (int argc, char **argv)   
    int i, nocase = 0;  
    FILE *fd;  
    char filename[20];  
    ACSM_STRUCT * acsm;  

    if (argc < 3)  
        fprintf (stderr,"Usage: acsmx filename pattern1 pattern2 ... -nocase\n");  
        exit (0);  

    acsm = acsmNew ();  

    strcpy (filename, argv[1]);  
    fd = fopen(filename,"r");  
    if(fd == NULL)  
        fprintf(stderr,"Open file error!\n");  

    for (i = 1; i < argc; i++)  
        if (strcmp (argv[i], "-nocase") == 0)  
            nocase = 1;  
    for (i = 2; i < argc; i++)  
        if (argv[i][0] == '-')  
        acsmAddPattern (acsm, argv[i], strlen (argv[i]), nocase);  

    /* Generate GtoTo Table and Fail Table */  
    acsmCompile (acsm);  

    /*Search Pattern*/  
    while ( fgets(text,MAXLEN,fd) )  
        acsmSearch (acsm, text, strlen (text), PrintMatch);  


    acsmFree (acsm);  

    printf ("\n### AC Match Finished ###\n");  
// system("pause"); 

    return (0);  