字符串查找算法:bm算法

今天有空,认真的对比了一下经典的字符串查找算法BM算法和C库查找函数 strstr 的区别,两者各有优缺点,总结一下:

bm算法的应用场合:适合海量数据搜索,比如数据库,磁盘文件等,总之是数据量越大,性能越高;

strstr,数据量较少时,比较适合,尤其是在一个几千字节的字符串中查找不同的字符串,这时候bm被strstr甩出几条街,原因就是每次查找,bm都要建立搜索模型,而strstr立马上阵。其次,搜索字符在总字符里面出现概率较少,相同率很低的情况下,strstr优势再次展露,比如在http协议头部关键字,像Host、Get、Post、User-Agent、Accept等,这些关键字一次出现几次的概率都非常少,此时用strstr较快。

废话这么多,上代码证明一下吧,同时附加一个改进过的 bm 算法代码:

#include <stdio.h>
#include <string.h>
#include <sys/time.h>
#include <stdlib.h>
 
#define MAX_CHAR 128    // 键盘字符,从32-126,总共95个
#define SIZE     128
#define MAX(x, y) (x) > (y) ? (x) : (y)
 
 
void PreBmBc(char *pattern, int m, int bmBc[])
{
    int i;
 
    for (i=0; i<MAX_CHAR; i++)
    {
        bmBc[i] = m;
    }
    for (i = 0; i < m - 1; i++)
    {
        bmBc[pattern[i]] = m - 1 - i;
    }
 }
 
void suffix_old(char *pattern, int m, int suff[])
{
    int i, j;
 
    suff[m - 1] = m;
 
    for(i = m - 2; i >= 0; i--)
    {
        j = i;
        while(j >= 0 && pattern[j] == pattern[m - 1 - i + j]) j--;
         
        suff[i] = i - j;
    }
}
 
void suffix(char *pattern, int m, int suff[]) 
{
   int f, g, i;
  
   suff[m - 1] = m;
   g = m - 1;
   for (i = m - 2; i >= 0; --i) 
   {
      if (i > g && suff[i + m - 1 - f] < i - g)
         suff[i] = suff[i + m - 1 - f];
      else 
	  {
         if (i < g)
            g = i;
         f = i;
         while (g >= 0 && pattern[g] == pattern[g + m - 1 - f])
            --g;
         suff[i] = f - g;
      }
   }
}
 
void PreBmGs(char *pattern, int m, int bmGs[])
{
    int i, j;
    int suff[SIZE]; 
 
    // 计算后缀数组
    suffix(pattern, m, suff);
 
    // 先全部赋值为m,包含Case3
    for(i = 0; i < m; i++)
    {
        bmGs[i] = m;
    }
 
    // Case2
    j = 0;
    for(i = m - 1; i >= 0; i--)
    {
        if(suff[i] == i + 1)
        {
            for(; j < m - 1 - i; j++)
            {
                if(bmGs[j] == m)
                    bmGs[j] = m - 1 - i;
            }
        }
    }
 
    // Case1
    for(i = 0; i <= m - 2; i++)
    {
        bmGs[m - 1 - suff[i]] = m - 1 - i;
    }
 
//  print(bmGs, m, "bmGs[]");
}
 
const char* BM_strstr(char *pattern, int patternLen, const char *text, int textLen)
{
    int i, j, bmBc[MAX_CHAR], bmGs[SIZE];
 
    // Preprocessing
    PreBmBc(pattern, patternLen, bmBc);
    PreBmGs(pattern, patternLen, bmGs);
 
    // Searching
    j = 0;
    while(j <= textLen - patternLen)
    {
        for(i = patternLen - 1; i >= 0 && pattern[i] == text[i + j]; i--);
        if(i < 0)
        {
            //printf("Find it, the position is %d\n", j);
            return (text + j);
        }
        else
        {
            j += MAX(bmBc[text[i + j]] - patternLen + 1 + i, bmGs[i]);
        }
    }
 
    //printf("No find.\n");
	return NULL;
}

const char* line_strstr(const char* src, int srcLen, const char* dst, int dstLen)
{
	const char* pline = src;
	const char* pEnd = src + srcLen;
	
	while(pline < pEnd)
	{
		if (*pline == '\n' || *pline == '\r')
		{
			++pline;
			while (*pline == '\n' || *pline == '\r' || *pline == ' ' || *pline == '\t') ++pline;
			if (memcmp(pline, dst, dstLen) == 0) 
			{
				return pline;
			}
		}
		else
		{
			++pline;
		}
	}
	return NULL;
}

const char* sun_strstr(const char *text, int textLen, const char *patt, int pattLen)  
{   
    unsigned int temp[256];   
    unsigned int *shift = temp;   
    
	int i; 
     
    for( i=0; i < 256; i++ )  
    {  
        *(shift+i) = pattLen + 1;  
    }   
    for( i=0; i < pattLen; i++ )  
    {  
        *(shift + (unsigned char)(*(patt+i))) = pattLen-i;  
    }  
	
    //shift['s']=6 步,shitf['e']=5 以此类推   
    size_t limit = textLen - pattLen+1;   
    for(i=0; i < limit; i += shift[ text[i+pattLen] ])  
    {  
        if( text[i] == *patt )  
        {   
            const char *match_text = text + i + 1;   
            size_t match_size = 1;   
            do  
            {  
                // 输出所有匹配的位置   
                if( match_size == pattLen )   
                {  
					return (text+i);
                }  
            }while((*match_text++) == patt[match_size++]);   
        }   
    }  
    return NULL;
}   


const char *text = "GET /qq HTTP/1.1\n\r\
Connection: keep-alive\n\r\
Accept: image/webp,*/*;q=0.8\n\r\
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/41.0.2272.89 Safari/537.36\n\r\
Referer: \
http://www.jd.com/?cu=true&utm_source=p.yiqifa.com&utm_medium=tuiguang&utm_campaign=t_1_792977&utm_term=2d74c473e39447508534846d9f847e2f\n\r\
Accept-Encoding: gzip, deflate, sdch \n\r\
Host: jcm.jd.com \n\r\
Accept-Language: zh-CN,zh;q=0.8\n\r";



int main()
{
#define TEST_CNT    1000000
    char pattern[256] = {"Host"};
	int patLen = strlen(pattern);
	int textLen = strlen(text);
	
	printf("patLen=%d, textLen=%d\n", patLen, textLen);
	int i = 0;
	
	struct timeval t;
	gettimeofday(&t, 0);
	unsigned long long a = t.tv_sec * 1000000ULL + t.tv_usec;
	for (i=0; i<TEST_CNT; ++i)
	{	
		const char* pstr = BM_strstr(pattern, patLen, text, textLen);
		//if (pstr)
			//printf("%c%c%c%c\n", pstr[0], pstr[1], pstr[2], pstr[3]);
		if (!pstr)
		{
			printf("error\n");
			exit(1);
		}	
	}
	gettimeofday(&t, 0);
	unsigned long long b = t.tv_sec * 1000000ULL + t.tv_usec;
	printf("BM_strstr: time=%lld\n", b-a);
	
	
	gettimeofday(&t, 0);
	a = t.tv_sec * 1000000ULL + t.tv_usec;
	for (i=0; i<TEST_CNT; ++i)
	{	
		const char* pstr = strstr(text, pattern);
		//if (pstr)
			//printf("%c%c%c%c\n", pstr[0], pstr[1], pstr[2], pstr[3]);
		if (!pstr)
		{
			printf("error, i=%d\n", i);
			exit(1);
		}	
	}
	gettimeofday(&t, 0);
	b = t.tv_sec * 1000000ULL + t.tv_usec;
	printf("strstr: time=%lld\n", b-a);
	
	
	
	gettimeofday(&t, 0);
	a = t.tv_sec * 1000000ULL + t.tv_usec;
	for (i=0; i<TEST_CNT; ++i)
	{	
		const char* pstr = line_strstr(text, textLen, pattern, patLen);
		//if (pstr)
			//printf("%c%c%c%c\n", pstr[0], pstr[1], pstr[2], pstr[3]);
		if (!pstr)
		{
			printf("error, i=%d\n", i);
			exit(1);
		}	
	}
	gettimeofday(&t, 0);
	b = t.tv_sec * 1000000ULL + t.tv_usec;
	printf("linestrstr: time=%lld\n", b-a);
	
	
	gettimeofday(&t, 0);
	a = t.tv_sec * 1000000ULL + t.tv_usec;
	for (i=0; i<TEST_CNT; ++i)
	{	
		const char* pstr = sun_strstr(text, textLen, pattern, patLen);
		//if (pstr)
			//printf("%c%c%c%c\n", pstr[0], pstr[1], pstr[2], pstr[3]);
		if (!pstr)
		{
			printf("error, i=%d\n", i);
			exit(1);
		}	
	}
	gettimeofday(&t, 0);
	b = t.tv_sec * 1000000ULL + t.tv_usec;
	printf("SUNDAY: time=%lld\n", b-a);
	
    return 0;
}

    原文作者:查找算法
    原文地址: https://blog.csdn.net/u011034150/article/details/46726121
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞