KMP 字符串搜索算法的c++实现

2023年8月4日 344次阅读来源: KMP算法

说明：

KMP 字符串搜索算法是基于自动状态机的高效搜索算法
假如被搜索的字符串(长串)长度为 n, 要搜索的字符串(子串)长度为 m，则其空间复杂度为 O(m)，时间复杂度为 O(n+m)
由于该算法是基于自动状态机的，对于字符串分布在不同位置的时的处理较简单(某些公司面试题不是要在不连续内存中搜索字符串嘛，可以用kmp算法简单实现哦，不用考虑字符串拼接啊，中间复杂状态处理之类的)

算法实现：

/**
 * @file -
 * @author jingqi
 * @date 2012-10-18
 * @last-edit 2012-11-13 21:36:50 jingqi
 */

#ifndef ___HEADFILE_A04A7FB7_1516_4EF0_A8B9_44C5AABBF7EC_
#define ___HEADFILE_A04A7FB7_1516_4EF0_A8B9_44C5AABBF7EC_

#include <assert.h>
#include <string>

namespace nut
{

/**
 * 构建KMP自动状态机(特征码)
 *
 * @param target 长度为 len 的字符串，即要搜索的字符串子串
 * @param next 长度为 len 的缓冲区，用于存放生成的KMP自动状态机(特征码)
 */
inline void kmp_build_next(const char *target, int *next, size_t len)
{
    assert(NULL != target && NULL != next && len > 0);
    if (len > 0)
        next[0] = 0;
    size_t i = 1;
    int state = 0;
    while (i < len)
    {
        if (target[i] == target[state])
            next[i++] = ++state;
        else if (0 == state)
            next[i++] = 0;
        else
            state = next[state - 1];
    }
}

/**
 * 更新KMP状态
 *
 * @param c 输入的单个字符
 * @param state 当前状态
 * @param target 要搜索的字符串子串
 * @param next KMP特征码
 * @return 新的状态
 */
inline int kmp_update(char c, int state, const char *target, const int *next)
{
    assert(state >= 0 && NULL != target && NULL != next);
    while (true)
    {
        if (c == target[state])
            return ++state;
        else if (0 == state)
            return 0;
        else
            state = next[state - 1];
    }
}

/**
 * KMP字符串搜索
 *
 * @param src 被搜索的字符串
 * @param start 开始搜索的位置
 * @param target 要搜索的字符串子串
 * @param next KMP特征码(长度与target相同)
 */
inline int kmp_search(const char *src, size_t len_src, size_t start, const char *target, const int *next, size_t len_target)
{
    assert(NULL != src && NULL != target && NULL != next);
    size_t i = start;
    int state = 0; // 状态，其实代表着已经匹配的字符串长度
    while (i < len_src && ((size_t) state) < len_target)
        state = kmp_update(src[i++], state, target, next);
    if (state == (int)len_target)
        return i - len_target;
    return -1;
}

/**
 * KMP字符串搜索
 * 
 *      这里对于要搜索字符串的KMP特征码只做局部缓存，如果有特殊需求，例如被搜索的字符串不连续的存放在不同位置，
 * 则可使用其他函数组合使用(参见本函数的实现)
 *
 * @param src 被搜索的字符串
 * @param start 搜索开始的位置
 * @param target 要搜索的字符串子串
 */
inline int kmp_search(const std::string& src, size_t start, const std::string& target)
{
    int *next = new int[target.length()];
    kmp_build_next(target.c_str(), next, target.length());
    const int ret = kmp_search(src.c_str(), src.length(), start, target.c_str(), next, target.length());
    delete[] next;
    return ret;
}

}

#endif

算法的两种用法：

#include <stdio.h>
#include <stdlib.h>

#include "kmp.hpp"

using namespace nut;

int main(int argc, const char *argv[])
{
    // 方式1，源区域是连续的
    const char *s1 = "abcdcdefgh", *s2 = "cde";
    printf("%d\n", kmp_search(s1, 0, s2)); // 应该打印 4

    // 方式2，源区域是不连续的
    const char *p1 = "abcdc", *p2 = "defgh"; // p1, p2 是不连续的，怎么搜索到 "cde" ?
    const int len = ::strlen(s2);
    int *next = new int[len];
    kmp_build_next(s2, next, len); // 创建 KMP 特征码
    int i = 0, state = 0;
    while (p1[i] != '\0' && state < len)
        state = kmp_update(p1[i++], state, s2, next);
    if (state == len) // 搜索成功
    {
        printf("%d\n", i - len);
        return 0;
    }
    int processed = i;
    i = 0;
    while (p2[i] != '\0' && state < len)
        state = kmp_update(p2[i++], state, s2, next);
    if (state == len) // 搜索成功，下面打印的位置是相对于 p1,p2 连接起来后的位置，也就是应该打印 4
        printf("%d\n", i + processed - len);

    return 0;
}

    原文作者：KMP算法
    原文地址: https://blog.csdn.net/jingqi_se/article/details/14055709
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。