Posix Regex Search中捕获组不正确

在C项目中,我编写了一个函数来返回正则表达式搜索中的第一个捕获组.

我希望实现的目标最好由output of this online parser描述(注意右侧面板上的捕获组输出).

我写的函数和测试代码如下:

#include <stdio.h>
#include <stdlib.h>
#include <regex.h>
#include <string.h>
#include <assert.h>

typedef int bool;
#define true 1
#define false 0

/*
 * Obtains the first group that matches the regex pattern in input string
 * The output pointer is made to point to:
 *  - in case the regexp compilation succeeded
 *      - the result in case there was a match found
 *      - or NULL in case there was no match
 *  - in case the regexp compilation failed
 *      - the error from the compilation process
 *
 * If there was an error while compiling the input reg_exp, then this function
 * returns false, if not, it returns true.
 *
 * NOTE: The user is responsible for free-ing the memory for *output
 */
bool get_first_match(const char* search_str, const char* reg_exp, char** output)
{
    int res, len;
    regex_t preg;
    regmatch_t pmatch;

    // Compile the input regexp
    if( (res = regcomp(&preg, reg_exp, REG_EXTENDED)) != 0)
    {
        char* error = (char*)malloc(1024*sizeof(char));
        regerror(res, &preg, error, 1024);
        output = &error;
        return false;
    }

    res = regexec(&preg, search_str, 1, &pmatch, REG_NOTBOL);
    if(res == REG_NOMATCH)
    {
        return true;
    }

    len = pmatch.rm_eo - pmatch.rm_so;
    char* result = (char*)malloc( (len + 1) * sizeof(char) );
    memcpy(result, search_str + pmatch.rm_so, len);
    result[len] = 0; // null-terminate the result
    *output = result;
    regfree(&preg);
    return true;
}

int main()
{
    const char* search_str = "param1=blah&param2=blahblah&param3=blahetc&map=/usr/bin/blah.map";
    const char* regexp = "map=([^\\&]*)(&|$)";
    char* output;
    bool status = get_first_match(search_str, regexp, &output);
    if(status){
       if(output) 
           printf("Found match: %s\n", output);
       else
           printf("No match found.");
    }
    else{
       printf("Regex error: %s\n", output);
    }
    free(output);

    return 0;
}

但是,output I get from the C code中包含map =字符串的一部分,即使我已经在第一个捕获组中明确排除了它.

如果没有map = part,我该怎么做才能获得捕获组?为什么与我的C程序相比,我从在线解析器获得不同的结果?

最佳答案 这里发生的是,你有这样的模式:

const char* regexp = "map=([^\\&]*)(&|$)";

其中,结果(让我们称之为数组结果)将根据以下内容填充:

result = {
    "map=/usr/bin/blah.map",
    "/usr/bin/blah.map",
    ""
}

现在,因为您正在调用regexc,如下所示:

res = regexec(&preg, search_str, 1, &pmatch, REG_NOTBOL);
// Notice the argument 1 here ---^

参数1表示最多只有一个结果存储在pmatch数组中.因此,您从上面得到结果[0].由于您需要第一个匹配的组(而不是整个匹配的字符串),您必须:

>将pmatch定义为大小至少为2的数组.
>传递2作为上面regexc调用的参数.

完成上述操作后:

bool get_first_match(const char* search_str, const char* reg_exp, char** output)
{
    int res, len;
    regex_t preg;
    regmatch_t pmatch[3];
    // SNIP
    // SNIP
    res = regexec(&preg, search_str, 2, &pmatch, REG_NOTBOL);
    if(res == REG_NOMATCH)
    {
        return true;
    }
    // Notice changes in the lines below
    // I am using pmatch[1] since that is equivalent to our
    // result[1] from above
    len = pmatch[1].rm_eo - pmatch[1].rm_so;
    char* result = (char*) malloc( (len + 1) * sizeof(char) );
    memcpy(result, search_str + pmatch[1].rm_so, len);
    result[len] = 0; // null-terminate the result
    *output = result;
    regfree(&preg);
    return true;
}

和程序works as expected.

点赞