统计字符串中单词个数的算法优化

2019年1月26日 204次阅读来源: 玉米疯收

  
   
   
要求：输入一个字符串，统计每个单词的个数。单词间用空格隔开，可多个空格，写出自己认为高效的算法。

例如：输入：I love love China
输出为：
I： 
   1
   
love： 
   2
   
China： 
   1

首先想到的还是模拟的方法，就是用struct把出现过的单词缓存起来，然后再输入文本中遍历到新单词的时候，遍历一次struct，看这个单词是不是已经存，做相关处理。
如果输入文本中有n个字母，不重复的字母为m个，则算法复杂度为O(nm^2) 最好情况是m =1 ，最差情况是m=n 其实现代码如下：

《统计字符串中单词个数的算法优化》

   
    
     1 
    

     2 
    #include 
    <
    stdio.h
    >
    

     3 
    #include 
    <
    string
    .h
    >
    

     4 
     
    struct
     struct_words{

     5 
     
    char
     word[
    20
    ];

     6 
     
    int
     count;

     7 
    };

     8 
     
    int
     main(){

     9 
     
    char
     
    string
    [
    100
    ];

    10 
     
    char
     c;

    11 
     
    struct
     struct_words words[
    20
    ];

    12 
     
    int
     i 
    =
     
    0
    , k 
    =
     
    0
     , ws 
    =
    0
    ;

    13 
    

    14 
     
    for
    (; i 
    <
     
    20
    ; i
    ++
    ){

    15 
     words[i].word[
    0
    ] 
    =
     
    '
    \0
    '
    ;

    16 
     words[i].count 
    =
     
    0
    ;

    17 
     }

    18 
     puts(
    "
    please input words.
    "
    );

    19 
     gets(
    string
    );

    20 
     puts(
    "
    =============开始取词================
    "
    );

    21 
    

    22 
     i 
    =
     
    0
    ;

    23 
     
    do
    {

    24 
     c 
    =
     
    string
    [i];

    25 
     
    if
    (c 
    !=
     
    '
     
    '
     
    &&
     c 
    !=
    '
    \0
    '
    ){

    26 
     words[k].word[ws] 
    =
     c;

    27 
     words[k].count 
    =
     
    1
    ;

    28 
     ws 
    ++
    ;

    29 
     }
    else
    {

    30 
     words[k].word[ws] 
    =
     
    '
    \0
    '
    ;

    31 
     ws 
    =
     
    0
    ;

    32 
     k 
    ++
    ;

    33 
     }

    34 
     i 
    ++
    ;

    35 
     }
    while
    (c
    !=
    '
    \0
    '
    );lda

    36 
    

    37 
    

    38 
     puts(
    "
    =========== 合并相同的单词 ==============
    "
    );

    39 
     
    for
    (i 
    =
     
    0
    ; words[i].word[
    0
    ] 
    !=
     
    '
    \0
    '
     ; i
    ++
    ){

    40 
     puts(words[i].word);

    41 
     
    if
    ( words[i].count 
    >=
     
    1
    )

    42 
     
    for
    (k 
    =
     i; words[k].word[
    0
    ] 
    !=
     
    '
    \0
    '
    ; k
    ++
    ){

    43 
     
    if
    (strcmp(words[i].word, words[k].word) 
    ==
     
    0
    

    44 
     
    &&
     words[k].count 
    ==
     
    1
    ){

    45 
     words[k].count 
    --
    ;

    46 
     words[i].count 
    ++
    ;

    47 
     }

    48 
     }

    49 
     }

    50 
    

    51 
     puts(
    "
    =============== End ==============
    "
    );

    52 
     
    for
    (i 
    =
     
    0
    ;words[i].word[
    0
    ] 
    !=
     
    '
    \0
    '
     ;i
    ++
    ){

    53 
     
    if
    (words[i].count 
    !=
     
    0
     )

    54 
     printf(
    "
    %s:\t\t%d\n
    "
    ,words[i].word, words[i].count);

    55 
     }

    56 
     
    return
    (
    0
    );

    57 
    }

然后呢，做一下优化，恩路是遍历用户的输入文本是必须的，但是，单词的缓存和出现次数的统计是可以使用hash算法来优化的，借用hash算法的特性，使复杂度立刻就降低到了 O(n),实现代码如下：

《统计字符串中单词个数的算法优化》

   
    
    
#include 
    <
    stdio.h
    >
    
#include 
    <
    string
    .h
    >
    

    #define
     N 100
    


    struct
     struct_words{
 
    char
     word[
    100
    ];
 
    int
     count;
};


    int
     hash(
    char
    *
     key)
{
 unsigned 
    long
     h
    =
    0
    ;
 
    while
    (
    *
    key)
 { 
 h
    =
    (h
    <<
    4
    )
    +*
    key
    ++
    ;
 unsigned 
    long
     g
    =
    h 
    &
     
    0xF0000000L
    ;
 
    if
    (g)
 h
    ^=
    g
    >>
    24
    ;
 h
    &=~
    g;
 } 
 
    return
     h
    &
    N;
}

    int
     main(){
 
    char
     
    string
    [
    1000
    ];
 
    char
     current_word[
    100
    ];
 
    char
     c;
 
    struct
     struct_words words[
    200
    ]; 
 
    int
     i 
    =
     
    0
    , k 
    =
     
    0
     , ws 
    =
    0
     , key;
 
    int
     keys[
    100
    ];

 
    for
    (; i 
    <
     
    200
    ; i
    ++
    ){
 words[i].word[
    0
    ] 
    =
     
    '
    \0
    '
    ;
 words[i].count 
    =
     
    0
    ;
 } 
 puts(
    "
    =============输入一些单词，用空格隔开================
    "
    );
 gets(
    string
    );

 i 
    =
     
    0
    ;
 
    do
    { 
 c 
    =
     
    string
    [i];
 
    //
    如果第一个单词前有空格，跳过去
    

     
    if
    ( ws 
    ==
     
    0
     
    &&
     c 
    ==
     
    '
     
    '
    ) {i
    ++
     ; 
    continue
    ;}
 
    if
    (c 
    !=
     
    '
     
    '
     
    &&
     c 
    !=
    '
    \0
    '
    ){
 current_word[ws] 
    =
     c;
 ws 
    ++
    ; 
 }
    else
    {
 current_word[ws] 
    =
     
    '
    \0
    '
    ;
 key 
    =
     hash(current_word);
 
    if
    (words[key].count 
    ==
     
    0
    ){ 
 strcpy(words[key].word, current_word);
 keys[k] 
    =
     key;
 k
    ++
    ;
 } 
 words[key].count 
    ++
    ; 
 ws 
    =
     
    0
    ;
 }
 i 
    ++
    ;
 }
    while
    (c 
    !=
     
    '
    \0
    '
    );

 printf(
    "
    %d
    "
     ,k);
 puts(
    "
    ===============打印結果 ==============
    "
    );
 
    for
    (i 
    =
     
    0
     ; i 
    <
     k ;i
    ++
    ){
 printf(
    "
    %s:\t\t%d\n
    "
    ,words[keys[i]].word, words[keys[i]].count);
 }
 puts(
    "
    =============== End ==============
    "
    );
 
    return
     
    0
    ;
}

呵呵，弄了近三个小时，发现Linux下gdb不熟太痛苦了，加油！

    原文作者：玉米疯收
    原文地址: https://www.cnblogs.com/amboyna/archive/2009/12/05/1617387.html
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。