c# – Soundex算法实现输出错误的情况 – “Tymczak”和“Pfister”

当我根据
Wikipedia article on Soundex测试算法Soundex时,我发现Tymczak返回T520,而不是T522,Pfister返回P123,而不是P236.

我不知道为什么输出不正确.

我的代码:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace ConsoleApplication4
{
   class Program
   {
     static void Main(string[] args)
     {
        string s = "Tymczak";
        string result = SoundexByWord(s);
        Console.WriteLine(result);
    }

    private static string Soundex(string data)
    {
        string first = "pv";
        string second = "xz";
        string third = "dt";
        string forth = "mn";
        string fifth = "bf";
        string sixth = "cgj";
        string seventh = "kqs";
        //ana ast5dtmt string builder 34an 3aml zy al list fy apend 
        // 34an a apend mn al a5r al string 

        StringBuilder result = new StringBuilder();
        if (data != null && data.Length > 0)
        {
            string previousCode = "", currentCode = "", currentLetter = "";

            // append on the string from the last 
            // get the first characheter of the string data 
            // append it on the result
            //according to algorithm first charachter stays the same 
            result.Append(data.Substring(0, 1));
            RemoveUnwantedChar(data);
            for (int i = 1; i < data.Length; i++)
            {
                // nb2d al algorithm first take the second characheter in data 
                //n7wlo la lower  
                currentLetter = data.Substring(i, 1).ToLower();
                currentCode = "";
                // No string for zero because we will remove it 
                if (first.IndexOf(currentLetter) > -1)
                    //search for bfpv in the current letter return number 
                    // -1 is out of string index 
                    currentCode = "1";

                else if (fifth.IndexOf(currentLetter) > -1)
                    //search for bfpv in the current letter return number 
                    // -1 is out of string index 
                    currentCode = "1";

                else if (sixth.IndexOf(currentLetter) > -1)
                    //search for bfpv in the current letter return number 
                    // -1 is out of string index 
                    currentCode = "2";
                else if (seventh.IndexOf(currentLetter) > -1)
                    //search for bfpv in the current letter return number 
                    // -1 is out of string index 
                    currentCode = "2";
                else if (second.IndexOf(currentLetter) > -1)
                    currentCode = "2";
                else if (third.IndexOf(currentLetter) > -1)
                    currentCode = "3";
                else if (currentLetter == "l")
                    currentCode = "4";
                else if (forth.IndexOf(currentLetter) > -1)
                    currentCode = "5";
                else if (currentLetter == "r")
                    currentCode = "6";
                if (currentCode != previousCode)
                    result.Append(currentCode);

                if (result.Length == 4) break;

                if (currentCode != "")
                    previousCode = currentCode;
            }
        }

        if (result.Length < 4)
            result.Append(new String('0', 4 - result.Length));

        return result.ToString().ToUpper();
    }

    public static string RemoveUnwantedChar(string input)
    {
        return Regex.Replace(input, "aeiouyhw", "");
    }
    private static string SoundexByWord(string data)
    {
        var soundexes = new List<string>();
        foreach (var str in data.Split(' '))
        {
            soundexes.Add(Soundex(str));
        }

        #if Net35OrLower
       // string.Join in .Net 3.5 and 
        //before require the second parameter to be an array.
        return string.Join(" ", soundexes.ToArray());
        #endif
        // string.Join in .Net 4 has an overload 
        //that takes IEnumerable<string>
        return string.Join(" ", soundexes);
    }
  }
}

最佳答案 这并没有告诉你代码出错的地方,甚至可能不是最快的解决方案,但它似乎是正确的例子并且只有几行代码..

它实现了second version of the algorithm的六个步骤.

string Soundex(string input)
{
    // character groups: the 1st one are vowels to remove
    // the other groups are characters to replace by the group index
    List<string> groups = new List<string>() 
                         { "aeiouy", "bfpv", "cgjkqsxz", "dt", "l", "mn", "r" };

    // save the 1st character (1)
    string first = input.Substring(0, 1);
    string s = input.ToLower();

    // remove unconditionally (1)
    s = s.Replace("h", "").Replace("w", "");

    // replace characters in all replacement groups (2)
    for (int g = 1; g < groups.Count; g++)
        for (int i = 0; i < groups[g].Length; i++)
            s = s.Replace(groups[g][i], ((char)(g + (byte)'0')));

    // replace repeating digits (3)
    // NOTE: this step actually should be repeated until the length no longer changes!!!
    for (int i = 1; i < 10; i++) s = s.Replace(i + "" + i, i + "");

    // now remove characters from group 0: (4)
    for (int i = 0; i < groups[0].Length; i++)  s = s.Replace(groups[0][i].ToString(), "");

    // remove the first if it is a digit (5)
    if ( (s[0] >= '0') && (s[0] <= '9') ) s = s.Substring(1);

    // add saved first to max 3 digits and pad if needed (6)
    return (first + s.Substring(0, Math.Min(3, s.Length))).PadRight(4, '0');
}
点赞