c# – 快速替换文件中的行首

2023年9月17日 243次阅读

我有一个包含如下行的初始文件：

34    964:0.049759 1123:0.0031 2507:0.015979 
32,48 524:0.061167 833:0.030133 1123:0.002549
34,52 534:0.07349 698:0.141667 1123:0.004403 
106   389:0.013396 417:0.016276 534:0.023859

一行的第一部分是类号.一行可以有几个类.

对于每个类,我创建一个新文件.

例如,对于第34类,生成的文件将是：

+1 964:0.049759 1123:0.0031 2507:0.015979 
-1 524:0.061167 833:0.030133 1123:0.002549
+1 534:0.07349 698:0.141667 1123:0.004403 
-1 389:0.013396 417:0.016276 534:0.023859

对于类106,结果文件将是：

-1 964:0.049759 1123:0.0031 2507:0.015979 
-1 524:0.061167 833:0.030133 1123:0.002549
-1 534:0.07349 698:0.141667 1123:0.004403 
+1 389:0.013396 417:0.016276 534:0.023859

问题是我有200个类要写的13个文件.
我已经运行了一个不太优化的代码版本,花了几个小时.
使用下面的代码,生成2600个文件需要1个小时.

有没有办法以更快的方式执行这样的替换？正则表达式是可行的选择吗？

下面是我的实现(适用于LINQPAD和this data file)

static void Main()
{
    const string filePath = @"C:\data.txt";
    const string generatedFilesFolderPath = @"C:\";
    const string fileName = "data";

    using (new TimeIt("Whole process"))
    {
        var fileLines = File.ReadLines(filePath).Select(l => l.Split(new[] { ' ' }, 2)).ToList();
        var classValues = GetClassValues();
        foreach (var classValue in classValues)
        {
            var directoryPath = Path.Combine(generatedFilesFolderPath, classValue);

            if (!Directory.Exists(directoryPath))
                Directory.CreateDirectory(directoryPath);

            var classFilePath = Path.Combine(directoryPath, fileName);

            using (var file = new StreamWriter(classFilePath))
            {
                foreach (var line in fileLines)
                {
                    var lineFirstPart = line.First();
                    string newFirstPart = "-1";

                    var hashset = new HashSet<string>(lineFirstPart.Split(','));
                    if (hashset.Contains(classValue))
                    {
                        newFirstPart = "+1";
                    }

                    file.WriteLine("{0} {1}", newFirstPart, line.Last());
                }
            }
        }
    }

    Console.Read();
}

public static List<string> GetClassValues()
{
    // In real life there is 200 class values.
    return Enumerable.Range(0, 2).Select(c => c.ToString()).ToList(); 
}

public class TimeIt : IDisposable
{
    private readonly string _name;
    private readonly Stopwatch _watch;
    public TimeIt(string name)
    {
        _name = name;
        _watch = Stopwatch.StartNew();
    }
    public void Dispose()
    {
        _watch.Stop();
        Console.WriteLine("{0} took {1}", _name, _watch.Elapsed);
    }
}

输出：

Whole process took 00:00:00.1175102

编辑：我也运行了一个分析器,它看起来像分裂方法是最热门的地方.

编辑2：简单的例子：

2,1 1:0.8 2:0.2
3   1:0.4 3:0.6
12  1:0.02 4:0.88 5:0.1

第2课的预期输出：

+1 1:0.8 2:0.2
-1 1:0.4 3:0.6
-1 1:0.02 4:0.88 5:0.1

第3类的预期产出：

-1 1:0.8 2:0.2
+1 1:0.4 3:0.6
-1 1:0.02 4:0.88 5:0.1

第4类的预期产出：

-1 1:0.8 2:0.2
-1 1:0.4 3:0.6
-1 1:0.02 4:0.88 5:0.1

最佳答案通过删除拆分并在
FileStream上使用更大的缓冲区,我已经从代码中删除了最热门的路径.

而不是Split我现在调用ToCharArray,然后将第一个Chars解析到第一个空格,而我在它的基础上执行char-by classValue的匹配.找到的boolean表示与第一个空格之前的任何内容完全匹配.剩下的处理是一样的.

var fsw = new FileStream(classFilePath,
    FileMode.Create,
    FileAccess.Write,
    FileShare.None,
    64*1024*1024); // use a large buffer
using (var file = new StreamWriter(fsw)) // use the filestream
{
    foreach(var line in fileLines) // for( int i = 0;i < fileLines.Length;i++)
    {
        char[] chars = line.ToCharArray();
        int matched = 0;
        int parsePos = -1;
        bool takeClass = true;
        bool found = false;
        bool space = false;
        // parse until space
        while (parsePos<chars.Length && !space )
        {
            parsePos++;
            space = chars[parsePos] == ' '; // end
            // tokens
            if (chars[parsePos] == ' ' ||
                chars[parsePos] == ',')
            {
                if (takeClass 
                    && matched == classValue.Length)
                {
                    found = true;
                    takeClass = false;
                }
                else
                {
                    // reset matching
                    takeClass = true;
                    matched = 0;
                }
            }
            else
            {
                if (takeClass 
                    &&  matched < classValue.Length 
                    && chars[parsePos] == classValue[matched])
                {
                    matched++; // on the next iteration, match next
                }
                else
                {
                    takeClass = false; // no match!
                }    
            }
        }

        chars[parsePos - 1] = '1'; // replace 1 in front of space
        var correction = 1;
        if (parsePos > 1)
        {
            // is classValue before the comma (or before space)
            if (found)
            {
                chars[parsePos - 2] = '+';
            }
            else
            {
                chars[parsePos - 2] = '-';
            }
            correction++;
        }
        else
        {
            // is classValue before the comma (or before space)
            if (found)
            {
                // not enough space in the array, write a single char
                file.Write('+');
            }
            else
            {
                file.Write('-');
            }
        }
        file.WriteLine(chars, parsePos - correction, chars.Length - (parsePos - correction));
    }
}