如何优化我的C# WinForms代码以高效地将大型XML文件拆分成较小的文件?

huangapple go评论79阅读模式
英文:

How can I optimize my C# WinForms code for splitting large XML files into smaller ones efficiently?

问题

以下是您的代码部分的翻译:

public string XML_file_path;

BackgroundWorker bw;

public Form1()
{
    InitializeComponent();
}

private void AppendText(string text)
{
    if (Progress_output_text.InvokeRequired)
    {
        Progress_output_text.Invoke(new Action<string>(AppendText), text);
    }
    else
    {
        Progress_output_text.AppendText(text);
    }
}

private async void button1_Click(object sender, EventArgs e)
{
    Generate_Button.Enabled = false;

    // 启动后台工作线程
    bw = new BackgroundWorker();
    bw.DoWork += (obj, ea) => TasksAsync(1);
    bw.RunWorkerCompleted += (obj, ea) => Generate_Button.Enabled = true; // 操作完成后启用按钮
    bw.RunWorkerAsync();
}

private async void TasksAsync(int times)
{
    string Error_code_save = "", file_name_and_type = File_Path_Textbox.Text.Substring(File_Path_Textbox.Text.LastIndexOf('\\') + 1), full_file_path, file_name;
    int number_is_not_devisable = 0, total_items_in_XML = 0, Current_item_line = 0, filesCreated = 0, total_files_at_end;

    XML_file_path = File_Path_Textbox.Text;

    if (Number_Of_Elements_Per_file.Value == 0)
    {
        Error_code_save += "错误:XML文件数不能为零\r";
    }
    else if (!string.IsNullOrEmpty(Error_code_save))
    {
        Progress_output_text.Invoke((MethodInvoker)delegate
        {
            Progress_output_text.Text += "您设置了每个文件 " + Number_Of_Elements_Per_file.Value + " 个项目\r";
        });
    }

    if (string.IsNullOrEmpty(File_Path_Textbox.Text) || string.IsNullOrEmpty(File_Destination_Textbox.Text))
    {
        Error_code_save += "错误:未设置路径和/或目的地,请设置后再生成\r";
    }

    if (!string.IsNullOrEmpty(Error_code_save))
    {
        MessageBox.Show(Error_code_save);
        return;
    }

    file_name = file_name_and_type.Substring(0, file_name_and_type.Length - 4);

    using (XmlReader reader = XmlReader.Create(XML_file_path))
    {
        while (reader.Read())
        {
            if (reader.NodeType == XmlNodeType.Element && reader.Name == "Item")
                total_items_in_XML++;
        }
    }

    if (total_items_in_XML % Number_Of_Elements_Per_file.Value > 0)
    {
        number_is_not_devisable = 1;
    }

    total_files_at_end = (int)Math.Ceiling(total_items_in_XML / Number_Of_Elements_Per_file.Value);

    for (int i = 1; i <= total_files_at_end; i++)
    {
        int progressValue = (int)(i * 100.0 / total_files_at_end);

        progressBar1.Invoke((MethodInvoker)delegate
        {
            progressBar1.Value = progressValue;
        });

        full_file_path = string.Concat(File_Destination_Textbox.Text, '\\', file_name, i, ".xml");
        try
        {
            create_file(full_file_path, Current_item_line);

            filesCreated++;

            // 创建10个文件后重置进度条
            if (filesCreated % 10 == 0)
            {
                await Task.Delay(2000); // 等待2秒
                progressBar1.Invoke((MethodInvoker)delegate
                {
                    progressBar1.Value = 0;
                });
            }
        }
        catch (Exception ex)
        {
            Console.WriteLine("创建文件时出现错误: " + ex.Message);
        }

        Current_item_line += (int)Number_Of_Elements_Per_file.Value;
    }

    Progress_output_text.Invoke((MethodInvoker)delegate
    {
        Progress_output_text.Text += filesCreated + " 个文件已经生成在 " + File_Destination_Textbox.Text + "\r\n";
    });
}

public void create_file(string full_file_path, int Current_item_line)
{
    using (XmlWriter writer = XmlWriter.Create(full_file_path))
    {
        writer.WriteStartDocument();
        writer.WriteStartElement("CNJExport");

        using (XmlReader reader = XmlReader.Create(XML_file_path))
        {
            int itemCounter = 0;
            // 循环遍历XML文件并将选定的项目复制到新文件中
            while (reader.Read())
            {
                if (reader.NodeType == XmlNodeType.Element && reader.Name == "Item")
                {
                    if (itemCounter >= Current_item_line && itemCounter < Current_item_line + Number_Of_Elements_Per_file.Value)
                    {
                        writer.WriteNode(reader, true);
                    }
                    itemCounter++;
                }
            }
        }

        writer.WriteEndElement();
        writer.WriteEndDocument();
    }
}

希望这可以帮助您理解代码的中文翻译。如果您需要更多帮助,请随时提问。

英文:

I have a large .XML file with 8mil nodes (around 1GB). Currently I am using XML reader to read the files and XML writer to write the files into new .XML files. Currently I have noticed that the speed is very inconsistent. If I make 800 000 nodes per file it takes about 10 min, if I say 80 000 nodes per file it will take 15-20 min, so it for some reason takes longer the more files it makes.

XML file:

&lt;CNJExport&gt;
&lt;Item&gt;
&lt;ID&gt;1&lt;/ID&gt;
&lt;name&gt;Logitech MX Master 3 mouse&lt;/name&gt;
&lt;price&gt;423.36&lt;/price&gt;
&lt;/Item&gt;
&lt;/CNJExport&gt; 

Current code :
This part is the code without file making and writing

public string XML_file_path;
        
BackgroundWorker bw;

public Form1()
{
    InitializeComponent();
}


private void AppendText(string text)
{
    if (Progress_output_text.InvokeRequired)
    {
        Progress_output_text.Invoke(new Action&lt;string&gt;(AppendText), text);
    }
    else
    {
        Progress_output_text.AppendText(text);
    }
}


private async void button1_Click(object sender, EventArgs e)
{
    Generate_Button.Enabled = false;

    // Start the background worker
    bw = new BackgroundWorker();
    bw.DoWork += (obj, ea) =&gt; TasksAsync(1);
    bw.RunWorkerCompleted += (obj, ea) =&gt; Generate_Button.Enabled = true; // Enable the button after the operation completes
    bw.RunWorkerAsync();
}

private async void TasksAsync(int times)
{
    string Error_code_save = &quot;&quot;, file_name_and_type = File_Path_Textbox.Text.Substring(File_Path_Textbox.Text.LastIndexOf(&#39;\\&#39;) + 1), full_file_path, file_name;
    int number_is_not_devisable = 0, total_items_in_XML = 0, Current_item_line = 0, filesCreated = 0, total_files_at_end;

    XML_file_path = File_Path_Textbox.Text;

    if (Number_Of_Elements_Per_file.Value == 0)
    {
        Error_code_save += &quot;ERROR: Number of XML files cannot be zero\r&quot;;
    }
    else if (!string.IsNullOrEmpty(Error_code_save))
    {
        Progress_output_text.Invoke((MethodInvoker)delegate
        {
            Progress_output_text.Text += &quot;You set &quot; + Number_Of_Elements_Per_file.Value + &quot; items per file\r&quot;;
        });
    }

    if (string.IsNullOrEmpty(File_Path_Textbox.Text) || string.IsNullOrEmpty(File_Destination_Textbox.Text))
    {
        Error_code_save += &quot;ERROR: Path and/or Destination have not been set, please set them and generate again&quot;;
    }

    if (!string.IsNullOrEmpty(Error_code_save))
    {
        MessageBox.Show(Error_code_save);
        return;
    }

    file_name = file_name_and_type.Substring(0, file_name_and_type.Length - 4);

    using (XmlReader reader = XmlReader.Create(XML_file_path))
    {
        while (reader.Read())
        {
            if (reader.NodeType == XmlNodeType.Element &amp;&amp; reader.Name == &quot;Item&quot;)
                total_items_in_XML++;
        }
    }

    if (total_items_in_XML % Number_Of_Elements_Per_file.Value &gt; 0)
    {
        number_is_not_devisable = 1;
    }

    total_files_at_end = (int)Math.Ceiling(total_items_in_XML / Number_Of_Elements_Per_file.Value);

    for (int i = 1; i &lt;= total_files_at_end; i++)
    {
        int progressValue = (int)(i * 100.0 / total_files_at_end);

        progressBar1.Invoke((MethodInvoker)delegate
        {
            progressBar1.Value = progressValue;
        });

        full_file_path = string.Concat(File_Destination_Textbox.Text, &#39;\\&#39;, file_name, i, &quot;.xml&quot;);
        try
        {
            create_file(full_file_path, Current_item_line);

            filesCreated++;

            // Reset the progress bar after creating 10 files
            if (filesCreated % 10 == 0)
            {
                await Task.Delay(2000); // Wait for 2 seconds
                progressBar1.Invoke((MethodInvoker)delegate
                {
                    progressBar1.Value = 0;
                });
            }
        }
        catch (Exception ex)
        {
            Console.WriteLine(&quot;An error occurred while creating the file: &quot; + ex.Message);
        }

        Current_item_line += (int)Number_Of_Elements_Per_file.Value;
    }

    Progress_output_text.Invoke((MethodInvoker)delegate
    {
        Progress_output_text.Text += filesCreated + &quot; files have been created in &quot; + File_Destination_Textbox.Text + &quot;\r\n&quot;;
    });
}

This part is the actual create file function

public void create_file(string full_file_path, int Current_item_line)
{
    using (XmlWriter writer = XmlWriter.Create(full_file_path))
    {
        writer.WriteStartDocument();
        writer.WriteStartElement(&quot;CNJExport&quot;);

        using (XmlReader reader = XmlReader.Create(XML_file_path))
        {
            int itemCounter = 0;
            // Loop through the XML file and copy selected items to the new file
            while (reader.Read())
            {
                if (reader.NodeType == XmlNodeType.Element &amp;&amp; reader.Name == &quot;Item&quot;)
                {
                    if (itemCounter &gt;= Current_item_line &amp;&amp; itemCounter &lt; Current_item_line + Number_Of_Elements_Per_file.Value)
                    {
                        writer.WriteNode(reader, true);
                    }
                    itemCounter++;
                }
            }
        }

        writer.WriteEndElement();
        writer.WriteEndDocument();
    }
}

答案1

得分: 0

这段代码稍微更高效

public void create_file(string full_file_path, int Current_item_line)
{
    using (XmlWriter writer = XmlWriter.Create(full_file_path))
    {
        writer.WriteStartDocument();
        writer.WriteStartElement("CNJExport");

        using (XmlReader reader = XmlReader.Create(XML_file_path))
        {
            int itemCounter = 0;
            // 循环遍历XML文件并将选定的项复制到新文件中
            while (!reader.EOF)
            {
                if (reader.Name != "Item")
                {
                    reader.ReadToFollowing("Item");
                }
                if (!reader.EOF)
                {
                    if (itemCounter >= Current_item_line && itemCounter < Current_item_line + Number_Of_Elements_Per_file.Value)
                    {
                        writer.WriteNode(reader, true);
                    }
                }
                itemCounter++;
            }
        }
        writer.WriteEndElement();
        writer.WriteEndDocument();
    }
}
英文:

This code is a little more efficient

<!-- begin snippet: js hide: false console: true babel: false -->

       public void create_file(string full_file_path, int Current_item_line)
{
using (XmlWriter writer = XmlWriter.Create(full_file_path))
{
writer.WriteStartDocument();
writer.WriteStartElement(&quot;CNJExport&quot;);
using (XmlReader reader = XmlReader.Create(XML_file_path))
{
int itemCounter = 0;
// Loop through the XML file and copy selected items to the new file
while (!reader.EOF)
{
if (reader.Name != &quot;Item&quot;)
{
reader.ReadToFollowing(&quot;Item&quot;);
}
if (!reader.EOF)
{
if (itemCounter &gt;= Current_item_line &amp;&amp; itemCounter &lt; Current_item_line + Number_Of_Elements_Per_file.Value)
{
writer.WriteNode(reader, true);
}
}
itemCounter++;
}
}
writer.WriteEndElement();
writer.WriteEndDocument();
}
}

<!-- end snippet -->

答案2

得分: 0

你在Palle Due的评论中提到了两个基本问题:

  1. 你对输入文件进行了一次阅读,用于每个输出文件片段。

    相反,你应该只对输入文件进行一次流式处理,并在遇到&lt;Item&gt;节点时动态创建输出文件。

  2. 你的进度跟踪器包括2秒的延迟。

    你应该消除这一延迟,而只在必要时更新进度跟踪器,例如自上次更新以来已经过去了2秒以上,或者已经完成了超过10%的进度。

对于问题#1,我重写了你的create_file()如下,使用了帕斯卡命名法而不是蛇形命名法,符合C#的命名约定

// 代码部分不翻译

然后修改你的button1_Click()如下:

// 代码部分不翻译

请注意:

  • 为了简化进度跟踪,我跟踪了已读取的输入流的百分比。由于XmlReader的缓冲,这可能会多出4K字节,但由于你的文件大小为1GB,这应该足够准确了。

  • 我的版本的SplitXmlFile()不需要你预先计算输出文件的总数,也不需要最大项目数均匀地划分项目数。

  • 在输入文件格式错误或无法写入输出文件时,你可能希望改进异常处理。

  • 我传递了FileMode.CreateNew以避免覆盖任何预先存在的分割片段。如果你想要覆盖它们,请传递FileMode.Create

  • 我在.NET 6中尚未进行过异步WinForms编程,所以我的异步和线程使用可能存在一些错误。

  • 与其手动提取和组合文件名,最好使用Path类的工具。

  • 考虑重写你的代码以使用C#的标准命名约定

演示示例在这里

英文:

As noted in comments by Palle Due, you have two basic problems here:

  1. You are reading your input file once for each output file fragment.

    Instead, you should stream through your input file only once, and create output files dynamically as &lt;Item&gt; nodes are encountered.

  2. Your progress tracker includes a 2 second delay.

    You should eliminate that, and instead only update the progress tracker when necessary, e.g. when more than 2 seconds have elapsed since the previous update, or if more than 10% progress has been made.

For issue #1, I rewrote your create_file() as follows, using Pascal Casing instead of snake casing as per C# naming conventions:

public static class XmlExtensions
{
public static void SplitXmlFile(string inputFilePath, int maxItemsPerFile, XName rootName, XName itemName,
Func&lt;long, string&gt; makeOutputFileFullPath, Action&lt;long, long, long&gt;? progressTracker, 
FileMode fileMode = FileMode.CreateNew, 
XmlReaderSettings? inputSettings = default, XmlWriterSettings? outputSettings = default) 
{
if (string.IsNullOrEmpty(inputFilePath) || maxItemsPerFile &lt; 1 || rootName == null || itemName == null)
throw new ArgumentException(); // TODO - throw more descriptive exceptions.
void OpenOutput(out Stream outStream, out XmlWriter writer, ref long fileIndex)
{
var path = makeOutputFileFullPath(++fileIndex);
outStream = new FileStream(path, fileMode);
writer = XmlWriter.Create(outStream, outputSettings);
writer.WriteStartElement(rootName.LocalName, rootName.NamespaceName);
}
void CloseOutput(ref Stream? outStream, ref XmlWriter? writer, long fileIndex, long streamPosition, long streamLength)
{
writer?.WriteEndElement();
writer?.Dispose();
outStream?.Dispose();
(writer, outStream) = (null, null);
// Inform the caller of the approximate progress by passing in the input stream length and position.  
// Due to buffering, inStream.Position may be up to 4K bytes ahead of the actual reader position, 
// but for UI progress tracking purposes this is probably fine.
progressTracker?.Invoke(streamPosition, streamLength, fileIndex);
}
Stream? outStream = null;
XmlWriter? writer = null;
long fileIndex = 0;
using (var inStream = File.OpenRead(inputFilePath))
using (var reader = XmlReader.Create(inStream, inputSettings))
{
var streamLength = inStream.Length;
try
{
uint currentCount = 0;
// Loop through the XML file and, for each maxItemsPerFile chunk of items, create a new file and copy them into it.
while (reader.Read())
{
if (reader.NodeType == XmlNodeType.Element &amp;&amp; reader.LocalName == itemName.LocalName &amp;&amp; reader.NamespaceURI == itemName.NamespaceName)
{
if (currentCount &gt;= maxItemsPerFile)
{
CloseOutput(ref outStream, ref writer, fileIndex, inStream.Position, streamLength);
Debug.Assert(writer == null);
}
if (writer == null)
{
OpenOutput(out outStream, out writer, ref fileIndex);
currentCount = 0;
}
// ReadSubtree() ensures the reader is positioned at the EndElement node, not the next node
using (var subReader = reader.ReadSubtree())
writer.WriteNode(subReader, true);		
currentCount++;
}
}
}
finally
{
CloseOutput(ref outStream, ref writer, fileIndex, streamLength, streamLength);
}
}
}
}

And then modify your button1_Click() to look something like:

private async void button1_Click(object sender, EventArgs e)
{
await SplitSelectedFile();
}
private async Task SplitSelectedFile()
{
// Collect information from the GUI on the main thread.
string inputFilePath = File_Path_Textbox.Text;
string outputFilePrefix = Path.GetFileNameWithoutExtension(File_Path_Textbox.Text);
string outputFileDestination = File_Destination_Textbox.Text;
int maxItemsPerFile = (int)Number_Of_Elements_Per_file.Value;
// Disable the Generate_Button while processing
Generate_Button.Enabled = false; 
List&lt;string&gt; outputFiles = new();
// Split on the background thread
Action doSplit = () =&gt;
{
// TODO: Error handling in the event that the input file is missing or malformed, or we run out of disk space while writing the output files.
// For instance, if the input file is malformed, you might want to delete all the output files.
Console.WriteLine(&quot;Started&quot;);
Func&lt;long, string&gt; makeOutputFileFullPath = (i) =&gt;
{
var path = Path.Combine(outputFileDestination, string.Concat(outputFilePrefix, i, &quot;.xml&quot;));
outputFiles.Add(path);
return path;
};
int lastPercentDone = -1;
DateTime lastDateTime = default;
Action&lt;long, long, long&gt;? progressTracker = (position, length, fileNumber) =&gt;
{
var percentDone = (int)((double)position / (double)length * 100);
if (percentDone != lastPercentDone)
{
var dateTime = DateTime.UtcNow;
// Update the progress bar if two seconds have passed or the percentage has changed by 10%.
if ((dateTime - lastDateTime).TotalSeconds &gt; 2.0 || percentDone &gt; lastPercentDone + 10)
{
progressBar1.InvokeIfRequired(() =&gt; progressBar1.Value = percentDone);
lastDateTime = dateTime;
lastPercentDone = percentDone;
}
}
};
// Force the output to be indented, or not, as perferred.
XmlReaderSettings inputSettings = new() { IgnoreWhitespace = true };
XmlWriterSettings outputSettings = new() { Indent = false };
XmlExtensions.SplitXmlFile(inputFilePath, maxItemsPerFile, &quot;CNJExport&quot;, &quot;Item&quot;, 
makeOutputFileFullPath, progressTracker, 
inputSettings : inputSettings, outputSettings : outputSettings);
};
// Update the UI after split on the main thread.
Action&lt;Task&gt; onCompleted = (_) =&gt; 
{
// Re-enable  the Generate_Button when processing is complete.
Generate_Button.InvokeIfRequired(() =&gt; Generate_Button.Enabled = false);
Progress_output_text.InvokeIfRequired(
() =&gt;
{
Progress_output_text.Text += outputFiles.Count + &quot; files have been created in &quot; + outputFileDestination + &quot;\r\n&quot;;
});
// If required, loop through the created files and do something
foreach (var file in outputFiles)
{
// Add the file to some master list of files, show it in the UI, etc etc.
}
};
await Task
.Run(doSplit)
.ContinueWith(onCompleted, TaskScheduler.FromCurrentSynchronizationContext());
}

Using the following extension method to simplify cross-thread Control invocation:

public static class ControlExtensions
{
public static void InvokeIfRequired(this Control control, MethodInvoker invoker)
{
if (control.InvokeRequired)
control.Invoke(invoker);
else
invoker();
}
}

Notes:

  • To simplify progress tracking, I track the percentage of the input stream read. Due to XmlReader buffering this may be wrong by up to 4K bytes, but since your files are 1GB in size this should be more than accurate enough.

  • My version of SplitXmlFile() does not require you to precompute the total number of output files beforehand, and does not require that the maximum number of items per output file evenly divide the number of items.

  • You may want to improve exception handling in the event that the input file is malformed, or the output files cannot be written.

  • I passed FileMode.CreateNew to avoid overwriting any preexisting split fragments. Pass FileMode.Create if you want them to be overwritten.

  • I haven't done asynchronous WinForms programming in .NET 6 so there might be some mistake with my use of asynchrony and threading.

  • Rather than manually extracting and combining file names using string methods, use the utilities from the Path class.

  • Consider rewriting your code to use C#'s standard naming conventions.

Demo fiddle here.

huangapple
  • 本文由 发表于 2023年6月6日 15:36:53
  • 转载请务必保留本文链接:https://go.coder-hub.com/76412374.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定