PDF数据提取处理的API

huangapple go评论96阅读模式
英文:

API for handling extraction of PDF data

问题

这是你提供的C#代码的翻译部分:

使用 Aspose.Pdf;
使用 Aspose.Pdf.Text;
使用 Microsoft.AspNetCore.Http;
使用 Microsoft.AspNetCore.Mvc;
使用 System;
使用 System.Data;
使用 System.IO;
使用 System.Linq;

namespace TryingOutAPI.Controllers
{
    [Route("api/[controller]")]
    [ApiController]
    public class ValuesController : ControllerBase
    {
        [HttpPost]
        public IActionResult ProcessPdfTables(IFormFile pdfFile)
        {
            try
            {
                if (pdfFile == null || pdfFile.Length == 0)
                {
                    return BadRequest("未上传 PDF 文件。");
                }

                // 从上传的文件加载 PDF 文档
                Aspose.Pdf.Document pdfDocument;
                使用 (var stream = pdfFile.OpenReadStream())
                {
                    pdfDocument = new Aspose.Pdf.Document(stream);
                }

                // 从 PDF 中提取带有表格的页面
                DataTable[] tables = ExtractTablesFromPdf(pdfDocument, new int[] { 2, 3 });

                // 访问从提取的表格列表中的第一个表格
                DataTable table1 = tables[0];
                // 访问从提取的表格列表中的第二个表格
                DataTable table2 = tables[1];

                // 指定正确的列名
                string[] columnsToExtract = { "Peak Name", "RT", "Area", "% Area", "RT Ratio", "Height" };

                // 从表格1中选择所需的列
                DataTable table1Subset = SelectColumnsFromTable(table1, columnsToExtract);
                table1Subset = RemoveRowsWithNullValues(table1Subset);

                // 从表格2中选择所需的列
                DataTable table2Subset = SelectColumnsFromTable(table2, columnsToExtract);
                table2Subset = RemoveRowsWithNullValues(table2Subset);

                // 以 JSON 形式返回不带空行的表格子集
                return Ok(new { Table1 = table1Subset, Table2 = table2Subset });
            }
            catch (Exception ex)
            {
                // 处理任何异常并返回错误响应
                return StatusCode(StatusCodes.Status500InternalServerError, ex.Message);
            }
        }

        private DataTable[] ExtractTablesFromPdf(Aspose.Pdf.Document pdfDocument, int[] pages)
        {
            DataTable[] tables = new DataTable[pages.Length];

            for (int i = 0; i < pages.Length; i++)
            {
                int pageNumber = pages[i];
                Page pdfPage = pdfDocument.Pages[pageNumber];

                // 从页面提取文本
                TextAbsorber textAbsorber = new TextAbsorber();
                pdfPage.Accept(textAbsorber);
                string pageContent = textAbsorber.Text;

                tables[i] = ConvertTextToDataTable(pageContent);
            }

            return tables;
        }

        private DataTable SelectColumnsFromTable(DataTable table, string[] columnsToExtract)
        {
            DataTable subset = new DataTable();

            foreach (string column in columnsToExtract)
            {
                DataColumn existingColumn = table.Columns.Cast<DataColumn>()
                    .FirstOrDefault(c => c.ColumnName == column);
                if (existingColumn != null)
                {
                    subset.Columns.Add(existingColumn.ColumnName);
                }
            }

            foreach (DataRow row in table.Rows)
            {
                DataRow newRow = subset.NewRow();
                foreach (DataColumn column in subset.Columns)
                {
                    newRow[column.ColumnName] = row[column.ColumnName];
                }
                subset.Rows.Add(newRow);
            }

            return subset;
        }

        private DataTable RemoveRowsWithNullValues(DataTable table)
        {
            DataTable filteredTable = table.Clone();

            foreach (DataRow row in table.Rows)
            {
                bool hasNullValues = row.ItemArray.Any(x => x is DBNull || string.IsNullOrWhiteSpace(x.ToString()));
                if (!hasNullValues)
                {
                    filteredTable.ImportRow(row);
                }
            }

            return filteredTable;
        }

        private DataTable ConvertTextToDataTable(string text)
        {
            DataTable dataTable = new DataTable();

            // 将文本拆分为行
            string[] lines = text.Split('\n');

            // 从第一行提取列名
            string[] columnNames = lines[0].Split('\t');

            // 将列添加到 DataTable
            foreach (string columnName in columnNames)
            {
                dataTable.Columns.Add(columnName.Trim());
            }

            // 从后续行提取数据行
            for (int i = 1; i < lines.Length; i++)
            {
                string[] rowValues = lines[i].Split('\t');

                // 创建新的 DataRow
                DataRow dataRow = dataTable.NewRow();

                // 为每列设置值
                for (int j = 0; j < columnNames.Length; j++)
                {
                    dataRow[j] = rowValues[j].Trim();
                }

                // 将行添加到 DataTable
                dataTable.Rows.Add(dataRow);
            }

            return dataTable;
        }
    }
}

请注意,翻译中尽量保持了代码的结构和逻辑,只对注释和字符串进行了翻译。如果您有任何其他问题或需要进一步的帮助,请随时提出。

英文:

So I created a Web API in .NET Core Web API. I added the logic to the API created all the code, on swaggerUI I was testing out the API it shows the error:
Cannot access a disposed object.
Object name: 'ReferenceReadStream'.
This is the code(C#)

using Aspose.Pdf;
using Aspose.Pdf.Text;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using System;
using System.Data;
using System.IO;
using System.Linq;
namespace TryingOutAPI.Controllers
{
[Route(&quot;api/[controller]&quot;)]
[ApiController]
public class ValuesController : ControllerBase
{
[HttpPost]
public IActionResult ProcessPdfTables(IFormFile pdfFile)
{
try
{
if (pdfFile == null || pdfFile.Length == 0)
{
return BadRequest(&quot;No PDF file uploaded.&quot;);
}
// Load the PDF document from the uploaded file
Aspose.Pdf.Document pdfDocument;
using (var stream = pdfFile.OpenReadStream())
{
pdfDocument = new Aspose.Pdf.Document(stream);
}
// Extract the pages with the tables
DataTable[] tables = ExtractTablesFromPdf(pdfDocument, new int[] { 2, 3 });
// Access the first table from the list of extracted tables
DataTable table1 = tables[0];
// Access the second table from the list of extracted tables
DataTable table2 = tables[1];
// Specify the correct column names
string[] columnsToExtract = { &quot;Peak Name&quot;, &quot;RT&quot;, &quot;Area&quot;, &quot;% Area&quot;, &quot;RT Ratio&quot;, &quot;Height&quot; };
// Select the desired columns from table 1
DataTable table1Subset = SelectColumnsFromTable(table1, columnsToExtract);
table1Subset = RemoveRowsWithNullValues(table1Subset);
// Select the desired columns from table 2
DataTable table2Subset = SelectColumnsFromTable(table2, columnsToExtract);
table2Subset = RemoveRowsWithNullValues(table2Subset);
// Return the subsets of tables without null rows as JSON
return Ok(new { Table1 = table1Subset, Table2 = table2Subset });
}
catch (Exception ex)
{
// Handle any exceptions and return an error response
return StatusCode(StatusCodes.Status500InternalServerError, ex.Message);
}
}
private DataTable[] ExtractTablesFromPdf(Aspose.Pdf.Document pdfDocument, int[] pages)
{
DataTable[] tables = new DataTable[pages.Length];
for (int i = 0; i &lt; pages.Length; i++)
{
int pageNumber = pages[i];
Page pdfPage = pdfDocument.Pages[pageNumber];
// Extract text from the page
TextAbsorber textAbsorber = new TextAbsorber();
pdfPage.Accept(textAbsorber);
string pageContent = textAbsorber.Text;
tables[i] = ConvertTextToDataTable(pageContent);
}
return tables;
}
private DataTable SelectColumnsFromTable(DataTable table, string[] columnsToExtract)
{
DataTable subset = new DataTable();
foreach (string column in columnsToExtract)
{
DataColumn existingColumn = table.Columns.Cast&lt;DataColumn&gt;()
.FirstOrDefault(c =&gt; c.ColumnName == column);
if (existingColumn != null)
{
subset.Columns.Add(existingColumn.ColumnName);
}
}
foreach (DataRow row in table.Rows)
{
DataRow newRow = subset.NewRow();
foreach (DataColumn column in subset.Columns)
{
newRow[column.ColumnName] = row[column.ColumnName];
}
subset.Rows.Add(newRow);
}
return subset;
}
private DataTable RemoveRowsWithNullValues(DataTable table)
{
DataTable filteredTable = table.Clone();
foreach (DataRow row in table.Rows)
{
bool hasNullValues = row.ItemArray.Any(x =&gt; x is DBNull || string.IsNullOrWhiteSpace(x.ToString()));
if (!hasNullValues)
{
filteredTable.ImportRow(row);
}
}
return filteredTable;
}
private DataTable ConvertTextToDataTable(string text)
{
DataTable dataTable = new DataTable();
// Split the text into lines
string[] lines = text.Split(&#39;\n&#39;);
// Extract column names from the first line
string[] columnNames = lines[0].Split(&#39;\t&#39;);
// Add columns to the DataTable
foreach (string columnName in columnNames)
{
dataTable.Columns.Add(columnName.Trim());
}
// Extract data rows from subsequent lines
for (int i = 1; i &lt; lines.Length; i++)
{
string[] rowValues = lines[i].Split(&#39;\t&#39;);
// Create a new DataRow
DataRow dataRow = dataTable.NewRow();
// Set values for each column in the row
for (int j = 0; j &lt; columnNames.Length; j++)
{
dataRow[j] = rowValues[j].Trim();
}
// Add the row to the DataTable
dataTable.Rows.Add(dataRow);
}
return dataTable;
}
}
}

I tried uploading the file that worked but it did not show the extracted data

答案1

得分: 1

ProcessPdfTables中的using语句将在stream被使用之前将其释放。如果您使用的是C# 8或更高版本,您可以像这样声明stream

using var stream = pdfFile.OpenReadStream();

通过这种声明方式,stream将在超出作用域时才会被释放。

英文:

Your using statement in ProcessPdfTables will dispose of stream before it is used. If you are using C# version 8 or above you can declare stream like this:

using var stream = pdfFile.OpenReadStream();

With this declaration, stream will not be disposed until it goes out of scope.

答案2

得分: 0

可能是您的using语句,在其中您有一个对流的对象引用,在using语句关闭时进行了处理并进行了释放。

尝试删除using语句,并在完成处理PDF文档时释放流。

英文:

<p>It is probably your using statement where you have an object reference to the stream, that you are disposing at the close of your using statement and .</p>
<p>Try to remove the using statement and dispose the stream when you are done handling the pdf document.</p>

huangapple
  • 本文由 发表于 2023年7月6日 13:51:20
  • 转载请务必保留本文链接:https://go.coder-hub.com/76625864.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定