英文:
API for handling extraction of PDF data
问题
这是你提供的C#代码的翻译部分:
使用 Aspose.Pdf;
使用 Aspose.Pdf.Text;
使用 Microsoft.AspNetCore.Http;
使用 Microsoft.AspNetCore.Mvc;
使用 System;
使用 System.Data;
使用 System.IO;
使用 System.Linq;
namespace TryingOutAPI.Controllers
{
[Route("api/[controller]")]
[ApiController]
public class ValuesController : ControllerBase
{
[HttpPost]
public IActionResult ProcessPdfTables(IFormFile pdfFile)
{
try
{
if (pdfFile == null || pdfFile.Length == 0)
{
return BadRequest("未上传 PDF 文件。");
}
// 从上传的文件加载 PDF 文档
Aspose.Pdf.Document pdfDocument;
使用 (var stream = pdfFile.OpenReadStream())
{
pdfDocument = new Aspose.Pdf.Document(stream);
}
// 从 PDF 中提取带有表格的页面
DataTable[] tables = ExtractTablesFromPdf(pdfDocument, new int[] { 2, 3 });
// 访问从提取的表格列表中的第一个表格
DataTable table1 = tables[0];
// 访问从提取的表格列表中的第二个表格
DataTable table2 = tables[1];
// 指定正确的列名
string[] columnsToExtract = { "Peak Name", "RT", "Area", "% Area", "RT Ratio", "Height" };
// 从表格1中选择所需的列
DataTable table1Subset = SelectColumnsFromTable(table1, columnsToExtract);
table1Subset = RemoveRowsWithNullValues(table1Subset);
// 从表格2中选择所需的列
DataTable table2Subset = SelectColumnsFromTable(table2, columnsToExtract);
table2Subset = RemoveRowsWithNullValues(table2Subset);
// 以 JSON 形式返回不带空行的表格子集
return Ok(new { Table1 = table1Subset, Table2 = table2Subset });
}
catch (Exception ex)
{
// 处理任何异常并返回错误响应
return StatusCode(StatusCodes.Status500InternalServerError, ex.Message);
}
}
private DataTable[] ExtractTablesFromPdf(Aspose.Pdf.Document pdfDocument, int[] pages)
{
DataTable[] tables = new DataTable[pages.Length];
for (int i = 0; i < pages.Length; i++)
{
int pageNumber = pages[i];
Page pdfPage = pdfDocument.Pages[pageNumber];
// 从页面提取文本
TextAbsorber textAbsorber = new TextAbsorber();
pdfPage.Accept(textAbsorber);
string pageContent = textAbsorber.Text;
tables[i] = ConvertTextToDataTable(pageContent);
}
return tables;
}
private DataTable SelectColumnsFromTable(DataTable table, string[] columnsToExtract)
{
DataTable subset = new DataTable();
foreach (string column in columnsToExtract)
{
DataColumn existingColumn = table.Columns.Cast<DataColumn>()
.FirstOrDefault(c => c.ColumnName == column);
if (existingColumn != null)
{
subset.Columns.Add(existingColumn.ColumnName);
}
}
foreach (DataRow row in table.Rows)
{
DataRow newRow = subset.NewRow();
foreach (DataColumn column in subset.Columns)
{
newRow[column.ColumnName] = row[column.ColumnName];
}
subset.Rows.Add(newRow);
}
return subset;
}
private DataTable RemoveRowsWithNullValues(DataTable table)
{
DataTable filteredTable = table.Clone();
foreach (DataRow row in table.Rows)
{
bool hasNullValues = row.ItemArray.Any(x => x is DBNull || string.IsNullOrWhiteSpace(x.ToString()));
if (!hasNullValues)
{
filteredTable.ImportRow(row);
}
}
return filteredTable;
}
private DataTable ConvertTextToDataTable(string text)
{
DataTable dataTable = new DataTable();
// 将文本拆分为行
string[] lines = text.Split('\n');
// 从第一行提取列名
string[] columnNames = lines[0].Split('\t');
// 将列添加到 DataTable
foreach (string columnName in columnNames)
{
dataTable.Columns.Add(columnName.Trim());
}
// 从后续行提取数据行
for (int i = 1; i < lines.Length; i++)
{
string[] rowValues = lines[i].Split('\t');
// 创建新的 DataRow
DataRow dataRow = dataTable.NewRow();
// 为每列设置值
for (int j = 0; j < columnNames.Length; j++)
{
dataRow[j] = rowValues[j].Trim();
}
// 将行添加到 DataTable
dataTable.Rows.Add(dataRow);
}
return dataTable;
}
}
}
请注意,翻译中尽量保持了代码的结构和逻辑,只对注释和字符串进行了翻译。如果您有任何其他问题或需要进一步的帮助,请随时提出。
英文:
So I created a Web API in .NET Core Web API. I added the logic to the API created all the code, on swaggerUI I was testing out the API it shows the error:
Cannot access a disposed object.
Object name: 'ReferenceReadStream'.
This is the code(C#)
using Aspose.Pdf;
using Aspose.Pdf.Text;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using System;
using System.Data;
using System.IO;
using System.Linq;
namespace TryingOutAPI.Controllers
{
[Route("api/[controller]")]
[ApiController]
public class ValuesController : ControllerBase
{
[HttpPost]
public IActionResult ProcessPdfTables(IFormFile pdfFile)
{
try
{
if (pdfFile == null || pdfFile.Length == 0)
{
return BadRequest("No PDF file uploaded.");
}
// Load the PDF document from the uploaded file
Aspose.Pdf.Document pdfDocument;
using (var stream = pdfFile.OpenReadStream())
{
pdfDocument = new Aspose.Pdf.Document(stream);
}
// Extract the pages with the tables
DataTable[] tables = ExtractTablesFromPdf(pdfDocument, new int[] { 2, 3 });
// Access the first table from the list of extracted tables
DataTable table1 = tables[0];
// Access the second table from the list of extracted tables
DataTable table2 = tables[1];
// Specify the correct column names
string[] columnsToExtract = { "Peak Name", "RT", "Area", "% Area", "RT Ratio", "Height" };
// Select the desired columns from table 1
DataTable table1Subset = SelectColumnsFromTable(table1, columnsToExtract);
table1Subset = RemoveRowsWithNullValues(table1Subset);
// Select the desired columns from table 2
DataTable table2Subset = SelectColumnsFromTable(table2, columnsToExtract);
table2Subset = RemoveRowsWithNullValues(table2Subset);
// Return the subsets of tables without null rows as JSON
return Ok(new { Table1 = table1Subset, Table2 = table2Subset });
}
catch (Exception ex)
{
// Handle any exceptions and return an error response
return StatusCode(StatusCodes.Status500InternalServerError, ex.Message);
}
}
private DataTable[] ExtractTablesFromPdf(Aspose.Pdf.Document pdfDocument, int[] pages)
{
DataTable[] tables = new DataTable[pages.Length];
for (int i = 0; i < pages.Length; i++)
{
int pageNumber = pages[i];
Page pdfPage = pdfDocument.Pages[pageNumber];
// Extract text from the page
TextAbsorber textAbsorber = new TextAbsorber();
pdfPage.Accept(textAbsorber);
string pageContent = textAbsorber.Text;
tables[i] = ConvertTextToDataTable(pageContent);
}
return tables;
}
private DataTable SelectColumnsFromTable(DataTable table, string[] columnsToExtract)
{
DataTable subset = new DataTable();
foreach (string column in columnsToExtract)
{
DataColumn existingColumn = table.Columns.Cast<DataColumn>()
.FirstOrDefault(c => c.ColumnName == column);
if (existingColumn != null)
{
subset.Columns.Add(existingColumn.ColumnName);
}
}
foreach (DataRow row in table.Rows)
{
DataRow newRow = subset.NewRow();
foreach (DataColumn column in subset.Columns)
{
newRow[column.ColumnName] = row[column.ColumnName];
}
subset.Rows.Add(newRow);
}
return subset;
}
private DataTable RemoveRowsWithNullValues(DataTable table)
{
DataTable filteredTable = table.Clone();
foreach (DataRow row in table.Rows)
{
bool hasNullValues = row.ItemArray.Any(x => x is DBNull || string.IsNullOrWhiteSpace(x.ToString()));
if (!hasNullValues)
{
filteredTable.ImportRow(row);
}
}
return filteredTable;
}
private DataTable ConvertTextToDataTable(string text)
{
DataTable dataTable = new DataTable();
// Split the text into lines
string[] lines = text.Split('\n');
// Extract column names from the first line
string[] columnNames = lines[0].Split('\t');
// Add columns to the DataTable
foreach (string columnName in columnNames)
{
dataTable.Columns.Add(columnName.Trim());
}
// Extract data rows from subsequent lines
for (int i = 1; i < lines.Length; i++)
{
string[] rowValues = lines[i].Split('\t');
// Create a new DataRow
DataRow dataRow = dataTable.NewRow();
// Set values for each column in the row
for (int j = 0; j < columnNames.Length; j++)
{
dataRow[j] = rowValues[j].Trim();
}
// Add the row to the DataTable
dataTable.Rows.Add(dataRow);
}
return dataTable;
}
}
}
I tried uploading the file that worked but it did not show the extracted data
答案1
得分: 1
在ProcessPdfTables
中的using
语句将在stream
被使用之前将其释放。如果您使用的是C# 8或更高版本,您可以像这样声明stream
:
using var stream = pdfFile.OpenReadStream();
通过这种声明方式,stream
将在超出作用域时才会被释放。
英文:
Your using statement in ProcessPdfTables
will dispose of stream
before it is used. If you are using C# version 8 or above you can declare stream
like this:
using var stream = pdfFile.OpenReadStream();
With this declaration, stream
will not be disposed until it goes out of scope.
答案2
得分: 0
可能是您的using语句,在其中您有一个对流的对象引用,在using语句关闭时进行了处理并进行了释放。
尝试删除using语句,并在完成处理PDF文档时释放流。
英文:
<p>It is probably your using statement where you have an object reference to the stream, that you are disposing at the close of your using statement and .</p>
<p>Try to remove the using statement and dispose the stream when you are done handling the pdf document.</p>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论