2020年8月19日 06:17:49go评论92阅读模式

英文:

Getting errors while using pdfbox to parse pdf files in Java

问题

请在使用Java中的pdfbox解析PDF文件时给我建议，因为我遇到了错误。已经导入了所有必要的库，但仍然出现错误。

请告诉我我做错了什么以及可能的解决方案。

以下是我的Java代码 / 查看下面的错误和堆栈跟踪

错误：（32，10）java：类型的非法开始
错误：（32，13）java：类型的非法开始
错误：（32，14）java：应输入')'
错误：（32，18）java：应输入';'
错误：（32，19）java：无效的方法声明；需要返回类型
错误：（32，27）java：应输入';'

import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import java.beans.XMLDecoder;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
public class PdfReader {
    private XMLDecoder cosDoc = null;
    private XMLDecoder pdDoc = null;
    static File file = new File("data/javaPDF.pdf");
    private static PDFParser parser = null;
    public static void main(String args[]) {
        PDFTextParser pdf = new PDFTextParser();
        //print out results
        System.out.println(pdf.getParsedText());
        //
    }
    private static class PDFTextParser {
        if(!file.isFile())
        {
            String fileName = null;
            System.err.println("File " + fileName + " does not exist.");
        }
        //Set up instance of PDF parser
        {
            try {
                parser = new PDFParser((RandomAccessRead) new FileInputStream(file));
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    //-------------------------------
    public static String getParsedText() {
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        String parsedText = null;
        try {
            parser.parse();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        try {
            cosDoc = parser.getDocument();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        pdDoc = new PDDocument(cosDoc);
        return parsedText;
    }
//            System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
    {
        if (cosDoc != null) cosDoc.close();
        if (pdDoc != null) pdDoc.close();
    }
}

英文:

Please advice me as I am getting errors while using pdfbox to parse pdf files in Java. All the necessary libraries are imported and I am still getting errors.

Please tell me what I am doing wrong and the possible solution

See my Java code below / see the errors and stack trace below

Error:(32, 10) java: illegal start of type
Error:(32, 13) java: illegal start of type
Error:(32, 14) java: &#39;)&#39; expected
Error:(32, 18) java: &#39;;&#39; expected
Error:(32, 19) java: invalid method declaration; return type required
Error:(32, 27) java: &#39;;&#39; expected

import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import java.beans.XMLDecoder;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
public class PdfReader {
private XMLDecoder cosDoc = null;
private XMLDecoder pdDoc = null;
static File file = new File(&quot;data/javaPDF.pdf&quot;);
private static PDFParser parser = null;
public static void main(String args[]) {
PDFTextParser pdf = new PDFTextParser();
//print out results
System.out.println(pdf.getParsedText());
//
}
private static class PDFTextParser {
if(!file.isFile())
{
String fileName = null;
System.err.println(&quot;File &quot; + fileName + &quot; does not exist.&quot;);
}
//Set up instance of PDF parser
{
try {
parser = new PDFParser((RandomAccessRead) new FileInputStream(file));
} catch (IOException e) {
e.printStackTrace();
}
}
}
//-------------------------------
public static String getParsedText() {
PDDocument pdDoc = null;
COSDocument cosDoc = null;
String parsedText = null;
try {
parser.parse();
} catch (IOException e1) {
e1.printStackTrace();
}
try {
cosDoc = parser.getDocument();
} catch (IOException e1) {
e1.printStackTrace();
}
pdDoc = new PDDocument(cosDoc);
return parsedText;
}
//            System.err.println(&quot;An exception occured in parsing the PDF Document.&quot; + e.getMessage());
{
if (cosDoc != null) cosDoc.close();
if (pdDoc != null) pdDoc.close();
}
}

答案1

得分: 0

感谢大家，问题已解决。

以下是解决方案：

package com.mypackage.util;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
public class ExtractTextSimple
{
    private ExtractTextSimple()
    {
        // 示例类不应该被实例化
    }
    public static void main(String[] args) throws IOException
    {
        PDDocument document = PDDocument.load(new File("data/javaPDF.pdf"));
        System.out.println("页数 >>" + document.getNumberOfPages());
        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent())
        {
            throw new IOException("您无权提取文本");
        }
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setSortByPosition(true);
        for (int p = 1; p <= document.getNumberOfPages(); ++p)
        {
            // 设置要提取的页面间隔。如果不设置，将提取所有页面。
            stripper.setStartPage(p);
            stripper.setEndPage(p);
            // 让魔法发生
            String text = stripper.getText(document);
            System.out.println("文本 >>" + text);
            // 以标题形式输出一些漂亮的内容
            String pageStr = String.format("第 %d 页：", p);
            System.out.println(pageStr);
            for (int i = 0; i < pageStr.length(); ++i)
            {
                System.out.print("-");
            }
            System.out.println();
            System.out.println(text.trim());
            System.out.println();
        }
    }
}

英文:

Thanks so much guys, it's been resolved.

Here's the solution:

package com.mypackage.util;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
public class ExtractTextSimple
{
private ExtractTextSimple()
{
// example class should not be instantiated
}
public static void main(String[] args) throws IOException
{
PDDocument document = PDDocument.load(new File(&quot;data/javaPDF.pdf&quot;));
System.out.println(&quot;No of pages &gt;&gt;&quot;+ document.getNumberOfPages());
AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent())
{
throw new IOException(&quot;You do not have permission to extract text&quot;);
}
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(true);
for (int p = 1; p &lt;= document.getNumberOfPages(); ++p)
{
// Set the page interval to extract. If you don&#39;t, then all pages would be extracted.
stripper.setStartPage(p);
stripper.setEndPage(p);
// let the magic happen
String text = stripper.getText(document);
System.out.println(&quot;Text &gt;&gt;&quot; + text);
// do some nice output with a header
String pageStr = String.format(&quot;page %d:&quot;, p);
System.out.println(pageStr);
for (int i = 0; i &lt; pageStr.length(); ++i)
{
System.out.print(&quot;-&quot;);
}
System.out.println();
System.out.println(text.trim());
System.out.println();
}
}
}

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

在使用Java中的pdfbox解析PDF文件时出现错误。

问题

答案1

Hibernate能够将子选择中1:M连接的结果集映射到具有子集合的父项吗？

如何创建特定次数的随机数？

Android：应用程序不会加载我的启动画面，直接转到引导屏幕。

JDA 提及成员 – 命令无效

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。