在使用Java中的pdfbox解析PDF文件时出现错误。

huangapple go评论92阅读模式
英文:

Getting errors while using pdfbox to parse pdf files in Java

问题

请在使用Java中的pdfbox解析PDF文件时给我建议,因为我遇到了错误。已经导入了所有必要的库,但仍然出现错误。

请告诉我我做错了什么以及可能的解决方案。

以下是我的Java代码 / 查看下面的错误和堆栈跟踪

  1. 错误:(3210java类型的非法开始
  2. 错误:(3213java类型的非法开始
  3. 错误:(3214java应输入')'
  4. 错误:(3218java应输入';'
  5. 错误:(3219java无效的方法声明需要返回类型
  6. 错误:(3227java应输入';'
  1. import org.apache.pdfbox.cos.COSDocument;
  2. import org.apache.pdfbox.io.RandomAccessRead;
  3. import org.apache.pdfbox.pdfparser.PDFParser;
  4. import org.apache.pdfbox.pdmodel.PDDocument;
  5. import java.beans.XMLDecoder;
  6. import java.io.File;
  7. import java.io.FileInputStream;
  8. import java.io.IOException;
  9. public class PdfReader {
  10. private XMLDecoder cosDoc = null;
  11. private XMLDecoder pdDoc = null;
  12. static File file = new File("data/javaPDF.pdf");
  13. private static PDFParser parser = null;
  14. public static void main(String args[]) {
  15. PDFTextParser pdf = new PDFTextParser();
  16. //print out results
  17. System.out.println(pdf.getParsedText());
  18. //
  19. }
  20. private static class PDFTextParser {
  21. if(!file.isFile())
  22. {
  23. String fileName = null;
  24. System.err.println("File " + fileName + " does not exist.");
  25. }
  26. //Set up instance of PDF parser
  27. {
  28. try {
  29. parser = new PDFParser((RandomAccessRead) new FileInputStream(file));
  30. } catch (IOException e) {
  31. e.printStackTrace();
  32. }
  33. }
  34. }
  35. //-------------------------------
  36. public static String getParsedText() {
  37. PDDocument pdDoc = null;
  38. COSDocument cosDoc = null;
  39. String parsedText = null;
  40. try {
  41. parser.parse();
  42. } catch (IOException e1) {
  43. e1.printStackTrace();
  44. }
  45. try {
  46. cosDoc = parser.getDocument();
  47. } catch (IOException e1) {
  48. e1.printStackTrace();
  49. }
  50. pdDoc = new PDDocument(cosDoc);
  51. return parsedText;
  52. }
  53. // System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
  54. {
  55. if (cosDoc != null) cosDoc.close();
  56. if (pdDoc != null) pdDoc.close();
  57. }
  58. }
英文:

Please advice me as I am getting errors while using pdfbox to parse pdf files in Java. All the necessary libraries are imported and I am still getting errors.

Please tell me what I am doing wrong and the possible solution

See my Java code below / see the errors and stack trace below

  1. Error:(32, 10) java: illegal start of type
  2. Error:(32, 13) java: illegal start of type
  3. Error:(32, 14) java: ')' expected
  4. Error:(32, 18) java: ';' expected
  5. Error:(32, 19) java: invalid method declaration; return type required
  6. Error:(32, 27) java: ';' expected
  1. import org.apache.pdfbox.cos.COSDocument;
  2. import org.apache.pdfbox.io.RandomAccessRead;
  3. import org.apache.pdfbox.pdfparser.PDFParser;
  4. import org.apache.pdfbox.pdmodel.PDDocument;
  5. import java.beans.XMLDecoder;
  6. import java.io.File;
  7. import java.io.FileInputStream;
  8. import java.io.IOException;
  9. public class PdfReader {
  10. private XMLDecoder cosDoc = null;
  11. private XMLDecoder pdDoc = null;
  12. static File file = new File("data/javaPDF.pdf");
  13. private static PDFParser parser = null;
  14. public static void main(String args[]) {
  15. PDFTextParser pdf = new PDFTextParser();
  16. //print out results
  17. System.out.println(pdf.getParsedText());
  18. //
  19. }
  20. private static class PDFTextParser {
  21. if(!file.isFile())
  22. {
  23. String fileName = null;
  24. System.err.println("File " + fileName + " does not exist.");
  25. }
  26. //Set up instance of PDF parser
  27. {
  28. try {
  29. parser = new PDFParser((RandomAccessRead) new FileInputStream(file));
  30. } catch (IOException e) {
  31. e.printStackTrace();
  32. }
  33. }
  34. }
  35. //-------------------------------
  36. public static String getParsedText() {
  37. PDDocument pdDoc = null;
  38. COSDocument cosDoc = null;
  39. String parsedText = null;
  40. try {
  41. parser.parse();
  42. } catch (IOException e1) {
  43. e1.printStackTrace();
  44. }
  45. try {
  46. cosDoc = parser.getDocument();
  47. } catch (IOException e1) {
  48. e1.printStackTrace();
  49. }
  50. pdDoc = new PDDocument(cosDoc);
  51. return parsedText;
  52. }
  53. // System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
  54. {
  55. if (cosDoc != null) cosDoc.close();
  56. if (pdDoc != null) pdDoc.close();
  57. }
  58. }

答案1

得分: 0

感谢大家,问题已解决。

以下是解决方案:

  1. package com.mypackage.util;
  2. import java.io.File;
  3. import java.io.IOException;
  4. import org.apache.pdfbox.pdmodel.PDDocument;
  5. import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
  6. import org.apache.pdfbox.text.PDFTextStripper;
  7. public class ExtractTextSimple
  8. {
  9. private ExtractTextSimple()
  10. {
  11. // 示例类不应该被实例化
  12. }
  13. public static void main(String[] args) throws IOException
  14. {
  15. PDDocument document = PDDocument.load(new File("data/javaPDF.pdf"));
  16. System.out.println("页数 >>" + document.getNumberOfPages());
  17. AccessPermission ap = document.getCurrentAccessPermission();
  18. if (!ap.canExtractContent())
  19. {
  20. throw new IOException("您无权提取文本");
  21. }
  22. PDFTextStripper stripper = new PDFTextStripper();
  23. stripper.setSortByPosition(true);
  24. for (int p = 1; p <= document.getNumberOfPages(); ++p)
  25. {
  26. // 设置要提取的页面间隔。如果不设置,将提取所有页面。
  27. stripper.setStartPage(p);
  28. stripper.setEndPage(p);
  29. // 让魔法发生
  30. String text = stripper.getText(document);
  31. System.out.println("文本 >>" + text);
  32. // 以标题形式输出一些漂亮的内容
  33. String pageStr = String.format("第 %d 页:", p);
  34. System.out.println(pageStr);
  35. for (int i = 0; i < pageStr.length(); ++i)
  36. {
  37. System.out.print("-");
  38. }
  39. System.out.println();
  40. System.out.println(text.trim());
  41. System.out.println();
  42. }
  43. }
  44. }
英文:

Thanks so much guys, it's been resolved.

Here's the solution:

  1. package com.mypackage.util;
  2. import java.io.File;
  3. import java.io.IOException;
  4. import org.apache.pdfbox.pdmodel.PDDocument;
  5. import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
  6. import org.apache.pdfbox.text.PDFTextStripper;
  7. public class ExtractTextSimple
  8. {
  9. private ExtractTextSimple()
  10. {
  11. // example class should not be instantiated
  12. }
  13. public static void main(String[] args) throws IOException
  14. {
  15. PDDocument document = PDDocument.load(new File(&quot;data/javaPDF.pdf&quot;));
  16. System.out.println(&quot;No of pages &gt;&gt;&quot;+ document.getNumberOfPages());
  17. AccessPermission ap = document.getCurrentAccessPermission();
  18. if (!ap.canExtractContent())
  19. {
  20. throw new IOException(&quot;You do not have permission to extract text&quot;);
  21. }
  22. PDFTextStripper stripper = new PDFTextStripper();
  23. stripper.setSortByPosition(true);
  24. for (int p = 1; p &lt;= document.getNumberOfPages(); ++p)
  25. {
  26. // Set the page interval to extract. If you don&#39;t, then all pages would be extracted.
  27. stripper.setStartPage(p);
  28. stripper.setEndPage(p);
  29. // let the magic happen
  30. String text = stripper.getText(document);
  31. System.out.println(&quot;Text &gt;&gt;&quot; + text);
  32. // do some nice output with a header
  33. String pageStr = String.format(&quot;page %d:&quot;, p);
  34. System.out.println(pageStr);
  35. for (int i = 0; i &lt; pageStr.length(); ++i)
  36. {
  37. System.out.print(&quot;-&quot;);
  38. }
  39. System.out.println();
  40. System.out.println(text.trim());
  41. System.out.println();
  42. }
  43. }
  44. }

huangapple
  • 本文由 发表于 2020年8月19日 06:17:49
  • 转载请务必保留本文链接:https://go.coder-hub.com/63477381.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定