如何使用 HTMLDocument 查找属性?

huangapple go评论87阅读模式
英文:

How to find an attribute using HTMLDocument?

问题

  1. import java.io.BufferedReader;
  2. import java.io.FileReader;
  3. import java.io.Reader;
  4. import javax.swing.text.AttributeSet;
  5. import javax.swing.text.Element;
  6. import javax.swing.text.ElementIterator;
  7. import javax.swing.text.StyleConstants;
  8. import javax.swing.text.html.HTML;
  9. import javax.swing.text.html.HTMLDocument;
  10. import javax.swing.text.html.HTMLEditorKit;
  11. import javax.swing.text.html.parser.ParserDelegator;
  12. public class HTMLParserTest {
  13. public static void main(String args[]) throws Exception {
  14. Reader reader = new FileReader("C:/Downloads/DeleteMe/Example1.html");
  15. BufferedReader br = new BufferedReader(reader);
  16. HTMLEditorKit htmlKit = new HTMLEditorKit();
  17. HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
  18. HTMLEditorKit.Parser parser = new ParserDelegator();
  19. HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
  20. parser.parse(br, callback, true);
  21. // Parse
  22. ElementIterator iterator = new ElementIterator(htmlDoc);
  23. Element element;
  24. while ((element = iterator.next()) != null) {
  25. System.out.println("Element : " + element);
  26. AttributeSet attributes = element.getAttributes();
  27. Object name = attributes.getAttribute(StyleConstants.NameAttribute);
  28. if ((name instanceof HTML.Tag)) {
  29. // Build up content text as it may be within multiple elements
  30. StringBuffer text = new StringBuffer();
  31. int count = element.getElementCount();
  32. for (int i = 0; i < count; i++) {
  33. Element child = element.getElement(i);
  34. AttributeSet childAttributes = child.getAttributes();
  35. System.out.println("Element : " + child);
  36. System.out.println(" Attribute count : " + childAttributes.getAttributeCount());
  37. System.out.println(" a1 exists : " + childAttributes.isDefined("a1"));
  38. int startOffset = child.getStartOffset();
  39. int endOffset = child.getEndOffset();
  40. int length = endOffset - startOffset;
  41. text.append(htmlDoc.getText(startOffset, length));
  42. }
  43. }
  44. }
  45. System.exit(0);
  46. }
  47. }

输出:

  1. Element : BranchElement(html) 0,1
  2. Element : BranchElement(body) 0,1
  3. Attribute count : 1
  4. a1 exists : true
  5. Element : BranchElement(body) 0,1
  6. Element : BranchElement(p) 0,1
  7. Attribute count : 3
  8. a1 exists : false
  9. Element : BranchElement(p) 0,1
  10. Element : LeafElement(content) 0,1
  11. Attribute count : 1
  12. a1 exists : false
  13. Element : LeafElement(content) 0,1

期望的输出如下:

  1. Element : BranchElement(html) 0,1
  2. Element : BranchElement(body) 0,1
  3. Attribute count : 1
  4. a1 exists : true &lt;-----预期结果为true
  5. Element : BranchElement(body) 0,1
  6. Element : BranchElement(p) 0,1
  7. Attribute count : 3
  8. a1 exists : false
  9. Element : BranchElement(p) 0,1
  10. Element : LeafElement(content) 0,1
  11. Attribute count : 1
  12. a1 exists : false
  13. Element : LeafElement(content) 0,1

以上是代码的翻译部分。如果有其他问题或需要进一步帮助,请随时提问。

英文:

Possibly the terminology is different with HTML than with XML, but here is a HTML document from which attributes are being retrieved. Here the attributes a1, a2, a3 are part of the Body tag.

  1. &lt;html&gt;
  2. &lt;head&gt;
  3. Hello World
  4. &lt;/head&gt;
  5. &lt;body a1=&quot;ABC&quot; a2=&quot;3974&quot; a3=&quot;A1B2&quot;&gt; &lt;------These attributes
  6. &lt;H1&gt;Start Here&lt;H1&gt;
  7. &lt;p&gt;This is the body&lt;/p&gt;
  8. &lt;/body&gt;
  9. &lt;/html&gt;

Using the following file to parse the above HTML file.

  1. import java.io.BufferedReader;
  2. import java.io.FileReader;
  3. import java.io.Reader;
  4. import javax.swing.text.AttributeSet;
  5. import javax.swing.text.Element;
  6. import javax.swing.text.ElementIterator;
  7. import javax.swing.text.StyleConstants;
  8. import javax.swing.text.html.HTML;
  9. import javax.swing.text.html.HTMLDocument;
  10. import javax.swing.text.html.HTMLEditorKit;
  11. import javax.swing.text.html.parser.ParserDelegator;
  12. public class HTMLParserTest
  13. {
  14. public static void main(String args[]) throws Exception {
  15. Reader reader = new FileReader(&quot;C:/Downloads/DeleteMe/Example1.html&quot;);
  16. BufferedReader br = new BufferedReader(reader );
  17. HTMLEditorKit htmlKit = new HTMLEditorKit();
  18. HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
  19. HTMLEditorKit.Parser parser = new ParserDelegator();
  20. HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
  21. parser.parse(br, callback, true);
  22. // Parse
  23. ElementIterator iterator = new ElementIterator(htmlDoc);
  24. Element element;
  25. while ((element = iterator.next()) != null)
  26. {
  27. System.out.println(&quot;Element : &quot; + element);
  28. AttributeSet attributes = element.getAttributes();
  29. Object name = attributes.getAttribute(StyleConstants.NameAttribute);
  30. if ((name instanceof HTML.Tag))
  31. //&amp;&amp; ((name == HTML.Tag.H1) || (name == HTML.Tag.H2) || (name == HTML.Tag.H3)))
  32. {
  33. // Build up content text as it may be within multiple elements
  34. StringBuffer text = new StringBuffer();
  35. int count = element.getElementCount();
  36. for (int i = 0; i &lt; count; i++) {
  37. Element child = element.getElement(i);
  38. AttributeSet childAttributes = child.getAttributes();
  39. System.out.println(&quot;Element : &quot; + child);
  40. System.out.println(&quot; Attribute count : &quot; + childAttributes.getAttributeCount());
  41. System.out.println(&quot; a1 exists : &quot; + childAttributes.isDefined(&quot;a1&quot;));
  42. int startOffset = child.getStartOffset();
  43. int endOffset = child.getEndOffset();
  44. int length = endOffset - startOffset;
  45. text.append(htmlDoc.getText(startOffset, length));
  46. }
  47. }
  48. }
  49. System.exit(0);
  50. }
  51. }

The output is here.

  1. Element : BranchElement(html) 0,1
  2. Element : BranchElement(body) 0,1
  3. Attribute count : 1
  4. a1 exists : false &lt;-----expected true here.
  5. Element : BranchElement(body) 0,1
  6. Element : BranchElement(p) 0,1
  7. Attribute count : 3
  8. a1 exists : false
  9. Element : BranchElement(p) 0,1
  10. Element : LeafElement(content) 0,1
  11. Attribute count : 1
  12. a1 exists : false
  13. Element : LeafElement(content) 0,1

The expectation is that the "a1 exists" check should have returned true once, but it did not.
Eventually all 3 (a1, a2, a3) will be searched.

Is the above code the proper implementation or is this not feasible with the HTML parser?

答案1

得分: 1

也许这会有所帮助:

  1. import java.io.*;
  2. import java.net.*;
  3. import java.util.*;
  4. import javax.swing.*;
  5. import javax.swing.text.*;
  6. import javax.swing.text.html.*;
  7. class AttributeHTML
  8. {
  9. public static void main(String[] args)
  10. {
  11. EditorKit kit = new HTMLEditorKit();
  12. Document doc = kit.createDefaultDocument();
  13. // The Document class does not yet handle charset's properly.
  14. doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
  15. try
  16. {
  17. // Create a reader on the HTML content.
  18. Reader rd = getReader(args[0]);
  19. // Parse the HTML.
  20. kit.read(rd, doc, 0);
  21. // Iterate through the elements of the HTML document.
  22. ElementIterator it = new ElementIterator(doc);
  23. Element elem = null;
  24. while ( (elem = it.next()) != null )
  25. {
  26. if (elem.getName().equals("body"))
  27. {
  28. AttributeSet as = elem.getAttributes();
  29. Enumeration enum1 = as.getAttributeNames();
  30. while( enum1.hasMoreElements() )
  31. {
  32. Object name = enum1.nextElement();
  33. Object value = as.getAttribute( name );
  34. System.out.println( "\t" + name + " : " + value );
  35. }
  36. }
  37. }
  38. }
  39. catch (Exception e)
  40. {
  41. e.printStackTrace();
  42. }
  43. System.exit(1);
  44. }
  45. // Returns a reader on the HTML data. If 'uri' begins
  46. // with "http:", it's treated as a URL; otherwise,
  47. // it's assumed to be a local filename.
  48. static Reader getReader(String uri)
  49. throws IOException
  50. {
  51. // Retrieve from Internet.
  52. if (uri.startsWith("http:"))
  53. {
  54. URLConnection conn = new URL(uri).openConnection();
  55. return new InputStreamReader(conn.getInputStream());
  56. }
  57. // Retrieve from file.
  58. else
  59. {
  60. return new FileReader(uri);
  61. }
  62. }
  63. }

使用以下方式进行测试:

  1. java AttributeHTML yourFile.html
英文:

Maybe this will help:

  1. import java.io.*;
  2. import java.net.*;
  3. import java.util.*;
  4. import javax.swing.*;
  5. import javax.swing.text.*;
  6. import javax.swing.text.html.*;
  7. class AttributeHTML
  8. {
  9. public static void main(String[] args)
  10. {
  11. EditorKit kit = new HTMLEditorKit();
  12. Document doc = kit.createDefaultDocument();
  13. // The Document class does not yet handle charset&#39;s properly.
  14. doc.putProperty(&quot;IgnoreCharsetDirective&quot;, Boolean.TRUE);
  15. try
  16. {
  17. // Create a reader on the HTML content.
  18. Reader rd = getReader(args[0]);
  19. // Parse the HTML.
  20. kit.read(rd, doc, 0);
  21. // Iterate through the elements of the HTML document.
  22. ElementIterator it = new ElementIterator(doc);
  23. Element elem = null;
  24. while ( (elem = it.next()) != null )
  25. {
  26. if (elem.getName().equals(&quot;body&quot;))
  27. {
  28. AttributeSet as = elem.getAttributes();
  29. Enumeration enum1 = as.getAttributeNames();
  30. while( enum1.hasMoreElements() )
  31. {
  32. Object name = enum1.nextElement();
  33. Object value = as.getAttribute( name );
  34. System.out.println( &quot;\t&quot; + name + &quot; : &quot; + value );
  35. }
  36. }
  37. }
  38. }
  39. catch (Exception e)
  40. {
  41. e.printStackTrace();
  42. }
  43. System.exit(1);
  44. }
  45. // Returns a reader on the HTML data. If &#39;uri&#39; begins
  46. // with &quot;http:&quot;, it&#39;s treated as a URL; otherwise,
  47. // it&#39;s assumed to be a local filename.
  48. static Reader getReader(String uri)
  49. throws IOException
  50. {
  51. // Retrieve from Internet.
  52. if (uri.startsWith(&quot;http:&quot;))
  53. {
  54. URLConnection conn = new URL(uri).openConnection();
  55. return new InputStreamReader(conn.getInputStream());
  56. }
  57. // Retrieve from file.
  58. else
  59. {
  60. return new FileReader(uri);
  61. }
  62. }
  63. }

Test using:

  1. java AttributeHTML yourFile.html

答案2

得分: 0

  1. 我对于 `HtmlKit` 并不了解但你可以使用正则表达式来达到类似的结果
  2. public static void main(String[] args) throws UnirestException {
  3. String html = "<html>\r\n" +
  4. " <head>\r\n" +
  5. " Hello World\r\n" +
  6. " </head>\r\n" +
  7. " <body a1=\"ABC\" a2=\"3974\" a3=\"A1B2\">\r\n" +
  8. " <H1>Start Here<H1>\r\n" +
  9. " <p>This is the body</p>\r\n" +
  10. " </body>\r\n" +
  11. "</html>";
  12. Pattern regexBodyPattern = Pattern.compile("<body[^>]*>", Pattern.MULTILINE);
  13. Matcher matcher = regexBodyPattern.matcher(html);
  14. while(matcher.find()) {
  15. String bodyTag = matcher.group();
  16. Pattern regexBodyAttrPattern = Pattern.compile("(\\S*)=(\"\\w*\")", Pattern.MULTILINE);
  17. Matcher attrMatcher = regexBodyAttrPattern.matcher(bodyTag);
  18. while(attrMatcher.find()) {
  19. System.out.println("Key :: " + attrMatcher.group(1) + " , Value " + attrMatcher.group(2));
  20. }
  21. }
  22. }

output

  1. Key :: a1 , Value "ABC"
  2. Key :: a2 , Value "3974"
  3. Key :: a3 , Value "A1B2"
  1. <details>
  2. <summary>英文:</summary>
  3. I am not aware about `HtmlKit`but u can achieve similar result using regex
  4. public static void main(String[] args) throws UnirestException {
  5. String html = &quot;&lt;html&gt;\r\n&quot; +
  6. &quot; &lt;head&gt;\r\n&quot; +
  7. &quot; Hello World\r\n&quot; +
  8. &quot; &lt;/head&gt;\r\n&quot; +
  9. &quot; &lt;body a1=\&quot;ABC\&quot; a2=\&quot;3974\&quot; a3=\&quot;A1B2\&quot;&gt;\r\n&quot; +
  10. &quot; &lt;H1&gt;Start Here&lt;H1&gt;\r\n&quot; +
  11. &quot; &lt;p&gt;This is the body&lt;/p&gt;\r\n&quot; +
  12. &quot; &lt;/body&gt;\r\n&quot; +
  13. &quot;&lt;/html&gt;&quot;;
  14. Pattern regexBodyPattern = Pattern.compile(&quot;&lt;body[^&gt;]*&gt;&quot;, Pattern.MULTILINE);
  15. Matcher matcher = regexBodyPattern.matcher(html);
  16. while(matcher.find()) {
  17. String bodyTag = matcher.group();
  18. Pattern regexBodyAttrPattern = Pattern.compile(&quot;(\\S*)=(\\\&quot;\\w*\\\&quot;)&quot;, Pattern.MULTILINE);
  19. Matcher attrMatcher = regexBodyAttrPattern.matcher(bodyTag);
  20. while(attrMatcher.find()) {
  21. System.out.println(&quot;Key :: &quot;+attrMatcher.group(1)+&quot; , Value &quot;+attrMatcher.group(2));
  22. }
  23. }
  24. }
  25. **output**
  26. Key :: a1 , Value &quot;ABC&quot;
  27. Key :: a2 , Value &quot;3974&quot;
  28. Key :: a3 , Value &quot;A1B2&quot;
  29. </details>
  30. # 答案3
  31. **得分**: 0
  32. 要检索属性,您可以提供自己的ParserCallback
  33. ```java
  34. import java.io.BufferedReader;
  35. import java.io.FileReader;
  36. import java.io.IOException;
  37. import java.io.Reader;
  38. import java.util.ArrayList;
  39. import java.util.Enumeration;
  40. import java.util.List;
  41. import javax.swing.text.MutableAttributeSet;
  42. import javax.swing.text.html.HTML.Tag;
  43. import javax.swing.text.html.HTMLEditorKit.ParserCallback;
  44. import javax.swing.text.html.parser.ParserDelegator;
  45. public class HTMLParserTest2 {
  46. public static void main(String args[]) throws Exception {
  47. Reader reader = new FileReader("d:/temp/Example.html");
  48. BufferedReader br = new BufferedReader(reader);
  49. System.out.println(HTMLParserTest2.extractTagsAttributes(br));
  50. // output : [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
  51. System.exit(0);
  52. }
  53. public static List<String> extractTagsAttributes(Reader r) throws IOException {
  54. final ArrayList<String> list = new ArrayList<String>();
  55. ParserDelegator parserDelegator = new ParserDelegator();
  56. ParserCallback parserCallback = new ParserCallback() {
  57. @Override
  58. public void handleText(final char[] data, final int pos) { }
  59. @Override
  60. public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
  61. Enumeration<?> e=attribute.getAttributeNames();
  62. while(e.hasMoreElements()) {
  63. Object name=e.nextElement();
  64. Object value=attribute.getAttribute(name);
  65. list.add(tag.toString() + "-" + name + "=" +value);
  66. }
  67. }
  68. @Override
  69. public void handleEndTag(Tag t, final int pos) { }
  70. @Override
  71. public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
  72. @Override
  73. public void handleComment(final char[] data, final int pos) { }
  74. @Override
  75. public void handleError(final java.lang.String errMsg, final int pos) { }
  76. };
  77. parserDelegator.parse(r, parserCallback, true);
  78. return list;
  79. }
  80. }
英文:

To retrieve the attributes, you can provide your own ParserCallback

  1. import java.io.BufferedReader;
  2. import java.io.FileReader;
  3. import java.io.IOException;
  4. import java.io.Reader;
  5. import java.util.ArrayList;
  6. import java.util.Enumeration;
  7. import java.util.List;
  8. import javax.swing.text.MutableAttributeSet;
  9. import javax.swing.text.html.HTML.Tag;
  10. import javax.swing.text.html.HTMLEditorKit.ParserCallback;
  11. import javax.swing.text.html.parser.ParserDelegator;
  12. public class HTMLParserTest2
  13. {
  14. public static void main(String args[]) throws Exception {
  15. Reader reader = new FileReader(&quot;d:/temp/Example.html&quot;);
  16. BufferedReader br = new BufferedReader(reader);
  17. System.out.println(HTMLParserTest2.extractTagsAttributes(br));
  18. // output : [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
  19. System.exit(0);
  20. }
  21. public static List&lt;String&gt; extractTagsAttributes(Reader r) throws IOException {
  22. final ArrayList&lt;String&gt; list = new ArrayList&lt;String&gt;();
  23. ParserDelegator parserDelegator = new ParserDelegator();
  24. ParserCallback parserCallback = new ParserCallback() {
  25. @Override
  26. public void handleText(final char[] data, final int pos) { }
  27. @Override
  28. public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
  29. Enumeration&lt;?&gt; e=attribute.getAttributeNames();
  30. while(e.hasMoreElements()) {
  31. Object name=e.nextElement();
  32. Object value=attribute.getAttribute(name);
  33. list.add(tag.toString() + &quot;-&quot; + name + &quot;=&quot; +value);
  34. }
  35. }
  36. @Override
  37. public void handleEndTag(Tag t, final int pos) { }
  38. @Override
  39. public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
  40. @Override
  41. public void handleComment(final char[] data, final int pos) { }
  42. @Override
  43. public void handleError(final java.lang.String errMsg, final int pos) { }
  44. };
  45. parserDelegator.parse(r, parserCallback, true);
  46. return list;
  47. }
  48. }

huangapple
  • 本文由 发表于 2020年9月19日 05:14:57
  • 转载请务必保留本文链接:https://go.coder-hub.com/63962908.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定