如何使用 HTMLDocument 查找属性?

huangapple go评论59阅读模式
英文:

How to find an attribute using HTMLDocument?

问题

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;

import javax.swing.text.AttributeSet;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

public class HTMLParserTest {

  public static void main(String args[]) throws Exception {

    Reader reader = new FileReader("C:/Downloads/DeleteMe/Example1.html");
    BufferedReader br = new BufferedReader(reader);

    HTMLEditorKit htmlKit = new HTMLEditorKit();
    HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
    HTMLEditorKit.Parser parser = new ParserDelegator();
    HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
    parser.parse(br, callback, true);

    // Parse
    ElementIterator iterator = new ElementIterator(htmlDoc);
    Element element;
    while ((element = iterator.next()) != null) {
      System.out.println("Element : " + element);
      AttributeSet attributes = element.getAttributes();
      Object name = attributes.getAttribute(StyleConstants.NameAttribute);
      if ((name instanceof HTML.Tag)) {
        // Build up content text as it may be within multiple elements
        StringBuffer text = new StringBuffer();
        int count = element.getElementCount();
        for (int i = 0; i < count; i++) {
          Element child = element.getElement(i);
          AttributeSet childAttributes = child.getAttributes();
          System.out.println("Element : " + child);
          System.out.println("     Attribute count : " + childAttributes.getAttributeCount());
          System.out.println("     a1 exists : " + childAttributes.isDefined("a1"));

          int startOffset = child.getStartOffset();
          int endOffset = child.getEndOffset();
          int length = endOffset - startOffset;
          text.append(htmlDoc.getText(startOffset, length));
        }

      }
    }
    System.exit(0);
  }
}

输出:

Element : BranchElement(html) 0,1
Element : BranchElement(body) 0,1
Attribute count : 1
a1 exists : true
Element : BranchElement(body) 0,1
Element : BranchElement(p) 0,1
Attribute count : 3
a1 exists : false
Element : BranchElement(p) 0,1
Element : LeafElement(content) 0,1
Attribute count : 1
a1 exists : false
Element : LeafElement(content) 0,1

期望的输出如下:

Element : BranchElement(html) 0,1
Element : BranchElement(body) 0,1
Attribute count : 1
a1 exists : true                    &lt;-----预期结果为true。
Element : BranchElement(body) 0,1
Element : BranchElement(p) 0,1
Attribute count : 3
a1 exists : false
Element : BranchElement(p) 0,1
Element : LeafElement(content) 0,1
Attribute count : 1
a1 exists : false
Element : LeafElement(content) 0,1

以上是代码的翻译部分。如果有其他问题或需要进一步帮助,请随时提问。

英文:

Possibly the terminology is different with HTML than with XML, but here is a HTML document from which attributes are being retrieved. Here the attributes a1, a2, a3 are part of the Body tag.

&lt;html&gt;
&lt;head&gt;
Hello World
&lt;/head&gt;
&lt;body a1=&quot;ABC&quot; a2=&quot;3974&quot; a3=&quot;A1B2&quot;&gt;     &lt;------These attributes
&lt;H1&gt;Start Here&lt;H1&gt;
&lt;p&gt;This is the body&lt;/p&gt;
&lt;/body&gt;
&lt;/html&gt;

Using the following file to parse the above HTML file.

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import javax.swing.text.AttributeSet;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader(&quot;C:/Downloads/DeleteMe/Example1.html&quot;);
BufferedReader br = new BufferedReader(reader );
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
// Parse
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null) 
{
System.out.println(&quot;Element : &quot; + element);
AttributeSet attributes = element.getAttributes();
Object name = attributes.getAttribute(StyleConstants.NameAttribute);
if ((name instanceof HTML.Tag))
//&amp;&amp; ((name == HTML.Tag.H1) || (name == HTML.Tag.H2) || (name == HTML.Tag.H3))) 
{
// Build up content text as it may be within multiple elements
StringBuffer text = new StringBuffer();
int count = element.getElementCount();
for (int i = 0; i &lt; count; i++) {
Element child = element.getElement(i);
AttributeSet childAttributes = child.getAttributes();
System.out.println(&quot;Element : &quot; + child);
System.out.println(&quot;     Attribute count : &quot; + childAttributes.getAttributeCount());
System.out.println(&quot;     a1 exists : &quot; + childAttributes.isDefined(&quot;a1&quot;));
int startOffset = child.getStartOffset();
int endOffset = child.getEndOffset();
int length = endOffset - startOffset;
text.append(htmlDoc.getText(startOffset, length));
}
}
}
System.exit(0);
}
}

The output is here.

Element : BranchElement(html) 0,1
Element : BranchElement(body) 0,1
Attribute count : 1
a1 exists : false                    &lt;-----expected true here.
Element : BranchElement(body) 0,1
Element : BranchElement(p) 0,1
Attribute count : 3
a1 exists : false
Element : BranchElement(p) 0,1
Element : LeafElement(content) 0,1
Attribute count : 1
a1 exists : false
Element : LeafElement(content) 0,1

The expectation is that the "a1 exists" check should have returned true once, but it did not.
Eventually all 3 (a1, a2, a3) will be searched.

Is the above code the proper implementation or is this not feasible with the HTML parser?

答案1

得分: 1

也许这会有所帮助:

import java.io.*;
import java.net.*;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;

class AttributeHTML
{
    public static void main(String[] args)
    {
        EditorKit kit = new HTMLEditorKit();
        Document doc = kit.createDefaultDocument();

        // The Document class does not yet handle charset's properly.
        doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);

        try
        {
            // Create a reader on the HTML content.
            Reader rd = getReader(args[0]);

            // Parse the HTML.
            kit.read(rd, doc, 0);

            // Iterate through the elements of the HTML document.

            ElementIterator it = new ElementIterator(doc);
            Element elem = null;

            while ( (elem = it.next()) != null )
            {
                if (elem.getName().equals("body"))
                {

                    AttributeSet as = elem.getAttributes();

                    Enumeration enum1 = as.getAttributeNames();

                    while( enum1.hasMoreElements() )
                    {
                        Object name = enum1.nextElement();
                        Object value = as.getAttribute( name );

                        System.out.println( "\t" + name + " : " + value );
                    }

                }
            }

        }
        catch (Exception e)
        {
            e.printStackTrace();
        }

        System.exit(1);
    }

    // Returns a reader on the HTML data. If 'uri' begins
    // with "http:", it's treated as a URL; otherwise,
    // it's assumed to be a local filename.
    static Reader getReader(String uri)
        throws IOException
    {
        // Retrieve from Internet.
        if (uri.startsWith("http:"))
        {
            URLConnection conn = new URL(uri).openConnection();
            return new InputStreamReader(conn.getInputStream());
        }
        // Retrieve from file.
        else
        {
            return new FileReader(uri);
        }
    }
}

使用以下方式进行测试:

java AttributeHTML yourFile.html
英文:

Maybe this will help:

import java.io.*;
import java.net.*;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
class AttributeHTML
{
public static void main(String[] args)
{
EditorKit kit = new HTMLEditorKit();
Document doc = kit.createDefaultDocument();
// The Document class does not yet handle charset&#39;s properly.
doc.putProperty(&quot;IgnoreCharsetDirective&quot;, Boolean.TRUE);
try
{
// Create a reader on the HTML content.
Reader rd = getReader(args[0]);
// Parse the HTML.
kit.read(rd, doc, 0);
// Iterate through the elements of the HTML document.
ElementIterator it = new ElementIterator(doc);
Element elem = null;
while ( (elem = it.next()) != null )
{
if (elem.getName().equals(&quot;body&quot;))
{
AttributeSet as = elem.getAttributes();
Enumeration enum1 = as.getAttributeNames();
while( enum1.hasMoreElements() )
{
Object name = enum1.nextElement();
Object value = as.getAttribute( name );
System.out.println( &quot;\t&quot; + name + &quot; : &quot; + value );
}
}
}
}
catch (Exception e)
{
e.printStackTrace();
}
System.exit(1);
}
// Returns a reader on the HTML data. If &#39;uri&#39; begins
// with &quot;http:&quot;, it&#39;s treated as a URL; otherwise,
// it&#39;s assumed to be a local filename.
static Reader getReader(String uri)
throws IOException
{
// Retrieve from Internet.
if (uri.startsWith(&quot;http:&quot;))
{
URLConnection conn = new URL(uri).openConnection();
return new InputStreamReader(conn.getInputStream());
}
// Retrieve from file.
else
{
return new FileReader(uri);
}
}
}

Test using:

java AttributeHTML yourFile.html

答案2

得分: 0

我对于 `HtmlKit` 并不了解但你可以使用正则表达式来达到类似的结果

public static void main(String[] args) throws UnirestException {
    String html = "<html>\r\n" +
            "  <head>\r\n" +
            "      Hello World\r\n" +
            "  </head>\r\n" +
            "  <body a1=\"ABC\" a2=\"3974\" a3=\"A1B2\">\r\n" +
            "    <H1>Start Here<H1>\r\n" +
            "    <p>This is the body</p>\r\n" +
            "  </body>\r\n" +
            "</html>";
    Pattern regexBodyPattern = Pattern.compile("<body[^>]*>", Pattern.MULTILINE);
    Matcher matcher = regexBodyPattern.matcher(html);
    
    while(matcher.find()) {
        String bodyTag = matcher.group();
        Pattern regexBodyAttrPattern = Pattern.compile("(\\S*)=(\"\\w*\")", Pattern.MULTILINE);
        Matcher attrMatcher = regexBodyAttrPattern.matcher(bodyTag);
        while(attrMatcher.find()) {
            System.out.println("Key :: " + attrMatcher.group(1) + " , Value " + attrMatcher.group(2));
        }
    }        
}

output

Key :: a1 , Value "ABC"
Key :: a2 , Value "3974"
Key :: a3 , Value "A1B2"

<details>
<summary>英文:</summary>
I am not aware about `HtmlKit`but u can achieve similar result using regex 
public static void main(String[] args) throws UnirestException {
String html = &quot;&lt;html&gt;\r\n&quot; + 
&quot;  &lt;head&gt;\r\n&quot; + 
&quot;      Hello World\r\n&quot; + 
&quot;  &lt;/head&gt;\r\n&quot; + 
&quot;  &lt;body a1=\&quot;ABC\&quot; a2=\&quot;3974\&quot; a3=\&quot;A1B2\&quot;&gt;\r\n&quot; + 
&quot;    &lt;H1&gt;Start Here&lt;H1&gt;\r\n&quot; + 
&quot;    &lt;p&gt;This is the body&lt;/p&gt;\r\n&quot; + 
&quot;  &lt;/body&gt;\r\n&quot; + 
&quot;&lt;/html&gt;&quot;;
Pattern regexBodyPattern = Pattern.compile(&quot;&lt;body[^&gt;]*&gt;&quot;, Pattern.MULTILINE);
Matcher matcher = regexBodyPattern.matcher(html);
while(matcher.find()) {
String bodyTag = matcher.group();
Pattern regexBodyAttrPattern = Pattern.compile(&quot;(\\S*)=(\\\&quot;\\w*\\\&quot;)&quot;, Pattern.MULTILINE);
Matcher attrMatcher = regexBodyAttrPattern.matcher(bodyTag);
while(attrMatcher.find()) {
System.out.println(&quot;Key :: &quot;+attrMatcher.group(1)+&quot; , Value &quot;+attrMatcher.group(2));
}
}		
}
**output**
Key :: a1 , Value &quot;ABC&quot;
Key :: a2 , Value &quot;3974&quot;
Key :: a3 , Value &quot;A1B2&quot;
</details>
# 答案3
**得分**: 0
要检索属性,您可以提供自己的ParserCallback
```java
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest2 {
public static void main(String args[]) throws Exception {
Reader reader = new FileReader("d:/temp/Example.html");
BufferedReader br = new BufferedReader(reader);
System.out.println(HTMLParserTest2.extractTagsAttributes(br));
// output :  [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
System.exit(0);
}
public static List<String> extractTagsAttributes(Reader r) throws IOException {
final ArrayList<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
@Override
public void handleText(final char[] data, final int pos) {  }
@Override
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { 
Enumeration<?> e=attribute.getAttributeNames();
while(e.hasMoreElements()) {
Object name=e.nextElement();
Object value=attribute.getAttribute(name);
list.add(tag.toString() + "-" + name + "=" +value);
}
}
@Override
public void handleEndTag(Tag t, final int pos) {  }
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
@Override
public void handleComment(final char[] data, final int pos) { }
@Override
public void handleError(final java.lang.String errMsg, final int pos) { }
};
parserDelegator.parse(r, parserCallback, true);
return list;
}
}
英文:

To retrieve the attributes, you can provide your own ParserCallback

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest2
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader(&quot;d:/temp/Example.html&quot;);
BufferedReader br = new BufferedReader(reader);
System.out.println(HTMLParserTest2.extractTagsAttributes(br));
// output :  [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
System.exit(0);
}
public static List&lt;String&gt; extractTagsAttributes(Reader r) throws IOException {
final ArrayList&lt;String&gt; list = new ArrayList&lt;String&gt;();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
@Override
public void handleText(final char[] data, final int pos) {  }
@Override
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { 
Enumeration&lt;?&gt; e=attribute.getAttributeNames();
while(e.hasMoreElements()) {
Object name=e.nextElement();
Object value=attribute.getAttribute(name);
list.add(tag.toString() + &quot;-&quot; + name + &quot;=&quot; +value);
}
}
@Override
public void handleEndTag(Tag t, final int pos) {  }
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
@Override
public void handleComment(final char[] data, final int pos) { }
@Override
public void handleError(final java.lang.String errMsg, final int pos) { }
};
parserDelegator.parse(r, parserCallback, true);
return list;
}
}

huangapple
  • 本文由 发表于 2020年9月19日 05:14:57
  • 转载请务必保留本文链接:https://go.coder-hub.com/63962908.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定