How to get both the chardata and the value of the attributes of an XML tag when decoding it in Golang

huangapple go评论81阅读模式
英文:

How to get both the chardata and the value of the attributes of an XML tag when decoding it in Golang

问题

我的XML文件类似于这样:

<page>
    <title>Antoine Meillet</title>
    <ns>0</ns>
    <id>3</id>
    <revision>
      <id>178204512</id>
      <parentid>178097574</parentid>
      <timestamp>2020-12-30T10:12:14Z</timestamp>
      <contributor>
        <username>Rovo</username>
        <id>34820</id>
      </contributor>
      <minor />
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text bytes="11274" xml:space="preserve">
        大量的文本
      </text>
      <sha1>ikqy1f9ppwo8eo38a0hh817eynr40vg</sha1>
    </revision>
  </page>

我的目标是过滤掉大量的标签,只保留page标签和其中的titleidtext标签。

到目前为止,我已经成功提取了具有正确值的page标签和titleid标签。这是我得到的结果:

<page>
 <title>Antoine Meillet</title>
 <id>3</id>
 <text bytes="0" xml:space=""></text>
</page>
<page>
 <title>Algèbre linéaire</title>
 <id>7</id>
 <text bytes="0" xml:space=""></text>
</page>

所以问题在于,正如你所看到的,text标签的属性值不正确,而且其中没有文本。

我使用了以下代码来实现这一点:

package main

import (
	"encoding/xml"
	"fmt"
	"io"
	"os"
)

type Page struct {
	XMLName xml.Name `xml:"page"`
	Title   string   `xml:"title"`
	Id      int64    `xml:"id"`
	Text    struct {
		Key   float32 `xml:"bytes,attr"`
		Space string  `xml:"xml:space,attr"`
	} `xml:"text"`
}

func main() {
	frwikiXML, err := os.Open("frwiki10000.xml")
	if err != nil {
		fmt.Println(err)
	}
	cleanedWikiXML, err := os.Create("cleaned_fr_wiki.xml")
	if err != nil {
		fmt.Println(err)
	}

	cleanXMLEncoder := xml.NewEncoder(cleanedWikiXML)
	cleanXMLEncoder.Indent("", " ")

	frwikiDecoder := xml.NewDecoder(frwikiXML)
	for {
		t, tokenErr := frwikiDecoder.Token()
		if tokenErr != nil {
			if tokenErr == io.EOF {
				break
			}
			fmt.Errorf("decoding token: %w", tokenErr)
		}
		switch t := t.(type) {
		case xml.StartElement:
			if t.Name.Local == "page" {
				var page Page
				if err := frwikiDecoder.DecodeElement(&page, &t); err != nil {
					fmt.Errorf("decoding element %q: %v", t.Name.Local, err)
				}
				fmt.Println("Element was decoded successfully.")
				fmt.Printf("Page title: %v\n Page id: %d\n", page.Title, page.Id)
				fmt.Printf("Text: %v", page.Text)
				cleanXMLEncoder.Encode(page)
			}
		}
	}

	defer frwikiXML.Close()
	defer cleanedWikiXML.Close()
}

请问我该如何解决这个问题呢?

谢谢。

英文:

My XML file resembles to something like this:

<page>
    <title>Antoine Meillet</title>
    <ns>0</ns>
    <id>3</id>
    <revision>
      <id>178204512</id>
      <parentid>178097574</parentid>
      <timestamp>2020-12-30T10:12:14Z</timestamp>
      <contributor>
        <username>Rovo</username>
        <id>34820</id>
      </contributor>
      <minor />
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text bytes="11274" xml:space="preserve">
        a lot of text
      </text>
      <sha1>ikqy1f9ppwo8eo38a0hh817eynr40vg</sha1>
    </revision>
  </page>

My goal is to filter out a lot of those tags and only keep the page tag and those inner tags: title, id, text.

So far, I have been able to successfully extract the page tag with title and id having the right value.
This is what I get:

<page>
 <title>Antoine Meillet</title>
 <id>3</id>
 <text bytes="0" xml:space=""></text>
</page>
<page>
 <title>Algèbre linéaire</title>
 <id>7</id>
 <text bytes="0" xml:space=""></text>
</page>

So the problem here as you can see is that the text tag doesn't have the right values for its attributes and the absence of text in it.

I have achieved this using this piece of code:

package main

import (
	"encoding/xml"
	"fmt"
	"io"
	"os"
)

type Page struct {
	XMLName xml.Name `xml:"page"`
	Title   string   `xml:"title"`
	Id      int64    `xml:"id"`
	Text    struct {
		Key   float32 `xml:"bytes,attr"`
		Space string  `xml:"xml:space,attr"`
	} `xml:"text"`
}

func main() {
	frwikiXML, err := os.Open("frwiki10000.xml")
	if err != nil {
		fmt.Println(err)
	}
	cleanedWikiXML, err := os.Create("cleaned_fr_wiki.xml")
	if err != nil {
		fmt.Println(err)
	}

	cleanXMLEncoder := xml.NewEncoder(cleanedWikiXML)
	cleanXMLEncoder.Indent("", " ")

	frwikiDecoder := xml.NewDecoder(frwikiXML)
	for {
		t, tokenErr := frwikiDecoder.Token()
		if tokenErr != nil {
			if tokenErr == io.EOF {
				break
			}
			fmt.Errorf("decoding token: %w", tokenErr)
		}
		switch t := t.(type) {
		case xml.StartElement:
			if t.Name.Local == "page" {
				var page Page
				if err := frwikiDecoder.DecodeElement(&page, &t); err != nil {
					fmt.Errorf("decoding element %q: %v", t.Name.Local, err)
				}
				fmt.Println("Element was decoded successfully.")
				fmt.Printf("Page title: %v\n Page id: %d\n", page.Title, page.Id)
				fmt.Printf("Text: %v", page.Text)
				cleanXMLEncoder.Encode(page)
			}
		}
	}

	defer frwikiXML.Close()
	defer cleanedWikiXML.Close()
}

How would I be able to solve this problem, please?

Thanks.

答案1

得分: 1

要解析大型的xml文件,可以使用标准的xml Decoder

调用Token逐个读取标记。当找到一个具有所需名称的起始元素("page"),调用DecodeElement来解码该元素并准备下一步操作的结果。

type Page struct {
	XMLName  xml.Name `xml:"page"`
	Title    string   `xml:"title"`
	Id       int64    `xml:"id"`
	Revision struct {
		Text struct {
			Key   float32 `xml:"bytes,attr"`
			Space string  `xml:"xml:space,attr"`
		} `xml:"text"`
	} `xml:"revision"`
}

type PageTarget struct {
	XMLName xml.Name `xml:"page"`
	Title   string   `xml:"title"`
	Id      int64    `xml:"id"`
	Text    struct {
		Key   float32 `xml:"bytes,attr"`
		Space string  `xml:"xml:space,attr"`
	} `xml:"text"`
}
dec := xml.NewDecoder(strings.NewReader(sample))

loop:
for {
	tok, err := dec.Token()
	switch {
	case err != nil && err != io.EOF:
		panic(err)
	case err == io.EOF:
		break loop
	case tok == nil:
		fmt.Println("token is nill")

	}

	switch se := tok.(type) {
	case xml.StartElement:
		if se.Name.Local == "page" {
			var page Page
			if err := dec.DecodeElement(&page, &se); err != nil {
				panic(err)
			}

			target := PageTarget{
				XMLName: page.XMLName,
				Id:      page.Id,
				Title:   page.Title,
				Text:    page.Revision.Text,
			}

			out, err := xml.MarshalIndent(target, " ", "  ")
			if err != nil {
				panic(err)
			}
			fmt.Println(string(out))
		}
	}
}

<kbd>PLAYGROUND</kbd>

英文:

To parse huge file xml file, use the standard xml Decoder.

Call Token to read tokens one by one. When a start element with required name is found ("page"), call DecodeElement to decode the element and prepare result to next actions.

type Page struct {
XMLName  xml.Name `xml:&quot;page&quot;`
Title    string   `xml:&quot;title&quot;`
Id       int64    `xml:&quot;id&quot;`
Revision struct {
Text struct {
Key   float32 `xml:&quot;bytes,attr&quot;`
Space string  `xml:&quot;xml:space,attr&quot;`
} `xml:&quot;text&quot;`
} `xml:&quot;revision&quot;`
}
type PageTarget struct {
XMLName xml.Name `xml:&quot;page&quot;`
Title   string   `xml:&quot;title&quot;`
Id      int64    `xml:&quot;id&quot;`
Text    struct {
Key   float32 `xml:&quot;bytes,attr&quot;`
Space string  `xml:&quot;xml:space,attr&quot;`
} `xml:&quot;text&quot;`
}
	dec := xml.NewDecoder(strings.NewReader(sample))
loop:
for {
tok, err := dec.Token()
switch {
case err != nil &amp;&amp; err != io.EOF:
panic(err)
case err == io.EOF:
break loop
case tok == nil:
fmt.Println(&quot;token is nill&quot;)
}
switch se := tok.(type) {
case xml.StartElement:
if se.Name.Local == &quot;page&quot; {
var page Page
if err := dec.DecodeElement(&amp;page, &amp;se); err != nil {
panic(err)
}
target := PageTarget{
XMLName: page.XMLName,
Id:      page.Id,
Title:   page.Title,
Text:    page.Revision.Text,
}
out, err := xml.MarshalIndent(target, &quot; &quot;, &quot;  &quot;)
if err != nil {
panic(err)
}
fmt.Println(string(out))
}
}
}

<kbd>PLAYGROUND</kbd>

答案2

得分: 0

只需将其解码为结构体,然后再进行编码即可满足您的目标。

请查看此链接:https://go.dev/play/p/69vjlve4P6p

英文:

Simply decoding to the struct and encoding again will satisfy your goal.

Please check this: https://go.dev/play/p/69vjlve4P6p

huangapple
  • 本文由 发表于 2022年1月20日 07:13:40
  • 转载请务必保留本文链接:https://go.coder-hub.com/70778945.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定