英文:
out of memory in golang when parsing Freebase RDF
问题
我正在解析Freebase RDF压缩和流式处理的三元组,使用Golang中的XML包。然而,我遇到了内存溢出错误。
我需要进行垃圾回收吗?我该如何做到这一点?在将三元组写入XML文件后,我该如何清除内存?
以下是我的代码:http://play.golang.org/p/dWvbtcs7wy
package main
import (
"bufio"
"flag"
"fmt"
"io"
"net/url"
"os"
"regexp"
"strings"
)
var inputFile = flag.String("infile", "freebase-rdf", "Input file path")
var filter, _ = regexp.Compile("^file:.*|^talk:.*|^special:.*|^wikipedia:.*|^wiktionary:.*|^user:.*|^user_talk:.*")
type Redirect struct {
Title string `xml:"title,attr"`
}
type Page struct {
Title string `xml:"title"`
Abstract string `xml:""`
}
func CanonicaliseTitle(title string) string {
can := strings.ToLower(title)
can = strings.Replace(can, " ", "_", -1)
can = url.QueryEscape(can)
return can
}
func convertFreebaseId(uri string) string {
if strings.HasPrefix(uri, "<") && strings.HasSuffix(uri, ">") {
var id = uri[1 : len(uri)-1]
id = strings.Replace(id, "http://rdf.freebase.com/ns", "", -1)
id = strings.Replace(id, ".", "/", -1)
return id
}
return uri
}
func parseTriple(line string) (string, string, string) {
var parts = strings.Split(line, "\t")
subject := convertFreebaseId(parts[0])
predicate := convertFreebaseId(parts[1])
object := convertFreebaseId(parts[2])
return subject, predicate, object
}
var (
validRegexp = regexp.MustCompile("^[A-Za-z0-9][A-Za-z0-9_-]*$")
englishRegexp = regexp.MustCompile("@en")
)
func validTitle(content []string) bool {
for _, v := range content {
if !englishRegexp.MatchString(v) && len(v) > 1 && strings.Index(v, "[]") != -1 {
}
}
return true
}
func validText(content []string) bool {
for _, v := range content {
if !validRegexp.MatchString(v) && len(v) > 1 && strings.Index(v, "[]") != -1 {
return false
}
}
return true
}
func processTopic(id string, properties map[string][]string, file io.Writer) {
if validTitle(properties["/type/object/name"]) && validText(properties["/common/document/text"]) {
fmt.Fprintf(file, "<card>\n")
fmt.Fprintf(file, "<title>\"%s\"</title>\n", properties["/type/object/name"])
fmt.Fprintf(file, "<image>\"%s/%s\"</image>\n", "https://usercontent.googleapis.com/freebase/v1/image", id)
fmt.Fprintf(file, "<text>\"%s\"</text>\n", properties["/common/document/text"])
fmt.Fprintf(file, "<facts>\n")
for k, v := range properties {
for _, value := range v {
fmt.Fprintf(file, "<fact property=\"%s\">%s</fact>\n", k, value)
}
}
fmt.Fprintf(file, "</facts>\n")
fmt.Fprintf(file, "</card>\n")
}
}
func main() {
var current_mid = ""
current_topic := make(map[string][]string)
f, err := os.Open(*inputFile)
if err != nil {
fmt.Println(err)
return
}
r := bufio.NewReader(f)
xmlFile, _ := os.Create("freebase.xml")
line, err := r.ReadString('\n')
for err == nil {
subject, predicate, object := parseTriple(line)
if subject == current_mid {
current_topic[predicate] = append(current_topic[predicate], object)
} else if len(current_mid) > 0 {
processTopic(current_mid, current_topic, xmlFile)
current_topic = make(map[string][]string)
}
current_mid = subject
line, err = r.ReadString('\n')
}
processTopic(current_mid, current_topic, xmlFile)
if err != io.EOF {
fmt.Println(err)
return
}
}
希望对你有所帮助!
英文:
I'm parsing through the triples of the Freebase RDF compressed and streaming with the XML package in Golang. However, I'm getting an out of memory error.
Do I have to garbage-collect? How can I do that? How can I clear the memory after I'm doing writing that triple to the XML file?
Here's my code: http://play.golang.org/p/dWvbtcs7wy
package main
import(
"bufio"
"flag"
"fmt"
"io"
"net/url"
"os"
"regexp"
"strings"
)
var inputFile = flag.String("infile", "freebase-rdf", "Input file path")
var filter, _ = regexp.Compile("^file:.*|^talk:.*|^special:.*|^wikipedia:.*|^wiktionary:.*|^user:.*|^user_talk:.*")
type Redirect struct {
Title string `xml:"title,attr"`
}
type Page struct {
Title string `xml:"title"`
Abstract string `xml:""`
}
func CanonicaliseTitle(title string) string{
can := strings.ToLower(title)
can = strings.Replace(can, " ", "_", -1)
can = url.QueryEscape(can)
return can
}
func convertFreebaseId(uri string) string{
if strings.HasPrefix(uri, "<") && strings.HasSuffix(uri, ">") {
var id = uri[1 : len(uri)-1]
id = strings.Replace(id, "http://rdf.freebase.com/ns", "", -1)
id = strings.Replace(id, ".", "/", -1)
return id
}
return uri
}
func parseTriple(line string) (string, string, string){
var parts = strings.Split(line, "\t")
subject := convertFreebaseId(parts[0])
predicate := convertFreebaseId(parts[1])
object := convertFreebaseId(parts[2])
return subject, predicate, object
}
var (
validRegexp = regexp.MustCompile("^[A-Za-z0-9][A-Za-z0-9_-]*$")
englishRegexp = regexp.MustCompile("@en")
)
func validTitle(content []string) bool{
for _, v := range content{
if !englishRegexp.MatchString(v) && len(v) > 1 && strings.Index(v, "[]") != -1{
}
}
return true
}
func validText(content []string) bool{
for _, v := range content{
if !validRegexp.MatchString(v) && len(v) > 1 && strings.Index(v, "[]") != -1{
return false
}
}
return true
}
func processTopic(id string, properties map[string][]string, file io.Writer){
if validTitle(properties["/type/object/name"]) && validText(properties["/common/document/text"]){
fmt.Fprintf(file, "<card>\n")
fmt.Fprintf(file, "<title>\"%s\"</title>\n", properties["/type/object/name"])
fmt.Fprintf(file, "<image>\"%s/%s\"</image>\n", "https://usercontent.googleapis.com/freebase/v1/image", id)
fmt.Fprintf(file, "<text>\"%s\"</text>\n", properties["/common/document/text"])
fmt.Fprintf(file, "<facts>\n")
for k, v := range properties{
for _, value := range v{
fmt.Fprintf(file, "<fact property=\"%s\">%s</fact>\n", k, value)
}
}
fmt.Fprintf(file, "</facts>\n")
fmt.Fprintf(file, "</card>\n")
}
}
func main(){
var current_mid = ""
current_topic := make(map[string][]string)
f, err := os.Open(*inputFile)
if err != nil {
fmt.Println(err)
return
}
r := bufio.NewReader(f)
xmlFile, _ := os.Create("freebase.xml")
line, err := r.ReadString('\n')
for err == nil{
subject, predicate, object := parseTriple(line)
if subject == current_mid{
current_topic[predicate] = append(current_topic[predicate], object)
}else if len(current_mid) > 0{
processTopic(current_mid, current_topic, xmlFile)
current_topic = make(map[string][]string)
}
current_mid = subject
line, err = r.ReadString('\n')
}
processTopic(current_mid, current_topic, xmlFile)
if err != io.EOF {
fmt.Println(err)
return
}
}
答案1
得分: 1
我不确定这是否是你的问题,尽管阅读你的代码,似乎没有泄漏任何东西 - 但是你可以通过SetGCPercent()
来调整GC的行为。根据文档,当新分配的数据与上次收集后剩余的活动数据之比达到这个百分比时,会触发一次收集。默认比例是100%,这意味着对于进行大量小内存分配并持有大量内存的程序来说,开销可能会很大。我曾经遇到过一个HTTP缓存占用了超过缓存大小200%的情况。尝试将百分比调整到大约10%左右,看看是否有帮助。
英文:
I'm not sure that this is your problem, although reading your code it seems you're not leaking anything - but you can tune GC behavior a bit with SetGCPercent()
http://golang.org/pkg/runtime/debug/#SetGCPercent
According to TFM, a collection is triggered when the ratio of freshly allocated data to live data remaining after the previous collection reaches this percentage.
. The default rate is 100%, meaning for programs that make lots of small allocations and hold lots of RAM, the overhead can be huge. I had an HTTP cache take up over 200% the cache size once. Try tuning the percentage to somewhere around 10% and see if it helps.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论