使用Golang从文件中提取数据

huangapple go评论79阅读模式
英文:

extracting data from file using golang

问题

我正在尝试从文件中提取行,如果满足条件的话。

文件中的数据如下:

Sat 08 Aug 2015
Norwich City
A
League
	W 3-1
	Zaha 38; Delaney 48; Cabaye 90
	27,036

如果日期的模式匹配成功,我想打印接下来的五行。

我的代码如下:

func main() {

	r, _ := regexp.Compile("[aA-zZ]{3}\\s[0-9]{2}\\s[aA-zZ]{3}\\s[0-9]{4}")

	file, err := os.Open("test.txt")
	if err != nil {
		log.Fatal(err)
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	for scanner.Scan() {

		if r.MatchString(scanner.Text()) {

			fmt.Println(scanner.Text())

			// 在这里,我该如何捕获接下来的五行

		}

		if err := scanner.Err(); err != nil {
			log.Fatal(err)
		}
	}
}
英文:

I am trying to extract lines from a file if a condition is met.

The data in the file look like this :

Sat 08 Aug 2015
Norwich City
A
League
	W 3-1
	Zaha 38; Delaney 48; Cabaye 90
	27,036

If the pattern of the date is matched, I want to print the following five lines.

My code is,

func main() {

	r, _ := regexp.Compile("[aA-zZ]{3}\\s[0-9]{2}\\s[aA-zZ]{3}\\s[0-9]{4}")

	file, err := os.Open("test.txt")
	if err != nil {
		log.Fatal(err)
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	for scanner.Scan() {

		if r.MatchString(scanner.Text()) {

			fmt.Println(scanner.Text())

			// here how do i capture the following 5 lines

		}

		if err := scanner.Err(); err != nil {
			log.Fatal(err)
		}
	}
}

答案1

得分: 2

不确定是否有遗漏,但是像这样的代码是否足够:

package main

import (
"regexp"
"os"
"log"
"bufio"
"fmt"
)

func main() {

r, _ := regexp.Compile("[aA-zZ]{3}\\s[0-9]{2}\\s[aA-zZ]{3}\\s[0-9]{4}")

file, err := os.Open("/tmp/test.txt")
if err != nil {
    log.Fatal(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {

    if r.MatchString(scanner.Text()) {
        fmt.Println(scanner.Text())
        for i :=0; i < 5; i++{
           scanner.Scan()
            fmt.Println(scanner.Text())
        }

    }

    if err := scanner.Err(); err != nil {
        log.Fatal(err)
    }
  }
}
英文:

Not sure if I have missed something but would something like this suffice:

package main

import (
&quot;regexp&quot;
&quot;os&quot;
&quot;log&quot;
&quot;bufio&quot;
&quot;fmt&quot;
)

func main() {

r, _ := regexp.Compile(&quot;[aA-zZ]{3}\\s[0-9]{2}\\s[aA-zZ]{3}\\s[0-9]{4}&quot;)

file, err := os.Open(&quot;/tmp/test.txt&quot;)
if err != nil {
    log.Fatal(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {

    if r.MatchString(scanner.Text()) {
        fmt.Println(scanner.Text())
        for i :=0; i &lt; 5; i++{
           scanner.Scan()
            fmt.Println(scanner.Text())
        }

    }

    if err := scanner.Err(); err != nil {
        log.Fatal(err)
    }
  }
}

答案2

得分: 1

也许是这样的吗?

package main

import (
	"bufio"
	"fmt"
	"io"
	"os"
	"strings"
	"time"
)

type Match struct {
	Date       time.Time
	Opponents  string
	Venue      string
	Type       string
	Result     string
	Scorers    string
	Attendance string
}

var fmtMatchDate = "Mon 02 Jan 2006"

func (m Match) String() string {
	var s string
	s += fmt.Sprint(m.Date.Format(fmtMatchDate), "\n")
	s += fmt.Sprint(
		m.Opponents, "\n",
		m.Venue, "\n",
		m.Type, "\n",
		m.Result, "\n",
	)
	if len(m.Scorers) > 0 {
		s += fmt.Sprint(
			m.Scorers, "\n",
		)
	}
	if len(m.Attendance) > 0 {
		s += fmt.Sprint(
			m.Attendance, "\n",
		)
	}
	return s
}

func ParseMatch(lines []string) (Match, error) {
	// TODO: Implement a better parser.
	var m Match
	for i, line := range lines {
		line = strings.TrimSpace(line)
		switch i {
		case 0:
			date, err := time.Parse(fmtMatchDate, line)
			if err != nil {
				return Match{}, err
			}
			m.Date = date
		case 1:
			m.Opponents = line
		case 2:
			m.Venue = line
		case 3:
			m.Type = line
		case 4:
			m.Result = line
		case 5:
			m.Scorers = line
		case 6:
			m.Attendance = line
		default:
		}
	}
	return m, nil
}

func main() {
	f, err := os.Open("match.txt")
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
	var lines []string
	snr := bufio.NewScanner(f)
	for snr.Scan() {
		line := snr.Text()
		if _, err = time.Parse(fmtMatchDate, strings.TrimSpace(line)); err == nil {
			if len(lines) > 0 {
				m, err := ParseMatch(lines)
				if err != nil {
					fmt.Fprintln(os.Stderr, err)
				} else {
					fmt.Print(m)
				}
			}
			lines = lines[:0]
		}
		lines = append(lines, line)
	}
	if len(lines) > 0 {
		m, err := ParseMatch(lines)
		if err != nil {
			fmt.Fprintln(os.Stderr, err)
		} else {
			fmt.Print(m)
		}
	}
	if err := snr.Err(); err != nil {
		if err != io.EOF {
			fmt.Fprintln(os.Stderr, err)
			os.Exit(1)
		}
	}
}

输入:

$ cat match.txt
Sat 08 Aug 2015
Norwich City
A
League
W 3-1
Zaha 38; Delaney 48; Cabaye 90
27,036
Sun 16 Aug 2015
Arsenal
H
League
L 1-2
Sat 29 Aug 2015
Chelsea
A
League
W 2-1
Sako 64; Ward 80
41,581

输出:

$ go run match.go
Sat 08 Aug 2015
Norwich City
A
League
W 3-1
Zaha 38; Delaney 48; Cabaye 90
27,036
Sun 16 Aug 2015
Arsenal
H
League
L 1-2
Sat 29 Aug 2015
Chelsea
A
League
W 2-1
Sako 64; Ward 80
41,581
$
英文:

Perhaps, something like this?

package main
import (
&quot;bufio&quot;
&quot;fmt&quot;
&quot;io&quot;
&quot;os&quot;
&quot;strings&quot;
&quot;time&quot;
)
type Match struct {
Date       time.Time
Opponents  string
Venue      string
Type       string
Result     string
Scorers    string
Attendance string
}
var fmtMatchDate = &quot;Mon 02 Jan 2006&quot;
func (m Match) String() string {
var s string
s += fmt.Sprint(m.Date.Format(fmtMatchDate), &quot;\n&quot;)
s += fmt.Sprint(
m.Opponents, &quot;\n&quot;,
m.Venue, &quot;\n&quot;,
m.Type, &quot;\n&quot;,
m.Result, &quot;\n&quot;,
)
if len(m.Scorers) &gt; 0 {
s += fmt.Sprint(
m.Scorers, &quot;\n&quot;,
)
}
if len(m.Attendance) &gt; 0 {
s += fmt.Sprint(
m.Attendance, &quot;\n&quot;,
)
}
return s
}
func ParseMatch(lines []string) (Match, error) {
// TODO: Implement a better parser.
var m Match
for i, line := range lines {
line = strings.TrimSpace(line)
switch i {
case 0:
date, err := time.Parse(fmtMatchDate, line)
if err != nil {
return Match{}, err
}
m.Date = date
case 1:
m.Opponents = line
case 2:
m.Venue = line
case 3:
m.Type = line
case 4:
m.Result = line
case 5:
m.Scorers = line
case 6:
m.Attendance = line
default:
}
}
return m, nil
}
func main() {
f, err := os.Open(&quot;match.txt&quot;)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
var lines []string
snr := bufio.NewScanner(f)
for snr.Scan() {
line := snr.Text()
if _, err = time.Parse(fmtMatchDate, strings.TrimSpace(line)); err == nil {
if len(lines) &gt; 0 {
m, err := ParseMatch(lines)
if err != nil {
fmt.Fprintln(os.Stderr, err)
} else {
fmt.Print(m)
}
}
lines = lines[:0]
}
lines = append(lines, line)
}
if len(lines) &gt; 0 {
m, err := ParseMatch(lines)
if err != nil {
fmt.Fprintln(os.Stderr, err)
} else {
fmt.Print(m)
}
}
if err := snr.Err(); err != nil {
if err != io.EOF {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
}

Input:

$ cat match.txt
Sat 08 Aug 2015
Norwich City
A
League
W 3-1
Zaha 38; Delaney 48; Cabaye 90
27,036
Sun 16 Aug 2015
Arsenal
H
League
L 1-2
Sat 29 Aug 2015
Chelsea
A
League
W 2-1
Sako 64; Ward 80
41,581

Output:

$ go run match.go
Sat 08 Aug 2015
Norwich City
A
League
W 3-1
Zaha 38; Delaney 48; Cabaye 90
27,036
Sun 16 Aug 2015
Arsenal
H
League
L 1-2
Sat 29 Aug 2015
Chelsea
A
League
W 2-1
Sako 64; Ward 80
41,581
$

答案3

得分: 0

我不是正则表达式的忠实粉丝,因为当你或其他人在6个月后回头看它时,它往往会使事情变得复杂。我会将文件读入行的切片,并使用偏移量来获取要测试的行。

func main() {
    var (
        dayName    string
        month      string
        name       string
        A          string
        league     string
        score      string
        scorers    string
        attendance string
        day        int
        year       int
        err        error
    )
    data, errRead := ioutil.ReadFile(fileName)
    if errRead != nil {
        return
    }

    // 将文件作为文本块获取
    theText := string(data)
    // 使行结束符一致
    theText = strings.Replace(theText, "\r\n", "\r", -1)
    theText = strings.Replace(theText, "\n", "\r", -1)
    // 将文本拆分为一组行
    lines := strings.Split(theText, "\r")
    numLines := len(lines)
    i := 0
    for i < numLines {
        // 此时我们应该有你的测试行
        theLine := lines[i]
        i++
        // 给每行一个一致的间距,你永远不知道它处于什么状态
        theLine = strings.Replace(theLine, "  ", " ", -1)
        parts := strings.Split(theLine, " ")
        if len(parts) == 4 {
            // 至少该行有四个日期部分
            dayName := parts[0]
            day, err = strconv.Atoi(parts[1])
            if err == nil {
                // 我们有一个数字表示日期
                month := parts[2]
                year, err = strconv.Atoi(parts[3])
                if err == nil {
                    // 我们有一个数字表示年份
                    // 接下来的五行是你的数据
                    name = lines[i]
                    A = lines[i+1]
                    league = lines[i+2]
                    score = lines[i+3]
                    scorers = lines[i+4]
                    attendance = lines[i+5]
                    i += 6
                }
            }
        }
    }
}

对于得分等数据,你需要自己解析,但这将相当简单。你还需要记住,当从他人那里获取数据时,他们的一致性可能不如你所希望的那样。

英文:

I am not a great fan of regex as it tends to complicate things when you, or someone else, goes back to it in 6 months. I would read the the file into a slice of lines, and use an offset as the way of getting the lines to test.

func main() {
var (
dayName    string
month      string
name       string
A          string
league     string
score      string
scorers    string
attendance string
day        int
year       int
err        error
)
data, errRead := ioutil.ReadFile(fileName)
if errRead != nil {
return
}
//  get the files as a block of text
theText := string(data)
//  make the line endings consistent
theText = strings.Replace(theText, &quot;\r\n&quot;, &quot;\r&quot;, -1)
theText = strings.Replace(theText, &quot;\n&quot;, &quot;\r&quot;, -1)
//  split it into a set of lines
lines := strings.Split(theText, &quot;\r&quot;)
numLines := len(lines)
i := 0
for i &lt; numLines {
//      at this point we should have your test line
theLine := lines[i]
i++
//      give each line a consistent spacing, you never know what state it is in
theLine = strings.Replace(theLine, &quot;  &quot;, &quot; &quot;, -1)
parts := strings.Split(theLine, &quot; &quot;)
if len(parts) == 4 {
//         At least the line has the four date parts
dayName := parts[0]
day, err = strconv.Atoi(parts[1])
if err == nil {
//             We have a number for the day
month := parts[2]
year, err = strconv.Atoi(parts[3])
if err == nil {
//                 We have a number for the year
//                 the next five lines are your data
name = lines[i]
A = lines[i+1]
league = lines[i+2]
score = lines[i+3]
scorers = lines[i+4]
attendance = lines[i+5]
i += 6
}
}
}
}
}

For the score etc you will have to parse it yourself, but this will be fairly trivial. You also need to remember that when getting data from someone else they may not always be as consistent as you would wish.

huangapple
  • 本文由 发表于 2016年3月27日 17:23:04
  • 转载请务必保留本文链接:https://go.coder-hub.com/36245781.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定