英文:
Bad perfomance with CSV and maps in Go
问题
我需要编写一个Go脚本,它将打开一个大型的CSV文件,并根据每行的第一个元素的值创建新的、独立的CSV文件。
CSV文件的格式如下:
"country", "otherfield", "otherfield1", "otherfield2", "etc"
"AT", "otherfield", "otherfield1", "otherfield2", "etc"
"AT", "otherfield", "otherfield1", "otherfield2", "etc"
"DE", "otherfield", "otherfield1", "otherfield2", "etc"
"DE", "otherfield", "otherfield1", "otherfield2", "etc"
所以,我想要做的是创建一个以第一个字段的值命名的文件(例如AT.csv
),其中包含以该值开头的所有行。
以下是我目前编写的脚本:
package main
import (
"encoding/csv"
"fmt"
"os"
)
func main() {
// contentCreated := make(chan map[string]string)
createContent("union_exp.csv")
}
func createContent(csvfilename string) {
keys := ""
content := make(map[string]string)
csvfile, err := os.Open(csvfilename)
if err != nil {
fmt.Println(err)
}
defer csvfile.Close()
reader := csv.NewReader(csvfile)
reader.FieldsPerRecord = -1
rawCSVdata, err := reader.ReadAll()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for i, each := range rawCSVdata {
if i == 0 {
keys = "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
} else {
stringtoadd := "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
if i%10000 == 0 {
fmt.Println(i)
}
exists := Exists(content, each[0])
if !exists {
content[each[0]] = keys
}
content[each[0]] += stringtoadd
createFile(each[0], content[each[0]])
}
}
}
func createFile(name, content string) {
f, _ := os.Create(name + ".csv")
f.WriteString(content)
f.Close()
}
func Exists(content map[string]string, name string) bool {
_, exists := content[name]
return exists
}
我目前遇到的问题是性能相当慢。我甚至有一个类似的用PHP编写的脚本,执行相同的操作比这个快得多。这让我觉得我的Go脚本肯定有问题。
有人可以帮助我理解其中的问题吗?
谢谢!
英文:
I need to write a Go script that will open a big CSV file, and create new, separate CSVs based on the value of the first element of each line.
The CSV file looks like this:
"country", "otherfield", "otherfield1", "otherfield2", "etc"
"AT", "otherfield", "otherfield1", "otherfield2", "etc"
"AT", "otherfield", "otherfield1", "otherfield2", "etc"
"DE", "otherfield", "otherfield1", "otherfield2", "etc"
"DE", "otherfield", "otherfield1", "otherfield2", "etc"
So, what I am trying to do is creating a file with the first field's value (e.g. AT.csv
), containing all the lines that start with that value.
The following is the script that I have written so far:
package main
import (
"encoding/csv"
"fmt"
"os"
)
func main() {
// contentCreated := make(chan map[string]string)
createContent("union_exp.csv")
}
func createContent(csvfilename string) {
keys := ""
content := make(map[string]string)
csvfile, err := os.Open(csvfilename)
if err != nil {
fmt.Println(err)
}
defer csvfile.Close()
reader := csv.NewReader(csvfile)
reader.FieldsPerRecord = -1
rawCSVdata, err := reader.ReadAll()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for i, each := range rawCSVdata {
if i == 0 {
keys = "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
} else {
stringtoadd := "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
if i%10000 == 0 {
fmt.Println(i)
}
exists := Exists(content, each[0])
if !exists {
content[each[0]] = keys
}
content[each[0]] += stringtoadd
createFile(each[0], content[each[0]])
}
}
}
func createFile(name, content string) {
f, _ := os.Create(name + ".csv")
f.WriteString(content)
f.Close()
}
func Exists(content map[string]string, name string) bool {
_, exists := content[name]
return exists
}
The problem I am having at the moment is that the performances are quite slow. I even have a similar script written in PHP which is executing the same operation way faster than this. And that obviously makes me think that there must be something wrong with my Go script.
Can someone help me to understand what is wrong with it?
Thank you!
答案1
得分: 2
你正在(不必要地)一次性加载完整的CSV文件,并在内容更改时每次都覆盖文件。
尝试以下代码:
package main
import (
"encoding/csv"
"fmt"
"os"
"sync"
)
func main() {
input, err := os.Open("union_exp.csv")
if err != nil {
fmt.Println("打开CSV文件时出错。")
return
}
defer input.Close()
reader := csv.NewReader(input)
reader.FieldsPerRecord = -1
files := make(map[string]chan []string)
keys, err := reader.Read()
if err != nil {
fmt.Println("读取CSV文件时出错。")
return
}
wg := &sync.WaitGroup{}
var line []string
for line, err = reader.Read(); err == nil; line, err = reader.Read() {
ch, ok := files[line[0]]
if ok {
ch <- line
} else {
ch = make(chan []string, 8)
wg.Add(1)
go fileWriter(line[0], ch, wg)
ch <- keys
files[line[0]] = ch
}
}
if err.Error() != "EOF" {
fmt.Println("读取CSV文件时出错。")
return
}
for _, ch := range files {
close(ch)
}
wg.Wait()
fmt.Println("完成!")
}
func fileWriter(fileName string, ch chan []string, wg *sync.WaitGroup) {
defer wg.Done()
file, err := os.Create("x" + fileName + ".csv")
if err != nil {
fmt.Println("创建输出文件时出错。")
os.Exit(1) // 终止整个应用程序
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
for line := range ch {
writer.Write(line)
}
}
这段代码会按行读取CSV文件,并根据第一列的值将行写入不同的文件中。每个文件都使用一个单独的goroutine进行写入,以提高效率。
英文:
You are (unnecessarily) loading the complete CVS file at once and overwriting the files every time the contents would change.
Try the following:
package main
import (
"encoding/csv"
"fmt"
"os"
"sync"
)
func main() {
input, err := os.Open("union_exp.csv")
if err != nil {
fmt.Println("Error while opening CSV file.")
return
}
defer input.Close()
reader := csv.NewReader(input)
reader.FieldsPerRecord = -1
files := make(map[string]chan []string)
keys, err := reader.Read()
if err != nil {
fmt.Println("Error while reading CSV file.")
return
}
wg := &sync.WaitGroup{}
var line []string
for line, err = reader.Read(); err == nil; line, err = reader.Read() {
ch, ok := files]
if ok {
ch <- line
} else {
ch = make(chan []string, 8)
wg.Add(1)
go fileWriter(line[0], ch, wg)
ch <- keys
files] = ch
}
}
if err.Error() != "EOF" {
fmt.Println("Error while reading CSV file.")
return
}
for _, ch := range files {
close(ch)
}
wg.Wait()
fmt.Println("Done!")
}
func fileWriter(fileName string, ch chan []string, wg *sync.WaitGroup) {
defer wg.Done()
file, err := os.Create("x" + fileName + ".csv")
if err != nil {
fmt.Println("Error while creating output file.")
os.Exit(1) // Kill the whole app
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
for line := range ch {
writer.Write(line)
}
}
答案2
得分: 1
我同意@plusmid的答案-你的程序大部分时间都花在打开/写入/关闭文件上。
所以,首先修复这个bug,并且每个键只写入一次内容:
package main
import (
"encoding/csv"
"fmt"
"os"
)
func main() {
// contentCreated := make(chan map[string]string)
createContent("union_exp.csv")
}
func createContent(csvfilename string) {
keys := ""
content := make(map[string]string)
csvfile, err := os.Open(csvfilename)
if err != nil {
fmt.Println(err)
}
defer csvfile.Close()
reader := csv.NewReader(csvfile)
reader.FieldsPerRecord = -1
rawCSVdata, err := reader.ReadAll()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for i, each := range rawCSVdata {
if i == 0 {
keys = "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
} else {
stringtoadd := "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
if i%10000 == 0 {
fmt.Println(i)
}
exists := Exists(content, each[0])
if !exists {
content[each[0]] = keys
}
content[each[0]] += stringtoadd
}
}
for key, content := range content {
createFile(key, content)
}
}
func createFile(name, content string) {
f, _ := os.Create(name + ".csv")
f.WriteString(content)
f.Close()
}
func Exists(content map[string]string, name string) bool {
_, exists := content[name]
return exists
}
在25k个CSV文件上,它给我带来了50 -> 5秒的速度提升。
接下来,考虑使用goroutines并行解析文件。现在你只使用了单个核心。
此外,还有一些其他问题,比如使用+运算符连接字符串,这通常比fmt.Sprintf()慢。你在这里有很多优化代码的空间。
英文:
I second the @plusmid' answer - the vast majority of time your program spends on opening/(over)writing/closing files.
So, first of all, fix this bug, and write content only once for each key:
package main
import (
"encoding/csv"
"fmt"
"os"
)
func main() {
// contentCreated := make(chan map[string]string)
createContent("union_exp.csv")
}
func createContent(csvfilename string) {
keys := ""
content := make(map[string]string)
csvfile, err := os.Open(csvfilename)
if err != nil {
fmt.Println(err)
}
defer csvfile.Close()
reader := csv.NewReader(csvfile)
reader.FieldsPerRecord = -1
rawCSVdata, err := reader.ReadAll()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for i, each := range rawCSVdata {
if i == 0 {
keys = "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
} else {
stringtoadd := "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
if i%10000 == 0 {
fmt.Println(i)
}
exists := Exists(content, each[0])
if !exists {
content[each[0]] = keys
}
content[each[0]] += stringtoadd
}
}
for key, content := range content {
createFile(key, content)
}
}
func createFile(name, content string) {
f, _ := os.Create(name + ".csv")
f.WriteString(content)
f.Close()
}
func Exists(content map[string]string, name string) bool {
_, exists := content[name]
return exists
}
On 25k CSV it gives me 50 -> 5 secs speed increase.
Next, think about using goroutines to parse file in parallel. Now you're using only single core.
Also, there are some more issues like using + operator to concatenate strings, which is generally slower than fmt.Sprintf(). You have a lot of space to optimize code here.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论