
huangapple go评论108阅读模式

Efficient read and write CSV in Go




  1. package main
  2. import (
  3. "encoding/csv"
  4. "log"
  5. "os"
  6. "strconv"
  7. )
  8. func ReadCSV(filepath string) ([][]string, error) {
  9. csvfile, err := os.Open(filepath)
  10. if err != nil {
  11. return nil, err
  12. }
  13. defer csvfile.Close()
  14. reader := csv.NewReader(csvfile)
  15. fields, err := reader.ReadAll()
  16. return fields, nil
  17. }
  18. func main() {
  19. // 加载数据CSV
  20. records, err := ReadCSV("./path/to/datafile.csv")
  21. if err != nil {
  22. log.Fatal(err)
  23. }
  24. // 将结果写入新的CSV文件
  25. outfile, err := os.Create("./where/to/write/resultsfile.csv")
  26. if err != nil {
  27. log.Fatal("无法打开输出文件")
  28. }
  29. defer outfile.Close()
  30. writer := csv.NewWriter(outfile)
  31. for i, record := range records {
  32. time := record[0]
  33. value := record[1]
  34. // 跳过标题行
  35. if i == 0 {
  36. writer.Write([]string{time, value, "score"})
  37. continue
  38. }
  39. // 获取浮点数值
  40. floatValue, err := strconv.ParseFloat(value, 64)
  41. if err != nil {
  42. log.Fatal("记录: %v, 错误: %v", floatValue, err)
  43. }
  44. // 计算分数;无法更改此外部方法
  45. score := calculateStuff(floatValue)
  46. valueString := strconv.FormatFloat(floatValue, 'f', 8, 64)
  47. scoreString := strconv.FormatFloat(prob, 'f', 8, 64)
  48. //fmt.Printf("结果: %v\n", []string{time, valueString, scoreString})
  49. writer.Write([]string{time, valueString, scoreString})
  50. }
  51. writer.Flush()
  52. }



The Go code below reads in a 10,000 record CSV (of timestamp times and float values), runs some operations on the data, and then writes the original values to another CSV along with an additional column for score. However it is terribly slow (i.e. hours, but most of that is calculateStuff()) and I'm curious if there are any inefficiencies in the CSV reading/writing I can take care of.

  1. package main
  2. import (
  3. "encoding/csv"
  4. "log"
  5. "os"
  6. "strconv"
  7. )
  8. func ReadCSV(filepath string) ([][]string, error) {
  9. csvfile, err := os.Open(filepath)
  10. if err != nil {
  11. return nil, err
  12. }
  13. defer csvfile.Close()
  14. reader := csv.NewReader(csvfile)
  15. fields, err := reader.ReadAll()
  16. return fields, nil
  17. }
  18. func main() {
  19. // load data csv
  20. records, err := ReadCSV("./path/to/datafile.csv")
  21. if err != nil {
  22. log.Fatal(err)
  23. }
  24. // write results to a new csv
  25. outfile, err := os.Create("./where/to/write/resultsfile.csv"))
  26. if err != nil {
  27. log.Fatal("Unable to open output")
  28. }
  29. defer outfile.Close()
  30. writer := csv.NewWriter(outfile)
  31. for i, record := range records {
  32. time := record[0]
  33. value := record[1]
  34. // skip header row
  35. if i == 0 {
  36. writer.Write([]string{time, value, "score"})
  37. continue
  38. }
  39. // get float values
  40. floatValue, err := strconv.ParseFloat(value, 64)
  41. if err != nil {
  42. log.Fatal("Record: %v, Error: %v", floatValue, err)
  43. }
  45. score := calculateStuff(floatValue)
  46. valueString := strconv.FormatFloat(floatValue, 'f', 8, 64)
  47. scoreString := strconv.FormatFloat(prob, 'f', 8, 64)
  48. //fmt.Printf("Result: %v\n", []string{time, valueString, scoreString})
  49. writer.Write([]string{time, valueString, scoreString})
  50. }
  51. writer.Flush()
  52. }

I'm looking for help making this CSV read/write template code as fast as possible. For the scope of this question we need not worry about the calculateStuff method.


得分: 22



  1. func processCSV(rc io.Reader) (ch chan []string) {
  2. ch = make(chan []string, 10)
  3. go func() {
  4. r := csv.NewReader(rc)
  5. if _, err := r.Read(); err != nil { //读取标题行
  6. log.Fatal(err)
  7. }
  8. defer close(ch)
  9. for {
  10. rec, err := r.Read()
  11. if err != nil {
  12. if err == io.EOF {
  13. break
  14. }
  15. log.Fatal(err)
  16. }
  17. ch <- rec
  18. }
  19. }()
  20. return
  21. }




You're loading the file in memory first then processing it, that can be slow with a big file.

You need to loop and call .Read and process one line at a time.

  1. func processCSV(rc io.Reader) (ch chan []string) {
  2. ch = make(chan []string, 10)
  3. go func() {
  4. r := csv.NewReader(rc)
  5. if _, err := r.Read(); err != nil { //read header
  6. log.Fatal(err)
  7. }
  8. defer close(ch)
  9. for {
  10. rec, err := r.Read()
  11. if err != nil {
  12. if err == io.EOF {
  13. break
  14. }
  15. log.Fatal(err)
  16. }
  17. ch &lt;- rec
  18. }
  19. }()
  20. return
  21. }


//note it's roughly based on DaveC's comment.


得分: 7

这基本上是来自评论部分的Dave C的答案:

  1. package main
  2. import (
  3. "encoding/csv"
  4. "log"
  5. "os"
  6. "strconv"
  7. )
  8. func main() {
  9. // 设置读取器
  10. csvIn, err := os.Open("./path/to/datafile.csv")
  11. if err != nil {
  12. log.Fatal(err)
  13. }
  14. r := csv.NewReader(csvIn)
  15. // 设置写入器
  16. csvOut, err := os.Create("./where/to/write/resultsfile.csv")
  17. if err != nil {
  18. log.Fatal("无法打开输出文件")
  19. }
  20. w := csv.NewWriter(csvOut)
  21. defer csvOut.Close()
  22. // 处理标题
  23. rec, err := r.Read()
  24. if err != nil {
  25. log.Fatal(err)
  26. }
  27. rec = append(rec, "score")
  28. if err = w.Write(rec); err != nil {
  29. log.Fatal(err)
  30. }
  31. for {
  32. rec, err = r.Read()
  33. if err != nil {
  34. if err == io.EOF {
  35. break
  36. }
  37. log.Fatal(err)
  38. }
  39. // 获取浮点数值
  40. value := rec[1]
  41. floatValue, err := strconv.ParseFloat(value, 64)
  42. if err != nil {
  43. log.Fatal("记录错误: %v, %v", value, err)
  44. }
  45. // 计算分数;此外部方法不能更改
  46. score := calculateStuff(floatValue)
  47. scoreString := strconv.FormatFloat(score, 'f', 8, 64)
  48. rec = append(rec, scoreString)
  49. if err = w.Write(rec); err != nil {
  50. log.Fatal(err)
  51. }
  52. w.Flush()
  53. }
  54. }



This is essentially Dave C's answer from the comments sections:

  1. package main
  2. import (
  3. &quot;encoding/csv&quot;
  4. &quot;log&quot;
  5. &quot;os&quot;
  6. &quot;strconv&quot;
  7. )
  8. func main() {
  9. // setup reader
  10. csvIn, err := os.Open(&quot;./path/to/datafile.csv&quot;)
  11. if err != nil {
  12. log.Fatal(err)
  13. }
  14. r := csv.NewReader(csvIn)
  15. // setup writer
  16. csvOut, err := os.Create(&quot;./where/to/write/resultsfile.csv&quot;))
  17. if err != nil {
  18. log.Fatal(&quot;Unable to open output&quot;)
  19. }
  20. w := csv.NewWriter(csvOut)
  21. defer csvOut.Close()
  22. // handle header
  23. rec, err := r.Read()
  24. if err != nil {
  25. log.Fatal(err)
  26. }
  27. rec = append(rec, &quot;score&quot;)
  28. if err = w.Write(rec); err != nil {
  29. log.Fatal(err)
  30. }
  31. for {
  32. rec, err = r.Read()
  33. if err != nil {
  34. if err == io.EOF {
  35. break
  36. }
  37. log.Fatal(err)
  38. }
  39. // get float value
  40. value := rec[1]
  41. floatValue, err := strconv.ParseFloat(value, 64)
  42. if err != nil {
  43. log.Fatal(&quot;Record, error: %v, %v&quot;, value, err)
  44. }
  46. score := calculateStuff(floatValue)
  47. scoreString := strconv.FormatFloat(score, &#39;f&#39;, 8, 64)
  48. rec = append(rec, scoreString)
  49. if err = w.Write(rec); err != nil {
  50. log.Fatal(err)
  51. }
  52. w.Flush()
  53. }
  54. }

Note of course the logic is all jammed into main(), better would be to split it into several functions, but that's beyond the scope of this question.


得分: 1





encoding/csv is indeed very slow on big files, as it performs a lot of allocations. Since your format is so simple I recommend using strings.Split instead which is much faster.

If even that is not fast enough you can consider implementing the parsing yourself using strings.IndexByte which is implemented in assembly: http://golang.org/src/strings/strings_decl.go?s=274:310#L1

Having said that, you should also reconsider using ReadAll if the file is larger than your memory.

  • 本文由 发表于 2015年8月16日 01:53:53
  • 转载请务必保留本文链接:https://go.coder-hub.com/32027590.html



:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:
