英文:
Most efficient way to check if a byte slice is a number
问题
我正在寻找判断字节切片是否为浮点数的最高效方法。
由于要处理大型数据集,因此性能至关重要。
尝试过的方法有:
strconv.ParseFloatregexp.MatchCheckNumber- 使用IsNumber函数和检查字节切片是否包含.的自定义函数。
func CheckNumber(p []byte) bool {
    r := string(p)
    sep := 0
    for _, b := range r {
        if unicode.IsNumber(b) {
            continue
        }
        if b == rune('.') {
            if sep > 0 {
                return false
            }
            sep++
            continue
        }
        return false
    }
    return true
}
基准测试代码:
func BenchmarkFloatStrconv(b *testing.B) {
    p := []byte("15.34234234234")
    for i := 0; i < b.N; i++ {
        _, err := strconv.ParseFloat(string(p), 64)
        if err != nil {
            log.Fatalf("NaN")
        }
    }
}
func BenchmarkFloatRegex(b *testing.B) {
    p := []byte("15.34234234234")
    r := `[-+]?[0-9]*\.?[0-9]`
    c, _ := regexp.Compile(r)
    for i := 0; i < b.N; i++ {
        ok := c.Match(p)
        if !ok {
            log.Fatalf("NaN")
        }
    }
}
func BenchmarkCheckNumber(b *testing.B) {
    p := []byte("15.34234234234")
    for i := 0; i < b.N; i++ {
        ok := CheckNumber(p)
        if !ok {
            log.Fatalf("NaN")
        }
    }
}
基准测试结果:
BenchmarkFloatStrconv-8   	20000000	        85.8 ns/op	      16 B/op	       1 allocs/op
BenchmarkFloatRegex-8     	 5000000	       252 ns/op	       0 B/op	       0 allocs/op
BenchmarkCheckNumber-8    	20000000	        64.3 ns/op	       0 B/op	       0 allocs/op
- 我对这些不同的解决方案进行了公平的比较吗?
 - 是否有更好的解决方案?
 
编辑: 在Adrian和icza的指导下,这个新方法避免了转换为strings/runes的过程。
func CheckNumberNoStringConvert(r []byte) bool {
    sep := 0
    for i := range r {
        if r[i] >= 48 && r[i] <= 57 {
            continue
        }
        if r[i] == 46 {
            if sep > 0 {
                return false
            }
            sep++
            continue
        }
        return false
    }
    return true
}
并且性能表现非常好 ![]()
BenchmarkCheckNumberNoStringConvert-8   	200000000	         8.55 ns/op	       0 B/op	       0 allocs/op
英文:
I'm looking for the most efficient way to tell whether a byte slice is a float.
This is to be done on huge datasets, so performance is key.
Tried approaches:
- 
strconv.ParseFloat - 
regexp.Match - 
CheckNumber- home rolled function usingIsNumber+ looking at whether the byte slice contains a..func CheckNumber(p []byte) bool { r := string(p) sep := 0 for _, b := range r { if unicode.IsNumber(b) { continue } if b == rune('.') { if sep > 0 { return false } sep++ continue } return false } return true } 
The benchmark code:
func BenchmarkFloatStrconv(b *testing.B) {
	p := []byte("15.34234234234")
	for i := 0; i < b.N; i++ {
		_, err := strconv.ParseFloat(string(p), 64)
		if err != nil {
			log.Fatalf("NaN")
		}
	}
}
func BenchmarkFloatRegex(b *testing.B) {
	p := []byte("15.34234234234")
	r := `[-+]?[0-9]*\.?[0-9]`
	c, _ := regexp.Compile(r)
	for i := 0; i < b.N; i++ {
		ok := c.Match(p)
		if !ok {
			log.Fatalf("NaN")
		}
	}
}
func BenchmarkCheckNumber(b *testing.B) {
	p := []byte("15.34234234234")
	for i := 0; i < b.N; i++ {
		ok := CheckNumber(p)
		if !ok {
			log.Fatalf("NaN")
		}
	}
}
Benchmark results:
BenchmarkFloatStrconv-8   	20000000	        85.8 ns/op	      16 B/op	       1 allocs/op
BenchmarkFloatRegex-8     	 5000000	       252 ns/op	       0 B/op	       0 allocs/op
BenchmarkCheckNumber-8    	20000000	        64.3 ns/op	       0 B/op	       0 allocs/op
- Am I doing the different solutions fairness?
 - Are there better solutions?
 
Edit: thanks to pointers from Adrian and icza, this avoids converting to strings/runes
func CheckNumberNoStringConvert(r []byte) bool {
	sep := 0
	for i := range r {
		if r[i] >= 48 && r[i] <= 57 {
			continue
		}
		if r[i] == 46 {
			if sep > 0 {
				return false
			}
			sep++
			continue
		}
		return false
	}
	return true
}
and performs quite well ![]()
BenchmarkCheckNumberNoStringConvert-8   	200000000	         8.55 ns/op	       0 B/op	       0 allocs/op
答案1
得分: 3
对于一个简单的实数(浮点数)(没有科学或工程浮点格式,没有分组分隔符),可以使用以下代码进行判断:
func IsReal(n []byte) bool {
    if len(n) > 0 && n[0] == '-' {
        n = n[1:]
    }
    if len(n) == 0 {
        return false
    }
    var point bool
    for _, c := range n {
        if '0' <= c && c <= '9' {
            continue
        }
        if c == '.' && len(n) > 1 && !point {
            point = true
            continue
        }
        return false
    }
    return true
}
基准测试结果如下:
$ go test -run=! -bench=. -benchmem -cpu=1 real_test.go
goos: linux
goarch: amd64
BenchmarkIsReal       	100000000	    20.8 ns/op	       0 B/op	       0 allocs/op
BenchmarkFloatStrconv 	20000000	   101 ns/op	      16 B/op	       1 allocs/op
BenchmarkFloatRegex   	 5000000	   284 ns/op	       0 B/op	       0 allocs/op
BenchmarkCheckNumber  	20000000	    73.0 ns/op	       0 B/op	       0 allocs/op
PASS
ok  	command-line-arguments	7.380s
real_test.go 文件内容如下:
package main
import (
    "log"
    "regexp"
    "strconv"
    "testing"
    "unicode"
)
func IsReal(n []byte) bool {
    if len(n) > 0 && n[0] == '-' {
        n = n[1:]
    }
    if len(n) == 0 {
        return false
    }
    var point bool
    for _, c := range n {
        if '0' <= c && c <= '9' {
            continue
        }
        if c == '.' && len(n) > 1 && !point {
            point = true
            continue
        }
        return false
    }
    return true
}
func BenchmarkIsReal(b *testing.B) {
    p := []byte("15.34234234234")
    for i := 0; i < b.N; i++ {
        ok := IsReal(p)
        if !ok {
            log.Fatalf("NaN")
        }
    }
}
func CheckNumber(p []byte) bool {
    r := string(p)
    sep := 0
    for _, b := range r {
        if unicode.IsNumber(b) {
            continue
        }
        if b == rune('.') {
            if sep > 0 {
                return false
            }
            sep++
            continue
        }
        return false
    }
    return true
}
func BenchmarkFloatStrconv(b *testing.B) {
    p := []byte("15.34234234234")
    for i := 0; i < b.N; i++ {
        _, err := strconv.ParseFloat(string(p), 64)
        if err != nil {
            log.Fatalf("NaN")
        }
    }
}
func BenchmarkFloatRegex(b *testing.B) {
    p := []byte("15.34234234234")
    r := `[-+]?[0-9]*\.?[0-9]`
    c, _ := regexp.Compile(r)
    for i := 0; i < b.N; i++ {
        ok := c.Match(p)
        if !ok {
            log.Fatalf("NaN")
        }
    }
}
func BenchmarkCheckNumber(b *testing.B) {
    p := []byte("15.34234234234")
    for i := 0; i < b.N; i++ {
        ok := CheckNumber(p)
        if !ok {
            log.Fatalf("NaN")
        }
    }
}
希望对你有帮助!
英文:
For a simple real (floating-point) number (no scientific or engineering floating-point format, no group separators),
func IsReal(n []byte) bool {
if len(n) > 0 && n[0] == '-' {
n = n[1:]
}
if len(n) == 0 {
return false
}
var point bool
for _, c := range n {
if '0' <= c && c <= '9' {
continue
}
if c == '.' && len(n) > 1 && !point {
point = true
continue
}
return false
}
return true
}
Benchmark:
$ go test -run=! -bench=. -benchmem -cpu=1 real_test.go
goos: linux
goarch: amd64
BenchmarkIsReal       	100000000	    20.8 ns/op	       0 B/op	       0 allocs/op
BenchmarkFloatStrconv 	20000000	   101 ns/op	      16 B/op	       1 allocs/op
BenchmarkFloatRegex   	 5000000	   284 ns/op	       0 B/op	       0 allocs/op
BenchmarkCheckNumber  	20000000	    73.0 ns/op	       0 B/op	       0 allocs/op
PASS
ok  	command-line-arguments	7.380s
real_test.go:
package main
import (
"log"
"regexp"
"strconv"
"testing"
"unicode"
)
func IsReal(n []byte) bool {
if len(n) > 0 && n[0] == '-' {
n = n[1:]
}
if len(n) == 0 {
return false
}
var point bool
for _, c := range n {
if '0' <= c && c <= '9' {
continue
}
if c == '.' && len(n) > 1 && !point {
point = true
continue
}
return false
}
return true
}
func BenchmarkIsReal(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
ok := IsReal(p)
if !ok {
log.Fatalf("NaN")
}
}
}
func CheckNumber(p []byte) bool {
r := string(p)
sep := 0
for _, b := range r {
if unicode.IsNumber(b) {
continue
}
if b == rune('.') {
if sep > 0 {
return false
}
sep++
continue
}
return false
}
return true
}
func BenchmarkFloatStrconv(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
_, err := strconv.ParseFloat(string(p), 64)
if err != nil {
log.Fatalf("NaN")
}
}
}
func BenchmarkFloatRegex(b *testing.B) {
p := []byte("15.34234234234")
r := `[-+]?[0-9]*\.?[0-9]`
c, _ := regexp.Compile(r)
for i := 0; i < b.N; i++ {
ok := c.Match(p)
if !ok {
log.Fatalf("NaN")
}
}
}
func BenchmarkCheckNumber(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
ok := CheckNumber(p)
if !ok {
log.Fatalf("NaN")
}
}
}
答案2
得分: 1
我将其作为一种挑战,尝试将其重写为一种状态机,从这里的每个人的集体输入中合成 ![]()
func Validate(b []byte) bool {
    for i := range b {
        switch {
        case b[i] >= '0' && b[i] <= '9':
            continue
        case b[i] == '.':
            if len(b) == 1 {
                return false
            }
            if len(b) > i {
                return fractional(b[i+1:])
            }
            return true
        case i == 0 && b[i] == '-':
            if len(b) == 1 {
                return false
            }
            continue
        default:
            return false
        }
    }
    return true
}
func fractional(b []byte) bool {
    for i := range b {
        switch {
        case b[i] >= '0' && b[i] <= '9':
            continue
        case b[i] == 'e' || b[i] == 'E':
            if len(b[:i]) == 0 {
                return false
            }
            if len(b) > i+1 {
                return scientific(b[i+1:])
            }
            return false
        default:
            return false
        }
    }
    return true
}
func scientific(b []byte) bool {
    for i := range b {
        switch {
        case b[i] >= '0' && b[i] <= '9':
            continue
        case i == 0 && b[i] == '-':
            if len(b) == 1 {
                return false
            }
            continue
        default:
            return false
        }
    }
    return true
}
它似乎适用于几种不同的数字格式:
type v struct {
    Input    []byte
    Expected bool
}
func TestPermutations(t *testing.T) {
    b := []v{
        v{[]byte("123.456"), true},
        v{[]byte("123"), true},
        v{[]byte("123."), true},
        v{[]byte(".123"), true},
        v{[]byte("12.1e12"), true},
        v{[]byte("12.1e-12"), true},
        v{[]byte("-123.456"), true},
        v{[]byte("-123"), true},
        v{[]byte("-123."), true},
        v{[]byte("-.123"), true},
        v{[]byte("-12.1e12"), true},
        v{[]byte("-12.1e-12"), true},
        v{[]byte(".1e-12"), true},
        v{[]byte(".e-12"), false},
        v{[]byte(".e"), false},
        v{[]byte("e"), false},
        v{[]byte("abcdef"), false},
        v{[]byte("-"), false},
        v{[]byte("."), false},
    }
    for _, test := range b {
        ok := Validate(test.Input)
        if ok != test.Expected {
            t.Errorf("无法处理案例 %s", test.Input)
        }
    }
}
并且在原始基准测试中表现良好:
BenchmarkValidate-8    100000000    13.0 ns/op    0 B/op    0 allocs/op
基准测试代码:
func BenchmarkValidate(b *testing.B) {
    p := []byte("15.1234567890")
    for i := 0; i < b.N; i++ {
        ok := Validate(p)
        if !ok {
            log.Fatalf("问题")
        }
    }
}
英文:
I took upon it as a challenge for myself to rewrite this as some kind of state machine synthesizing the collective input from everyone here ![]()
func Validate(b []byte) bool {
for i := range b {
switch {
case b[i] >= '0' && b[i] <= '9':
continue
case b[i] == '.':
if len(b) == 1 {
return false
}
if len(b) > i {
return fractional(b[i+1:])
}
return true
case i == 0 && b[i] == '-':
if len(b) == 1 {
return false
}
continue
default:
return false
}
}
return true
}
func fractional(b []byte) bool {
for i := range b {
switch {
case b[i] >= '0' && b[i] <= '9':
continue
case b[i] == 'e' || b[i] == 'E':
if len(b[:i]) == 0 {
return false
}
if len(b) > i+1 {
return scientific(b[i+1:])
}
return false
default:
return false
}
}
return true
}
func scientific(b []byte) bool {
for i := range b {
switch {
case b[i] >= '0' && b[i] <= '9':
continue
case i == 0 && b[i] == '-':
if len(b) == 1 {
return false
}
continue
default:
return false
}
}
return true
}
It seems to work on a few different number formats:
type v struct {
Input    []byte
Expected bool
}
func TestPermutations(t *testing.T) {
b := []v{
v{[]byte("123.456"), true},
v{[]byte("123"), true},
v{[]byte("123."), true},
v{[]byte(".123"), true},
v{[]byte("12.1e12"), true},
v{[]byte("12.1e-12"), true},
v{[]byte("-123.456"), true},
v{[]byte("-123"), true},
v{[]byte("-123."), true},
v{[]byte("-.123"), true},
v{[]byte("-12.1e12"), true},
v{[]byte("-12.1e-12"), true},
v{[]byte(".1e-12"), true},
v{[]byte(".e-12"), false},
v{[]byte(".e"), false},
v{[]byte("e"), false},
v{[]byte("abcdef"), false},
v{[]byte("-"), false},
v{[]byte("."), false},
}
for _, test := range b {
ok := Validate(test.Input)
if ok != test.Expected {
t.Errorf("could not handle case %s", test.Input)
}
}
}
and perform quite well on the original benchmark:
BenchmarkValidate-8   	100000000	        13.0 ns/op	       0 B/op	       0 allocs/op
Benchmark code:
func BenchmarkValidate(b *testing.B) {
p := []byte("15.1234567890")
for i := 0; i < b.N; i++ {
ok := Validate(p)
if !ok {
log.Fatalf("problem")
}
}
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。


评论