英文:
Most efficient way to check if a byte slice is a number
问题
我正在寻找判断字节切片是否为浮点数的最高效方法。
由于要处理大型数据集,因此性能至关重要。
尝试过的方法有:
strconv.ParseFloat
regexp.Match
CheckNumber
- 使用IsNumber
函数和检查字节切片是否包含.
的自定义函数。
func CheckNumber(p []byte) bool {
r := string(p)
sep := 0
for _, b := range r {
if unicode.IsNumber(b) {
continue
}
if b == rune('.') {
if sep > 0 {
return false
}
sep++
continue
}
return false
}
return true
}
基准测试代码:
func BenchmarkFloatStrconv(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
_, err := strconv.ParseFloat(string(p), 64)
if err != nil {
log.Fatalf("NaN")
}
}
}
func BenchmarkFloatRegex(b *testing.B) {
p := []byte("15.34234234234")
r := `[-+]?[0-9]*\.?[0-9]`
c, _ := regexp.Compile(r)
for i := 0; i < b.N; i++ {
ok := c.Match(p)
if !ok {
log.Fatalf("NaN")
}
}
}
func BenchmarkCheckNumber(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
ok := CheckNumber(p)
if !ok {
log.Fatalf("NaN")
}
}
}
基准测试结果:
BenchmarkFloatStrconv-8 20000000 85.8 ns/op 16 B/op 1 allocs/op
BenchmarkFloatRegex-8 5000000 252 ns/op 0 B/op 0 allocs/op
BenchmarkCheckNumber-8 20000000 64.3 ns/op 0 B/op 0 allocs/op
- 我对这些不同的解决方案进行了公平的比较吗?
- 是否有更好的解决方案?
编辑: 在Adrian和icza的指导下,这个新方法避免了转换为strings
/runes
的过程。
func CheckNumberNoStringConvert(r []byte) bool {
sep := 0
for i := range r {
if r[i] >= 48 && r[i] <= 57 {
continue
}
if r[i] == 46 {
if sep > 0 {
return false
}
sep++
continue
}
return false
}
return true
}
并且性能表现非常好
BenchmarkCheckNumberNoStringConvert-8 200000000 8.55 ns/op 0 B/op 0 allocs/op
英文:
I'm looking for the most efficient way to tell whether a byte slice is a float.
This is to be done on huge datasets, so performance is key.
Tried approaches:
-
strconv.ParseFloat
-
regexp.Match
-
CheckNumber
- home rolled function usingIsNumber
+ looking at whether the byte slice contains a.
.func CheckNumber(p []byte) bool { r := string(p) sep := 0 for _, b := range r { if unicode.IsNumber(b) { continue } if b == rune('.') { if sep > 0 { return false } sep++ continue } return false } return true }
The benchmark code:
func BenchmarkFloatStrconv(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
_, err := strconv.ParseFloat(string(p), 64)
if err != nil {
log.Fatalf("NaN")
}
}
}
func BenchmarkFloatRegex(b *testing.B) {
p := []byte("15.34234234234")
r := `[-+]?[0-9]*\.?[0-9]`
c, _ := regexp.Compile(r)
for i := 0; i < b.N; i++ {
ok := c.Match(p)
if !ok {
log.Fatalf("NaN")
}
}
}
func BenchmarkCheckNumber(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
ok := CheckNumber(p)
if !ok {
log.Fatalf("NaN")
}
}
}
Benchmark results:
BenchmarkFloatStrconv-8 20000000 85.8 ns/op 16 B/op 1 allocs/op
BenchmarkFloatRegex-8 5000000 252 ns/op 0 B/op 0 allocs/op
BenchmarkCheckNumber-8 20000000 64.3 ns/op 0 B/op 0 allocs/op
- Am I doing the different solutions fairness?
- Are there better solutions?
Edit: thanks to pointers from Adrian and icza, this avoids converting to strings
/rune
s
func CheckNumberNoStringConvert(r []byte) bool {
sep := 0
for i := range r {
if r[i] >= 48 && r[i] <= 57 {
continue
}
if r[i] == 46 {
if sep > 0 {
return false
}
sep++
continue
}
return false
}
return true
}
and performs quite well
BenchmarkCheckNumberNoStringConvert-8 200000000 8.55 ns/op 0 B/op 0 allocs/op
答案1
得分: 3
对于一个简单的实数(浮点数)(没有科学或工程浮点格式,没有分组分隔符),可以使用以下代码进行判断:
func IsReal(n []byte) bool {
if len(n) > 0 && n[0] == '-' {
n = n[1:]
}
if len(n) == 0 {
return false
}
var point bool
for _, c := range n {
if '0' <= c && c <= '9' {
continue
}
if c == '.' && len(n) > 1 && !point {
point = true
continue
}
return false
}
return true
}
基准测试结果如下:
$ go test -run=! -bench=. -benchmem -cpu=1 real_test.go
goos: linux
goarch: amd64
BenchmarkIsReal 100000000 20.8 ns/op 0 B/op 0 allocs/op
BenchmarkFloatStrconv 20000000 101 ns/op 16 B/op 1 allocs/op
BenchmarkFloatRegex 5000000 284 ns/op 0 B/op 0 allocs/op
BenchmarkCheckNumber 20000000 73.0 ns/op 0 B/op 0 allocs/op
PASS
ok command-line-arguments 7.380s
real_test.go
文件内容如下:
package main
import (
"log"
"regexp"
"strconv"
"testing"
"unicode"
)
func IsReal(n []byte) bool {
if len(n) > 0 && n[0] == '-' {
n = n[1:]
}
if len(n) == 0 {
return false
}
var point bool
for _, c := range n {
if '0' <= c && c <= '9' {
continue
}
if c == '.' && len(n) > 1 && !point {
point = true
continue
}
return false
}
return true
}
func BenchmarkIsReal(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
ok := IsReal(p)
if !ok {
log.Fatalf("NaN")
}
}
}
func CheckNumber(p []byte) bool {
r := string(p)
sep := 0
for _, b := range r {
if unicode.IsNumber(b) {
continue
}
if b == rune('.') {
if sep > 0 {
return false
}
sep++
continue
}
return false
}
return true
}
func BenchmarkFloatStrconv(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
_, err := strconv.ParseFloat(string(p), 64)
if err != nil {
log.Fatalf("NaN")
}
}
}
func BenchmarkFloatRegex(b *testing.B) {
p := []byte("15.34234234234")
r := `[-+]?[0-9]*\.?[0-9]`
c, _ := regexp.Compile(r)
for i := 0; i < b.N; i++ {
ok := c.Match(p)
if !ok {
log.Fatalf("NaN")
}
}
}
func BenchmarkCheckNumber(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
ok := CheckNumber(p)
if !ok {
log.Fatalf("NaN")
}
}
}
希望对你有帮助!
英文:
For a simple real (floating-point) number (no scientific or engineering floating-point format, no group separators),
func IsReal(n []byte) bool {
if len(n) > 0 && n[0] == '-' {
n = n[1:]
}
if len(n) == 0 {
return false
}
var point bool
for _, c := range n {
if '0' <= c && c <= '9' {
continue
}
if c == '.' && len(n) > 1 && !point {
point = true
continue
}
return false
}
return true
}
Benchmark:
$ go test -run=! -bench=. -benchmem -cpu=1 real_test.go
goos: linux
goarch: amd64
BenchmarkIsReal 100000000 20.8 ns/op 0 B/op 0 allocs/op
BenchmarkFloatStrconv 20000000 101 ns/op 16 B/op 1 allocs/op
BenchmarkFloatRegex 5000000 284 ns/op 0 B/op 0 allocs/op
BenchmarkCheckNumber 20000000 73.0 ns/op 0 B/op 0 allocs/op
PASS
ok command-line-arguments 7.380s
real_test.go
:
package main
import (
"log"
"regexp"
"strconv"
"testing"
"unicode"
)
func IsReal(n []byte) bool {
if len(n) > 0 && n[0] == '-' {
n = n[1:]
}
if len(n) == 0 {
return false
}
var point bool
for _, c := range n {
if '0' <= c && c <= '9' {
continue
}
if c == '.' && len(n) > 1 && !point {
point = true
continue
}
return false
}
return true
}
func BenchmarkIsReal(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
ok := IsReal(p)
if !ok {
log.Fatalf("NaN")
}
}
}
func CheckNumber(p []byte) bool {
r := string(p)
sep := 0
for _, b := range r {
if unicode.IsNumber(b) {
continue
}
if b == rune('.') {
if sep > 0 {
return false
}
sep++
continue
}
return false
}
return true
}
func BenchmarkFloatStrconv(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
_, err := strconv.ParseFloat(string(p), 64)
if err != nil {
log.Fatalf("NaN")
}
}
}
func BenchmarkFloatRegex(b *testing.B) {
p := []byte("15.34234234234")
r := `[-+]?[0-9]*\.?[0-9]`
c, _ := regexp.Compile(r)
for i := 0; i < b.N; i++ {
ok := c.Match(p)
if !ok {
log.Fatalf("NaN")
}
}
}
func BenchmarkCheckNumber(b *testing.B) {
p := []byte("15.34234234234")
for i := 0; i < b.N; i++ {
ok := CheckNumber(p)
if !ok {
log.Fatalf("NaN")
}
}
}
答案2
得分: 1
我将其作为一种挑战,尝试将其重写为一种状态机,从这里的每个人的集体输入中合成
func Validate(b []byte) bool {
for i := range b {
switch {
case b[i] >= '0' && b[i] <= '9':
continue
case b[i] == '.':
if len(b) == 1 {
return false
}
if len(b) > i {
return fractional(b[i+1:])
}
return true
case i == 0 && b[i] == '-':
if len(b) == 1 {
return false
}
continue
default:
return false
}
}
return true
}
func fractional(b []byte) bool {
for i := range b {
switch {
case b[i] >= '0' && b[i] <= '9':
continue
case b[i] == 'e' || b[i] == 'E':
if len(b[:i]) == 0 {
return false
}
if len(b) > i+1 {
return scientific(b[i+1:])
}
return false
default:
return false
}
}
return true
}
func scientific(b []byte) bool {
for i := range b {
switch {
case b[i] >= '0' && b[i] <= '9':
continue
case i == 0 && b[i] == '-':
if len(b) == 1 {
return false
}
continue
default:
return false
}
}
return true
}
它似乎适用于几种不同的数字格式:
type v struct {
Input []byte
Expected bool
}
func TestPermutations(t *testing.T) {
b := []v{
v{[]byte("123.456"), true},
v{[]byte("123"), true},
v{[]byte("123."), true},
v{[]byte(".123"), true},
v{[]byte("12.1e12"), true},
v{[]byte("12.1e-12"), true},
v{[]byte("-123.456"), true},
v{[]byte("-123"), true},
v{[]byte("-123."), true},
v{[]byte("-.123"), true},
v{[]byte("-12.1e12"), true},
v{[]byte("-12.1e-12"), true},
v{[]byte(".1e-12"), true},
v{[]byte(".e-12"), false},
v{[]byte(".e"), false},
v{[]byte("e"), false},
v{[]byte("abcdef"), false},
v{[]byte("-"), false},
v{[]byte("."), false},
}
for _, test := range b {
ok := Validate(test.Input)
if ok != test.Expected {
t.Errorf("无法处理案例 %s", test.Input)
}
}
}
并且在原始基准测试中表现良好:
BenchmarkValidate-8 100000000 13.0 ns/op 0 B/op 0 allocs/op
基准测试代码:
func BenchmarkValidate(b *testing.B) {
p := []byte("15.1234567890")
for i := 0; i < b.N; i++ {
ok := Validate(p)
if !ok {
log.Fatalf("问题")
}
}
}
英文:
I took upon it as a challenge for myself to rewrite this as some kind of state machine synthesizing the collective input from everyone here
func Validate(b []byte) bool {
for i := range b {
switch {
case b[i] >= '0' && b[i] <= '9':
continue
case b[i] == '.':
if len(b) == 1 {
return false
}
if len(b) > i {
return fractional(b[i+1:])
}
return true
case i == 0 && b[i] == '-':
if len(b) == 1 {
return false
}
continue
default:
return false
}
}
return true
}
func fractional(b []byte) bool {
for i := range b {
switch {
case b[i] >= '0' && b[i] <= '9':
continue
case b[i] == 'e' || b[i] == 'E':
if len(b[:i]) == 0 {
return false
}
if len(b) > i+1 {
return scientific(b[i+1:])
}
return false
default:
return false
}
}
return true
}
func scientific(b []byte) bool {
for i := range b {
switch {
case b[i] >= '0' && b[i] <= '9':
continue
case i == 0 && b[i] == '-':
if len(b) == 1 {
return false
}
continue
default:
return false
}
}
return true
}
It seems to work on a few different number formats:
type v struct {
Input []byte
Expected bool
}
func TestPermutations(t *testing.T) {
b := []v{
v{[]byte("123.456"), true},
v{[]byte("123"), true},
v{[]byte("123."), true},
v{[]byte(".123"), true},
v{[]byte("12.1e12"), true},
v{[]byte("12.1e-12"), true},
v{[]byte("-123.456"), true},
v{[]byte("-123"), true},
v{[]byte("-123."), true},
v{[]byte("-.123"), true},
v{[]byte("-12.1e12"), true},
v{[]byte("-12.1e-12"), true},
v{[]byte(".1e-12"), true},
v{[]byte(".e-12"), false},
v{[]byte(".e"), false},
v{[]byte("e"), false},
v{[]byte("abcdef"), false},
v{[]byte("-"), false},
v{[]byte("."), false},
}
for _, test := range b {
ok := Validate(test.Input)
if ok != test.Expected {
t.Errorf("could not handle case %s", test.Input)
}
}
}
and perform quite well on the original benchmark:
BenchmarkValidate-8 100000000 13.0 ns/op 0 B/op 0 allocs/op
Benchmark code:
func BenchmarkValidate(b *testing.B) {
p := []byte("15.1234567890")
for i := 0; i < b.N; i++ {
ok := Validate(p)
if !ok {
log.Fatalf("problem")
}
}
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论