英文:
Performance discrepancy in compiled vs. hand-written assembly
问题
我一直在尝试在Go语言中使用汇编语言,并且我已经编写了一个汉明重量函数作为练习。
我基于这个Stack Overflow答案编写了一个原生的Go版本,而汇编版本则基于AMD的这个文档(第180页)。在对这两个函数进行基准测试后,我发现原生的Go版本比汇编版本快大约1.5倍到2倍,尽管手写的汇编版本几乎与go tool 6g -S popcount.go
的输出完全相同。
通过go test -bench=.
得到的输出
PASS
BenchmarkPopCount 100000000 19.4 ns/op
BenchmarkPopCount_g 200000000 8.97 ns/op
ok popcount 4.777s
popcount.go
package popcount
func popCount(i uint32) uint32 // Defined in popcount_amd64.s
func popCount_g(i uint32) uint32 {
i = i - ((i >> 1) & 0x55555555)
i = (i & 0x33333333) + ((i >> 2) & 0x33333333)
return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24
}
popcount_test.go
package popcount
import "testing"
func TestPopcount(t *testing.T) {
for i := uint32(0); i < uint32(100); i++ {
if popCount(i) != popCount_g(i) {
t.Fatalf("failed on input = %v", i)
}
}
}
func BenchmarkPopCount(b *testing.B) {
for i := 0; i < b.N; i++ {
popCount(uint32(i))
}
}
func BenchmarkPopCount_g(b *testing.B) {
for i := 0; i < b.N; i++ {
popCount_g(uint32(i))
}
}
popcount_amd64.s
// func popCount(i uint32) uint32
TEXT ·popCount(SB),$0
MOVL i+0(FP), BP // i
MOVL BP, BX // i
SHRL $1, BX // i >> 1
ANDL $0x055555555, BX // (i >> 1) & 0x55555555
SUBL BX, BP // w = i - ((i >> 1) & 0x55555555)
MOVL BP, AX // w
SHRL $2, BP // w >> 2
ANDL $0x033333333, AX // w & 0x33333333
ANDL $0x033333333, BP // (w >> 2) & 0x33333333
ADDL BP, AX // x = (w & 0x33333333) + ((w >> 2) & 0x33333333)
MOVL AX, BX // x
SHRL $4, BX // x >> 4
ADDL AX, BX // x + (x >> 4)
ANDL $0x00F0F0F0F, BX // y = (x + (x >> 4) & 0x0F0F0F0F)
IMULL $0x001010101, BX // y * 0x01010101
SHRL $24, BX // population count = (y * 0x01010101) >> 24
MOVL BX, toReturn+8(FP) // Store result.
RET // return
通过go tool 6g -S popcount.go
得到的输出
"".popCount_g t=1 size=64 value=0 args=0x10 locals=0
000000 00000 (popcount.go:5) TEXT "".popCount_g+0(SB),4,$0-16
000000 00000 (popcount.go:5) NOP ,
000000 00000 (popcount.go:5) NOP ,
000000 00000 (popcount.go:5) MOVL "".i+8(FP),BP
0x0004 00004 (popcount.go:5) FUNCDATA $2,gclocals·9308e7ef08d2cc2f72ae1228688dacf9+0(SB)
0x0004 00004 (popcount.go:5) FUNCDATA $3,gclocals·3280bececceccd33cb74587feedb1f9f+0(SB)
0x0004 00004 (popcount.go:6) MOVL BP,BX
0x0006 00006 (popcount.go:6) SHRL $1,BX
0x0008 00008 (popcount.go:6) ANDL $1431655765,BX
0x000e 00014 (popcount.go:6) SUBL BX,BP
0x0010 00016 (popcount.go:7) MOVL BP,AX
0x0012 00018 (popcount.go:7) ANDL $858993459,AX
0x0017 00023 (popcount.go:7) SHRL $2,BP
0x001a 00026 (popcount.go:7) ANDL $858993459,BP
0x0020 00032 (popcount.go:7) ADDL BP,AX
0x0022 00034 (popcount.go:8) MOVL AX,BX
0x0024 00036 (popcount.go:8) SHRL $4,BX
0x0027 00039 (popcount.go:8) ADDL AX,BX
0x0029 00041 (popcount.go:8) ANDL $252645135,BX
0x002f 00047 (popcount.go:8) IMULL $16843009,BX
0x0035 00053 (popcount.go:8) SHRL $24,BX
0x0038 00056 (popcount.go:8) MOVL BX,"".~r1+16(FP)
0x003c 00060 (popcount.go:8) RET ,
我从这里得知FUNCDATA
行包含垃圾收集器的信息,但除此之外,我没有看到任何明显的差异。
是什么导致这两个函数之间的速度差异如此之大呢?
英文:
I've been playing around with using assembly language in Go and I've written a Hamming Weight function as an excercise.
I've based a native Go version on this SO answer and the assembly version is based on this doc from AMD (page 180). Upon benchmarking the two functions, I find that the native Go version is around 1.5x - 2x faster than the assembly version, despite the hand-written assembly version being almost identical to the output from go tool 6g -S popcount.go
.
output from go test -bench=.
PASS
BenchmarkPopCount 100000000 19.4 ns/op
BenchmarkPopCount_g 200000000 8.97 ns/op
ok popcount 4.777s
popcount.go
package popcount
func popCount(i uint32) uint32 // Defined in popcount_amd64.s
func popCount_g(i uint32) uint32 {
i = i - ((i >> 1) & 0x55555555)
i = (i & 0x33333333) + ((i >> 2) & 0x33333333)
return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24
}
popcount_test.go
package popcount
import "testing"
func TestPopcount(t *testing.T) {
for i := uint32(0); i < uint32(100); i++ {
if popCount(i) != popCount_g(i) {
t.Fatalf("failed on input = %v", i)
}
}
}
func BenchmarkPopCount(b *testing.B) {
for i := 0; i < b.N; i++ {
popCount(uint32(i))
}
}
func BenchmarkPopCount_g(b *testing.B) {
for i := 0; i < b.N; i++ {
popCount_g(uint32(i))
}
}
popcount_amd64.s
// func popCount(i uint32) uint32
TEXT ·popCount(SB),$0
MOVL i+0(FP), BP // i
MOVL BP, BX // i
SHRL $1, BX // i >> 1
ANDL $0x055555555, BX // (i >> 1) & 0x55555555
SUBL BX, BP // w = i - ((i >> 1) & 0x55555555)
MOVL BP, AX // w
SHRL $2, BP // w >> 2
ANDL $0x033333333, AX // w & 0x33333333
ANDL $0x033333333, BP // (w >> 2) & 0x33333333
ADDL BP, AX // x = (w & 0x33333333) + ((w >> 2) & 0x33333333)
MOVL AX, BX // x
SHRL $4, BX // x >> 4
ADDL AX, BX // x + (x >> 4)
ANDL $0x00F0F0F0F, BX // y = (x + (x >> 4) & 0x0F0F0F0F)
IMULL $0x001010101, BX // y * 0x01010101
SHRL $24, BX // population count = (y * 0x01010101) >> 24
MOVL BX, toReturn+8(FP) // Store result.
RET // return
output from go tool 6g -S popcount.go
"".popCount_g t=1 size=64 value=0 args=0x10 locals=0
000000 00000 (popcount.go:5) TEXT "".popCount_g+0(SB),4,$0-16
000000 00000 (popcount.go:5) NOP ,
000000 00000 (popcount.go:5) NOP ,
000000 00000 (popcount.go:5) MOVL "".i+8(FP),BP
0x0004 00004 (popcount.go:5) FUNCDATA $2,gclocals┬À9308e7ef08d2cc2f72ae1228688dacf9+0(SB)
0x0004 00004 (popcount.go:5) FUNCDATA $3,gclocals┬À3280bececceccd33cb74587feedb1f9f+0(SB)
0x0004 00004 (popcount.go:6) MOVL BP,BX
0x0006 00006 (popcount.go:6) SHRL $1,BX
0x0008 00008 (popcount.go:6) ANDL $1431655765,BX
0x000e 00014 (popcount.go:6) SUBL BX,BP
0x0010 00016 (popcount.go:7) MOVL BP,AX
0x0012 00018 (popcount.go:7) ANDL $858993459,AX
0x0017 00023 (popcount.go:7) SHRL $2,BP
0x001a 00026 (popcount.go:7) ANDL $858993459,BP
0x0020 00032 (popcount.go:7) ADDL BP,AX
0x0022 00034 (popcount.go:8) MOVL AX,BX
0x0024 00036 (popcount.go:8) SHRL $4,BX
0x0027 00039 (popcount.go:8) ADDL AX,BX
0x0029 00041 (popcount.go:8) ANDL $252645135,BX
0x002f 00047 (popcount.go:8) IMULL $16843009,BX
0x0035 00053 (popcount.go:8) SHRL $24,BX
0x0038 00056 (popcount.go:8) MOVL BX,"".~r1+16(FP)
0x003c 00060 (popcount.go:8) RET ,
I know from here that the FUNCDATA
lines contain information for the garbage collector, but other than that I don't see any glaring differences.
What could be causing this large difference in speed between the 2 functions?
答案1
得分: 8
如果你查看函数调用的伪汇编代码,你会发现.s
(汇编器)版本被调用,带有调用开销,而.go
版本被内联。
func S() {
pc := popCount(uint32(0))
_ = pc
}
"".S t=1 size=48 value=0 args=0x0 locals=0x10
0x0000 00000 (popcount.go:11) TEXT "".S+0(SB),$16-0
0x0000 00000 (popcount.go:11) MOVQ (TLS),CX
0x0009 00009 (popcount.go:11) CMPQ SP,(CX)
0x000c 00012 (popcount.go:11) JHI ,21
0x000e 00014 (popcount.go:11) CALL ,runtime.morestack00_noctxt(SB)
0x0013 00019 (popcount.go:11) JMP ,0
0x0015 00021 (popcount.go:11) SUBQ $16,SP
0x0019 00025 (popcount.go:11) FUNCDATA $2,gclocals·3280bececceccd33cb74587feedb1f9f+0(SB)
0x0019 00025 (popcount.go:11) FUNCDATA $3,gclocals·3280bececceccd33cb74587feedb1f9f+0(SB)
0x0019 00025 (popcount.go:12) MOVL $0,(SP)
0x0020 00032 (popcount.go:12) PCDATA $1,$0
0x0020 00032 (popcount.go:12) CALL ,"".popCount(SB)
0x0025 00037 (popcount.go:12) MOVL 8(SP),BX
0x0029 00041 (popcount.go:12) NOP ,
0x0029 00041 (popcount.go:14) ADDQ $16,SP
0x002d 00045 (popcount.go:14) RET ,
func S_G() {
pc := popCount_g(uint32(0))
_ = pc
}
"".S_G t=1 size=64 value=0 args=0x0 locals=0x8
0x0000 00000 (popcount.go:16) TEXT "".S_G+0(SB),4,$8-0
0x0000 00000 (popcount.go:16) SUBQ $8,SP
0x0004 00004 (popcount.go:16) FUNCDATA $2,gclocals·3280bececceccd33cb74587feedb1f9f+0(SB)
0x0004 00004 (popcount.go:16) FUNCDATA $3,gclocals·3280bececceccd33cb74587feedb1f9f+0(SB)
0x0004 00004 (popcount.go:17) MOVL $0,BP
0x0006 00006 (popcount.go:17) MOVL BP,BX
0x0008 00008 (popcount.go:17) SHRL $1,BX
0x000a 00010 (popcount.go:17) ANDL $1431655765,BX
0x0010 00016 (popcount.go:17) SUBL BX,BP
0x0012 00018 (popcount.go:17) MOVL BP,AX
0x0014 00020 (popcount.go:17) ANDL $858993459,AX
0x0019 00025 (popcount.go:17) SHRL $2,BP
0x001c 00028 (popcount.go:17) ANDL $858993459,BP
0x0022 00034 (popcount.go:17) ADDL BP,AX
0x0024 00036 (popcount.go:17) MOVL AX,BX
0x0026 00038 (popcount.go:17) SHRL $4,BX
0x0029 00041 (popcount.go:17) ADDL AX,BX
0x002b 00043 (popcount.go:17) ANDL $252645135,BX
0x0031 00049 (popcount.go:17) IMULL $16843009,BX
0x0037 00055 (popcount.go:17) SHRL $24,BX
0x003a 00058 (popcount.go:19) ADDQ $8,SP
0x003e 00062 (popcount.go:19) RET ,
请注意,这只是伪汇编代码,具体的指令和操作可能因编译器和平台而异。
英文:
If you look at the pseudo-assembler for the funnction calls you will see that the .s
(assembler) version is called, with call overhead, and the .go
version is inlined.
func S() {
pc := popCount(uint32(0))
_ = pc
}
"".S t=1 size=48 value=0 args=0x0 locals=0x10
0x0000 00000 (popcount.go:11) TEXT "".S+0(SB),$16-0
0x0000 00000 (popcount.go:11) MOVQ (TLS),CX
0x0009 00009 (popcount.go:11) CMPQ SP,(CX)
0x000c 00012 (popcount.go:11) JHI ,21
0x000e 00014 (popcount.go:11) CALL ,runtime.morestack00_noctxt(SB)
0x0013 00019 (popcount.go:11) JMP ,0
0x0015 00021 (popcount.go:11) SUBQ $16,SP
0x0019 00025 (popcount.go:11) FUNCDATA $2,gclocals·3280bececceccd33cb74587feedb1f9f+0(SB)
0x0019 00025 (popcount.go:11) FUNCDATA $3,gclocals·3280bececceccd33cb74587feedb1f9f+0(SB)
0x0019 00025 (popcount.go:12) MOVL $0,(SP)
0x0020 00032 (popcount.go:12) PCDATA $1,$0
0x0020 00032 (popcount.go:12) CALL ,"".popCount(SB)
0x0025 00037 (popcount.go:12) MOVL 8(SP),BX
0x0029 00041 (popcount.go:12) NOP ,
0x0029 00041 (popcount.go:14) ADDQ $16,SP
0x002d 00045 (popcount.go:14) RET ,
func S_G() {
pc := popCount_g(uint32(0))
_ = pc
}
"".S_G t=1 size=64 value=0 args=0x0 locals=0x8
0x0000 00000 (popcount.go:16) TEXT "".S_G+0(SB),4,$8-0
0x0000 00000 (popcount.go:16) SUBQ $8,SP
0x0004 00004 (popcount.go:16) FUNCDATA $2,gclocals·3280bececceccd33cb74587feedb1f9f+0(SB)
0x0004 00004 (popcount.go:16) FUNCDATA $3,gclocals·3280bececceccd33cb74587feedb1f9f+0(SB)
0x0004 00004 (popcount.go:17) MOVL $0,BP
0x0006 00006 (popcount.go:17) MOVL BP,BX
0x0008 00008 (popcount.go:17) SHRL $1,BX
0x000a 00010 (popcount.go:17) ANDL $1431655765,BX
0x0010 00016 (popcount.go:17) SUBL BX,BP
0x0012 00018 (popcount.go:17) MOVL BP,AX
0x0014 00020 (popcount.go:17) ANDL $858993459,AX
0x0019 00025 (popcount.go:17) SHRL $2,BP
0x001c 00028 (popcount.go:17) ANDL $858993459,BP
0x0022 00034 (popcount.go:17) ADDL BP,AX
0x0024 00036 (popcount.go:17) MOVL AX,BX
0x0026 00038 (popcount.go:17) SHRL $4,BX
0x0029 00041 (popcount.go:17) ADDL AX,BX
0x002b 00043 (popcount.go:17) ANDL $252645135,BX
0x0031 00049 (popcount.go:17) IMULL $16843009,BX
0x0037 00055 (popcount.go:17) SHRL $24,BX
0x003a 00058 (popcount.go:19) ADDQ $8,SP
0x003e 00062 (popcount.go:19) RET ,
答案2
得分: 4
如果你真的想要一个popcnt,它已经成为X86处理器上的本地指令一段时间了。你的编译器可能有一个内置函数来实现它,比如微软的__popcnt()(还有__popcnt16()和__popcnt64()),或者GCC的__builtin_popcount()(或者对于64位的__builtin_popcountll())。
英文:
If you actually want a popcnt, it's been a native instruction on X86 processors for a while now. Your compiler may have an intrinsic for it, such as Microsoft's __popcnt() (also __popcnt16() and __popcnt64() ), or GCC's __builtin_popcount() (or __builtin_popcountll() for 64 bit).
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论