英文:
Speed up strlen using SWAR in x86-64 assembly
问题
The asm function strlen
receives the link to a string as a char - Array. To do so, the function may use SWAR on a general-purpose register, but without using xmm registers or SSE instructions.
The function checks with bit manipulation: (v - 0x01010101) & ~(v & 0x80808080)
in 8-byte steps if the word contains a zero byte, which marks the end of the string. If it does, it iterates byte by byte until the zero to avoid a page fault.
The alignment works like in this GNU Libc implementation:
for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; ++char_ptr){
if (*char_ptr == 'for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; ++char_ptr){
if (*char_ptr == '\0'){
return char_ptr - str;
}
}
'){
return char_ptr - str;
}
}
Is there any way to make it faster?
; rdi <- const *char
; rax <- counter + return value
; r10 <- current array for computation
; rcx, r8 <- Bitmask
; rsi <- Arr for calculation
strlen:
PUSH rbp
SUB rsp, 8
XOR rax, rax
MOV r8, 31
alignment:
CMP byte [rdi+rax], 0
JE end
MOV rsi, rdi
ADD rsi, rax
AND rsi, r8
CMP rsi, 0
JE while_parallel
INC rax
JMP alignment
while_parallel:
MOV rcx, 0x01010101
MOV r8, 0x80808080
while_parallel_loop:
MOV r10, [rdi+rax]
MOV rsi, r10
NOT r10
AND r10, r8
SUB rsi, rcx
AND rsi, r10
CMP rsi, 0
JNE while_single
ADD rax, 8
JMP while_parallel_loop
while_single:
CMP byte [rdi+rax], 0
JE end
INC rax
JMP while_single
end:
ADD rsp, 8
POP rbp
RET
Note that there is no intention to use any SSE instructions or xmm registers.
英文:
The asm function strlen
receives the link to a string as a char - Array. To to so, the function may use SWAR on general purpose register, but without using xmm register or SSE instructions.
The function checks with the bit manipulation: (v - 0x01010101) & ~(v & 0x80808080)
in 8 byte steps,
if the word contains a zero byte, which marks the end of the string. If it does, it iterates byte per byte until the zero to avoid a page fault.
The alignment works like in this GNU Libc implementation:
for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; ++char_ptr){
if (*char_ptr == 'for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; ++char_ptr){
if (*char_ptr == '\0'){
return char_ptr - str;
}
}
'){
return char_ptr - str;
}
}
Is there any way to make it faster?
; rdi <- const *char
; rax <- counter + return value
; r10 <- current array for computation
; rcx,r8 <- Bitmask
; rsi <- Arr for calculation
strlen:
PUSH rbp
SUB rsp,8
XOR rax,rax
MOV r8,31
alignment:
CMP byte [rdi+rax],0
JE end
MOV rsi,rdi
ADD rsi,rax
AND rsi,r8
CMP rsi,0
JE while_parallel
INC rax
JMP alignment
while_parallel:
MOV rcx,0x01010101
MOV r8,0x80808080
while_parallel_loop:
MOV r10,[rdi+rax]
MOV rsi,r10
NOT r10
AND r10,r8
SUB rsi,rcx
AND rsi,r10
CMP rsi,0
JNE while_single
ADD rax,8
JMP while_parallel_loop
while_single:
CMP byte [rdi+rax],0
JE end
INC rax
JMP while_single
end:
ADD rsp,8
POP rbp
RET
Note that I do not intend to use any SSE instructions nor xmm register.
答案1
得分: 4
以下是对问题的代码的翻译:
问题中的代码似乎存在一个缺陷:Mycroft的魔术常数没有扩展到64位。至于效率:各种x86-64调用约定主要基于寄存器,因此对于简单的函数来说,没有必要维护堆栈帧。在发布的代码中的主循环似乎有点太复杂;请注意,大多数x86指令会设置可以用于分支的标志。与其按字节处理直到达到字符串起始的下一个QWORD边界,不如直接读取它所在的整个QWORD,并用0xff覆盖字符串起始之前的字节。然后,可以通过常规的QWORD处理循环处理生成的“掩码”数据。
以下是为Windows编写的x86-64代码,其中包含这些建议。将其调整为Linux使用的System V ABI只需要循环交换寄存器。请注意,在具有BMI指令集扩展的CPU上,可以通过将“not rcx”与后续的“and rcx, r9”合并为“andn rcx, rcx, r9”来减少64位处理循环的一个指令。
PUBLIC strglen
_TEXT SEGMENT
ALIGN 16
;; Alan Mycroft的零字节检测算法(newsgroup comp.lang.c,1987/04/08,
;; https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ):
;; null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
mycroft_magic1 EQU 0101010101010101h
mycroft_magic2 EQU 8080808080808080h
;; size_t strglen (const char *src);
;; Windows x86-64调用约定:
;; 函数参数:rcx、rdx、r8、r9
;; 函数返回值:rax
;; 临时寄存器:rax、rcx、rdx、r8、r9、r10、r11
strglen PROC
mov rax, rcx ; src指针
mov r8, rcx ; src指针
mov r9, mycroft_magic2 ; 0x8080808080808080
mov r10, -mycroft_magic1 ; -0x0101010101010101
and rcx, 7 ; ofs = src & 7;src按qword对齐?
jz process_qwords ; 是的,ofs为零,以qword为单位处理
sub rax, rcx ; src -= ofs
shl rcx, 3 ; ofs * CHAR_BIT
mov rdx, -1 ; ~0
shl rdx, cl ; ~(~0 << (ofs * CHAR_BIT))
mov rcx, qword ptr [rax] ; 加载对齐的qword
not rdx ; 掩码 = ~(~0 << (ofs * CHAR_BIT))
or rcx, rdx ; 将未使用的字节设置为0xff
jmp process_qwords_alt ; 使用替代循环入口
ALIGN 16
process_qwords:
mov rcx, qword ptr [rax] ; 加载对齐的qword
process_qwords_alt:
add rax, 8 ; src += 8
lea rdx, [rcx + r10] ; qword - 0x0101010101010101
not rcx ; ~qword
and rcx, r9 ; ~qword & 0x8080808080808080
and rcx, rdx ; (qword - 0x0101010101010101) & (~qword & 0x8080808080808080)
jz process_qwords ; 如果为零,未找到null字节,继续
bsf rcx, rcx ; 找到第一个置位位(null字节)
shr rcx, 3 ; 将位位置转换为字节位置
lea rax, [rax + rcx - 8] ; 反向预递减以计算字节数
sub rax, r8 ; src - 原始src = 长度
ret
strglen ENDP
ALIGN 16
_TEXT ENDS
END
我使用了下面的ISO-C99测试框架来检查上述strglen
实现的正确性。我使用Microsoft Macro Assembler和Intel C/C++编译器构建了它,如下所示:ml64 -c strglen.asm
,icl /Qstd=c99 /Ox /W4 strlen.c strglen.obj
。我还使用dumpbin /disasm strglen.obj
检查了生成的机器代码,主要是为了确保对齐指令按预期工作,并且通过REPT
宏进行的循环展开是否正确,因为我在过去二十年中没有使用过它。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
extern size_t strglen (const char* str);
int main (void)
{
const char a[] = "0123456789 the quick brown fox jumps over the lazy dog";
char* src = malloc (sizeof(a));
size_t ref, res;
printf ("src=%p\n", a);
for (int srcofs = 0; srcofs < 8; srcofs++) {
for (size_t len = 0; len < sizeof(a); len++) {
memcpy (src, a, sizeof(a));
src [len] = 0;
ref = strlen (src + srcofs);
res = strglen (src + srcofs);
if (res != ref) {
printf ("error @ srcofs=%d len=%zu ref=%zu res=%zu\n",
srcofs, len, ref, res);
return EXIT_FAILURE;
}
}
}
printf ("Test passed\n");
return EXIT_SUCCESS;
}
希望这可以帮助你理解问题中的代码。如果有任何进一步的问题,请随时提出。
英文:
The code from the question seems to have a defect: Mycroft's magic constants have not been extended to 64 bits. As for efficiency: the various x86-64 calling conventions are primarily register-based, so it is not necessary to maintain a stack frame for simple functions. The main loop in the posted code appears a bit too complex; note that most x86 instructions set flags that can be used for branching. Instead of processing byte-wise until reaching the next QWORD
boundary from the start of the string, simply read the entire QWORD
it falls in, and overwrite the bytes preceding start of the string with 0xff
. The resulting "masked' data can then be processed by the regular QWORD
processing loop.
Below is x86-64 code written for Windows that incorporates these suggestions. Adjusting it to the System V ABI used by Linux should require nothing more than a cyclical exchange of registers. Note that on CPUs with BMI instruction set extension the 64-bit processing loop can be reduced by one instructions by merging the not rcx
with the following and rcx, r9
into andn rcx, rcx, r9
.
PUBLIC strglen
_TEXT SEGMENT
ALIGN 16
;; Alan Mycroft's null-byte detection algorithm (newsgroup comp.lang.c, 1987/04/08,
;; https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ):
;; null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
mycroft_magic1 EQU 0101010101010101h
mycroft_magic2 EQU 8080808080808080h
;; size_t strglen (const char *src);
;; Windows x86-64 calling convention:
;; function arguments: rcx, rdx, r8, r9
;; function return value: rax
;; scratch registers: rax, rcx, rdx, r8, r9, r10, r11
strglen PROC
mov rax, rcx ; src pointer
mov r8, rcx ; src pointer
mov r9, mycroft_magic2 ; 0x8080808080808080
mov r10, -mycroft_magic1 ; -0x0101010101010101
and rcx, 7 ; ofs = src & 7; src qword aligned ?
jz process_qwords ; yes, ofs is zero, process qword wise
sub rax, rcx ; src -= ofs
shl rcx, 3 ; ofs * CHAR_BIT
mov rdx, -1 ; ~0
shl rdx, cl ; ~0 << (ofs * CHAR_BIT)
mov rcx, qword ptr [rax] ; load aligned qword
not rdx ; mask = ~(~0 << (ofs * CHAR_BIT))
or rcx, rdx ; set unused bytes to 0xff
jmp process_qwords_alt ; use alternate loop entry
ALIGN 16
process_qwords:
mov rcx, qword ptr [rax] ; load aligned qword
process_qwords_alt:
add rax, 8 ; src += 8
lea rdx, [rcx + r10] ; qword - 0x0101010101010101
not rcx ; ~qword
and rcx, r9 ; ~qword & 0x8080808080808080
and rcx, rdx ; (qword - 0x0101010101010101) & (~qword & 0x8080808080808080)
jz process_qwords ; if zero, no null byte found, continue
bsf rcx, rcx ; find first set bit (null byte)
shr rcx, 3 ; convert bit position to byte position
lea rax, [rax + rcx - 8] ; reverse pre-increment to compute byte count
sub rax, r8 ; src - original src = length
ret
strglen ENDP
ALIGN 16
_TEXT ENDS
END
I used the ISO-C99 test scaffolding below to check the correctness of the strglen
implementation above. I built with the Microsoft Macro Assembler and the Intel C/C++ compiler, as follows: ml64 -c strglen.asm
, icl /Qstd=c99 /Ox /W4 strlen.c strglen.obj
. I also inspected the generated machine code with dumpbin /disasm strglen.obj
, mostly to make sure the alignment directives work as desired and that loop unrolling via the REPT
macro works correctly, as I had not used that in the past twenty years.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
extern size_t strglen (const char* str);
int main (void)
{
const char a[] = "0123456789 the quick brown fox jumps over the lazy dog";
char* src = malloc (sizeof(a));
size_t ref, res;
printf ("src=%p\n", a);
for (int srcofs = 0; srcofs < 8; srcofs++) {
for (size_t len = 0; len < sizeof(a); len++) {
memcpy (src, a, sizeof(a));
src [len] = 0;
ref = strlen (src + srcofs);
res = strglen (src + srcofs);
if (res != ref) {
printf ("error @ srcofs=%d len=%zu ref=%zu res=%zu\n",
srcofs, len, ref, res);
return EXIT_FAILURE;
}
}
}
printf ("Test passed\n");
return EXIT_SUCCESS;
}
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论