2023年6月5日 00:53:54go评论66阅读模式

英文:

Speed up strlen using SWAR in x86-64 assembly

问题

The asm function strlen receives the link to a string as a char - Array. To do so, the function may use SWAR on a general-purpose register, but without using xmm registers or SSE instructions.

The function checks with bit manipulation: (v - 0x01010101) & ~(v & 0x80808080) in 8-byte steps if the word contains a zero byte, which marks the end of the string. If it does, it iterates byte by byte until the zero to avoid a page fault.

The alignment works like in this GNU Libc implementation:

for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; ++char_ptr){
    if (*char_ptr == 'for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; ++char_ptr){
if (*char_ptr == '\0'){
return char_ptr - str;
}
}
'){
        return char_ptr - str;
    }
}

Is there any way to make it faster?

; rdi <- const *char
; rax <- counter + return value
; r10 <- current array for computation
; rcx, r8 <- Bitmask
; rsi <- Arr for calculation 

strlen:
    PUSH rbp
    SUB rsp, 8

    XOR rax, rax
    MOV r8, 31
alignment:
    CMP byte [rdi+rax], 0
    JE end

    MOV rsi, rdi
    ADD rsi, rax
    AND rsi, r8
    CMP rsi, 0
    JE while_parallel
    INC rax
    JMP alignment

while_parallel:
    MOV rcx, 0x01010101
    MOV r8, 0x80808080
while_parallel_loop:
    MOV r10, [rdi+rax]
    MOV rsi, r10

    NOT r10
    AND r10, r8
    SUB rsi, rcx
    AND rsi, r10

    CMP rsi, 0
    JNE while_single
    ADD rax, 8
    JMP while_parallel_loop

while_single:
    CMP byte [rdi+rax], 0
    JE end
    INC rax
    JMP while_single
end:
    ADD rsp, 8
    POP rbp
    RET

Note that there is no intention to use any SSE instructions or xmm registers.

英文:

The asm function strlen receives the link to a string as a char - Array. To to so, the function may use SWAR on general purpose register, but without using xmm register or SSE instructions.

The function checks with the bit manipulation: (v - 0x01010101) & ~(v & 0x80808080)in 8 byte steps,
if the word contains a zero byte, which marks the end of the string. If it does, it iterates byte per byte until the zero to avoid a page fault.

The alignment works like in this GNU Libc implementation:

for (char_ptr = str; ((unsigned long int) char_ptr &amp; (sizeof (longword) - 1)) != 0; ++char_ptr){
    if (*char_ptr == &#39;for (char_ptr = str; ((unsigned long int) char_ptr &amp; (sizeof (longword) - 1)) != 0; ++char_ptr){
if (*char_ptr == &#39;\0&#39;){
return char_ptr - str;
}
}
&#39;){
        return char_ptr - str;
    }
}

Is there any way to make it faster?

; rdi &lt;- const *char
; rax &lt;- counter + return value
; r10 &lt;- current array for computation
; rcx,r8 &lt;- Bitmask
; rsi &lt;- Arr for calculation 
            
strlen:
    PUSH rbp
    SUB rsp,8
 
    XOR rax,rax    
    MOV r8,31
alignment:
    CMP byte [rdi+rax],0
    JE end
    
    MOV rsi,rdi
    ADD rsi,rax
    AND rsi,r8
    CMP rsi,0
    JE while_parallel
    INC rax
    JMP alignment       
          
while_parallel:  
    MOV rcx,0x01010101
    MOV r8,0x80808080
while_parallel_loop:  
    MOV r10,[rdi+rax]
    MOV rsi,r10
    
    NOT r10
    AND r10,r8
    SUB rsi,rcx
    AND rsi,r10
    
    CMP rsi,0
    JNE while_single
    ADD rax,8
    JMP while_parallel_loop
    
while_single:
    CMP byte [rdi+rax],0
    JE  end
    INC rax
    JMP while_single    
end:
    ADD rsp,8
    POP rbp
    RET

Note that I do not intend to use any SSE instructions nor xmm register.

答案1

得分: 4

以下是对问题的代码的翻译：

问题中的代码似乎存在一个缺陷：Mycroft的魔术常数没有扩展到64位。至于效率：各种x86-64调用约定主要基于寄存器，因此对于简单的函数来说，没有必要维护堆栈帧。在发布的代码中的主循环似乎有点太复杂；请注意，大多数x86指令会设置可以用于分支的标志。与其按字节处理直到达到字符串起始的下一个QWORD边界，不如直接读取它所在的整个QWORD，并用0xff覆盖字符串起始之前的字节。然后，可以通过常规的QWORD处理循环处理生成的“掩码”数据。

以下是为Windows编写的x86-64代码，其中包含这些建议。将其调整为Linux使用的System V ABI只需要循环交换寄存器。请注意，在具有BMI指令集扩展的CPU上，可以通过将“not rcx”与后续的“and rcx, r9”合并为“andn rcx, rcx, r9”来减少64位处理循环的一个指令。

PUBLIC strglen

_TEXT SEGMENT

ALIGN 16

;; Alan Mycroft的零字节检测算法（newsgroup comp.lang.c，1987/04/08，
;; https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ）：
;; null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))

mycroft_magic1 EQU 0101010101010101h
mycroft_magic2 EQU 8080808080808080h

;; size_t strglen (const char *src);

;; Windows x86-64调用约定：
;; 函数参数：rcx、rdx、r8、r9
;; 函数返回值：rax
;; 临时寄存器：rax、rcx、rdx、r8、r9、r10、r11

strglen PROC
mov rax, rcx ; src指针
mov r8, rcx ; src指针
mov r9, mycroft_magic2 ; 0x8080808080808080
mov r10, -mycroft_magic1 ; -0x0101010101010101
and rcx, 7 ; ofs = src & 7；src按qword对齐？
jz process_qwords ; 是的，ofs为零，以qword为单位处理

sub rax, rcx ; src -= ofs
shl rcx, 3 ; ofs * CHAR_BIT
mov rdx, -1 ; ~0
shl rdx, cl ; ~(~0 << (ofs * CHAR_BIT))
mov rcx, qword ptr [rax] ; 加载对齐的qword
not rdx ; 掩码 = ~(~0 << (ofs * CHAR_BIT))
or rcx, rdx ; 将未使用的字节设置为0xff
jmp process_qwords_alt ; 使用替代循环入口

ALIGN 16

process_qwords:
mov rcx, qword ptr [rax] ; 加载对齐的qword
process_qwords_alt:
add rax, 8 ; src += 8
lea rdx, [rcx + r10] ; qword - 0x0101010101010101
not rcx ; ~qword
and rcx, r9 ; ~qword & 0x8080808080808080
and rcx, rdx ; (qword - 0x0101010101010101) & (~qword & 0x8080808080808080)
jz process_qwords ; 如果为零，未找到null字节，继续

bsf rcx, rcx ; 找到第一个置位位（null字节）
shr rcx, 3 ; 将位位置转换为字节位置
lea rax, [rax + rcx - 8] ; 反向预递减以计算字节数
sub rax, r8 ; src - 原始src = 长度
ret

strglen ENDP

ALIGN 16

_TEXT ENDS

END

我使用了下面的ISO-C99测试框架来检查上述strglen实现的正确性。我使用Microsoft Macro Assembler和Intel C/C++编译器构建了它，如下所示：ml64 -c strglen.asm，icl /Qstd=c99 /Ox /W4 strlen.c strglen.obj。我还使用dumpbin /disasm strglen.obj检查了生成的机器代码，主要是为了确保对齐指令按预期工作，并且通过REPT宏进行的循环展开是否正确，因为我在过去二十年中没有使用过它。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

extern size_t strglen (const char* str);

int main (void)
{
    const char a[] = "0123456789 the quick brown fox jumps over the lazy dog";
    char* src =  malloc (sizeof(a));
    size_t ref, res;

    printf ("src=%p\n", a);

    for (int srcofs = 0; srcofs < 8; srcofs++) {
        for (size_t len = 0; len < sizeof(a); len++) {
            memcpy (src, a, sizeof(a));
            src [len] = 0;
            ref = strlen (src + srcofs);
            res = strglen (src + srcofs);
            if (res != ref) {
                printf ("error @ srcofs=%d  len=%zu  ref=%zu  res=%zu\n", 
                        srcofs, len, ref, res);
                return EXIT_FAILURE;
            }
        }
    }
    printf ("Test passed\n");
    return EXIT_SUCCESS;
}

希望这可以帮助你理解问题中的代码。如果有任何进一步的问题，请随时提出。

英文:

The code from the question seems to have a defect: Mycroft's magic constants have not been extended to 64 bits. As for efficiency: the various x86-64 calling conventions are primarily register-based, so it is not necessary to maintain a stack frame for simple functions. The main loop in the posted code appears a bit too complex; note that most x86 instructions set flags that can be used for branching. Instead of processing byte-wise until reaching the next QWORD boundary from the start of the string, simply read the entire QWORD it falls in, and overwrite the bytes preceding start of the string with 0xff. The resulting "masked' data can then be processed by the regular QWORD processing loop.

Below is x86-64 code written for Windows that incorporates these suggestions. Adjusting it to the System V ABI used by Linux should require nothing more than a cyclical exchange of registers. Note that on CPUs with BMI instruction set extension the 64-bit processing loop can be reduced by one instructions by merging the not rcx with the following and rcx, r9 into andn rcx, rcx, r9.

PUBLIC  strglen

_TEXT   SEGMENT

        ALIGN 16

;; Alan Mycroft&#39;s null-byte detection algorithm (newsgroup comp.lang.c, 1987/04/08,
;; https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ):
;; null_byte(x) = ((x - 0x01010101) &amp; (~x &amp; 0x80808080))
 
mycroft_magic1 EQU 0101010101010101h
mycroft_magic2 EQU 8080808080808080h 

;; size_t strglen (const char *src);

;; Windows x86-64 calling convention:
;; function arguments: rcx, rdx, r8, r9
;; function return value: rax
;; scratch registers: rax, rcx, rdx, r8, r9, r10, r11

strglen PROC
        mov    rax, rcx             ; src pointer
        mov    r8, rcx              ; src pointer
        mov    r9, mycroft_magic2   ; 0x8080808080808080
        mov    r10, -mycroft_magic1 ; -0x0101010101010101
        and    rcx, 7               ; ofs = src &amp; 7; src qword aligned ?
        jz     process_qwords       ; yes, ofs is zero, process qword wise

        sub    rax, rcx             ; src -= ofs
        shl    rcx, 3               ; ofs * CHAR_BIT
        mov    rdx, -1              ; ~0
        shl    rdx, cl              ; ~0 &lt;&lt; (ofs * CHAR_BIT)
        mov    rcx, qword ptr [rax] ; load aligned qword
        not    rdx                  ; mask = ~(~0 &lt;&lt; (ofs * CHAR_BIT))
        or     rcx, rdx             ; set unused bytes to 0xff
        jmp    process_qwords_alt   ; use alternate loop entry

        ALIGN 16

process_qwords:
        mov    rcx, qword ptr [rax] ; load aligned qword
process_qwords_alt:
        add    rax, 8               ; src += 8
        lea    rdx, [rcx + r10]     ; qword - 0x0101010101010101
        not    rcx                  ; ~qword
        and    rcx, r9              ; ~qword &amp; 0x8080808080808080
        and    rcx, rdx             ; (qword - 0x0101010101010101) &amp; (~qword &amp; 0x8080808080808080)
        jz     process_qwords       ; if zero, no null byte found, continue

        bsf    rcx, rcx             ; find first set bit (null byte)
        shr    rcx, 3               ; convert bit position to byte position
        lea    rax, [rax + rcx - 8] ; reverse pre-increment to compute byte count
        sub    rax, r8              ; src - original src = length
        ret

strglen ENDP

        ALIGN 16

_TEXT   ENDS

        END

I used the ISO-C99 test scaffolding below to check the correctness of the strglen implementation above. I built with the Microsoft Macro Assembler and the Intel C/C++ compiler, as follows: ml64 -c strglen.asm, icl /Qstd=c99 /Ox /W4 strlen.c strglen.obj. I also inspected the generated machine code with dumpbin /disasm strglen.obj, mostly to make sure the alignment directives work as desired and that loop unrolling via the REPT macro works correctly, as I had not used that in the past twenty years.

#include &lt;stdio.h&gt;
#include &lt;stdlib.h&gt;
#include &lt;string.h&gt;

extern size_t strglen (const char* str);

int main (void)
{
    const char a[] = &quot;0123456789 the quick brown fox jumps over the lazy dog&quot;;
    char* src =  malloc (sizeof(a));
    size_t ref, res;

    printf (&quot;src=%p\n&quot;, a);

    for (int srcofs = 0; srcofs &lt; 8; srcofs++) {
        for (size_t len = 0; len &lt; sizeof(a); len++) {
            memcpy (src, a, sizeof(a));
            src [len] = 0;
            ref = strlen (src + srcofs);
            res = strglen (src + srcofs);
            if (res != ref) {
                printf (&quot;error @ srcofs=%d  len=%zu  ref=%zu  res=%zu\n&quot;, 
                        srcofs, len, ref, res);
                return EXIT_FAILURE;
            }
        }
    }
    printf (&quot;Test passed\n&quot;);
    return EXIT_SUCCESS;
}

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

在x86-64汇编中使用SWAR加速strlen。

问题

答案1

如何在x86汇编中编写一个操作数，它是从一个N位内存位置加载的512位向量。

Clang能够被说服优化这个几乎是叶子函数吗？

Creating bootable image for VirtualBox

汇编浮点数运算

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论