在M1 Mac上搜索数组的最快方法

huangapple go评论70阅读模式
英文:

Fastest way to search an array on m1 mac

问题

我正在尝试从内存加载一个u16数组,并尽快在M1 Mac上找到第一个小于某个数字的元素。我已经查看了NEON指令,但无法找到一个好的方法来实现这个目标。虽然有比较向量指令,但它们会将您留下来具有全部为1或0的元素的向量。如何将它们加载到通用寄存器中?并且有没有一种通过测试向量指令来中断循环的方法?

英文:

I am trying to load an array of u16s from memory and find the first element that is less than some number, as fast as possible on an M1 mac. I have been looking through the NEON instructions, but I wasn't able to find a good way to do it. There are comparison vector instructions, but they leave you with vectors of elements of all 1 or 0. How would you get that into a general purpose register? And is there a way to break a loop by testing a vector instruction?

答案1

得分: 1

Here is the translated code without the comments and other non-translatable parts:

    .arch armv8-a
    .global searchArrayU16
    .text

    pArray0 .req x0
    thresh .req w1
    len .req x2
    stride .req x3
    pArray1 .req x4
    count .req w5
    val0 .req x6
    val0w .req w6
    val1 .req x7
    val1w .req w7

    .balign 64
    .func
    searchArrayU16:
        adr x15, 2f
        dup v0.8h, thresh
        ld1r {v1.2d}, [x15]
        mov stride, #128
        add pArray1, pArray0, #64
        mov count, #0

        b 1f

    .balign 8
    2:
    .byte 1, 2, 4, 8, 16, 32, 64, 128

    .balign 64
    1:
        ld1 {v16.8h-v19.8h}, [pArray0], stride
        ld1 {v20.8h-v23.8h}, [pArray1], stride
        ld1 {v24.8h-v27.8h}, [pArray0], stride
        ld1 {v28.8h-v31.8h}, [pArray1], stride

        cmhi v16.8h, v0.8h, v16.8h
        cmhi v17.8h, v0.8h, v17.8h
        cmhi v18.8h, v0.8h, v18.8h
        cmhi v19.8h, v0.8h, v19.8h
        cmhi v20.8h, v0.8h, v20.8h
        cmhi v21.8h, v0.8h, v21.8h
        cmhi v22.8h, v0.8h, v22.8h
        cmhi v23.8h, v0.8h, v23.8h
        cmhi v24.8h, v0.8h, v24.8h
        cmhi v25.8h, v0.8h, v25.8h
        cmhi v26.8h, v0.8h, v26.8h
        cmhi v27.8h, v0.8h, v27.8h
        cmhi v28.8h, v0.8h, v28.8h
        cmhi v29.8h, v0.8h, v29.8h
        cmhi v30.8h, v0.8h, v30.8h
        cmhi v31.8h, v0.8h, v31.8h

        uzp1 v16.16b, v16.16b, v17.16b
        uzp1 v18.16b, v18.16b, v19.16b
        uzp1 v20.16b, v20.16b, v21.16b
        uzp1 v22.16b, v22.16b, v23.16b
        uzp1 v24.16b, v24.16b, v25.16b
        uzp1 v26.16b, v26.16b, v27.16b
        uzp1 v28.16b, v28.16b, v29.16b
        uzp1 v30.16b, v30.16b, v31.16b

        and v16.16b, v16.16b, v1.16b
        and v18.16b, v18.16b, v1.16b
        and v20.16b, v20.16b, v1.16b
        and v22.16b, v22.16b, v1.16b
        and v24.16b, v24.16b, v1.16b
        and v26.16b, v26.16b, v1.16b
        and v28.16b, v28.16b, v1.16b
        and v30.16b, v30.16b, v1.16b

        addp v16.16b, v16.16b, v18.16b
        addp v20.16b, v20.16b, v22.16b
        addp v24.16b, v24.16b, v26.16b
        addp v28.16b, v28.16b, v30.16b

        addp v16.16b, v16.16b, v20.16b
        addp v24.16b, v24.16b, v28.16b

        add count, count, #128
        addp v16.16b, v16.16b, v24.16b

        mov val0, v16.d[0]
        mov val1, v16.d[1]

        orr x15, val0, val1
        cbnz x15, 1f

        cmp len, count, uxtw
        b.hi 1b

    .balign 16
    mov w0, #-1 // no match found
    ret

    .balign 16
    1:
        rbit val0, val0
        rbit val1, val1
        cmp val0, #0
        sub w0, count, #128
        sub w1, count, #64
        clz val0, val0
        clz val1, val1
        add w0, w0, val0w
        add w1, w1, val1w
        csel w0, w0, w1, ne
        ret
    .endfunc
    .end

Please note that this is a direct translation, and the code's functionality is retained as per the original code provided.

英文:
// int32_t searchArrayU16(uint16_t *pArray, uint16_t threshold, uint32_t len);
// assert(len & 127 == 0);
// assert(len >= 128);

	.arch armv8-a
	.global	searchArrayU16
	.text


pArray0	.req	x0
thresh	.req	w1
len		.req	x2
stride	.req	x3
pArray1	.req	x4
count	.req	w5
val0	.req	x6
val0w	.req	w6
val1	.req	x7
val1w	.req	w7

.balign 64
.func
searchArrayU16:
	adr		x15, 2f
	dup		v0.8h, thresh
	ld1r	{v1.2d}, [x15]
	mov		stride, #128
	add		pArray1, pArray0, #64
	mov		count, #0

	b		1f

.balign 8
2:
.byte	1, 2, 4, 8, 16, 32, 64, 128

.balign 64
1:
	ld1		{v16.8h-v19.8h}, [pArray0], stride
	ld1		{v20.8h-v23.8h}, [pArray1], stride
	ld1		{v24.8h-v27.8h}, [pArray0], stride
	ld1		{v28.8h-v31.8h}, [pArray1], stride

	cmhi	v16.8h, v0.8h, v16.8h
	cmhi	v17.8h, v0.8h, v17.8h
	cmhi	v18.8h, v0.8h, v18.8h
	cmhi	v19.8h, v0.8h, v19.8h
	cmhi	v20.8h, v0.8h, v20.8h
	cmhi	v21.8h, v0.8h, v21.8h
	cmhi	v22.8h, v0.8h, v22.8h
	cmhi	v23.8h, v0.8h, v23.8h
	cmhi	v24.8h, v0.8h, v24.8h
	cmhi	v25.8h, v0.8h, v25.8h
	cmhi	v26.8h, v0.8h, v26.8h
	cmhi	v27.8h, v0.8h, v27.8h
	cmhi	v28.8h, v0.8h, v28.8h
	cmhi	v29.8h, v0.8h, v29.8h
	cmhi	v30.8h, v0.8h, v30.8h
	cmhi	v31.8h, v0.8h, v31.8h

	uzp1	v16.16b, v16.16b, v17.16b
	uzp1	v18.16b, v18.16b, v19.16b
	uzp1	v20.16b, v20.16b, v21.16b
	uzp1	v22.16b, v22.16b, v23.16b
	uzp1	v24.16b, v24.16b, v25.16b
	uzp1	v26.16b, v26.16b, v27.16b
	uzp1	v28.16b, v28.16b, v29.16b
	uzp1	v30.16b, v30.16b, v31.16b

	and		v16.16b, v16.16b, v1.16b
	and		v18.16b, v18.16b, v1.16b
	and		v20.16b, v20.16b, v1.16b
	and		v22.16b, v22.16b, v1.16b
	and		v24.16b, v24.16b, v1.16b
	and		v26.16b, v26.16b, v1.16b
	and		v28.16b, v28.16b, v1.16b
	and		v30.16b, v30.16b, v1.16b

	addp	v16.16b, v16.16b, v18.16b
	addp	v20.16b, v20.16b, v22.16b
	addp	v24.16b, v24.16b, v26.16b
	addp	v28.16b, v28.16b, v30.16b

	addp	v16.16b, v16.16b, v20.16b
	addp	v24.16b, v24.16b, v28.16b

	add		count, count, #128
	addp	v16.16b, v16.16b, v24.16b

// total pipeline stall here

	mov		val0, v16.d[0]
	mov		val1, v16.d[1]

	orr		x15, val0, val1
	cbnz	x15, 1f	// found a match!!!

	cmp		len, count, uxtw
	b.hi	1b
.balign 16
	mov		w0, #-1		// no match found
	ret

.balign 16
1:
	rbit	val0, val0
	rbit	val1, val1
	cmp		val0, #0
	sub		w0, count, #128
	sub		w1, count, #64
	clz		val0, val0
	clz		val1, val1
	add		w0, w0, val0w
	add		w1, w1, val1w
	csel	w0, w0, w1, ne
	ret
.endfunc
.end

Here you are. It returns -1 when no match is found.
It should work on all armv8-a cores or above.

huangapple
  • 本文由 发表于 2023年1月9日 15:46:24
  • 转载请务必保留本文链接:https://go.coder-hub.com/75054362.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定