在M1 Mac上搜索数组的最快方法

huangapple go评论116阅读模式
英文:

Fastest way to search an array on m1 mac

问题

我正在尝试从内存加载一个u16数组,并尽快在M1 Mac上找到第一个小于某个数字的元素。我已经查看了NEON指令,但无法找到一个好的方法来实现这个目标。虽然有比较向量指令,但它们会将您留下来具有全部为1或0的元素的向量。如何将它们加载到通用寄存器中?并且有没有一种通过测试向量指令来中断循环的方法?

英文:

I am trying to load an array of u16s from memory and find the first element that is less than some number, as fast as possible on an M1 mac. I have been looking through the NEON instructions, but I wasn't able to find a good way to do it. There are comparison vector instructions, but they leave you with vectors of elements of all 1 or 0. How would you get that into a general purpose register? And is there a way to break a loop by testing a vector instruction?

答案1

得分: 1

Here is the translated code without the comments and other non-translatable parts:

  1. .arch armv8-a
  2. .global searchArrayU16
  3. .text
  4. pArray0 .req x0
  5. thresh .req w1
  6. len .req x2
  7. stride .req x3
  8. pArray1 .req x4
  9. count .req w5
  10. val0 .req x6
  11. val0w .req w6
  12. val1 .req x7
  13. val1w .req w7
  14. .balign 64
  15. .func
  16. searchArrayU16:
  17. adr x15, 2f
  18. dup v0.8h, thresh
  19. ld1r {v1.2d}, [x15]
  20. mov stride, #128
  21. add pArray1, pArray0, #64
  22. mov count, #0
  23. b 1f
  24. .balign 8
  25. 2:
  26. .byte 1, 2, 4, 8, 16, 32, 64, 128
  27. .balign 64
  28. 1:
  29. ld1 {v16.8h-v19.8h}, [pArray0], stride
  30. ld1 {v20.8h-v23.8h}, [pArray1], stride
  31. ld1 {v24.8h-v27.8h}, [pArray0], stride
  32. ld1 {v28.8h-v31.8h}, [pArray1], stride
  33. cmhi v16.8h, v0.8h, v16.8h
  34. cmhi v17.8h, v0.8h, v17.8h
  35. cmhi v18.8h, v0.8h, v18.8h
  36. cmhi v19.8h, v0.8h, v19.8h
  37. cmhi v20.8h, v0.8h, v20.8h
  38. cmhi v21.8h, v0.8h, v21.8h
  39. cmhi v22.8h, v0.8h, v22.8h
  40. cmhi v23.8h, v0.8h, v23.8h
  41. cmhi v24.8h, v0.8h, v24.8h
  42. cmhi v25.8h, v0.8h, v25.8h
  43. cmhi v26.8h, v0.8h, v26.8h
  44. cmhi v27.8h, v0.8h, v27.8h
  45. cmhi v28.8h, v0.8h, v28.8h
  46. cmhi v29.8h, v0.8h, v29.8h
  47. cmhi v30.8h, v0.8h, v30.8h
  48. cmhi v31.8h, v0.8h, v31.8h
  49. uzp1 v16.16b, v16.16b, v17.16b
  50. uzp1 v18.16b, v18.16b, v19.16b
  51. uzp1 v20.16b, v20.16b, v21.16b
  52. uzp1 v22.16b, v22.16b, v23.16b
  53. uzp1 v24.16b, v24.16b, v25.16b
  54. uzp1 v26.16b, v26.16b, v27.16b
  55. uzp1 v28.16b, v28.16b, v29.16b
  56. uzp1 v30.16b, v30.16b, v31.16b
  57. and v16.16b, v16.16b, v1.16b
  58. and v18.16b, v18.16b, v1.16b
  59. and v20.16b, v20.16b, v1.16b
  60. and v22.16b, v22.16b, v1.16b
  61. and v24.16b, v24.16b, v1.16b
  62. and v26.16b, v26.16b, v1.16b
  63. and v28.16b, v28.16b, v1.16b
  64. and v30.16b, v30.16b, v1.16b
  65. addp v16.16b, v16.16b, v18.16b
  66. addp v20.16b, v20.16b, v22.16b
  67. addp v24.16b, v24.16b, v26.16b
  68. addp v28.16b, v28.16b, v30.16b
  69. addp v16.16b, v16.16b, v20.16b
  70. addp v24.16b, v24.16b, v28.16b
  71. add count, count, #128
  72. addp v16.16b, v16.16b, v24.16b
  73. mov val0, v16.d[0]
  74. mov val1, v16.d[1]
  75. orr x15, val0, val1
  76. cbnz x15, 1f
  77. cmp len, count, uxtw
  78. b.hi 1b
  79. .balign 16
  80. mov w0, #-1 // no match found
  81. ret
  82. .balign 16
  83. 1:
  84. rbit val0, val0
  85. rbit val1, val1
  86. cmp val0, #0
  87. sub w0, count, #128
  88. sub w1, count, #64
  89. clz val0, val0
  90. clz val1, val1
  91. add w0, w0, val0w
  92. add w1, w1, val1w
  93. csel w0, w0, w1, ne
  94. ret
  95. .endfunc
  96. .end

Please note that this is a direct translation, and the code's functionality is retained as per the original code provided.

英文:
  1. // int32_t searchArrayU16(uint16_t *pArray, uint16_t threshold, uint32_t len);
  2. // assert(len & 127 == 0);
  3. // assert(len >= 128);
  4. .arch armv8-a
  5. .global searchArrayU16
  6. .text
  7. pArray0 .req x0
  8. thresh .req w1
  9. len .req x2
  10. stride .req x3
  11. pArray1 .req x4
  12. count .req w5
  13. val0 .req x6
  14. val0w .req w6
  15. val1 .req x7
  16. val1w .req w7
  17. .balign 64
  18. .func
  19. searchArrayU16:
  20. adr x15, 2f
  21. dup v0.8h, thresh
  22. ld1r {v1.2d}, [x15]
  23. mov stride, #128
  24. add pArray1, pArray0, #64
  25. mov count, #0
  26. b 1f
  27. .balign 8
  28. 2:
  29. .byte 1, 2, 4, 8, 16, 32, 64, 128
  30. .balign 64
  31. 1:
  32. ld1 {v16.8h-v19.8h}, [pArray0], stride
  33. ld1 {v20.8h-v23.8h}, [pArray1], stride
  34. ld1 {v24.8h-v27.8h}, [pArray0], stride
  35. ld1 {v28.8h-v31.8h}, [pArray1], stride
  36. cmhi v16.8h, v0.8h, v16.8h
  37. cmhi v17.8h, v0.8h, v17.8h
  38. cmhi v18.8h, v0.8h, v18.8h
  39. cmhi v19.8h, v0.8h, v19.8h
  40. cmhi v20.8h, v0.8h, v20.8h
  41. cmhi v21.8h, v0.8h, v21.8h
  42. cmhi v22.8h, v0.8h, v22.8h
  43. cmhi v23.8h, v0.8h, v23.8h
  44. cmhi v24.8h, v0.8h, v24.8h
  45. cmhi v25.8h, v0.8h, v25.8h
  46. cmhi v26.8h, v0.8h, v26.8h
  47. cmhi v27.8h, v0.8h, v27.8h
  48. cmhi v28.8h, v0.8h, v28.8h
  49. cmhi v29.8h, v0.8h, v29.8h
  50. cmhi v30.8h, v0.8h, v30.8h
  51. cmhi v31.8h, v0.8h, v31.8h
  52. uzp1 v16.16b, v16.16b, v17.16b
  53. uzp1 v18.16b, v18.16b, v19.16b
  54. uzp1 v20.16b, v20.16b, v21.16b
  55. uzp1 v22.16b, v22.16b, v23.16b
  56. uzp1 v24.16b, v24.16b, v25.16b
  57. uzp1 v26.16b, v26.16b, v27.16b
  58. uzp1 v28.16b, v28.16b, v29.16b
  59. uzp1 v30.16b, v30.16b, v31.16b
  60. and v16.16b, v16.16b, v1.16b
  61. and v18.16b, v18.16b, v1.16b
  62. and v20.16b, v20.16b, v1.16b
  63. and v22.16b, v22.16b, v1.16b
  64. and v24.16b, v24.16b, v1.16b
  65. and v26.16b, v26.16b, v1.16b
  66. and v28.16b, v28.16b, v1.16b
  67. and v30.16b, v30.16b, v1.16b
  68. addp v16.16b, v16.16b, v18.16b
  69. addp v20.16b, v20.16b, v22.16b
  70. addp v24.16b, v24.16b, v26.16b
  71. addp v28.16b, v28.16b, v30.16b
  72. addp v16.16b, v16.16b, v20.16b
  73. addp v24.16b, v24.16b, v28.16b
  74. add count, count, #128
  75. addp v16.16b, v16.16b, v24.16b
  76. // total pipeline stall here
  77. mov val0, v16.d[0]
  78. mov val1, v16.d[1]
  79. orr x15, val0, val1
  80. cbnz x15, 1f // found a match!!!
  81. cmp len, count, uxtw
  82. b.hi 1b
  83. .balign 16
  84. mov w0, #-1 // no match found
  85. ret
  86. .balign 16
  87. 1:
  88. rbit val0, val0
  89. rbit val1, val1
  90. cmp val0, #0
  91. sub w0, count, #128
  92. sub w1, count, #64
  93. clz val0, val0
  94. clz val1, val1
  95. add w0, w0, val0w
  96. add w1, w1, val1w
  97. csel w0, w0, w1, ne
  98. ret
  99. .endfunc
  100. .end

Here you are. It returns -1 when no match is found.
It should work on all armv8-a cores or above.

huangapple
  • 本文由 发表于 2023年1月9日 15:46:24
  • 转载请务必保留本文链接:https://go.coder-hub.com/75054362.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定