英文:
SIMD Intrinsics AVX. Tried to use _mm256_mullo_epi64. But got 0xC000001D: Illegal Instruction exception
问题
以下是代码部分的中文翻译:
我想使用SIMD来相乘两个NxN矩阵。我想要对64位整数执行矩阵乘法,并将一个矩阵的一个元素与具有相同索引的另一个元素相乘。例如:
c[1][1] = a[1][1] * b[1][1]
在使用 _mm256_mullo_epi64
操作进行乘法时出现错误。我无法弄清楚为什么会发生这种情况。我能否将结果值写入256位寄存器?
#include <iostream>
#include <immintrin.h>
using namespace std;
int avx_mult(__int64** A, __int64** B, __int64** C, int N) {
cout << "AVX mult:" << endl;
if (N < 4 || N % 4 != 0) return 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j += 4) {
// 将结果AMX寄存器填充为零
__m256i c_line = _mm256_setzero_si256();
// 从数组A加载4个长长整数元素到AMX寄存器
__m256i a_line = _mm256_loadu_si256((__m256i*) & A[i][j]);
// 从数组B加载4个长长整数元素到AMX寄存器
__m256i b_line = _mm256_loadu_si256((__m256i*) & B[i][j]);
// 将两个AVX寄存器相乘
c_line = _mm256_mullo_epi64(a_line, b_line);
}
}
}
int main() {
const unsigned int N = 4; // 数组大小
__int64** A = new __int64* [N];
__int64** B = new __int64* [N];
__int64** C = new __int64* [N];
for (int i = 0; i < N; i++) {
A[i] = new __int64[N];
B[i] = new __int64[N];
C[i] = new __int64[N];
}
for (int i = 0; i < N; i++) { // 填充数组
for (int j = 0; j < N; j++) {
A[i][j] = __int64(rand() % 10);
B[i][j] = __int64(rand() % 10);
C[i][j] = __int64(0);
}
}
avx_mult(A, B, C, N);
for (int i = 0; i < N; i++) {
delete[] A[i];
delete[] B[i];
delete[] C[i];
}
delete[] A;
delete[] B;
delete[] C;
}
这段代码可以编译,但程序在以下行上停止:
c_line = _mm256_mullo_epi64(a_line, b_line);
... 并出现退出代码 0xC000001D
:非法指令异常。
Intel Intrinsics文档对于 _mm256_mullo_epi64
的描述如下:
概要
__m256i _mm256_mullo_epi64 (__m256i a, __m256i b) #include <immintrin.h> 指令:vpmullq ymm, ymm, ymm CPUID标志:AVX512DQ + AVX512VL
描述
将包含在
a
和b
中的64位整数相乘,生成中间的128位整数,并将中间整数的低64位存储在dst
中。
我的函数参数符合描述。还有其他错误吗?
英文:
I want to multiply two NxN matrices using SIMD. I want to do matrix multiplication for 64-bit integers, and multiply one element of a matrix with another element with the same index. For example:
c[1][1] = a[1][1] * b[1][1]
An error occurs when multiplying with the _mm256_mullo_epi64
operation. I can't figure out why this is happening. Can I write the resulting value into a 256-bit register?
#include <iostream>
#include <immintrin.h>
using namespace std;
int avx_mult(__int64** A, __int64** B, __int64** C, int N) {
cout << "AVX mult:" << endl;
if (N < 4 || N % 4 != 0) return 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j += 4) {
// filling the resulting AMX register with zeros
__m256i c_line = _mm256_setzero_si256();
// load 4 long long elements from array A into AMX register
__m256i a_line = _mm256_loadu_si256((__m256i*) & A[i][j]);
// load 4 long long elements from array B into AMX register
__m256i b_line = _mm256_loadu_si256((__m256i*) & B[i][j]);
// multiplying two AVX registers
c_line = _mm256_mullo_epi64(a_line, b_line);
}
}
}
int main() {
const unsigned int N = 4; // array size
__int64** A = new __int64* [N];
__int64** B = new __int64* [N];
__int64** C = new __int64* [N];
for (int i = 0; i < N; i++) {
A[i] = new __int64[N];
B[i] = new __int64[N];
C[i] = new __int64[N];
}
for (int i = 0; i < N; i++) { // filling arrays
for (int j = 0; j < N; j++) {
A[i][j] = __int64(rand() % 10);
B[i][j] = __int64(rand() % 10);
C[i][j] = __int64(0);
}
}
avx_mult(A, B, C, N);
for (int i = 0; i < N; i++) {
delete[] A[i];
delete[] B[i];
delete[] C[i];
}
delete[] A;
delete[] B;
delete[] C;
}
The code compiles, but the program stops on this line:
c_line = _mm256_mullo_epi64(a_line, b_line);
... with exit code 0xC000001D
: Illegal Instruction exception.
The Intel Intrinsics Documentation for _mm256_mullo_epi64
says:
> #### Synopsis
> cpp
> __m256i _mm256_mullo_epi64 (__m256i a, __m256i b)
> #include <immintrin.h>
> Instruction: vpmullq ymm, ymm, ymm
> CPUID Flags: AVX512DQ + AVX512VL
>
> #### Description
> Multiply the packed 64-bit integers in a
and b
, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst
.
My function arguments fit the description. Or is there some mistake?
答案1
得分: 4
不是每个x86_64处理器都支持每个指令。特别是,_mm256_mullo_epi64
需要AVX-512扩展,如果您的其余代码正常工作,但运行此内联函数会导致执行非法指令,那么您很可能是在没有AVX-512的处理器上运行此代码。
您也可以只使用AVX2来实现打包的64位乘法:
__m256i mul64_haswell (__m256i a, __m256i b) {
__m256i bswap = _mm256_shuffle_epi32(b,0xB1);
__m256i prodlh = _mm256_mullo_epi32(a,bswap);
__m256i prodlh2 = _mm256_srli_epi64(prodlh, 32);
__m256i prodlh3 = _mm256_add_epi32(prodlh2, prodlh);
__m256i prodlh4 = _mm256_and_si256(prodlh3, _mm256_set1_epi64x(0x00000000FFFFFFFF));
__m256i prodll = _mm256_mul_epu32(a,b);
__m256i prod = _mm256_add_epi64(prodll,prodlh4);
return prod;
}
此代码取自@PeterCordes的回答 ,链接是Fastest way to multiply an array of int64_t?。
英文:
Not every x86_64 processor supports every instruction. Namely, _mm256_mullo_epi64
requires AVX-512 extensions, and if the rest of your code works, but running this intrinsic result in executing an illegal instruction, then you're most likely running this code on a processor without AVX-512.
You can implement packed 64-bit multiplication with just AVX2 as well:
__m256i mul64_haswell (__m256i a, __m256i b) {
__m256i bswap = _mm256_shuffle_epi32(b,0xB1);
__m256i prodlh = _mm256_mullo_epi32(a,bswap);
__m256i prodlh2 = _mm256_srli_epi64(prodlh, 32);
__m256i prodlh3 = _mm256_add_epi32(prodlh2, prodlh);
__m256i prodlh4 = _mm256_and_si256(prodlh3, _mm256_set1_epi64x(0x00000000FFFFFFFF));
__m256i prodll = _mm256_mul_epu32(a,b);
__m256i prod = _mm256_add_epi64(prodll,prodlh4);
return prod;
}
This code is taken from @PeterCordes' answer to Fastest way to multiply an array of int64_t?
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论