英文:
SIMD: how to find minimum values among 4 __m256d registers with its index
问题
我有4个 _m256d,如何找到所有16个值中的最小值?
我怎样知道最小值来自哪个 __m256d 变量?以及它是哪个元素?假设部分值在不同的 __m256d 变量中相同
我正在尝试,但不起作用:
#include <immintrin.h>
#include <float.h>
int main()
{
// 要么 v1[0] 要么 v3[2] 是答案。
__m256d v1 = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
__m256d v2 = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
__m256d v3 = _mm256_set_pd(3.0, 4.0, 1.0, 2.0);
__m256d v4 = _mm256_set_pd(6.0, 5.0, 8.0, 7.0);
__m256d min = _mm256_set1_pd(DBL_MAX);
// 找出所有16个值中的最小值
min = _mm256_min_pd(min, v1);
min = _mm256_min_pd(min, v2);
min = _mm256_min_pd(min, v3);
min = _mm256_min_pd(min, v4);
// 获得最小元素的4位掩码
int mask = _mm256_movemask_pd(_mm256_cmp_pd(min, min, _CMP_EQ_OQ));
// 提取最小元素的索引
int index = __builtin_ffs(mask) - 1;
// 确定最小值来自哪个 __m256d 变量以及它是哪个元素
__m256d* v[4] = {&v1, &v2, &v3, &v4};
int v_index = index / 4;
int elem_index = index % 4;
printf("最小值为 %lf 来自 v%d 的元素 %d\n", min[elem_index], v_index + 1, elem_index);
return 0;
}
英文:
I have 4 _m256d, how can I find the minimum among all 16 values?
How can I know the minimum value come from which __m256d variable? and which element is it? assume part of values are the same in different __m256d variable
I'm trying but it doesn't work:
#include <immintrin.h>
#include <float.h>
int main()
{
// either v1[0] or v3[2] is the answer.
__m256d v1 = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
__m256d v2 = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
__m256d v3 = _mm256_set_pd(3.0, 4.0, 1.0, 2.0);
__m256d v4 = _mm256_set_pd(6.0, 5.0, 8.0, 7.0);
__m256d min = _mm256_set1_pd(DBL_MAX);
// Find the minimum among all 16 values
min = _mm256_min_pd(min, v1);
min = _mm256_min_pd(min, v2);
min = _mm256_min_pd(min, v3);
min = _mm256_min_pd(min, v4);
// Get a 4-bit mask of the minimum elements
int mask = _mm256_movemask_pd(_mm256_cmp_pd(min, min, _CMP_EQ_OQ));
// Extract the index of the minimum element
int index = __builtin_ffs(mask) - 1;
// Determine which __m256d variable the minimum value comes from and which element it is
__m256d* v[4] = {&v1, &v2, &v3, &v4};
int v_index = index / 4;
int elem_index = index % 4;
printf("The minimum value is %lf from v%d at element %d\n", min[elem_index], v_index + 1, elem_index);
return 0;
}
答案1
得分: 1
以下是您要翻译的内容:
假设您有AVX1但没有AVX2,我会这样做。
#include <immintrin.h>
struct sMin16
{
// The minimum value
double val;
// Index of the first minimum element
int index;
};
// Compute minimum of 16 FP64 numbers, stored in 4 AVX vectors
sMin16 min16(__m256d v0, __m256d v1, __m256d v2, __m256d v3)
{
// 计算4个向量的垂直最小值
__m256d t0 = _mm256_min_pd(v0, v1);
__m256d t1 = _mm256_min_pd(v2, v3);
t0 = _mm256_min_pd(t0, t1);
// 计算广播的水平最小值,交换16字节块,计算最小值
t1 = _mm256_permute2f128_pd(t0, t0, 0x01);
t0 = _mm256_min_pd(t0, t1);
// 成对交换元素,计算最小值
t1 = _mm256_shuffle_pd(t0, t0, 0b0101);
t0 = _mm256_min_pd(t0, t1);
// 存储最小值
sMin16 result;
result.val = _mm256_cvtsd_f64(t0);
// 将数字与广播的最小值比较,生成结果的位图
uint32_t mask;
mask = (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v0, _CMP_EQ_OQ));
mask |= (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v1, _CMP_EQ_OQ)) << 4;
mask |= (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v2, _CMP_EQ_OQ)) << 8;
mask |= (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v3, _CMP_EQ_OQ)) << 12;
// 我们有一个16位的位图,一个位表示等于最小值的元素
// 计算第一个等于最小值的元素的索引
#ifdef _MSC_VER
unsigned long idx;
_BitScanForward(&idx, mask);
result.index = idx;
#else
result.index = __builtin_ctz(mask);
#endif
return result;
}
计算的索引的最低2位包含向量中的通道索引,索引的位[2 .. 3]包含具有第一个最小元素的向量的索引。
请注意,_mm256_set_pd
内联函数会反转参数的顺序,所以 _mm256_set_pd(1, 2, 3, 4)
创建一个具有值 [4, 3, 2, 1] 的向量,而不是 [1, 2, 3, 4]。 由于这个原因,对于您的测试用例,该函数返回最小索引= 3,这是 v0
向量中的最后一个通道。
英文:
Assuming you have AVX1 but not AVX2, I would do it like that.
<!-- language-all: lang-cpp -->
#include <immintrin.h>
struct sMin16
{
// The minimum value
double val;
// Index of the first minimum element
int index;
};
// Compute minimum of 16 FP64 numbers, stored in 4 AVX vectors
sMin16 min16( __m256d v0, __m256d v1, __m256d v2, __m256d v3 )
{
// Compute vertical minimum of the 4 vectors
__m256d t0 = _mm256_min_pd( v0, v1 );
__m256d t1 = _mm256_min_pd( v2, v3 );
t0 = _mm256_min_pd( t0, t1 );
// Compute broadcasted horizontal minimum of `t0` vector
// Swap 16-byte pieces, compute minimum
t1 = _mm256_permute2f128_pd( t0, t0, 0x01 );
t0 = _mm256_min_pd( t0, t1 );
// Swap elements pairwise, compute minimum
t1 = _mm256_shuffle_pd( t0, t0, 0b0101 );
t0 = _mm256_min_pd( t0, t1 );
// Store the minimum value
sMin16 result;
result.val = _mm256_cvtsd_f64( t0 );
// Compare numbers for equality with the broadcasted minimum,
// and make a bitmap of the results
uint32_t mask;
mask = (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v0, _CMP_EQ_OQ ) );
mask |= (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v1, _CMP_EQ_OQ ) ) << 4;
mask |= (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v2, _CMP_EQ_OQ ) ) << 8;
mask |= (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v3, _CMP_EQ_OQ ) ) << 12;
// We have a bitmap of 16 bits, a bit is set for element展开收缩 equal to the minimum
// Compute index of the first element equal to the minimum
#ifdef _MSC_VER
unsigned long idx;
_BitScanForward( &idx, mask );
result.index = idx;
#else
result.index = __builtin_ctz( mask );
#endif
return result;
}
The lowest 2 bits of the computed index contain lane index in a vector, and bits [ 2 .. 3 ] of the index contain index of the vector with the first minimum element.
Note that _mm256_set_pd
intrinsic reverses the order of the arguments, so _mm256_set_pd( 1, 2, 3, 4 )
creates a vector with the values [ 4, 3, 2, 1 ], not [ 1, 2, 3, 4 ].<br/>
For this reason, the function returns minimum index = 3 for your test case, that’s last lane in the v0
vector.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论