SIMD:如何在4个__m256d寄存器中找到最小值及其索引

huangapple go评论61阅读模式
英文:

SIMD: how to find minimum values among 4 __m256d registers with its index

问题

我有4个 _m256d,如何找到所有16个值中的最小值?
我怎样知道最小值来自哪个 __m256d 变量?以及它是哪个元素?假设部分值在不同的 __m256d 变量中相同
我正在尝试,但不起作用:

#include <immintrin.h>
#include <float.h>

int main()
{
   // 要么 v1[0] 要么 v3[2] 是答案。
    __m256d v1 = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
    __m256d v2 = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
    __m256d v3 = _mm256_set_pd(3.0, 4.0, 1.0, 2.0);
    __m256d v4 = _mm256_set_pd(6.0, 5.0, 8.0, 7.0);

    __m256d min = _mm256_set1_pd(DBL_MAX);

    // 找出所有16个值中的最小值
    min = _mm256_min_pd(min, v1);
    min = _mm256_min_pd(min, v2);
    min = _mm256_min_pd(min, v3);
    min = _mm256_min_pd(min, v4);

    // 获得最小元素的4位掩码
    int mask = _mm256_movemask_pd(_mm256_cmp_pd(min, min, _CMP_EQ_OQ));

    // 提取最小元素的索引
    int index = __builtin_ffs(mask) - 1;

    // 确定最小值来自哪个 __m256d 变量以及它是哪个元素
    __m256d* v[4] = {&v1, &v2, &v3, &v4};
    int v_index = index / 4;
    int elem_index = index % 4;

    printf("最小值为 %lf 来自 v%d 的元素 %d\n", min[elem_index], v_index + 1, elem_index);

    return 0;
}
英文:

I have 4 _m256d, how can I find the minimum among all 16 values?
How can I know the minimum value come from which __m256d variable? and which element is it? assume part of values are the same in different __m256d variable

I'm trying but it doesn't work:

#include &lt;immintrin.h&gt;
#include &lt;float.h&gt;

int main()
{
   // either v1[0] or v3[2] is the answer.
    __m256d v1 = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
    __m256d v2 = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
    __m256d v3 = _mm256_set_pd(3.0, 4.0, 1.0, 2.0);
    __m256d v4 = _mm256_set_pd(6.0, 5.0, 8.0, 7.0);

    __m256d min = _mm256_set1_pd(DBL_MAX);

    // Find the minimum among all 16 values
    min = _mm256_min_pd(min, v1);
    min = _mm256_min_pd(min, v2);
    min = _mm256_min_pd(min, v3);
    min = _mm256_min_pd(min, v4);

    // Get a 4-bit mask of the minimum elements
    int mask = _mm256_movemask_pd(_mm256_cmp_pd(min, min, _CMP_EQ_OQ));

    // Extract the index of the minimum element
    int index = __builtin_ffs(mask) - 1;

    // Determine which __m256d variable the minimum value comes from and which element it is
    __m256d* v[4] = {&amp;v1, &amp;v2, &amp;v3, &amp;v4};
    int v_index = index / 4;
    int elem_index = index % 4;

    printf(&quot;The minimum value is %lf from v%d at element %d\n&quot;, min[elem_index], v_index + 1, elem_index);

    return 0;
}

答案1

得分: 1

以下是您要翻译的内容:

假设您有AVX1但没有AVX2,我会这样做。

	#include <immintrin.h>

	struct sMin16
	{
		// The minimum value
		double val;
		// Index of the first minimum element
		int index;
	};

	// Compute minimum of 16 FP64 numbers, stored in 4 AVX vectors
	sMin16 min16(__m256d v0, __m256d v1, __m256d v2, __m256d v3)
	{
		// 计算4个向量的垂直最小值
		__m256d t0 = _mm256_min_pd(v0, v1);
		__m256d t1 = _mm256_min_pd(v2, v3);
		t0 = _mm256_min_pd(t0, t1);

		// 计算广播的水平最小值,交换16字节块,计算最小值
		t1 = _mm256_permute2f128_pd(t0, t0, 0x01);
		t0 = _mm256_min_pd(t0, t1);
		// 成对交换元素,计算最小值
		t1 = _mm256_shuffle_pd(t0, t0, 0b0101);
		t0 = _mm256_min_pd(t0, t1);

		// 存储最小值
		sMin16 result;
		result.val = _mm256_cvtsd_f64(t0);

		// 将数字与广播的最小值比较,生成结果的位图
		uint32_t mask;
		mask = (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v0, _CMP_EQ_OQ));
		mask |= (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v1, _CMP_EQ_OQ)) << 4;
		mask |= (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v2, _CMP_EQ_OQ)) << 8;
		mask |= (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v3, _CMP_EQ_OQ)) << 12;

		// 我们有一个16位的位图,一个位表示等于最小值的元素
		// 计算第一个等于最小值的元素的索引
	#ifdef _MSC_VER
		unsigned long idx;
		_BitScanForward(&idx, mask);
		result.index = idx;
	#else
		result.index = __builtin_ctz(mask);
	#endif
		return result;
	}

计算的索引的最低2位包含向量中的通道索引,索引的位[2 .. 3]包含具有第一个最小元素的向量的索引。

请注意,_mm256_set_pd 内联函数会反转参数的顺序,所以 _mm256_set_pd(1, 2, 3, 4) 创建一个具有值 [4, 3, 2, 1] 的向量,而不是 [1, 2, 3, 4]。 由于这个原因,对于您的测试用例,该函数返回最小索引= 3,这是 v0 向量中的最后一个通道。

英文:

Assuming you have AVX1 but not AVX2, I would do it like that.
<!-- language-all: lang-cpp -->

#include &lt;immintrin.h&gt;

struct sMin16
{
	// The minimum value
	double val;
	// Index of the first minimum element
	int index;
};

// Compute minimum of 16 FP64 numbers, stored in 4 AVX vectors
sMin16 min16( __m256d v0, __m256d v1, __m256d v2, __m256d v3 )
{
	// Compute vertical minimum of the 4 vectors
	__m256d t0 = _mm256_min_pd( v0, v1 );
	__m256d t1 = _mm256_min_pd( v2, v3 );
	t0 = _mm256_min_pd( t0, t1 );

	// Compute broadcasted horizontal minimum of `t0` vector
	// Swap 16-byte pieces, compute minimum
	t1 = _mm256_permute2f128_pd( t0, t0, 0x01 );
	t0 = _mm256_min_pd( t0, t1 );
	// Swap elements pairwise, compute minimum
	t1 = _mm256_shuffle_pd( t0, t0, 0b0101 );
	t0 = _mm256_min_pd( t0, t1 );

	// Store the minimum value
	sMin16 result;
	result.val = _mm256_cvtsd_f64( t0 );

	// Compare numbers for equality with the broadcasted minimum,
	// and make a bitmap of the results
	uint32_t mask;
	mask = (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v0, _CMP_EQ_OQ ) );
	mask |= (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v1, _CMP_EQ_OQ ) ) &lt;&lt; 4;
	mask |= (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v2, _CMP_EQ_OQ ) ) &lt;&lt; 8;
	mask |= (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v3, _CMP_EQ_OQ ) ) &lt;&lt; 12;

	// We have a bitmap of 16 bits, a bit is set for element
展开收缩
equal to the minimum // Compute index of the first element equal to the minimum #ifdef _MSC_VER unsigned long idx; _BitScanForward( &amp;idx, mask ); result.index = idx; #else result.index = __builtin_ctz( mask ); #endif return result; }

The lowest 2 bits of the computed index contain lane index in a vector, and bits [ 2 .. 3 ] of the index contain index of the vector with the first minimum element.

Note that _mm256_set_pd intrinsic reverses the order of the arguments, so _mm256_set_pd( 1, 2, 3, 4 ) creates a vector with the values [ 4, 3, 2, 1 ], not [ 1, 2, 3, 4 ].<br/>
For this reason, the function returns minimum index = 3 for your test case, that’s last lane in the v0 vector.

huangapple
  • 本文由 发表于 2023年5月10日 23:47:44
  • 转载请务必保留本文链接:https://go.coder-hub.com/76220406.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定