问题

我有4个 _m256d，如何找到所有16个值中的最小值？
我怎样知道最小值来自哪个 __m256d 变量？以及它是哪个元素？假设部分值在不同的 __m256d 变量中相同
我正在尝试，但不起作用：

#include <immintrin.h>
#include <float.h>

int main()
{
   // 要么 v1[0] 要么 v3[2] 是答案。
    __m256d v1 = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
    __m256d v2 = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
    __m256d v3 = _mm256_set_pd(3.0, 4.0, 1.0, 2.0);
    __m256d v4 = _mm256_set_pd(6.0, 5.0, 8.0, 7.0);

    __m256d min = _mm256_set1_pd(DBL_MAX);

    // 找出所有16个值中的最小值
    min = _mm256_min_pd(min, v1);
    min = _mm256_min_pd(min, v2);
    min = _mm256_min_pd(min, v3);
    min = _mm256_min_pd(min, v4);

    // 获得最小元素的4位掩码
    int mask = _mm256_movemask_pd(_mm256_cmp_pd(min, min, _CMP_EQ_OQ));

    // 提取最小元素的索引
    int index = __builtin_ffs(mask) - 1;

    // 确定最小值来自哪个 __m256d 变量以及它是哪个元素
    __m256d* v[4] = {&v1, &v2, &v3, &v4};
    int v_index = index / 4;
    int elem_index = index % 4;

    printf("最小值为 %lf 来自 v%d 的元素 %d\n", min[elem_index], v_index + 1, elem_index);

    return 0;
}

英文:

I have 4 _m256d, how can I find the minimum among all 16 values?
How can I know the minimum value come from which __m256d variable? and which element is it? assume part of values are the same in different __m256d variable

I'm trying but it doesn't work:

#include &lt;immintrin.h&gt;
#include &lt;float.h&gt;

int main()
{
   // either v1[0] or v3[2] is the answer.
    __m256d v1 = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
    __m256d v2 = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
    __m256d v3 = _mm256_set_pd(3.0, 4.0, 1.0, 2.0);
    __m256d v4 = _mm256_set_pd(6.0, 5.0, 8.0, 7.0);

    __m256d min = _mm256_set1_pd(DBL_MAX);

    // Find the minimum among all 16 values
    min = _mm256_min_pd(min, v1);
    min = _mm256_min_pd(min, v2);
    min = _mm256_min_pd(min, v3);
    min = _mm256_min_pd(min, v4);

    // Get a 4-bit mask of the minimum elements
    int mask = _mm256_movemask_pd(_mm256_cmp_pd(min, min, _CMP_EQ_OQ));

    // Extract the index of the minimum element
    int index = __builtin_ffs(mask) - 1;

    // Determine which __m256d variable the minimum value comes from and which element it is
    __m256d* v[4] = {&amp;v1, &amp;v2, &amp;v3, &amp;v4};
    int v_index = index / 4;
    int elem_index = index % 4;

    printf(&quot;The minimum value is %lf from v%d at element %d\n&quot;, min[elem_index], v_index + 1, elem_index);

    return 0;
}

答案1

得分: 1

以下是您要翻译的内容：

假设您有AVX1但没有AVX2，我会这样做。

	#include <immintrin.h>

	struct sMin16
	{
		// The minimum value
		double val;
		// Index of the first minimum element
		int index;
	};

	// Compute minimum of 16 FP64 numbers, stored in 4 AVX vectors
	sMin16 min16(__m256d v0, __m256d v1, __m256d v2, __m256d v3)
	{
		// 计算4个向量的垂直最小值
		__m256d t0 = _mm256_min_pd(v0, v1);
		__m256d t1 = _mm256_min_pd(v2, v3);
		t0 = _mm256_min_pd(t0, t1);

		// 计算广播的水平最小值，交换16字节块，计算最小值
		t1 = _mm256_permute2f128_pd(t0, t0, 0x01);
		t0 = _mm256_min_pd(t0, t1);
		// 成对交换元素，计算最小值
		t1 = _mm256_shuffle_pd(t0, t0, 0b0101);
		t0 = _mm256_min_pd(t0, t1);

		// 存储最小值
		sMin16 result;
		result.val = _mm256_cvtsd_f64(t0);

		// 将数字与广播的最小值比较，生成结果的位图
		uint32_t mask;
		mask = (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v0, _CMP_EQ_OQ));
		mask |= (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v1, _CMP_EQ_OQ)) << 4;
		mask |= (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v2, _CMP_EQ_OQ)) << 8;
		mask |= (uint32_t)_mm256_movemask_pd(_mm256_cmp_pd(t0, v3, _CMP_EQ_OQ)) << 12;

		// 我们有一个16位的位图，一个位表示等于最小值的元素
		// 计算第一个等于最小值的元素的索引
	#ifdef _MSC_VER
		unsigned long idx;
		_BitScanForward(&idx, mask);
		result.index = idx;
	#else
		result.index = __builtin_ctz(mask);
	#endif
		return result;
	}

计算的索引的最低2位包含向量中的通道索引，索引的位[2 .. 3]包含具有第一个最小元素的向量的索引。

请注意，_mm256_set_pd 内联函数会反转参数的顺序，所以 _mm256_set_pd(1, 2, 3, 4) 创建一个具有值 [4, 3, 2, 1] 的向量，而不是 [1, 2, 3, 4]。由于这个原因，对于您的测试用例，该函数返回最小索引= 3，这是 v0 向量中的最后一个通道。

英文:

Assuming you have AVX1 but not AVX2, I would do it like that.

#include &lt;immintrin.h&gt;

struct sMin16
{
	// The minimum value
	double val;
	// Index of the first minimum element
	int index;
};

// Compute minimum of 16 FP64 numbers, stored in 4 AVX vectors
sMin16 min16( __m256d v0, __m256d v1, __m256d v2, __m256d v3 )
{
	// Compute vertical minimum of the 4 vectors
	__m256d t0 = _mm256_min_pd( v0, v1 );
	__m256d t1 = _mm256_min_pd( v2, v3 );
	t0 = _mm256_min_pd( t0, t1 );

	// Compute broadcasted horizontal minimum of `t0` vector
	// Swap 16-byte pieces, compute minimum
	t1 = _mm256_permute2f128_pd( t0, t0, 0x01 );
	t0 = _mm256_min_pd( t0, t1 );
	// Swap elements pairwise, compute minimum
	t1 = _mm256_shuffle_pd( t0, t0, 0b0101 );
	t0 = _mm256_min_pd( t0, t1 );

	// Store the minimum value
	sMin16 result;
	result.val = _mm256_cvtsd_f64( t0 );

	// Compare numbers for equality with the broadcasted minimum,
	// and make a bitmap of the results
	uint32_t mask;
	mask = (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v0, _CMP_EQ_OQ ) );
	mask |= (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v1, _CMP_EQ_OQ ) ) &lt;&lt; 4;
	mask |= (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v2, _CMP_EQ_OQ ) ) &lt;&lt; 8;
	mask |= (uint32_t)_mm256_movemask_pd( _mm256_cmp_pd( t0, v3, _CMP_EQ_OQ ) ) &lt;&lt; 12;

	// We have a bitmap of 16 bits, a bit is set for element展开收缩 equal to the minimum
	// Compute index of the first element equal to the minimum
#ifdef _MSC_VER
	unsigned long idx;
	_BitScanForward( &amp;idx, mask );
	result.index = idx;
#else
	result.index = __builtin_ctz( mask );
#endif
	return result;
}

The lowest 2 bits of the computed index contain lane index in a vector, and bits [ 2 .. 3 ] of the index contain index of the vector with the first minimum element.

Note that _mm256_set_pd intrinsic reverses the order of the arguments, so _mm256_set_pd( 1, 2, 3, 4 ) creates a vector with the values [ 4, 3, 2, 1 ], not [ 1, 2, 3, 4 ].<br/>
For this reason, the function returns minimum index = 3 for your test case, that’s last lane in the v0 vector.

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

SIMD：如何在4个__m256d寄存器中找到最小值及其索引

问题

答案1

ARM NEON：为什么向量代码比标量代码慢？

如何使用AVX-512实现向量化的“exp”和“log”基数2函数。

能否在不使用 for 循环的情况下填充空单元格？

如何将数据框中的每一列分别乘以不同的值每列。

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论