英文:
Aberth–Ehrlich method GPU implementation
问题
以下是代码部分的中文翻译:
我的当前实现不稳定和/或不正确。结果可能会因每次运行而异。
我只想解决5次多项式。
编辑后的着色器:
// 版本声明
#define EPSILON 1e-05
#define MAXITERATION 200
layout (binding = 0) uniform ParameterUBO {
vec2 P_0, p1, p2, p3;
int maxIndex, width, height, polynomSize, curveIndex;
} ubo;
layout(std140, binding = 1) readonly buffer CoefficientIn
{
vec2 Coefficients[ ][6]; // 存储多项式系数的缓冲区。
};
layout(std140, binding = 2) buffer ParticleSSBOIn
{
volatile vec4 approximation[][5]; // 存储当前近似值的缓冲区。
};
vec2 mul(const vec2 z, const vec2 zp) // 复数相乘
{
vec2 result;
result.x = z.x * zp.x - z.y * zp.y;
result.y = z.y * zp.x + z.x * zp.y;
return result;
}
vec2 div(const vec2 z, const vec2 zp) // 复数除法
{
float bottom = zp.x * zp.x + zp.y * zp.y;
vec2 result = mul(z, vec2(zp.x, -zp.y);
return result / bottom;
}
vec2 f(const vec2 z, const in vec2[6] coeff) // 计算 z 处的多项式
{
vec2 tmp = vec2(1, 0);
vec2 result = coeff[0];
for(uint i = 1; i < ubo.polynomSize; i++)
{
tmp = mul(tmp, z);
result += mul(coeff[i], tmp);
}
return result;
}
vec2 fp(const vec2 z, const in vec2[6] coeff) // 计算 z 处的导数。
{
vec2 tmp = vec2(1, 0);
vec2 result = coeff[1];
for(uint i = 2; i < ubo.polynomSize; i++)
{
tmp = mul(tmp, z);
result += mul(coeff[i] * float(i), tmp);
}
return result;
}
float absComplex(const vec2 z)
{
return sqrt(z.x * z.x + z.y * z.y);
}
vec2 sumOfApproximation(const uint index, const in vec2[5] values)
{
vec2 result = vec2(0);
for(uint j = 0; j < ubo.polynomSize - 1; j++)
if (j != index)
result += div(vec2(1, 0), values[index] - values[j]) ;
return result;
}
vec2[5] getApproximationsCase0(const uint indexGroup)
{
vec2[5] result;
for(uint i = 0; i < 5; i++)
result[i] = approximation[indexGroup][i].xy;
return result;
}
vec2[5] getApproximationsCase1(const uint indexGroup)
{
vec2[5] result;
for(uint i = 0; i < 5; i++)
result[i] = approximation[indexGroup][i].zw;
return result;
}
layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
void main()
{
uint index = gl_GlobalInvocationID.x;
uint subIndex = index % (5);
uint indexGroup = (index - subIndex) / (5);
if(indexGroup < ubo.maxIndex)
{
bool switchCase = false;
for(uint i = 0; i < MAXITERATION; i++)
{
const vec2[5] CurrentApproximation = switchCase? getApproximationsCase1(indexGroup) : getApproximationsCase0(indexGroup);
const vec2 z = CurrentApproximation[subIndex];
const vec2 p = f(z, Coefficients[indexGroup]);
const vec2 pp = fp(z, Coefficients[indexGroup]);
const vec2 POverPP = div(p, pp);
const vec2 sum = sumOfApproximation(subIndex, CurrentApproximation);
const vec2 zPlusOne = z - div(POverPP, vec2(1, 0) - mul(POverPP, sum));
if(switchCase)
approximation[indexGroup][subIndex].zw = zPlusOne;
else
approximation[indexGroup][subIndex].xy = zPlusOne;
if(absComplex(z - zPlusOne) < EPSILON || absComplex(z) < EPSILON)
{
approximation[indexGroup][subIndex].xy = zPlusOne;
break;
}
switchCase != switchCase;
}
}
}
希望这些翻译对您有所帮助。如果有任何其他问题,请随时提出。
英文:
My current implementation is not stable and/or correct. The result may vary from one run to another.
I only want to solve 5th degree polynomials.
Edited shader:
#version 450
#define EPSILON 1e-05
#define MAXITERATION 200
layout (binding = 0) uniform ParameterUBO {
vec2 P_0, p1, p2, p3;
int maxIndex, width, height, polynomSize, curveIndex;
} ubo;
layout(std140, binding = 1) readonly buffer CoefficientIn
{
vec2 Coefficients[ ][6]; // buffer storing the coefficients of the polynomial.
};
layout(std140, binding = 2) buffer ParticleSSBOIn
{
volatile vec4 approximation[][5]; // buffer storing the current approximations.
};
vec2 mul(const vec2 z, const vec2 zp) // complex multiplication
{
vec2 result;
result.x = z.x * zp.x - z.y * zp.y;
result.y = z.y * zp.x + z.x * zp.y;
return result;
}
vec2 div(const vec2 z, const vec2 zp) // complex division
{
float bottom = zp.x * zp.x + zp.y * zp.y;
vec2 result = mul(z, vec2(zp.x, -zp.y));
return result / bottom;
}
vec2 f(const vec2 z, const in vec2[6] coeff) // compute polynom at z
{
vec2 tmp = vec2(1, 0);
vec2 result = coeff[0];
for(uint i = 1; i < ubo.polynomSize; i++)
{
tmp = mul(tmp, z);
result += mul(coeff[i], tmp);
}
return result;
}
vec2 fp(const vec2 z, const in vec2[6] coeff) // compute derivate at z.
{
vec2 tmp = vec2(1, 0);
vec2 result = coeff[1];
for(uint i = 2; i < ubo.polynomSize; i++)
{
tmp = mul(tmp, z);
result += mul(coeff[i] * float(i), tmp);
}
return result;
}
float absComplex(const vec2 z)
{
return sqrt(z.x * z.x + z.y * z.y);
}
vec2 sumOfApproximation(const uint index, const in vec2[5] values)
{
vec2 result = vec2(0);
for(uint j = 0; j < ubo.polynomSize - 1; j++)
if (j != index)
result += div(vec2(1, 0), values[index] - values[j]) ;
return result;
}
vec2[5] getApproximationsCase0(const uint indexGroup)
{
vec2[5] result;
for(uint i = 0; i < 5; i++)
result[i] = approximation[indexGroup][i].xy;
return result;
}
vec2[5] getApproximationsCase1(const uint indexGroup)
{
vec2[5] result;
for(uint i = 0; i < 5; i++)
result[i] = approximation[indexGroup][i].zw;
return result;
}
layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
void main()
{
uint index = gl_GlobalInvocationID.x;
uint subIndex = index % (5);
uint indexGroup = (index - subIndex) / (5);
if(indexGroup < ubo.maxIndex)
{
bool switchCase = false;
for(uint i = 0; i < MAXITERATION; i++)
{
const vec2[5] CurrentApproximation = switchCase? getApproximationsCase1(indexGroup) : getApproximationsCase0(indexGroup);
const vec2 z = CurrentApproximation[subIndex];
const vec2 p = f(z, Coefficients[indexGroup]);
const vec2 pp = fp(z, Coefficients[indexGroup]);
const vec2 POverPP = div(p, pp);
const vec2 sum = sumOfApproximation(subIndex, CurrentApproximation);
const vec2 zPlusOne = z - div(POverPP, vec2(1, 0) - mul(POverPP, sum));
if(switchCase)
approximation[indexGroup][subIndex].zw = zPlusOne;
else
approximation[indexGroup][subIndex].xy = zPlusOne;
if(absComplex(z - zPlusOne) < EPSILON || absComplex(z) < EPSILON)
{
approximation[indexGroup][subIndex].xy = zPlusOne;
break;
}
switchCase != switchCase;
}
}
}
A full exemple is avalable in this repo.
I tried to:
- add a new buffer
- define the buffer as volatile
- make one step per call
but all this destroyed the performance.
Currently it can solve 2073600 different polynomials in 15ms on a 1050.
Edit:
After reviewing my post, I realize that it may be confusing. My original question was, 'How do I keep my 5 threads in sync?'
The 'approximation' buffer is filled with the starting points of the method.
Since my initial post, I have edited the shader and tested it on multiple hardware without any failures.
I discovered that I had been computing 414720 polynomials instead of the intended 2073600. Additionally, I noticed that abs(f(approximation)) is greater than 1e-05, due to the step being too small near roots, which may be a probleme.
答案1
得分: 1
以下是您要翻译的内容:
基本问题是,您正在尝试进行跨线程操作而没有进行任何同步。具体来说...
sumOfAproximation()
在迭代 5 个不同线程写入的值,因为 5 不是硬件子组(warp/wave)大小的倍数,所以您无法保证这 5 个线程相对于彼此的运行时间。实际上,这 5 个线程中的一些在另外一组 5 个线程运行时甚至可能尚未开始运行。
如果您想要线程调度的任何保证,您必须执行以下操作之一:
(1)确保您关心的所有线程都位于相同的工作组内(在您当前的代码中,这是 64 个线程)。然后,您可以使用 barrier() 操作(可能还包括 memoryBarrier() 操作)在工作组内同步线程,以确保每个线程在继续之前都已达到该点。不足之处是屏障可能会很慢。为了获得最佳性能,您希望工作组大小是硬件子组大小的整数倍。
如果您在相同的工作组内共享数据,那么与全局内存相比,您可能会从使用共享内存获得一些性能提升,至少在桌面硬件上是如此。
(2)确保您关心的所有线程都在相同的硬件子组内运行,然后使用子组操作(GL_KHR_shader_subgroup
)根据需要交换数据。这可能会更快,因为您可以避免屏障,但子组大小是特定于供应商的,因此您可能需要处理其中的变化性。
(3)另一种方法是跳过此跨线程部分,而是在单个着色器线程调用内完成每组 5 个,就像您为 CPU 编写单线程代码一样。如果您可以将线程合并为 5 的倍数并仍然有足够大的问题空间来填充 GPU 线程,那么这可能不是一个主要问题。
英文:
The basic problem you have is that you are trying to do cross-thread operations without any synchronization. Specifically ...
sumOfAproximation()
is iterating over 5 values written by 5 different threads, and because 5 is not a multiple of the hardware subgroup (warp/wave) size you have no guarantee when those 5 threads are running relative to each other. Indeed, some of the 5 may not have even started running at the point the other threads in the set of 5 run.
If you want any guarantees of thread scheduling you MUST either:
(1) Ensure that all threads you care about are exchanging data are inside the same work group (in your current code that is 64 threads). You can then synchronize threads within the workgroup with barrier() operations (and possibly memoryBarrier() operations too) to ensure each thread has reached that point before continuing. Downside is that barriers can be slow. For best performance you want your workgroup size to be an integer multiple of the hardware subgroup size.
If you are sharing data inside the same workgroup you may then get some performance boost from using shared memory rather than global memory for the intermediate values, at least on desktop hardware.
(2) Ensure that all threads you care about are running inside the same hardware subgroup, and then use subgroup operations (GL_KHR_shader_subgroup
) to exchange data as needed. This can be faster, as you avoid barriers, but the subgroup size is vendor specific so you may have to handle variability there.
(3) The other approach would to simply skip the cross-thread part of this and complete each group of 5 inside a single shader thread invocation, just as you would write the single threaded code for a CPU. If you can merge threads by a factor of 5 and still have a big enough problem space to fill the GPU with threads, then this may not be a major issue.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论