英文:
Why is my compute shader not adjusting the positions of all of the input particles? Vulkan/GLSL/C++
问题
I have just about figured out compute shader implementation with Vulkan. However, I am struggling to understand why only a fraction of the particles that I input to the shader are updating. int PARTICLE_COUNT = 32000
in the video.
[YouTube upload of the problem.][1] Please forgive me for not using Imgur, it has not been working for me for the last few hours and will not let me create any accessible uploads.
The compute shader code is below:
#version 450
struct camera {
mat4 view;
mat4 proj;
vec3 position;
};
layout(binding = 0) uniform UniformBufferObject {
float dt;
mat4 model;
camera cam;
} ubo;
struct Particle {
vec4 position;
vec4 color;
vec4 velocity;
};
layout(std140, set = 2, binding = 0) readonly buffer inSSBO {
Particle particlesIn[ ];
};
layout(std140, set = 2, binding = 1) buffer outSSBO {
Particle particlesOut[ ];
};
layout (local_size_x = 10, local_size_y = 10, local_size_z = 10) in;
// Organization and Indexing
uvec3 nWG = gl_NumWorkGroups;
uvec3 sWG = gl_WorkGroupSize;
uint i = gl_WorkGroupID.x + (nWG.x * gl_WorkGroupID.y) + (nWG.x * nWG.y * gl_WorkGroupID.z);
uint j = gl_LocalInvocationID.x + (sWG.x * gl_LocalInvocationID.y) + (sWG.x * sWG.y *gl_LocalInvocationID.z);
// Globals
const float c = 1.0f;
// Calculates acceleration towards a position
vec3 Gravity(vec3 p1, vec3 p2, float m1, float m2) {
vec3 rN = normalize(p2 - p1);
float dist2 = distance(p2, p1);
dist2 *= dist2;
return rN * ((m1 * m2) / (dist2));
}
void main()
{
particlesOut[i].position.xyz = particlesIn[i].position.xyz;
// Kinematic Motion of the Elements of the System
vec3 Acceleration;
if (i != j)
{// Particle Interaction Calculations
// Interacting Particle Properties
float m0 = 1.f;
float m1 = 1.f;
vec3 p0 = particlesIn[i].position.xyz;
vec3 p1 = particlesIn[j].position.xyz;
// Velocity Calculation
particlesOut[i].velocity.xyz += Gravity(p0, p1, m0, m1) * ubo.dt;
if (length(particlesOut[i].velocity) > c/2)
{// Sets the Velocity Maximum to the Speed of Light (divided by two bc ITS TOO FAST)
normalize(particlesOut[i].velocity);
particlesOut[i].velocity *= c/2;
}
particlesOut[i].position.xyz += particlesOut[i].velocity.xyz * ubo.dt;
// Flip movement at volume border
if ((particlesOut[i].position.x <= -1.0) || (particlesOut[i].position.x >= 1.0)) {
particlesOut[i].velocity.x = -particlesOut[i].velocity.x;
}
if ((particlesOut[i].position.y <= -1.0) || (particlesOut[i].position.y >= 1.0)) {
particlesOut[i].velocity.y = -particlesOut[i].velocity.y;
}
if ((particlesOut[i].position.z <= -1.0) || (particlesOut[i].position.z >= 1.0)) {
particlesOut[i].velocity.z = -particlesOut[i].velocity.z;
}
}
}
The compute shader dispatch code is below:
void computeCommand(VkCommandBuffer& commandBuffer, uint32_t setCount, VkDescriptorSet* sets) {
VkCommandBufferBeginInfo beginInfo
{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
if (vkBeginCommandBuffer(commandBuffer, &beginInfo) != VK_SUCCESS) {
throw std.runtime_error("failed to begin recording command buffer!");
}
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipeline);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mLayout, 0, setCount, sets, 0, nullptr);
vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000));
if (vkEndCommandBuffer(commandBuffer) != VK_SUCCESS) {
throw std.runtime_error("failed to record compute command buffer!");
}
}
and the final potential culprits, the Particle struct
and the data buffer code:
struct Particle {
glm::vec4 position;
glm::vec4 color;
glm::vec4 velocity;
const static VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
static VkVertexInputBindingDescription vkCreateBindings() {
VkVertexInputBindingDescription bindingDescription{};
bindingDescription.binding = 0;
bindingDescription.stride = sizeof(Particle);
bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
return bindingDescription;
}
static std.array<VkVertexInputAttributeDescription, 2> vkCreateAttributes() {
std.array<VkVertexInputAttributeDescription, 2> attributeDescriptions{};
attributeDescriptions[0].binding = 0;
attributeDescriptions[0].location = 0;
attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT;
attributeDescriptions[0].offset = offsetof(Particle, position);
attributeDescriptions[1].binding = 0;
attributeDescriptions[1].location = 1;
attributeDescriptions[1].format = VK_FORMAT_R32G32B32A32_SFLOAT;
attributeDescriptions[1].offset = offsetof(Particle, color);
return attributeDescriptions;
}
static VkPipelineVertexInputStateCreateInfo vkCreateVertexInput() {
static auto bindingDescription = vkCreateBindings();
static auto attributeDescriptions = vkCreateAttributes();
VkPipelineVertexInputStateCreateInfo vertexInputInfo
{ VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO };
vertexInputInfo.vertexBindingDescriptionCount = 1;
vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(attributeDescriptions.size());
vertexInputInfo.pVertexBindingDescriptions = &bindingDescription;
vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data();
return vertexInputInfo;
}
};
// SSBO struct initializes and stores an std.vector<Particle> particles;
void createDataBuffer(SSBO& ssbo) {
void* data;
VkBuffer stagingBuffer;
VkDeviceMemory stagingBufferMemory;
Buffer.resize(MAX_FRAMES_IN_FLIGHT);
Memory.resize(MAX_FRAMES_IN_FLIGHT);
bufferSize = sizeof(Particle)*PARTICLE_COUNT;
createBuffer(stagingBuffer, stagingBufferMemory,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
<details>
<summary>英文:</summary>
I have just about figured out compute shader implementation with Vulkan. However, I am struggling to understand why only a fraction of the particles that I input to the shader are updating. `int PARTICLE_COUNT = 32000` in the video.
[YouTube upload of the problem.][1] Please forgive me for not using Imgur, it has not been working for me for the last few hours and will not let me create any accessible uploads.
The compute shader code is below:
#version 450
struct camera {
mat4 view;
mat4 proj;
vec3 position;
};
layout(binding = 0) uniform UniformBufferObject {
float dt;
mat4 model;
camera cam;
} ubo;
struct Particle {
vec4 position;
vec4 color;
vec4 velocity;
};
layout(std140, set = 2, binding = 0) readonly buffer inSSBO {
Particle particlesIn[ ];
};
layout(std140, set = 2, binding = 1) buffer outSSBO {
Particle particlesOut[ ];
};
layout (local_size_x = 10, local_size_y = 10, local_size_z = 10) in;
// Organization and Indexing
uvec3 nWG = gl_NumWorkGroups;
uvec3 sWG = gl_WorkGroupSize;
uint i = gl_WorkGroupID.x + (nWG.x * gl_WorkGroupID.y) + (nWG.x * nWG.y * gl_WorkGroupID.z);
uint j = gl_LocalInvocationID.x + (sWG.x * gl_LocalInvocationID.y) + (sWG.x * sWG.y *gl_LocalInvocationID.z);
// Globals
const float c = 1.0f;
// Calculates acceleration towards a position
vec3 Gravity(vec3 p1, vec3 p2, float m1, float m2) {
vec3 rN = normalize(p2 - p1);
float dist2 = distance(p2, p1);
dist2 *= dist2;
return rN * ((m1 * m2) / (dist2));
}
void main()
{
particlesOut[i].position.xyz = particlesIn[i].position.xyz;
// Kinematic Motion of the Elements of the System
vec3 Acceleration;
if (i != j)
{// Particle Interaction Calculations
// Interacting Particle Properties
float m0 = 1.f;
float m1 = 1.f;
vec3 p0 = particlesIn[i].position.xyz;
vec3 p1 = particlesIn[j].position.xyz;
// Velocity Calculation
particlesOut[i].velocity.xyz += Gravity(p0, p1, m0, m1) * ubo.dt;
if (length(particlesOut[i].velocity) > c/2)
{// Sets the Velocity Maximum to the Speed of Light (divided by two bc ITS TOO FAST)
normalize(particlesOut[i].velocity);
particlesOut[i].velocity *= c/2;
}
particlesOut[i].position.xyz += particlesOut[i].velocity.xyz * ubo.dt;
// Flip movement at volume border
if ((particlesOut[i].position.x <= -1.0) || (particlesOut[i].position.x >= 1.0)) {
particlesOut[i].velocity.x = -particlesOut[i].velocity.x;
}
if ((particlesOut[i].position.y <= -1.0) || (particlesOut[i].position.y >= 1.0)) {
particlesOut[i].velocity.y = -particlesOut[i].velocity.y;
}
if ((particlesOut[i].position.z <= -1.0) || (particlesOut[i].position.z >= 1.0)) {
particlesOut[i].velocity.z = -particlesOut[i].velocity.z;
}
}
}
The compute shader dispatch code is below:
void computeCommand(VkCommandBuffer& commandBuffer, uint32_t setCount, VkDescriptorSet* sets) {
VkCommandBufferBeginInfo beginInfo
{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
if (vkBeginCommandBuffer(commandBuffer, &beginInfo) != VK_SUCCESS) {
throw std::runtime_error("failed to begin recording command buffer!");
}
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipeline);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mLayout, 0, setCount, sets, 0, nullptr);
vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000));
if (vkEndCommandBuffer(commandBuffer) != VK_SUCCESS) {
throw std::runtime_error("failed to record compute command buffer!");
}
}
and the final potential culprits, the Particle `struct` and the data buffer code:
struct Particle {
glm::vec4 position;
glm::vec4 color;
glm::vec4 velocity;
const static VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
static VkVertexInputBindingDescription vkCreateBindings() {
VkVertexInputBindingDescription bindingDescription{};
bindingDescription.binding = 0;
bindingDescription.stride = sizeof(Particle);
bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
return bindingDescription;
}
static std::array<VkVertexInputAttributeDescription, 2> vkCreateAttributes() {
std::array<VkVertexInputAttributeDescription, 2> attributeDescriptions{};
attributeDescriptions[0].binding = 0;
attributeDescriptions[0].location = 0;
attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT;
attributeDescriptions[0].offset = offsetof(Particle, position);
attributeDescriptions[1].binding = 0;
attributeDescriptions[1].location = 1;
attributeDescriptions[1].format = VK_FORMAT_R32G32B32A32_SFLOAT;
attributeDescriptions[1].offset = offsetof(Particle, color);
return attributeDescriptions;
}
static VkPipelineVertexInputStateCreateInfo vkCreateVertexInput() {
static auto bindingDescription = vkCreateBindings();
static auto attributeDescriptions = vkCreateAttributes();
VkPipelineVertexInputStateCreateInfo vertexInputInfo
{ VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO };
vertexInputInfo.vertexBindingDescriptionCount = 1;
vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(attributeDescriptions.size());
vertexInputInfo.pVertexBindingDescriptions = &bindingDescription;
vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data();
return vertexInputInfo;
}
};
// SSBO struct initializes and stores an std::vector<Particle> particles;
void createDataBuffer(SSBO& ssbo) {
void* data;
VkBuffer stagingBuffer;
VkDeviceMemory stagingBufferMemory;
Buffer.resize(MAX_FRAMES_IN_FLIGHT);
Memory.resize(MAX_FRAMES_IN_FLIGHT);
bufferSize = sizeof(Particle)*PARTICLE_COUNT;
createBuffer(stagingBuffer, stagingBufferMemory,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
vkMapMemory(VkGPU::device, stagingBufferMemory, 0, bufferSize, 0, &data);
memcpy(data, ssbo.particles.data(), (size_t)bufferSize);
vkUnmapMemory(VkGPU::device, stagingBufferMemory);
for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
createBuffer(Buffer[i], Memory[i],
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
copyBuffer(stagingBuffer, Buffer[i]);
}
vkDestroyBuffer(VkGPU::device, stagingBuffer, nullptr);
vkFreeMemory(VkGPU::device, stagingBufferMemory, nullptr);
}
The problem is much more apparent when `PARTICLE_COUNT = 2000`. [Even fewer particles are moving.][2] At most ten out of the 2000 particles. Again, please forgive my use of YouTube for my uploads.
I have a feeling the problem is with my indexing within the compute shader, but I am not totally sure. My other thought was that the number of particles distributed to the workgroups might have been a source of the problem, but decreasing the particle count only made it more apparent.
Edit: Fixed a line in the compute shader that did not accurately update the positions of the `ParticlesOut[]` object, but that was a remnant from testing the SSBOs `readonly` property. Fixing that line has not made any difference in fixing the problem.
[1]: https://youtu.be/8bW30t1Pir4
[2]: https://youtu.be/WmOudT61lJg
</details>
# 答案1
**得分**: 0
有时,在辛苦工作后休息一下是很好的。吃了顿美味的饭菜,休息了一会儿后,我能够用崭新的眼光审视我的代码,找出问题所在。
在调度计算着色器时,我给每个工作组轴分配了太少的粒子来处理。我的原始调度命令如下:
```c
vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000));
而我的计算着色器布局是:
layout (local_size_x = 10, local_size_y = 10, local_size_z = 10) in;
将调度命令更新为以下代码行,可以分配适当数量的粒子进行处理:
vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (100), PARTICLE_COUNT / (100), PARTICLE_COUNT / (100));
我认为问题源于一开始给每个工作组的粒子太少,因此并没有处理每个粒子。我通过将调度命令的 groupCount
降低到 PARTICLE_COUNT / (10)
进行了测试,结果导致帧率严重下降,因为每次调用都必须处理多10倍的粒子。
我并不完全清楚为什么或者3D工作组背后的数学运算方式,但似乎与其他两个工作组轴的大小有关,其中除数等于其他本地工作组的乘积。即 local_size_y = 10
和 local_size_z = 10
,因此除数等于 10*10
或 100
。如果有人能更好地解释计算 groupCount
背后的数学原理,我将不胜感激,因为我并不完全理解,除了我在这里所能解释的内容。
英文:
Sometimes it's good to take a break after working hard. After a good meal and some rest, I was able to look at my code with fresh eyes and figure out what I was doing wrong.
In dispatching the compute shader, I gave each work-group axis too few particles to work on. My original dispatch command was:
vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000));
Whereas the layout for my compute shader was:
layout (local_size_x = 10, local_size_y = 10, local_size_z = 10) in;
Updating the dispatch command to the following line of code gives the appropriate number of particles to work on:
vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (100), PARTICLE_COUNT / (100), PARTICLE_COUNT / (100));
I believe the problem stemmed from giving each workgroup too few particles at first, so not every particle was being processed. I tested this by lowering the dispatch command's groupCount
to PARTICLE_COUNT / (10)
which resulted in a horrendous framerate drop since each invocation had to process 10x more particles.
I am not totally clear as to why or how the math works behind the 3D workgroups, but it seems to have to do with the size of the other two workgroup axes, where the divisor is equal to the product of the other local workgroups. I.E. local_size_y = 10
and local_size_z = 10
, so the divisor is equal to 10*10
or 100
. I would appreciate it if someone could better explain the math behind calculating the groupCount
as I do not fully understand it beyond what I could explain here.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论