英文:
WebGPU compute shader gives unexpected result
问题
我创建了一个着色器,它产生了意外的结果:
@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
let thread = id.x;
for(var c = 0u; c<1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}
只有一个线程会写入resultBuffer
。
因此,每个元素应该都是1。
函数any_compute()
没有副作用。我猜测是因为计算量大导致了这个问题。
大多数情况下结果是正确的。但有时会出现意外的结果。resultBuffer
中会出现一些2或3。
完整的测试程序:
const cWGSL = `
@group(0) @binding(0) var<storage, read_write> resultBuffer: array<f32>;
const PI = 3.14159265358979323846;
const inv4PI = 0.25/PI;
const eps = 1e-6;
fn any_compute(a : vec4<f32>, b : vec4<f32>) -> vec3f
{
let dist = a.xyz - b.xyz;
let invDist = inverseSqrt(dot(dist, dist) + eps);
let invDistCube = invDist * invDist * invDist;
let s = b.w * invDistCube;
return -s * inv4PI * dist;
}
@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
let thread = id.x;
for(var c = 0u; c<1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}`;
async function main(){
const adapter = await navigator.gpu.requestAdapter();
const device = await adapter.requestDevice();
// to-do: check limit
console.log(adapter);
const maxGPUThread = 256;
const resultBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
});
const readBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
});
const shaderModule = device.createShaderModule({
code: cWGSL,
})
const computePipeline = device.createComputePipeline({
layout: 'auto', // 从着色器代码推断
compute: {
module: shaderModule,
entryPoint: "test"
}
});
const bindGroupLayout = computePipeline.getBindGroupLayout(0);
const buffers = [resultBufferGPU];
const entries = buffers.map((b, i) => {
return { binding: i, resource: { buffer: b } };
});
const bindGroup = device.createBindGroup({
layout: bindGroupLayout,
entries: entries
});
const commandEncoder = device.createCommandEncoder();
const computePassEncoder = commandEncoder.beginComputePass();
computePassEncoder.setPipeline(computePipeline);
computePassEncoder.setBindGroup(0, bindGroup);
computePassEncoder.dispatchWorkgroups(256, 1);
computePassEncoder.end();
commandEncoder.copyBufferToBuffer(resultBufferGPU, 0, readBufferGPU, 0, 4000);
const gpuCommands = commandEncoder.finish();
device.queue.submit([gpuCommands]);
await device.queue.onSubmittedWorkDone();
await readBufferGPU.mapAsync(GPUMapMode.READ);
const arrayBuffer = readBufferGPU.getMappedRange();
const result = new Float32Array(arrayBuffer);
console.log(result);
const resultCount = {};
Array.from(new Set(result)).forEach(v => resultCount[v] = result.filter(x => x == v).length);
console.log(resultCount);
}
main();
也可以在 jsfiddle 中测试。我在Chrome Canary中进行了测试。
出现次数的计数将打印在控制台中。预期输出是 {1: 1000}
。
英文:
I made a shader which gives unexpected result:
@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
let thread = id.x;
for(var c = 0u; c<1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}
Only one thread will write to resultBuffer
.
So every element should be 1.
The function any_compute()
has no side effects. I guess heavy computation causes this problem.
The results are correct most of the time. But sometimes there are unexpected results. There will be some 2 or 3 in resultBuffer
.
The full test program:
<!-- begin snippet: js hide: false console: true babel: false -->
<!-- language: lang-js -->
const cWGSL = `
@group(0) @binding(0) var<storage, read_write> resultBuffer: array<f32>;
const PI = 3.14159265358979323846;
const inv4PI = 0.25/PI;
const eps = 1e-6;
fn any_compute(a : vec4<f32>, b : vec4<f32>) -> vec3f
{
let dist = a.xyz - b.xyz;
let invDist = inverseSqrt(dot(dist, dist) + eps);
let invDistCube = invDist * invDist * invDist;
let s = b.w * invDistCube;
return -s * inv4PI * dist;
}
@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
let thread = id.x;
for(var c = 0u; c<1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}`;
async function main(){
const adapter = await navigator.gpu.requestAdapter();
const device = await adapter.requestDevice();
// to-do: check limit
console.log(adapter);
const maxGPUThread = 256;
const resultBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
});
const readBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
});
const shaderModule = device.createShaderModule({
code: cWGSL,
})
const computePipeline = device.createComputePipeline({
layout: 'auto', // infer from shader code.
compute: {
module: shaderModule,
entryPoint: "test"
}
});
const bindGroupLayout = computePipeline.getBindGroupLayout(0);
const buffers = [resultBufferGPU];
const entries = buffers.map((b, i) => {
return { binding: i, resource: { buffer: b } };
});
const bindGroup = device.createBindGroup({
layout: bindGroupLayout,
entries: entries
});
const commandEncoder = device.createCommandEncoder();
const computePassEncoder = commandEncoder.beginComputePass();
computePassEncoder.setPipeline(computePipeline);
computePassEncoder.setBindGroup(0, bindGroup);
computePassEncoder.dispatchWorkgroups(256, 1);
computePassEncoder.end();
commandEncoder.copyBufferToBuffer(resultBufferGPU, 0, readBufferGPU, 0, 4000);
const gpuCommands = commandEncoder.finish();
device.queue.submit([gpuCommands]);
await device.queue.onSubmittedWorkDone();
await readBufferGPU.mapAsync(GPUMapMode.READ);
const arrayBuffer = readBufferGPU.getMappedRange();
const result = new Float32Array(arrayBuffer);
console.log(result);
const resultCount = {};
Array.from(new Set(result)).forEach(v => resultCount[v] = result.filter(x => x == v).length);
console.log(resultCount);
}
main();
<!-- end snippet -->
Also a jsfiddle version. I tested it in Chrome Canary.
The count of occurrence will be printed in console. The expected output is {1: 1000}
.
答案1
得分: 2
以下是您要翻译的内容:
问题是,我认为您混淆了工作组大小和dispatchWorkgroups
。
@workgroup_size
定义了每个工作组运行的单个进程数量。换句话说,如果在您的着色器中设置了 @workgroup_size(256, 1)
,那么 dispatchWorkgroups(1)
将运行 256 个进程。
您发出了命令 computePassEncoder.dispatchWorkgroups(256, 1);
,这意味着您运行了 256 个进程,共运行了 256 次(总共运行了 64k 个进程)。
由于 thread
= id.x
,而 id.x
来自于 workgroup_id
,这意味着当第一个工作组运行时,所有 256 个线程的 workgroup_id.x
都将为 0(因为 workgroup_id 是工作组的 ID)。
因此,有 256 次 thread == 0
,这意味着线程之间存在竞争,试图更新 resultBuffer[c]
。这就是结果随机的原因。
有三个 ID
local_invocation_id
(在工作组内的 ID)workgroup_id
(工作组的 ID)global_invocation_id
(workgroup_id * workgroup_size + local_invocation_id)
还有一个方便的内置
local_invocation_index
rowSize = workgroup_size.x
sliceSize = workgroup_size.x * workgroup_size.y
local_invocation_index =
local_invocation_id.x +
local_invocation_id.y * rowSize +
local_invocation_id.z * sliceSize
以下是一些代码,希望能说明问题。
const dispatchCount = [4, 3, 2];
const workgroupSize = [2, 3, 4];
const arrayProd = arr => arr.reduce((a, b) => a * b);
const numThreadsPerWorkgroup = arrayProd(workgroupSize);
const code = `
// 注意:vec3u 被填充为 4 个字节
@group(0) @binding(0) var<storage, read_write> workgroupResult: array<vec3u>;
@group(0) @binding(1) var<storage, read_write> localResult: array<vec3u>;
@group(0) @binding(2) var<storage, read_write> globalResult: array<vec3u>;
const wg_size = ${numThreadsPerWorkgroup};
@compute @workgroup_size(${workgroupSize})
fn test(
@builtin(workgroup_id) workgroup_id : vec3<u32>,
@builtin(local_invocation_id) local_invocation_id : vec3<u32>,
@builtin(global_invocation_id) global_invocation_id : vec3<u32>,
@builtin(local_invocation_index) local_invocation_index: u32,
@builtin(num_workgroups) num_workgroups: vec3<u32>
) {
let workgroup_index =
workgroup_id.x +
workgroup_id.y * num_workgroups.x +
workgroup_id.z * num_workgroups.x * num_workgroups.y;
let global_invocation_index =
workgroup_index * wg_size +
local_invocation_index;
workgroupResult[global_invocation_index] = workgroup_id;
localResult[global_invocation_index] = local_invocation_id;
globalResult[global_invocation_index] = global_invocation_id;
}
`;
// 其余代码...
在第一个分派中,请注意 global
和 local
是相同的。对于其余的分派,它们是不同的。
还请注意,同一工作组中的线程可以共享 var<workgroup>
变量。
请注意:您不需要调用 await device.queue.onSubmittedWorkDone
。根据规范的描述:
在 mapAsync 的第 4 步中:
在使用此内容的当前排队操作完成之后,
...
在 contentTimeline 上发出映射成功步骤。
英文:
The issue is I think you're confusing workgroup size and and dispatchWorkgroups
.
@workgroup_size
defines now many individual processes to run per workgroup. In other words, if you set @workgroup_size(256, 1)
in your shader, then dispatchWorkgroups(1)
will run 256 processes.
You issue the command computePassEncoder.dispatchWorkgroups(256, 1);
which means you're running 256 process, 256 times (64k processes in total)
Since thread
= id.x
and id.x
comes from workgroup_id
, that means when the first workgroup is run, workgroup_id.x will be 0 for all 256 threads. (because workgroup_id is the id of the workgroup)
So, there are 256 times that thread == 0
which means there's a race between threads trying to update resultBuffer[c]
. This is why the result is random.
There are 3 ids
local_invocation_id
(the ID with in a workgroup)workgroup_id
(the ID of a workgroup)global_invocation_id
(workgroup_id * workgroup_size + local_invocation_id)
There's one convenience builtin
-
local_invocation_index
rowSize = workgroup_size.x sliceSize = workgroup_size.x * workgroup_size.y local_invocation_index = local_invocation_id.x + local_invocation_id.y * rowSize + local_invocation_id.z * sliceSize
Here's some code to hopefully illustrate the point
<!-- begin snippet: js hide: false console: true babel: false -->
<!-- language: lang-js -->
const dispatchCount = [4, 3, 2];
const workgroupSize = [2, 3, 4];
const arrayProd = arr => arr.reduce((a, b) => a * b);
const numThreadsPerWorkgroup = arrayProd(workgroupSize);
const code = `
// NOTE!: vec3u is padded to by 4 bytes
@group(0) @binding(0) var<storage, read_write> workgroupResult: array<vec3u>;
@group(0) @binding(1) var<storage, read_write> localResult: array<vec3u>;
@group(0) @binding(2) var<storage, read_write> globalResult: array<vec3u>;
const wg_size = ${numThreadsPerWorkgroup};
@compute @workgroup_size(${workgroupSize})
fn test(
@builtin(workgroup_id) workgroup_id : vec3<u32>,
@builtin(local_invocation_id) local_invocation_id : vec3<u32>,
@builtin(global_invocation_id) global_invocation_id : vec3<u32>,
@builtin(local_invocation_index) local_invocation_index: u32,
@builtin(num_workgroups) num_workgroups: vec3<u32>
) {
let workgroup_index =
workgroup_id.x +
workgroup_id.y * num_workgroups.x +
workgroup_id.z * num_workgroups.x * num_workgroups.y;
let global_invocation_index =
workgroup_index * wg_size +
local_invocation_index;
workgroupResult[global_invocation_index] = workgroup_id;
localResult[global_invocation_index] = local_invocation_id;
globalResult[global_invocation_index] = global_invocation_id;
}
`;
async function main(){
const adapter = await navigator.gpu?.requestAdapter();
const device = await adapter?.requestDevice();
if (!device) {
console.error('need WebGPU');
return;
}
const numWorkgroups = dispatchCount[0] * dispatchCount[1] * dispatchCount[2];
const numResults = numWorkgroups * numThreadsPerWorkgroup;
const size = numResults * 4 * 4; // vec3f * u32
let usage = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC;
const workgroupBufferGPU = device.createBuffer({size, usage});
const localBufferGPU = device.createBuffer({size, usage});
const globalBufferGPU = device.createBuffer({size, usage});
usage = GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST;
const workgroupReadBufferGPU = device.createBuffer({size, usage});
const localReadBufferGPU = device.createBuffer({size, usage});
const globalReadBufferGPU = device.createBuffer({size, usage});
const shaderModule = device.createShaderModule({code});
const computePipeline = device.createComputePipeline({
layout: 'auto',
compute: {
module: shaderModule,
entryPoint: "test"
}
});
const bindGroup = device.createBindGroup({
layout: computePipeline.getBindGroupLayout(0),
entries: [
{ binding: 0, resource: { buffer: workgroupBufferGPU }},
{ binding: 1, resource: { buffer: localBufferGPU }},
{ binding: 2, resource: { buffer: globalBufferGPU }},
],
});
const commandEncoder = device.createCommandEncoder();
const computePassEncoder = commandEncoder.beginComputePass();
computePassEncoder.setPipeline(computePipeline);
computePassEncoder.setBindGroup(0, bindGroup);
computePassEncoder.dispatchWorkgroups(...dispatchCount);
computePassEncoder.end();
commandEncoder.copyBufferToBuffer(workgroupBufferGPU, 0, workgroupReadBufferGPU, 0, size);
commandEncoder.copyBufferToBuffer(localBufferGPU, 0, localReadBufferGPU, 0, size);
commandEncoder.copyBufferToBuffer(globalBufferGPU, 0, globalReadBufferGPU, 0, size);
device.queue.submit([commandEncoder.finish()]);
await Promise.all([
workgroupReadBufferGPU.mapAsync(GPUMapMode.READ),
localReadBufferGPU.mapAsync(GPUMapMode.READ),
globalReadBufferGPU.mapAsync(GPUMapMode.READ),
]);
const workgroup = new Uint32Array(workgroupReadBufferGPU.getMappedRange());
const local = new Uint32Array(localReadBufferGPU.getMappedRange());
const global = new Uint32Array(globalReadBufferGPU.getMappedRange());
const get3 = (arr, i) => {
const off = i * 4;
return `${arr[off]},${arr[off + 1]},${arr[off + 2]}`;
};
for (let i = 0; i < numResults; ++i) {
if (i % numThreadsPerWorkgroup === 0) {
log(`g-index workgroup local global dispatch: ${i / numThreadsPerWorkgroup}`);
}
log(`${i.toString().padStart(3)}: ${get3(workgroup, i)} ${get3(local, i)} ${get3(global, i)}`)
}
}
function log(...args) {
const elem = document.createElement('pre');
elem.textContent = args.join(' ');
document.body.appendChild(elem);
}
main();
<!-- language: lang-css -->
pre { margin: 0; }
<!-- end snippet -->
notice in the first dispatch global
and local
are the same. For the rest of the dispatches they aren't
Also notice, workgroup_id is the same for all threads in the same workgroup.
I don't know all the reasons why it's this way but, within a single workgroup the threads can share var<workgroup>
variables.
note: you do not need to call await device.queue.onSubmittedWorkDone
. From the spec:
Under step 4 of mapAsync:
> after the completion of currently-enqueued operations that use this,
>
> ...
>
> Issue the map success steps on the contentTimeline.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论