WebGPU计算着色器产生意外结果

huangapple go评论56阅读模式
英文:

WebGPU compute shader gives unexpected result

问题

我创建了一个着色器,它产生了意外的结果:

@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
  let thread = id.x;
  for(var c = 0u; c<1000; c++){
    any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
    if(thread==0){
      resultBuffer[c]+=1;
    }
  }
}

只有一个线程会写入resultBuffer
因此,每个元素应该都是1。

函数any_compute()没有副作用。我猜测是因为计算量大导致了这个问题。

大多数情况下结果是正确的。但有时会出现意外的结果。resultBuffer中会出现一些2或3。

完整的测试程序:

const cWGSL = `
@group(0) @binding(0) var<storage, read_write> resultBuffer: array<f32>;

const PI = 3.14159265358979323846;
const inv4PI = 0.25/PI;

const eps = 1e-6;
fn any_compute(a : vec4<f32>, b : vec4<f32>) -> vec3f
{
  let dist = a.xyz - b.xyz;
  let invDist = inverseSqrt(dot(dist, dist) + eps); 
  let invDistCube = invDist * invDist * invDist;
  let s = b.w * invDistCube;
  return -s * inv4PI * dist;
}

@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
  let thread = id.x;
  for(var c = 0u; c<1000; c++){
    any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
    if(thread==0){
    resultBuffer[c]+=1;
    }
  }
}`;

async function main(){
const adapter = await navigator.gpu.requestAdapter();
  const device = await adapter.requestDevice();

  // to-do: check limit
  console.log(adapter);
  const maxGPUThread = 256;

  const resultBufferGPU = device.createBuffer({
    size: 4000,
    usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
  });
  const readBufferGPU = device.createBuffer({
    size: 4000,
    usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
  });

  const shaderModule = device.createShaderModule({
    code: cWGSL,
  })
  const computePipeline = device.createComputePipeline({
    layout: 'auto', // 从着色器代码推断
    compute: {
      module: shaderModule,
      entryPoint: "test"
    }
  });

  const bindGroupLayout = computePipeline.getBindGroupLayout(0);
  const buffers = [resultBufferGPU];
  const entries = buffers.map((b, i) => {
    return { binding: i, resource: { buffer: b } };
  });
  const bindGroup = device.createBindGroup({
    layout: bindGroupLayout,
    entries: entries
  });
  const commandEncoder = device.createCommandEncoder();
  const computePassEncoder = commandEncoder.beginComputePass();
  computePassEncoder.setPipeline(computePipeline);
  computePassEncoder.setBindGroup(0, bindGroup);
  computePassEncoder.dispatchWorkgroups(256, 1);
  computePassEncoder.end();

  commandEncoder.copyBufferToBuffer(resultBufferGPU, 0, readBufferGPU, 0, 4000);


  const gpuCommands = commandEncoder.finish();
  device.queue.submit([gpuCommands]);
  await device.queue.onSubmittedWorkDone();
  await readBufferGPU.mapAsync(GPUMapMode.READ);
  const arrayBuffer = readBufferGPU.getMappedRange();
  const result = new Float32Array(arrayBuffer);
  console.log(result);
  const resultCount = {};
  Array.from(new Set(result)).forEach(v => resultCount[v] = result.filter(x => x == v).length);

  console.log(resultCount);

}

main();

也可以在 jsfiddle 中测试。我在Chrome Canary中进行了测试。

出现次数的计数将打印在控制台中。预期输出是 {1: 1000}

英文:

I made a shader which gives unexpected result:

@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
let thread = id.x;
for(var c = 0u; c<1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}

Only one thread will write to resultBuffer.
So every element should be 1.

The function any_compute() has no side effects. I guess heavy computation causes this problem.

The results are correct most of the time. But sometimes there are unexpected results. There will be some 2 or 3 in resultBuffer.

The full test program:

<!-- begin snippet: js hide: false console: true babel: false -->

<!-- language: lang-js -->

const cWGSL = `
@group(0) @binding(0) var&lt;storage, read_write&gt; resultBuffer: array&lt;f32&gt;;
const PI = 3.14159265358979323846;
const inv4PI = 0.25/PI;
const eps = 1e-6;
fn any_compute(a : vec4&lt;f32&gt;, b : vec4&lt;f32&gt;) -&gt; vec3f
{
let dist = a.xyz - b.xyz;
let invDist = inverseSqrt(dot(dist, dist) + eps); 
let invDistCube = invDist * invDist * invDist;
let s = b.w * invDistCube;
return -s * inv4PI * dist;
}
@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3&lt;u32&gt;) {
let thread = id.x;
for(var c = 0u; c&lt;1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}`;
async function main(){
const adapter = await navigator.gpu.requestAdapter();
const device = await adapter.requestDevice();
// to-do: check limit
console.log(adapter);
const maxGPUThread = 256;
const resultBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
});
const readBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
});
const shaderModule = device.createShaderModule({
code: cWGSL,
})
const computePipeline = device.createComputePipeline({
layout: &#39;auto&#39;, // infer from shader code.
compute: {
module: shaderModule,
entryPoint: &quot;test&quot;
}
});
const bindGroupLayout = computePipeline.getBindGroupLayout(0);
const buffers = [resultBufferGPU];
const entries = buffers.map((b, i) =&gt; {
return { binding: i, resource: { buffer: b } };
});
const bindGroup = device.createBindGroup({
layout: bindGroupLayout,
entries: entries
});
const commandEncoder = device.createCommandEncoder();
const computePassEncoder = commandEncoder.beginComputePass();
computePassEncoder.setPipeline(computePipeline);
computePassEncoder.setBindGroup(0, bindGroup);
computePassEncoder.dispatchWorkgroups(256, 1);
computePassEncoder.end();
commandEncoder.copyBufferToBuffer(resultBufferGPU, 0, readBufferGPU, 0, 4000);
const gpuCommands = commandEncoder.finish();
device.queue.submit([gpuCommands]);
await device.queue.onSubmittedWorkDone();
await readBufferGPU.mapAsync(GPUMapMode.READ);
const arrayBuffer = readBufferGPU.getMappedRange();
const result = new Float32Array(arrayBuffer);
console.log(result);
const resultCount = {};
Array.from(new Set(result)).forEach(v =&gt; resultCount[v] = result.filter(x =&gt; x == v).length);
console.log(resultCount);
}
main();

<!-- end snippet -->

Also a jsfiddle version. I tested it in Chrome Canary.

The count of occurrence will be printed in console. The expected output is {1: 1000}.

答案1

得分: 2

以下是您要翻译的内容:

问题是,我认为您混淆了工作组大小和dispatchWorkgroups

@workgroup_size 定义了每个工作组运行的单个进程数量。换句话说,如果在您的着色器中设置了 @workgroup_size(256, 1),那么 dispatchWorkgroups(1) 将运行 256 个进程。

您发出了命令 computePassEncoder.dispatchWorkgroups(256, 1);,这意味着您运行了 256 个进程,共运行了 256 次(总共运行了 64k 个进程)。

由于 thread = id.x,而 id.x 来自于 workgroup_id,这意味着当第一个工作组运行时,所有 256 个线程的 workgroup_id.x 都将为 0(因为 workgroup_id 是工作组的 ID)。
因此,有 256 次 thread == 0,这意味着线程之间存在竞争,试图更新 resultBuffer[c]。这就是结果随机的原因。

有三个 ID

  • local_invocation_id(在工作组内的 ID)
  • workgroup_id(工作组的 ID)
  • global_invocation_id(workgroup_id * workgroup_size + local_invocation_id)

还有一个方便的内置

  • local_invocation_index
rowSize = workgroup_size.x
sliceSize = workgroup_size.x * workgroup_size.y
local_invocation_index =
   local_invocation_id.x + 
   local_invocation_id.y * rowSize +
   local_invocation_id.z * sliceSize

以下是一些代码,希望能说明问题。

const dispatchCount = [4, 3, 2];
const workgroupSize = [2, 3, 4];

const arrayProd = arr => arr.reduce((a, b) => a * b);

const numThreadsPerWorkgroup = arrayProd(workgroupSize);

const code = `
// 注意:vec3u 被填充为 4 个字节
@group(0) @binding(0) var<storage, read_write> workgroupResult: array<vec3u>;
@group(0) @binding(1) var<storage, read_write> localResult: array<vec3u>;
@group(0) @binding(2) var<storage, read_write> globalResult: array<vec3u>;

const wg_size = ${numThreadsPerWorkgroup};
@compute @workgroup_size(${workgroupSize})
fn test(
    @builtin(workgroup_id) workgroup_id : vec3<u32>,
    @builtin(local_invocation_id) local_invocation_id : vec3<u32>,
    @builtin(global_invocation_id) global_invocation_id : vec3<u32>,
    @builtin(local_invocation_index) local_invocation_index: u32,
    @builtin(num_workgroups) num_workgroups: vec3<u32>
) {
  let workgroup_index =  
     workgroup_id.x +
     workgroup_id.y * num_workgroups.x +
     workgroup_id.z * num_workgroups.x * num_workgroups.y;
  let global_invocation_index =
     workgroup_index * wg_size +
     local_invocation_index;
  workgroupResult[global_invocation_index] = workgroup_id;
  localResult[global_invocation_index] = local_invocation_id;
  globalResult[global_invocation_index] = global_invocation_id;
}
`;

// 其余代码...

在第一个分派中,请注意 globallocal 是相同的。对于其余的分派,它们是不同的。

还请注意,同一工作组中的线程可以共享 var<workgroup> 变量。

请注意:您不需要调用 await device.queue.onSubmittedWorkDone。根据规范的描述:

在 mapAsync 的第 4 步中:

在使用此内容的当前排队操作完成之后,
...
在 contentTimeline 上发出映射成功步骤。

英文:

The issue is I think you're confusing workgroup size and and dispatchWorkgroups.

@workgroup_size defines now many individual processes to run per workgroup. In other words, if you set @workgroup_size(256, 1) in your shader, then dispatchWorkgroups(1) will run 256 processes.

You issue the command computePassEncoder.dispatchWorkgroups(256, 1); which means you're running 256 process, 256 times (64k processes in total)

Since thread = id.x and id.x comes from workgroup_id, that means when the first workgroup is run, workgroup_id.x will be 0 for all 256 threads. (because workgroup_id is the id of the workgroup)
So, there are 256 times that thread == 0 which means there's a race between threads trying to update resultBuffer[c]. This is why the result is random.

There are 3 ids

  • local_invocation_id (the ID with in a workgroup)
  • workgroup_id (the ID of a workgroup)
  • global_invocation_id (workgroup_id * workgroup_size + local_invocation_id)

There's one convenience builtin

  • local_invocation_index

    rowSize = workgroup_size.x
    sliceSize = workgroup_size.x * workgroup_size.y
    local_invocation_index =
    local_invocation_id.x + 
    local_invocation_id.y * rowSize +
    local_invocation_id.z * sliceSize
    

Here's some code to hopefully illustrate the point

<!-- begin snippet: js hide: false console: true babel: false -->

<!-- language: lang-js -->

const dispatchCount = [4, 3, 2];
const workgroupSize = [2, 3, 4];
const arrayProd = arr =&gt; arr.reduce((a, b) =&gt; a * b);
const numThreadsPerWorkgroup = arrayProd(workgroupSize);
const code = `
// NOTE!: vec3u is padded to by 4 bytes
@group(0) @binding(0) var&lt;storage, read_write&gt; workgroupResult: array&lt;vec3u&gt;;
@group(0) @binding(1) var&lt;storage, read_write&gt; localResult: array&lt;vec3u&gt;;
@group(0) @binding(2) var&lt;storage, read_write&gt; globalResult: array&lt;vec3u&gt;;
const wg_size = ${numThreadsPerWorkgroup};
@compute @workgroup_size(${workgroupSize})
fn test(
@builtin(workgroup_id) workgroup_id : vec3&lt;u32&gt;,
@builtin(local_invocation_id) local_invocation_id : vec3&lt;u32&gt;,
@builtin(global_invocation_id) global_invocation_id : vec3&lt;u32&gt;,
@builtin(local_invocation_index) local_invocation_index: u32,
@builtin(num_workgroups) num_workgroups: vec3&lt;u32&gt;
) {
let workgroup_index =  
workgroup_id.x +
workgroup_id.y * num_workgroups.x +
workgroup_id.z * num_workgroups.x * num_workgroups.y;
let global_invocation_index =
workgroup_index * wg_size +
local_invocation_index;
workgroupResult[global_invocation_index] = workgroup_id;
localResult[global_invocation_index] = local_invocation_id;
globalResult[global_invocation_index] = global_invocation_id;
}
`;
async function main(){
const adapter = await navigator.gpu?.requestAdapter();
const device = await adapter?.requestDevice();
if (!device) {
console.error(&#39;need WebGPU&#39;);
return;
}
const numWorkgroups = dispatchCount[0] * dispatchCount[1] * dispatchCount[2];
const numResults = numWorkgroups * numThreadsPerWorkgroup;
const size = numResults * 4 * 4;  // vec3f * u32
let usage = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC;
const workgroupBufferGPU = device.createBuffer({size, usage});
const localBufferGPU = device.createBuffer({size, usage});
const globalBufferGPU = device.createBuffer({size, usage});
usage = GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST;
const workgroupReadBufferGPU = device.createBuffer({size, usage});
const localReadBufferGPU = device.createBuffer({size, usage});
const globalReadBufferGPU = device.createBuffer({size, usage});
const shaderModule = device.createShaderModule({code});
const computePipeline = device.createComputePipeline({
layout: &#39;auto&#39;,
compute: {
module: shaderModule,
entryPoint: &quot;test&quot;
}
});
const bindGroup = device.createBindGroup({
layout: computePipeline.getBindGroupLayout(0),
entries: [
{ binding: 0, resource: { buffer: workgroupBufferGPU }},
{ binding: 1, resource: { buffer: localBufferGPU }},
{ binding: 2, resource: { buffer: globalBufferGPU }},
],
});
const commandEncoder = device.createCommandEncoder();
const computePassEncoder = commandEncoder.beginComputePass();
computePassEncoder.setPipeline(computePipeline);
computePassEncoder.setBindGroup(0, bindGroup);
computePassEncoder.dispatchWorkgroups(...dispatchCount);
computePassEncoder.end();
commandEncoder.copyBufferToBuffer(workgroupBufferGPU, 0, workgroupReadBufferGPU, 0, size);
commandEncoder.copyBufferToBuffer(localBufferGPU, 0, localReadBufferGPU, 0, size);
commandEncoder.copyBufferToBuffer(globalBufferGPU, 0, globalReadBufferGPU, 0, size);
device.queue.submit([commandEncoder.finish()]);
await Promise.all([
workgroupReadBufferGPU.mapAsync(GPUMapMode.READ),
localReadBufferGPU.mapAsync(GPUMapMode.READ),
globalReadBufferGPU.mapAsync(GPUMapMode.READ),
]);
const workgroup = new Uint32Array(workgroupReadBufferGPU.getMappedRange());
const local = new Uint32Array(localReadBufferGPU.getMappedRange());
const global = new Uint32Array(globalReadBufferGPU.getMappedRange());
const get3 = (arr, i) =&gt; {
const off = i * 4;
return `${arr[off]},${arr[off + 1]},${arr[off + 2]}`;
};
for (let i = 0; i &lt; numResults; ++i) {
if (i % numThreadsPerWorkgroup === 0) {
log(`g-index   workgroup  local  global   dispatch: ${i / numThreadsPerWorkgroup}`);
}
log(`${i.toString().padStart(3)}:      ${get3(workgroup, i)}      ${get3(local, i)}   ${get3(global, i)}`)
}
}
function log(...args) {
const elem = document.createElement(&#39;pre&#39;);
elem.textContent = args.join(&#39; &#39;);
document.body.appendChild(elem);
}
main();

<!-- language: lang-css -->

pre { margin: 0; }

<!-- end snippet -->

notice in the first dispatch global and local are the same. For the rest of the dispatches they aren't

Also notice, workgroup_id is the same for all threads in the same workgroup.

I don't know all the reasons why it's this way but, within a single workgroup the threads can share var&lt;workgroup&gt; variables.

note: you do not need to call await device.queue.onSubmittedWorkDone. From the spec:

Under step 4 of mapAsync:

> after the completion of currently-enqueued operations that use this,
>
> ...
>
> Issue the map success steps on the contentTimeline.

huangapple
  • 本文由 发表于 2023年6月1日 09:03:12
  • 转载请务必保留本文链接:https://go.coder-hub.com/76378081.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定