2023年6月1日 09:03:12go评论112阅读模式

英文:

WebGPU compute shader gives unexpected result

问题

我创建了一个着色器，它产生了意外的结果：

@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3&lt;u32&gt;) {
  let thread = id.x;
  for(var c = 0u; c&lt;1000; c++){
    any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
    if(thread==0){
      resultBuffer[c]+=1;
    }
  }
}

只有一个线程会写入resultBuffer。
因此，每个元素应该都是1。

函数any_compute()没有副作用。我猜测是因为计算量大导致了这个问题。

大多数情况下结果是正确的。但有时会出现意外的结果。resultBuffer中会出现一些2或3。

完整的测试程序：

const cWGSL = `
@group(0) @binding(0) var&lt;storage, read_write&gt; resultBuffer: array&lt;f32&gt;;

const PI = 3.14159265358979323846;
const inv4PI = 0.25/PI;

const eps = 1e-6;
fn any_compute(a : vec4&lt;f32&gt;, b : vec4&lt;f32&gt;) -&gt; vec3f
{
  let dist = a.xyz - b.xyz;
  let invDist = inverseSqrt(dot(dist, dist) + eps); 
  let invDistCube = invDist * invDist * invDist;
  let s = b.w * invDistCube;
  return -s * inv4PI * dist;
}

@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3&lt;u32&gt;) {
  let thread = id.x;
  for(var c = 0u; c&lt;1000; c++){
    any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
    if(thread==0){
    resultBuffer[c]+=1;
    }
  }
}`;

async function main(){
const adapter = await navigator.gpu.requestAdapter();
  const device = await adapter.requestDevice();

  // to-do: check limit
  console.log(adapter);
  const maxGPUThread = 256;

  const resultBufferGPU = device.createBuffer({
    size: 4000,
    usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
  });
  const readBufferGPU = device.createBuffer({
    size: 4000,
    usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
  });

  const shaderModule = device.createShaderModule({
    code: cWGSL,
  })
  const computePipeline = device.createComputePipeline({
    layout: 'auto', // 从着色器代码推断
    compute: {
      module: shaderModule,
      entryPoint: "test"
    }
  });

  const bindGroupLayout = computePipeline.getBindGroupLayout(0);
  const buffers = [resultBufferGPU];
  const entries = buffers.map((b, i) => {
    return { binding: i, resource: { buffer: b } };
  });
  const bindGroup = device.createBindGroup({
    layout: bindGroupLayout,
    entries: entries
  });
  const commandEncoder = device.createCommandEncoder();
  const computePassEncoder = commandEncoder.beginComputePass();
  computePassEncoder.setPipeline(computePipeline);
  computePassEncoder.setBindGroup(0, bindGroup);
  computePassEncoder.dispatchWorkgroups(256, 1);
  computePassEncoder.end();

  commandEncoder.copyBufferToBuffer(resultBufferGPU, 0, readBufferGPU, 0, 4000);


  const gpuCommands = commandEncoder.finish();
  device.queue.submit([gpuCommands]);
  await device.queue.onSubmittedWorkDone();
  await readBufferGPU.mapAsync(GPUMapMode.READ);
  const arrayBuffer = readBufferGPU.getMappedRange();
  const result = new Float32Array(arrayBuffer);
  console.log(result);
  const resultCount = {};
  Array.from(new Set(result)).forEach(v => resultCount[v] = result.filter(x => x == v).length);

  console.log(resultCount);

}

main();

也可以在 jsfiddle 中测试。我在Chrome Canary中进行了测试。

出现次数的计数将打印在控制台中。预期输出是 {1: 1000}。

英文:

I made a shader which gives unexpected result:

@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3&lt;u32&gt;) {
let thread = id.x;
for(var c = 0u; c&lt;1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}

Only one thread will write to resultBuffer.
So every element should be 1.

The function any_compute() has no side effects. I guess heavy computation causes this problem.

The results are correct most of the time. But sometimes there are unexpected results. There will be some 2 or 3 in resultBuffer.

The full test program:

const cWGSL = `
@group(0) @binding(0) var&lt;storage, read_write&gt; resultBuffer: array&lt;f32&gt;;
const PI = 3.14159265358979323846;
const inv4PI = 0.25/PI;
const eps = 1e-6;
fn any_compute(a : vec4&lt;f32&gt;, b : vec4&lt;f32&gt;) -&gt; vec3f
{
let dist = a.xyz - b.xyz;
let invDist = inverseSqrt(dot(dist, dist) + eps); 
let invDistCube = invDist * invDist * invDist;
let s = b.w * invDistCube;
return -s * inv4PI * dist;
}
@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3&lt;u32&gt;) {
let thread = id.x;
for(var c = 0u; c&lt;1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}`;
async function main(){
const adapter = await navigator.gpu.requestAdapter();
const device = await adapter.requestDevice();
// to-do: check limit
console.log(adapter);
const maxGPUThread = 256;
const resultBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
});
const readBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
});
const shaderModule = device.createShaderModule({
code: cWGSL,
})
const computePipeline = device.createComputePipeline({
layout: &#39;auto&#39;, // infer from shader code.
compute: {
module: shaderModule,
entryPoint: &quot;test&quot;
}
});
const bindGroupLayout = computePipeline.getBindGroupLayout(0);
const buffers = [resultBufferGPU];
const entries = buffers.map((b, i) =&gt; {
return { binding: i, resource: { buffer: b } };
});
const bindGroup = device.createBindGroup({
layout: bindGroupLayout,
entries: entries
});
const commandEncoder = device.createCommandEncoder();
const computePassEncoder = commandEncoder.beginComputePass();
computePassEncoder.setPipeline(computePipeline);
computePassEncoder.setBindGroup(0, bindGroup);
computePassEncoder.dispatchWorkgroups(256, 1);
computePassEncoder.end();
commandEncoder.copyBufferToBuffer(resultBufferGPU, 0, readBufferGPU, 0, 4000);
const gpuCommands = commandEncoder.finish();
device.queue.submit([gpuCommands]);
await device.queue.onSubmittedWorkDone();
await readBufferGPU.mapAsync(GPUMapMode.READ);
const arrayBuffer = readBufferGPU.getMappedRange();
const result = new Float32Array(arrayBuffer);
console.log(result);
const resultCount = {};
Array.from(new Set(result)).forEach(v =&gt; resultCount[v] = result.filter(x =&gt; x == v).length);
console.log(resultCount);
}
main();

Also a jsfiddle version. I tested it in Chrome Canary.

The count of occurrence will be printed in console. The expected output is {1: 1000}.

答案1

得分: 2

以下是您要翻译的内容：

问题是，我认为您混淆了工作组大小和dispatchWorkgroups。

@workgroup_size 定义了每个工作组运行的单个进程数量。换句话说，如果在您的着色器中设置了 @workgroup_size(256, 1)，那么 dispatchWorkgroups(1) 将运行 256 个进程。

您发出了命令 computePassEncoder.dispatchWorkgroups(256, 1);，这意味着您运行了 256 个进程，共运行了 256 次（总共运行了 64k 个进程）。

由于 thread = id.x，而 id.x 来自于 workgroup_id，这意味着当第一个工作组运行时，所有 256 个线程的 workgroup_id.x 都将为 0（因为 workgroup_id 是工作组的 ID）。
因此，有 256 次 thread == 0，这意味着线程之间存在竞争，试图更新 resultBuffer[c]。这就是结果随机的原因。

有三个 ID

local_invocation_id（在工作组内的 ID）
workgroup_id（工作组的 ID）
global_invocation_id（workgroup_id * workgroup_size + local_invocation_id）

还有一个方便的内置

local_invocation_index

rowSize = workgroup_size.x
sliceSize = workgroup_size.x * workgroup_size.y
local_invocation_index =
   local_invocation_id.x + 
   local_invocation_id.y * rowSize +
   local_invocation_id.z * sliceSize

以下是一些代码，希望能说明问题。

const dispatchCount = [4, 3, 2];
const workgroupSize = [2, 3, 4];

const arrayProd = arr => arr.reduce((a, b) => a * b);

const numThreadsPerWorkgroup = arrayProd(workgroupSize);

const code = `
// 注意：vec3u 被填充为 4 个字节
@group(0) @binding(0) var<storage, read_write> workgroupResult: array<vec3u>;
@group(0) @binding(1) var<storage, read_write> localResult: array<vec3u>;
@group(0) @binding(2) var<storage, read_write> globalResult: array<vec3u>;

const wg_size = ${numThreadsPerWorkgroup};
@compute @workgroup_size(${workgroupSize})
fn test(
    @builtin(workgroup_id) workgroup_id : vec3<u32>,
    @builtin(local_invocation_id) local_invocation_id : vec3<u32>,
    @builtin(global_invocation_id) global_invocation_id : vec3<u32>,
    @builtin(local_invocation_index) local_invocation_index: u32,
    @builtin(num_workgroups) num_workgroups: vec3<u32>
) {
  let workgroup_index =  
     workgroup_id.x +
     workgroup_id.y * num_workgroups.x +
     workgroup_id.z * num_workgroups.x * num_workgroups.y;
  let global_invocation_index =
     workgroup_index * wg_size +
     local_invocation_index;
  workgroupResult[global_invocation_index] = workgroup_id;
  localResult[global_invocation_index] = local_invocation_id;
  globalResult[global_invocation_index] = global_invocation_id;
}
`;

// 其余代码...

在第一个分派中，请注意 global 和 local 是相同的。对于其余的分派，它们是不同的。

还请注意，同一工作组中的线程可以共享 var<workgroup> 变量。

请注意：您不需要调用 await device.queue.onSubmittedWorkDone。根据规范的描述：

在 mapAsync 的第 4 步中：

在使用此内容的当前排队操作完成之后，
...
在 contentTimeline 上发出映射成功步骤。

英文:

The issue is I think you're confusing workgroup size and and dispatchWorkgroups.

@workgroup_size defines now many individual processes to run per workgroup. In other words, if you set @workgroup_size(256, 1) in your shader, then dispatchWorkgroups(1) will run 256 processes.

You issue the command computePassEncoder.dispatchWorkgroups(256, 1); which means you're running 256 process, 256 times (64k processes in total)

Since thread = id.x and id.x comes from workgroup_id, that means when the first workgroup is run, workgroup_id.x will be 0 for all 256 threads. (because workgroup_id is the id of the workgroup)
So, there are 256 times that thread == 0 which means there's a race between threads trying to update resultBuffer[c]. This is why the result is random.

There are 3 ids

local_invocation_id (the ID with in a workgroup)
workgroup_id (the ID of a workgroup)
global_invocation_id (workgroup_id * workgroup_size + local_invocation_id)

There's one convenience builtin

local_invocation_index

rowSize = workgroup_size.x
sliceSize = workgroup_size.x * workgroup_size.y
local_invocation_index =
local_invocation_id.x + 
local_invocation_id.y * rowSize +
local_invocation_id.z * sliceSize

Here's some code to hopefully illustrate the point

const dispatchCount = [4, 3, 2];
const workgroupSize = [2, 3, 4];
const arrayProd = arr =&gt; arr.reduce((a, b) =&gt; a * b);
const numThreadsPerWorkgroup = arrayProd(workgroupSize);
const code = `
// NOTE!: vec3u is padded to by 4 bytes
@group(0) @binding(0) var&lt;storage, read_write&gt; workgroupResult: array&lt;vec3u&gt;;
@group(0) @binding(1) var&lt;storage, read_write&gt; localResult: array&lt;vec3u&gt;;
@group(0) @binding(2) var&lt;storage, read_write&gt; globalResult: array&lt;vec3u&gt;;
const wg_size = ${numThreadsPerWorkgroup};
@compute @workgroup_size(${workgroupSize})
fn test(
@builtin(workgroup_id) workgroup_id : vec3&lt;u32&gt;,
@builtin(local_invocation_id) local_invocation_id : vec3&lt;u32&gt;,
@builtin(global_invocation_id) global_invocation_id : vec3&lt;u32&gt;,
@builtin(local_invocation_index) local_invocation_index: u32,
@builtin(num_workgroups) num_workgroups: vec3&lt;u32&gt;
) {
let workgroup_index =  
workgroup_id.x +
workgroup_id.y * num_workgroups.x +
workgroup_id.z * num_workgroups.x * num_workgroups.y;
let global_invocation_index =
workgroup_index * wg_size +
local_invocation_index;
workgroupResult[global_invocation_index] = workgroup_id;
localResult[global_invocation_index] = local_invocation_id;
globalResult[global_invocation_index] = global_invocation_id;
}
`;
async function main(){
const adapter = await navigator.gpu?.requestAdapter();
const device = await adapter?.requestDevice();
if (!device) {
console.error(&#39;need WebGPU&#39;);
return;
}
const numWorkgroups = dispatchCount[0] * dispatchCount[1] * dispatchCount[2];
const numResults = numWorkgroups * numThreadsPerWorkgroup;
const size = numResults * 4 * 4;  // vec3f * u32
let usage = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC;
const workgroupBufferGPU = device.createBuffer({size, usage});
const localBufferGPU = device.createBuffer({size, usage});
const globalBufferGPU = device.createBuffer({size, usage});
usage = GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST;
const workgroupReadBufferGPU = device.createBuffer({size, usage});
const localReadBufferGPU = device.createBuffer({size, usage});
const globalReadBufferGPU = device.createBuffer({size, usage});
const shaderModule = device.createShaderModule({code});
const computePipeline = device.createComputePipeline({
layout: &#39;auto&#39;,
compute: {
module: shaderModule,
entryPoint: &quot;test&quot;
}
});
const bindGroup = device.createBindGroup({
layout: computePipeline.getBindGroupLayout(0),
entries: [
{ binding: 0, resource: { buffer: workgroupBufferGPU }},
{ binding: 1, resource: { buffer: localBufferGPU }},
{ binding: 2, resource: { buffer: globalBufferGPU }},
],
});
const commandEncoder = device.createCommandEncoder();
const computePassEncoder = commandEncoder.beginComputePass();
computePassEncoder.setPipeline(computePipeline);
computePassEncoder.setBindGroup(0, bindGroup);
computePassEncoder.dispatchWorkgroups(...dispatchCount);
computePassEncoder.end();
commandEncoder.copyBufferToBuffer(workgroupBufferGPU, 0, workgroupReadBufferGPU, 0, size);
commandEncoder.copyBufferToBuffer(localBufferGPU, 0, localReadBufferGPU, 0, size);
commandEncoder.copyBufferToBuffer(globalBufferGPU, 0, globalReadBufferGPU, 0, size);
device.queue.submit([commandEncoder.finish()]);
await Promise.all([
workgroupReadBufferGPU.mapAsync(GPUMapMode.READ),
localReadBufferGPU.mapAsync(GPUMapMode.READ),
globalReadBufferGPU.mapAsync(GPUMapMode.READ),
]);
const workgroup = new Uint32Array(workgroupReadBufferGPU.getMappedRange());
const local = new Uint32Array(localReadBufferGPU.getMappedRange());
const global = new Uint32Array(globalReadBufferGPU.getMappedRange());
const get3 = (arr, i) =&gt; {
const off = i * 4;
return `${arr[off]},${arr[off + 1]},${arr[off + 2]}`;
};
for (let i = 0; i &lt; numResults; ++i) {
if (i % numThreadsPerWorkgroup === 0) {
log(`g-index   workgroup  local  global   dispatch: ${i / numThreadsPerWorkgroup}`);
}
log(`${i.toString().padStart(3)}:      ${get3(workgroup, i)}      ${get3(local, i)}   ${get3(global, i)}`)
}
}
function log(...args) {
const elem = document.createElement(&#39;pre&#39;);
elem.textContent = args.join(&#39; &#39;);
document.body.appendChild(elem);
}
main();

pre { margin: 0; }

notice in the first dispatch global and local are the same. For the rest of the dispatches they aren't

Also notice, workgroup_id is the same for all threads in the same workgroup.

I don't know all the reasons why it's this way but, within a single workgroup the threads can share var<workgroup> variables.

note: you do not need to call await device.queue.onSubmittedWorkDone. From the spec:

Under step 4 of mapAsync:

> after the completion of currently-enqueued operations that use this,
>
> ...
>
> Issue the map success steps on the contentTimeline.

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

WebGPU计算着色器产生意外结果

问题

答案1

光线追踪器非常嘈杂

创建从缓冲区写入和读取的wgpu。

如何使用不同颜色为每个三角形绘制三角形带？

位置向量的z和w分量在WGSL中好像被交换了。

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论