CUDA转置核心随机失败

huangapple go评论72阅读模式
英文:

CUDA transpose kernel fails randomly

问题

我尝试转置一个矩阵。对于一些值,它按预期工作,但对于较大的值,甚至在程序执行之间崩溃。

我尝试的目标是将矩阵分割成n×n块,每个块启动SEGMENT_SIZE线程,每个线程处理其块的一列。目前,我假设MATRIX_SIZE可被SEGMENT_SIZE整除。

调用和参数:

#define MATRIX_SIZE 64
#define SEGMENT_SIZE 32

int n_block = (MATRIX_SIZE % SEGMENT_SIZE) ? MATRIX_SIZE / SEGMENT_SIZE + 1 : MATRIX_SIZE / SEGMENT_SIZE;

dim3 blocksPerGrid(n_block, n_block);
dim3 threadsPerBlock(SEGMENT_SIZE);
transposeMatrix<<<blocksPerGrid, threadsPerBlock, threadsPerBlock.x * threadsPerBlock.x * sizeof(float)>>>(d_data, MATRIX_SIZE);

核函数:

__global__ void transposeMatrix(float *d_data, int mat_dim)
{
	//  共享内存中的数组
	extern __shared__ float sdata[];
	
	for (int i = 0; i < blockDim.x; i++)
	{

		sdata[blockDim.x * i + threadIdx.x] = d_data[(i + blockIdx.y * blockDim.x) * mat_dim + (blockDim.x * blockIdx.x + threadIdx.x)];
	}
	__syncthreads();

	for (int i = 0; i < blockDim.x; i++)
	{
		d_data[(i + blockIdx.y * blockDim.x) + (threadIdx.x + blockIdx.x * blockDim.x) * mat_dim] = sdata[threadIdx.x + i * blockDim.x];
	}
}

对于MATRIX_SIZE为64和SEGMENT_SIZE为32,按预期工作,从不失败,但对于较大的MATRIX_SIZE值(如480),它开始表现出意外行为,并在某些执行中生成的矩阵不正确。

英文:

I am trying to transpose a matrix. It works as expected for some values and starts crashing with bigger ones or even between executions of the program.

What I am trying to make is to split the matrix in n by n blocks, each one launches SEGMENT_SIZE threads and each thread processes a column of its block. For now I am assuming that MATRIX_SIZE is divisible by SEGMENT_SIZE.

Invocation and parameters:

#define MATRIX_SIZE 64
#define SEGMENT_SIZE 32

int n_block = (MATRIX_SIZE % SEGMENT_SIZE) ? MATRIX_SIZE / SEGMENT_SIZE + 1 : MATRIX_SIZE / SEGMENT_SIZE;

dim3 blocksPerGrid(n_block, n_block);
dim3 threadsPerBlock(SEGMENT_SIZE);
transposeMatrix&lt;&lt;&lt;blocksPerGrid, threadsPerBlock, threadsPerBlock.x * threadsPerBlock.x * sizeof(float)&gt;&gt;&gt;(d_data, MATRIX_SIZE);

Kernel:

__global__ void transposeMatrix(float *d_data, int mat_dim)
{
	//  Array in Shared Memory
	extern __shared__ float sdata[];
	
	for (int i = 0; i &lt; blockDim.x; i++)
	{

		sdata[blockDim.x * i + threadIdx.x] = d_data[(i + blockIdx.y * blockDim.x) * mat_dim + (blockDim.x * blockIdx.x + threadIdx.x)];
	}
	__syncthreads();

	for (int i = 0; i &lt; blockDim.x; i++)
	{
		d_data[(i + blockIdx.y * blockDim.x) + (threadIdx.x + blockIdx.x * blockDim.x) * mat_dim] = sdata[threadIdx.x + i * blockDim.x];
	}
}

For MATRIX_SIZE 64 and SEGMENT_SIZE 32 works as expected and never fails, but for bigger values of MATRIX_SIZE (like 480) it starts behaving unexpectedly and on some executions the resulting matrix is not correct.

答案1

得分: 2

以下是翻译好的部分:

"__syncthreads()" 不会同步整个网格,而只会同步每个块。因此,在更大的测试案例中,会有一些线程块在某些其他线程块尚未读取的情况下将其输出写入输入数据上。

两种选择可能是:

  1. 使输出矩阵与输入矩阵不同,

或:

  1. 分配每个块来处理对角线两侧的数据块。您首先将这两个块加载到共享内存中,然后执行 __syncthreads(),然后写出这两个块。这仅适用于方形数组,但您的问题涉及到方形数组。

(这里没有涵盖的第三种选择是用适当的协作内核网格范围同步替换您对__syncthreads()的使用。)

这是一个演示这两个想法的工作示例:

$ cat t24.cu
#include <iostream>

__global__ void transposeMatrix(float *o_data, float *i_data,  int mat_dim)
{
    //  Shared Memory中的数组
    extern __shared__ float sdata[];

    for (int i = 0; i < blockDim.x; i++)
    {

        sdata[blockDim.x * i + threadIdx.x] = i_data[(i + blockIdx.y * blockDim.x) * mat_dim + (blockDim.x * blockIdx.x + threadIdx.x)];
    }
    __syncthreads();

    for (int i = 0; i < blockDim.x; i++)
    {
        o_data[(i + blockIdx.y * blockDim.x) + (threadIdx.x + blockIdx.x * blockDim.x) * mat_dim] = sdata[threadIdx.x + i * blockDim.x];
    }
}

__global__ void transposeMatrix_ip(float *d_data,  int mat_dim)
{
    //  Shared Memory中的数组
    extern __shared__ float sdata[];
    if (blockIdx.x >= blockIdx.y){  // 仅需要在对角线上或“以上”的块
      for (int i = 0; i < blockDim.x; i++)
      {

        sdata[blockDim.x * i + threadIdx.x] = d_data[(i + blockIdx.y * blockDim.x) * mat_dim + (blockDim.x * blockIdx.x + threadIdx.x)];
        if (blockIdx.x != blockIdx.y) sdata[(blockDim.x*blockDim.x)+blockDim.x * i + threadIdx.x] = d_data[(i + blockIdx.x * blockDim.x) * mat_dim + (blockDim.x * blockIdx.y + threadIdx.x)];
      }
      __syncthreads();

      for (int i = 0; i < blockDim.x; i++)
      {
        d_data[(i + blockIdx.y * blockDim.x) + (threadIdx.x + blockIdx.x * blockDim.x) * mat_dim] = sdata[threadIdx.x + i * blockDim.x];
        if (blockIdx.x != blockIdx.y) d_data[(i + blockIdx.x * blockDim.x) + (threadIdx.x + blockIdx.y * blockDim.x) * mat_dim] = sdata[(blockDim.x*blockDim.x)+ threadIdx.x + i * blockDim.x];
      }
    }
}

int main(){

#define MATRIX_SIZE 4000
#define SEGMENT_SIZE 32

  int n_block = (MATRIX_SIZE % SEGMENT_SIZE) ? MATRIX_SIZE / SEGMENT_SIZE + 1 : MATRIX_SIZE / SEGMENT_SIZE;
  float *d_idata, *d_odata, *h_idata, *h_odata;
  cudaMalloc(&d_idata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float));
  cudaMalloc(&d_odata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float));
  h_idata = new float[MATRIX_SIZE*MATRIX_SIZE];
  h_odata = new float[MATRIX_SIZE*MATRIX_SIZE];
  int q = 0;
  for (int i = 0; i < MATRIX_SIZE; i++)
    for (int j = 0; j < MATRIX_SIZE; j++)
      h_idata[i*MATRIX_SIZE+j] = q++;
  cudaMemcpy(d_idata, h_idata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float), cudaMemcpyHostToDevice);
  dim3 blocksPerGrid(n_block, n_block);
  dim3 threadsPerBlock(SEGMENT_SIZE);
  transposeMatrix<<<blocksPerGrid, threadsPerBlock, threadsPerBlock.x * threadsPerBlock.x * sizeof(float)>>>(d_odata, d_idata, MATRIX_SIZE);
  cudaMemcpy(h_odata, d_odata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
  q = 0;
  for (int i = 0; i < MATRIX_SIZE; i++)
    for (int j = 0; j < MATRIX_SIZE; j++)
      if (h_odata[j*MATRIX_SIZE+i] != q++) {std::cout << "1mismatch at: " << i << "," << j << " was: " << h_odata[j*MATRIX_SIZE+i] << " should be: " << q-1 << std::endl; break; break;}

  transposeMatrix_ip<<<blocksPerGrid, threadsPerBlock, 2*threadsPerBlock.x * threadsPerBlock.x * sizeof(float)>>>(d_idata, MATRIX_SIZE);
  cudaMemcpy(h_odata, d_idata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
  q = 0;
  for (int i = 0; i < MATRIX_SIZE; i++)
    for (int j = 0; j < MATRIX_SIZE; j++)
      if (h_odata[j*MATRIX_SIZE+i] != q++) {std::cout << "2mismatch at: " << i << "," << j << " was: " << h_odata[j*MATRIX_SIZE+i] << " should be: " << q-1 << std::endl; break; break;}

}
$ nvcc -o t24 t24.cu
$ compute-sanitizer ./t24
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$

这种就地方法让每个块处理2个数据块。因此,我们只需要在对角线的一侧或一侧的块。这些块每个加载2个数据块到共享内存中,然后写出这两个数据块。因此,每个块需要的共享内存是原来的两倍。对于对角线上的块(blockIdx.x == blockIdx.y),每个块只需要一个数据块操作而不是两个。

还要注意,这里选择的矩阵大小(4000)经过精心选择,以避免 float 数量的整数存储容

英文:

It's doubtful that an in-place transpose could work the way you have it designed here. __syncthreads() does not synchronize the entire grid, only each block. Therefore, in a larger test case, you will have some threadblocks writing their output over input data that has not yet been read in by some other threadblock.

Two options would be to:

  1. make the output matrix different than the input matrix,

or:

  1. assign each block to handle the data tile on each side of the diagonal. You would first load both tiles into shared memory, then do __syncthreads(), then write out both tiles. This only works for square arrays, but your problem has square arrays in view.

(A third option not covered here would be to replace your usage of __syncthreads() with an appropriate cooperative kernels grid-wide sync.)

Here is a worked example demonstrating both ideas:

$ cat t24.cu
#include &lt;iostream&gt;
__global__ void transposeMatrix(float *o_data, float *i_data,  int mat_dim)
{
//  Array in Shared Memory
extern __shared__ float sdata[];
for (int i = 0; i &lt; blockDim.x; i++)
{
sdata[blockDim.x * i + threadIdx.x] = i_data[(i + blockIdx.y * blockDim.x) * mat_dim + (blockDim.x * blockIdx.x + threadIdx.x)];
}
__syncthreads();
for (int i = 0; i &lt; blockDim.x; i++)
{
o_data[(i + blockIdx.y * blockDim.x) + (threadIdx.x + blockIdx.x * blockDim.x) * mat_dim] = sdata[threadIdx.x + i * blockDim.x];
}
}
__global__ void transposeMatrix_ip(float *d_data,  int mat_dim)
{
//  Array in Shared Memory
extern __shared__ float sdata[];
if (blockIdx.x &gt;= blockIdx.y){  // only need blocks on or &quot;above&quot; the diagonal
for (int i = 0; i &lt; blockDim.x; i++)
{
sdata[blockDim.x * i + threadIdx.x] = d_data[(i + blockIdx.y * blockDim.x) * mat_dim + (blockDim.x * blockIdx.x + threadIdx.x)];
if (blockIdx.x != blockIdx.y) sdata[(blockDim.x*blockDim.x)+blockDim.x * i + threadIdx.x] = d_data[(i + blockIdx.x * blockDim.x) * mat_dim + (blockDim.x * blockIdx.y + threadIdx.x)];
}
__syncthreads();
for (int i = 0; i &lt; blockDim.x; i++)
{
d_data[(i + blockIdx.y * blockDim.x) + (threadIdx.x + blockIdx.x * blockDim.x) * mat_dim] = sdata[threadIdx.x + i * blockDim.x];
if (blockIdx.x != blockIdx.y) d_data[(i + blockIdx.x * blockDim.x) + (threadIdx.x + blockIdx.y * blockDim.x) * mat_dim] = sdata[(blockDim.x*blockDim.x)+ threadIdx.x + i * blockDim.x];
}
}
}
int main(){
#define MATRIX_SIZE 4000
#define SEGMENT_SIZE 32
int n_block = (MATRIX_SIZE % SEGMENT_SIZE) ? MATRIX_SIZE / SEGMENT_SIZE + 1 : MATRIX_SIZE / SEGMENT_SIZE;
float *d_idata, *d_odata, *h_idata, *h_odata;
cudaMalloc(&amp;d_idata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float));
cudaMalloc(&amp;d_odata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float));
h_idata = new float[MATRIX_SIZE*MATRIX_SIZE];
h_odata = new float[MATRIX_SIZE*MATRIX_SIZE];
int q = 0;
for (int i = 0; i &lt; MATRIX_SIZE; i++)
for (int j = 0; j &lt; MATRIX_SIZE; j++)
h_idata[i*MATRIX_SIZE+j] = q++;
cudaMemcpy(d_idata, h_idata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksPerGrid(n_block, n_block);
dim3 threadsPerBlock(SEGMENT_SIZE);
transposeMatrix&lt;&lt;&lt;blocksPerGrid, threadsPerBlock, threadsPerBlock.x * threadsPerBlock.x * sizeof(float)&gt;&gt;&gt;(d_odata, d_idata, MATRIX_SIZE);
cudaMemcpy(h_odata, d_odata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
q = 0;
for (int i = 0; i &lt; MATRIX_SIZE; i++)
for (int j = 0; j &lt; MATRIX_SIZE; j++)
if (h_odata[j*MATRIX_SIZE+i] != q++) {std::cout &lt;&lt; &quot;1mismatch at: &quot; &lt;&lt; i &lt;&lt; &quot;,&quot; &lt;&lt; j &lt;&lt; &quot; was: &quot; &lt;&lt; h_odata[j*MATRIX_SIZE+i] &lt;&lt; &quot; should be: &quot; &lt;&lt; q-1 &lt;&lt; std::endl; break; break;}
transposeMatrix_ip&lt;&lt;&lt;blocksPerGrid, threadsPerBlock, 2*threadsPerBlock.x * threadsPerBlock.x * sizeof(float)&gt;&gt;&gt;(d_idata, MATRIX_SIZE);
cudaMemcpy(h_odata, d_idata, MATRIX_SIZE*MATRIX_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
q = 0;
for (int i = 0; i &lt; MATRIX_SIZE; i++)
for (int j = 0; j &lt; MATRIX_SIZE; j++)
if (h_odata[j*MATRIX_SIZE+i] != q++) {std::cout &lt;&lt; &quot;2mismatch at: &quot; &lt;&lt; i &lt;&lt; &quot;,&quot; &lt;&lt; j &lt;&lt; &quot; was: &quot; &lt;&lt; h_odata[j*MATRIX_SIZE+i] &lt;&lt; &quot; should be: &quot; &lt;&lt; q-1 &lt;&lt; std::endl; break; break;}
}
$ nvcc -o t24 t24.cu
$ compute-sanitizer ./t24
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$

this in-place approach has each block handling 2 tiles. Therefore we only need blocks on or on one side of the diagonal to act. Those blocks each load 2 tiles into shared memory, and then write out those two tiles. Therefore twice as much shared mem per block is needed. For the blocks on the diagonal (blockIdx.x == blockIdx.y) we only need one tile operation per block not two.

Also note that the matrix size (4000) here is chosen carefully to avoid overflowing the integer-storage capacity of a float quantity. If you make this test case much larger, it will probably fail due to the larger integers involved in the test case, not because the method is flawed.

huangapple
  • 本文由 发表于 2023年5月31日 23:59:18
  • 转载请务必保留本文链接:https://go.coder-hub.com/76375367.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定