我编译CUDA代码时出现分段错误。

huangapple go评论60阅读模式
英文:

I get segmentation fault, when i compile cuda code

问题

程序试图使用CUDA计算π值,但在尝试显示计算结果时出现了分段错误。发生这种情况的原因是什么以及如何正确分配内存?

// System includes
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <iostream>

using namespace std;

#define CUDA_FLOAT float
#define dx 0.1f
#define BLOCK_SIZE 16
#define GRID_SIZE 1
#define THREAD_SIZE 1

__global__ void pi_kern(CUDA_FLOAT *res)
{
    int n = threadIdx.x + blockIdx.x * BLOCK_SIZE; //起始
    CUDA_FLOAT x0 = n * 1.f / (BLOCK_SIZE * GRID_SIZE * THREAD_SIZE); //积分步长
    CUDA_FLOAT s = 0; //给定线程段上的积分值
    CUDA_FLOAT x1, y1;
    CUDA_FLOAT y0;
    for (int i = 0; i < THREAD_SIZE; i++)
    {
        x1 = x0 + dx;
        y1 = sqrtf(1 - x1 * x1);
        s += (y0 + y1) * dx / 2.f;
        x0 = x1;
        y0 = y1;
    }
    res[n] = s;
}

int main(int argc, char **argv)
{
    printf("[center-of-mass] - Starting\n");
    CUDA_FLOAT* data;
    CUDA_FLOAT* d_data;
    cudaMalloc((void**)&data, 256 * sizeof(CUDA_FLOAT));
    cudaMalloc((void**)&d_data, 256 * sizeof(CUDA_FLOAT));
    cudaMemcpy(d_data, data, 256 * sizeof(CUDA_FLOAT), cudaMemcpyHostToDevice);
    dim3 block = dim3(BLOCK_SIZE);
    dim3 grid = dim3(GRID_SIZE * THREAD_SIZE / BLOCK_SIZE);
    pi_kern<<<grid, block>>>(d_data);
    cudaMemcpy(data, d_data, 256 * sizeof(float), cudaMemcpyDeviceToHost);
    //cudaFree(d_data);
    for (int i = 0; i < 256; i++)
    {
        cout << data[i];
    }
}

我不知道问题出在哪里,这是我的第一个CUDA项目。

英文:

Programm ought to count pi using CUDA, i get segmentation error, when i try to show result of calculations. Why does it happens and how i should allocate memory properly?

// System includes
#include &lt;stdio.h&gt;
#include &lt;assert.h&gt;
#include &lt;cuda.h&gt;
#include &lt;iostream&gt;

using namespace std;

#define CUDA_FLOAT float
#define dx 0.1f
#define BLOCK_SIZE 16
#define GRID_SIZE 1
#define THREAD_SIZE 1
__global__ void pi_kern(CUDA_FLOAT *res)
{
    int n = threadIdx.x + blockIdx.x * BLOCK_SIZE; //start
    CUDA_FLOAT x0 = n * 1.f / (BLOCK_SIZE * GRID_SIZE * THREAD_SIZE); //integrating step
    CUDA_FLOAT s = 0; // Value of integrating on the segment for given thread
    CUDA_FLOAT x1, y1;
    CUDA_FLOAT y0;
    for (int i=0; i &lt; THREAD_SIZE; i++)
    {
        x1 = x0 + dx;
        y1 = sqrtf(1 - x1 * x1);
        s += (y0 + y1) * dx / 2.f;
        x0 = x1; y0 = y1;
    }
    res[n] = s;
}


int main(int argc, char **argv)
{   
    printf(&quot;[center-of-mass] - Starting\n&quot;);
    CUDA_FLOAT* data;
    CUDA_FLOAT* d_data;
    cudaMalloc ((void **) &amp;data, 256*sizeof(CUDA_FLOAT));
    cudaMalloc ((void **) &amp;d_data, 256*sizeof(CUDA_FLOAT));
    cudaMemcpy(d_data, data,256*sizeof(CUDA_FLOAT), cudaMemcpyHostToDevice); 
    dim3 block = dim3(BLOCK_SIZE);
    dim3 grid = dim3(GRID_SIZE*THREAD_SIZE/BLOCK_SIZE);
    pi_kern&lt;&lt;&lt;grid, block&gt;&gt;&gt; (d_data);
    cudaMemcpy(data, d_data, 256*sizeof( float ),cudaMemcpyDeviceToHost );
    //cudaFree(d_data);
    for (int i = 0; i &lt; 256; i++)
    {
        cout &lt;&lt; data[i]; 
    }
}

I don't know where the problem is, it's my first cuda project.

答案1

得分: 0

数据在GPU上分配,类似于d_data。因此,使用cudaMemcpy和cudaMemcpyDeviceToHost将导致崩溃。

为了避免这种错误,您可以使用cudaMallocManaged。这允许您从主机和设备读/写缓冲区,而无需担心内存管理。不再需要cudaMemcpy!

阅读更多信息:https://developer.nvidia.com/blog/unified-memory-cuda-beginners/

英文:

data is allocated on GPU like d_data. So cudamemcpy with cudaMemcpyDeviceToHost will crash.

To avoid this kind of error, you can use cudaMallocManaged. This allows you to read/write to your buffer from the host and the device without having to worry about memory management. No more need of cudamemcpy!

Read more here : https://developer.nvidia.com/blog/unified-memory-cuda-beginners/

huangapple
  • 本文由 发表于 2023年4月6日 20:04:39
  • 转载请务必保留本文链接:https://go.coder-hub.com/75949321.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定