英文:
I get segmentation fault, when i compile cuda code
问题
程序试图使用CUDA计算π值,但在尝试显示计算结果时出现了分段错误。发生这种情况的原因是什么以及如何正确分配内存?
// System includes
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <iostream>
using namespace std;
#define CUDA_FLOAT float
#define dx 0.1f
#define BLOCK_SIZE 16
#define GRID_SIZE 1
#define THREAD_SIZE 1
__global__ void pi_kern(CUDA_FLOAT *res)
{
int n = threadIdx.x + blockIdx.x * BLOCK_SIZE; //起始
CUDA_FLOAT x0 = n * 1.f / (BLOCK_SIZE * GRID_SIZE * THREAD_SIZE); //积分步长
CUDA_FLOAT s = 0; //给定线程段上的积分值
CUDA_FLOAT x1, y1;
CUDA_FLOAT y0;
for (int i = 0; i < THREAD_SIZE; i++)
{
x1 = x0 + dx;
y1 = sqrtf(1 - x1 * x1);
s += (y0 + y1) * dx / 2.f;
x0 = x1;
y0 = y1;
}
res[n] = s;
}
int main(int argc, char **argv)
{
printf("[center-of-mass] - Starting\n");
CUDA_FLOAT* data;
CUDA_FLOAT* d_data;
cudaMalloc((void**)&data, 256 * sizeof(CUDA_FLOAT));
cudaMalloc((void**)&d_data, 256 * sizeof(CUDA_FLOAT));
cudaMemcpy(d_data, data, 256 * sizeof(CUDA_FLOAT), cudaMemcpyHostToDevice);
dim3 block = dim3(BLOCK_SIZE);
dim3 grid = dim3(GRID_SIZE * THREAD_SIZE / BLOCK_SIZE);
pi_kern<<<grid, block>>>(d_data);
cudaMemcpy(data, d_data, 256 * sizeof(float), cudaMemcpyDeviceToHost);
//cudaFree(d_data);
for (int i = 0; i < 256; i++)
{
cout << data[i];
}
}
我不知道问题出在哪里,这是我的第一个CUDA项目。
英文:
Programm ought to count pi using CUDA, i get segmentation error, when i try to show result of calculations. Why does it happens and how i should allocate memory properly?
// System includes
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <iostream>
using namespace std;
#define CUDA_FLOAT float
#define dx 0.1f
#define BLOCK_SIZE 16
#define GRID_SIZE 1
#define THREAD_SIZE 1
__global__ void pi_kern(CUDA_FLOAT *res)
{
int n = threadIdx.x + blockIdx.x * BLOCK_SIZE; //start
CUDA_FLOAT x0 = n * 1.f / (BLOCK_SIZE * GRID_SIZE * THREAD_SIZE); //integrating step
CUDA_FLOAT s = 0; // Value of integrating on the segment for given thread
CUDA_FLOAT x1, y1;
CUDA_FLOAT y0;
for (int i=0; i < THREAD_SIZE; i++)
{
x1 = x0 + dx;
y1 = sqrtf(1 - x1 * x1);
s += (y0 + y1) * dx / 2.f;
x0 = x1; y0 = y1;
}
res[n] = s;
}
int main(int argc, char **argv)
{
printf("[center-of-mass] - Starting\n");
CUDA_FLOAT* data;
CUDA_FLOAT* d_data;
cudaMalloc ((void **) &data, 256*sizeof(CUDA_FLOAT));
cudaMalloc ((void **) &d_data, 256*sizeof(CUDA_FLOAT));
cudaMemcpy(d_data, data,256*sizeof(CUDA_FLOAT), cudaMemcpyHostToDevice);
dim3 block = dim3(BLOCK_SIZE);
dim3 grid = dim3(GRID_SIZE*THREAD_SIZE/BLOCK_SIZE);
pi_kern<<<grid, block>>> (d_data);
cudaMemcpy(data, d_data, 256*sizeof( float ),cudaMemcpyDeviceToHost );
//cudaFree(d_data);
for (int i = 0; i < 256; i++)
{
cout << data[i];
}
}
I don't know where the problem is, it's my first cuda project.
答案1
得分: 0
数据在GPU上分配,类似于d_data。因此,使用cudaMemcpy和cudaMemcpyDeviceToHost将导致崩溃。
为了避免这种错误,您可以使用cudaMallocManaged。这允许您从主机和设备读/写缓冲区,而无需担心内存管理。不再需要cudaMemcpy!
阅读更多信息:https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
英文:
data is allocated on GPU like d_data. So cudamemcpy with cudaMemcpyDeviceToHost will crash.
To avoid this kind of error, you can use cudaMallocManaged. This allows you to read/write to your buffer from the host and the device without having to worry about memory management. No more need of cudamemcpy!
Read more here : https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论