英文:
cudaErrorIllegalAddress(700) and I can't figure out why
问题
我能看到你在使用CUDA/C++创建一个基本的粒子模拟程序,同时也在Rust中实现了相同的粒子系统并利用了线程。现在你想在CUDA中使用GPU来完成相同的任务。
你遇到了一个问题,似乎是与函数update_particles_GPU
的实现有关,导致了尝试写入无效内存地址的错误。你尝试过使用设备管理的内存,但遇到了illegalMemoryAddress
问题。你也尝试了添加打印语句来查看哪个线程出错,但没有太大帮助。
此外,不同的线程配置表现也不同,例如1个块,3333个线程在粒子只更新一次时可以工作,但在模拟循环中,尝试访问无效内存位置。
你还提供了代码,我可以看到其中包括了结构体、函数以及主程序的实现。如果你对特定部分有任何疑问或需要进一步的帮助,请告诉我。
英文:
I'm creating a basic particle simulation in CUDA/C++ as part of a university assignment. The other part of the assignment was to make the same particle system in Rust making use of threading. Now, I need to do the same thing but making use of the GPU in CUDA.
I'm having some problems with it that just aren't really making sense to me, I'm doing things exactly how we've done them before for lab assignments and yet I'm facing issues that never came up before. I keep getting an error that I'm attempting to write to an invalid memory address and I'm fairly certain it all comes down to my implementation of the function update_particles_GPU
.
Originally, I was using device managed memory (initializing the arrays on the GPU and not having to worry about copying the data back and forth between host and device) but started facing this illegalMemoryAddress
problem, this was also being done in a while loop that was locked to 60fps and had multiple aerosol cans, thousands of particles, etc. so I tried to go simple with it and try doing just one can, with a small amount of particles, no simulation loop and manually copying the data to see if the problem became any clearer but... it didn't. I don't really know how to approach this problem anymore.
I also tried adding prints to the kernel function to see which thread it was throwing an error on but this did little to help me.
Another strange thing to note is how different thread configurations seem to work and others don't. For example, 1 block of 3333 threads works when the particles are only updated once, but when this is done in a simulation loop, where particles are created and updated every frame, it eventually tries to access an invalid memory location.
Same with 1, 2000.
2, 1000 has an immediate error.
And using num_blocks
, block_size
also has an immediate error.
Breakdown of the code:
-
Create
AerosolCan
red_can
-
Create a null pointer
dev_red_particles
-
Spray red can (creates a batch of particles and adds them to the
red_can
's particles array) <- possible point of error -
Allocate memory on GPU
-
Copy host
red_can.particles
(array) to device -
Kernel function
update_particles_GPU
<- possible point of error -
cudaDeviceSynchronize()
-
Copy updated device array back to host
-
Print updated particle positions
The code:
#define _USE_MATH_DEFINES
#include <iostream>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <thread>
#include <stdio.h>
#include <vector>
#include <windows.h>
const float DRAG = 0.05;
const float GRAVITY = 9.8;
const float BLEND = 0.1;
struct Colour {
float red = 1;
float green = 1;
float blue = 1;
};
struct Particle {
Colour colour = Colour();
float x = 0;
float y = 0;
float z = 0;
float velocity_x = 0;
float velocity_y = 0;
float velocity_z = 0;
bool collided = false;
bool landed_on_paper = false;
};
const int max_particles_per_can = 3333;
__global__ void update_particles_GPU(Particle* particles, const float* time_step, const uint32_t* particles_created) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
//printf("index : (%d)", i);
if (i > *particles_created) {
//printf("\n");
return;
}
Particle* particle;
if (&particles[i] == NULL) {
//printf("\n");
return;
}
else {
particle = &particles[i];
}
if (!particle->collided) {
float GRAVITY = 9.8;
float DRAG = 0.05;
float acceleration_x = -DRAG * (particle->velocity_x * particle->velocity_x);
float distance_x = particle->velocity_x * *time_step + 0.5 * acceleration_x * (*time_step * *time_step);
float acceleration_y = GRAVITY - DRAG * (particle->velocity_y * particle->velocity_y);
float distance_y = particle->velocity_y * *time_step + 0.5 * acceleration_y * (*time_step * *time_step);
float acceleration_z = -DRAG * (particle->velocity_z * particle->velocity_z);
float distance_z = particle->velocity_z * *time_step + 0.5 * acceleration_z * (*time_step * *time_step);
particle->x += distance_x;
particle->y += distance_y;
particle->z += distance_z;
if (particle->y < 0) {
particle->y = 0;
}
if (particle->velocity_x < 0) {
particle->velocity_x += -acceleration_x * *time_step;
}
else {
particle->velocity_x += acceleration_x * *time_step;
}
particle->velocity_y += -acceleration_y * *time_step;
if (particle->velocity_z < 0) {
particle->velocity_z += -acceleration_z * *time_step;
}
else {
particle->velocity_z += acceleration_z * *time_step;
}
// Collision
if (particle->y == 0) {
particle->collided = true;
}
}
//printf("\n");
}
struct AerosolCan {
//std::vector<Particle> particles = std::vector<Particle>();
Particle* particles = new Particle[max_particles_per_can];
float x = 0;
float y = 0;
float z = 0;
float base_velocity_x = 0;
float base_velocity_y = 0;
float base_velocity_z = 0;
Colour colour = Colour();
float spray_radius = 0;
uint32_t particles_created = 0;
void print_particles() {
int index = 1;
for (int i = 0; i < particles_created; i++) {
Particle* particle = &particles[i];
std::cout << "Particle " << i + 1 << " | X: " << particle->x << " | Y: "
<< particle->y << " | Z: " << particle->z << " | Hit = " << particle->landed_on_paper << std::endl;
}
std::cout << "" << std::endl;
}
void spray(Particle* particles, uint32_t number_of_particles) {
float radius = spray_radius;
while (radius > 0.0 && number_of_particles > 0) {
// Create new particles
for (int i = 0; i < number_of_particles; i++) {
float horizontal_angle = i / number_of_particles * 2.0 * M_PI;
float vertical_angle = i / number_of_particles * M_PI;
float new_x = x + radius * cos(horizontal_angle) * sin(vertical_angle);
float new_y = y + radius * sin(horizontal_angle) * sin(vertical_angle);
float new_z = z + radius * cos(vertical_angle);
Particle new_particle = Particle();
new_particle.colour = colour;
new_particle.x = new_x;
new_particle.y = new_y;
new_particle.z = new_z;
new_particle.velocity_x = base_velocity_x;
new_particle.velocity_y = base_velocity_y;
new_particle.velocity_z = base_velocity_z;
//reinterpret_cast<Particle*>(&particles)[particles_created] = new_particle;
particles[particles_created] = new_particle;
particles_created++;
}
radius = radius / 2.0;
number_of_particles = number_of_particles / 2.0;
}
}
};
const int block_size = 256;
int main() {
int num_blocks = (max_particles_per_can + block_size - 1) / block_size;
std::cout << "Blocks: " << num_blocks << " | Block size: " << block_size << std::endl;
Colour red_colour;
red_colour.red = 1;
red_colour.green = 0;
red_colour.blue = 0;
AerosolCan red_can = AerosolCan();
red_can.colour = red_colour;
red_can.x = -25;
red_can.y = 30;
red_can.z = 60;
red_can.base_velocity_x = 125;
red_can.base_velocity_y = 10;
red_can.base_velocity_z = 0;
red_can.spray_radius = 15;
Particle* dev_red_particles = nullptr;
// Spray particles
red_can.spray(red_can.particles, 5);
// Copy data to GPU
cudaError_t cuda_status;
cuda_status = cudaMalloc((void**)&dev_red_particles, max_particles_per_can * sizeof(Particle));
if (cuda_status != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed");
goto Error;
}
cuda_status = cudaMemcpy(dev_red_particles, red_can.particles, max_particles_per_can * sizeof(Particle), cudaMemcpyHostToDevice);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed");
goto Error;
}
float time_step = 0.001;
// Update particles
update_particles_GPU << <num_blocks, block_size >> > (dev_red_particles, &time_step, &red_can.particles_created);
cuda_status = cudaDeviceSynchronize();
if (cuda_status != cudaSuccess) {
fprintf(stderr, "cudaSync failed");
goto Error;
}
// Copy data back to host
cuda_status = cudaMemcpy(red_can.particles, dev_red_particles, max_particles_per_can * sizeof(Particle), cudaMemcpyDeviceToHost);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed");
goto Error;
}
// Print particles
red_can.print_particles();
Error:
cudaFree(dev_red_particles);
return 0;
}
答案1
得分: 1
非常愚蠢的错误我犯了。GPU 函数被传递了两个只存在于 CPU 上的指针,particles_created 和 time_step。
__global__ void update_particles_GPU(Particle* particles, const float* time_step, const uint32_t* particles_created)
将这些按值传递而非指针修复了我的问题,简直不敢相信我没有注意到这一点。
英文:
Very silly mistake I made. GPU function was being passed two pointers that only existed on the CPU, particles_created and time_step
__global__ void update_particles_GPU(Particle* particles, const float* time_step, const uint32_t* particles_created)
Passing these by value instead of with a pointer fixed the issue for me, can't believe I didn't notice this
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论