英文:
need help parallelizing c++ program with OpenMP
问题
我有这段代码,我尝试用OpenMP实现并行化。它使用一个点(X、Y和Z坐标)加载的向量的向量,然后旋转并移动这些点,并在网格中找到最低的点。
这是代码:
double min_x = *min_element(points[0].begin(), points[0].end());
double min_y = *min_element(points[1].begin(), points[1].end());
double min_z = *min_element(points[2].begin(), points[2].end());
// reduction of coordinates
vector<vector<double>> reduced_points(3);
for (int i = 0; i < 3; i++) {
reduced_points[i].resize(points[i].size());
for (unsigned long int j = 0; j < points[i].size(); j++) {
reduced_points[i][j] = points[i][j] - (i == 0 ? min_x : i == 1 ? min_y : min_z);
}
}
// conversion of rotation angles to radians
#pragma omp parallel for collapse(3)
for (int m = 0; m < num_a; m++) {
for (int n = 0; n < num_b; n++) {
for (int o = 0; o < num_g; o++) {
double alpha = (alpha_gon[m] / 200) * numbers::pi;
double beta = (beta_gon[n] / 200) * numbers::pi;
double gamma = (gamma_gon[o] / 200) * numbers::pi;
// define rotation matrices for x, y, and z axes
vector<vector<double>> x_rot_matrix = {{1, 0, 0}, {0, cos(alpha), sin(alpha)}, {0, -sin(alpha), cos(alpha)}};
vector<vector<double>> y_rot_matrix = {{cos(beta), 0, -sin(beta)}, {0, 1, 0}, {sin(beta), 0, cos(beta)}};
vector<vector<double>> z_rot_matrix = {{cos(gamma), sin(gamma), 0}, {-sin(gamma), cos(gamma), 0}, {0, 0, 1}};
// apply rotations to points in reduced_points and store in rotated_points
vector<vector<double>> rotated_points(3);
for (int i = 0; i < 3; i++) {
rotated_points[i].resize(reduced_points[i].size());
for (unsigned long int j = 0; j < reduced_points[i].size(); j++) {
double x = reduced_points[0][j];
double y = reduced_points[1][j];
double z = reduced_points[2][j];
rotated_points[i][j] = x_rot_matrix[i][0] * x + x_rot_matrix[i][1] * y + x_rot_matrix[i][2] * z;
rotated_points[i][j] += y_rot_matrix[i][0] * x + y_rot_matrix[i][1] * y + y_rot_matrix[i][2] * z;
rotated_points[i][j] += z_rot_matrix[i][0] * x + z_rot_matrix[i][1] * y + z_rot_matrix[i][2] * z;
}
}
// shifting to origin
double to_origin_x = *min_element(rotated_points[0].begin(), rotated_points[0].end());
double to_origin_y = *min_element(rotated_points[1].begin(), rotated_points[1].end());
double to_origin_z = *min_element(rotated_points[2].begin(), rotated_points[2].end());
vector<vector<double>> to_origin_points(3);
for (int i = 0; i < 3; i++) {
to_origin_points[i].resize(rotated_points[i].size());
for (unsigned long int j = 0; j < rotated_points[i].size(); j++) {
to_origin_points[i][j] = rotated_points[i][j] - (i == 0 ? to_origin_x : i == 1 ? to_origin_y : to_origin_z);
}
}
//adding shift
for (int q = 0; q < number_of_shifts; q++) {
for (int r = 0; r < number_of_shifts; r++) {
vector<vector<double>> shifted_points(3);
for (int i = 0; i < 3; i++) {
shifted_points[i].resize(to_origin_points[i].size());
for (unsigned long int j = 0; j < to_origin_points[i].size(); j++) {
shifted_points[i][j] = to_origin_points[i][j] + (i == 0 ? shift_x * q : i == 1 ? shift_y * r : 0);
}
}
// find maximum values in each column
double max_x = *max_element(shifted_points[0].begin(), shifted_points[0].end());
double max_y = *max_element(shifted_points[1].begin(), shifted_points[1].end());
//determine raster size
int limit_x = ceil(max_x/raster_size);
int limit_y = ceil(max_y/raster_size);
vector<vector<double>> grid(limit_x, vector<double>(limit_y));
// finding ground
int grid_x;
int grid_y;
for (unsigned long int i = 0; i < shifted_points[0].size(); i++) {
grid_x = floor(shifted_points[0][i]/raster_size);
grid_y = floor(shifted_points[1][i]/raster_size);
if (grid[grid_x][grid_y] == 0)
{
grid[grid_x][grid_y] = i;
} else if (shifted_points[2][i] < shifted_points[2][grid[grid_x][grid_y]]) {
grid[grid_x][grid_y] = i;
}
}
for (int i = 0; i < grid.size(); i++) {
for (int j = 0; j < grid[i].size(); j++) {
int index = grid[i][j];
if (index != 0) {
#pragma omp critical
{
all_lowest_points.insert(index);
count_map[index]++;
}
}
}
}
}
}
}
}
}
英文:
I have this piece of code, that I'm trying to parallelize with OpenMP. It uses a vector of vectors loaded with points (X, Y and Z coordinates) and then it rotates and shift the points and finds the lowest ones in a grid.
the code:
double min_x = *min_element(points[0].begin(), points[0].end());
double min_y = *min_element(points[1].begin(), points[1].end());
double min_z = *min_element(points[2].begin(), points[2].end());
// reduction of coordinates
vector<vector<double>> reduced_points(3);
for (int i = 0; i < 3; i++) {
reduced_points[i].resize(points[i].size());
for (unsigned long int j = 0; j < points[i].size(); j++) {
reduced_points[i][j] = points[i][j] - (i == 0 ? min_x : i == 1 ? min_y : min_z);
}
}
// conversion of rotation angles to radians
#pragma omp parallel for collapse(3)
for (int m = 0; m < num_a; m++) {
for (int n = 0; n < num_b; n++) {
for (int o = 0; o < num_g; o++) {
double alpha = (alpha_gon[m] / 200) * numbers::pi;
double beta = (beta_gon[n] / 200) * numbers::pi;
double gamma = (gamma_gon[o] / 200) * numbers::pi;
// define rotation matrices for x, y, and z axes
vector<vector<double>> x_rot_matrix = {{1, 0, 0}, {0, cos(alpha), sin(alpha)}, {0, -sin(alpha), cos(alpha)}};
vector<vector<double>> y_rot_matrix = {{cos(beta), 0, -sin(beta)}, {0, 1, 0}, {sin(beta), 0, cos(beta)}};
vector<vector<double>> z_rot_matrix = {{cos(gamma), sin(gamma), 0}, {-sin(gamma), cos(gamma), 0}, {0, 0, 1}};
// apply rotations to points in reduced_points and store in rotated_points
vector<vector<double>> rotated_points(3);
for (int i = 0; i < 3; i++) {
rotated_points[i].resize(reduced_points[i].size());
for (unsigned long int j = 0; j < reduced_points[i].size(); j++) {
double x = reduced_points[0][j];
double y = reduced_points[1][j];
double z = reduced_points[2][j];
rotated_points[i][j] = x_rot_matrix[i][0] * x + x_rot_matrix[i][1] * y + x_rot_matrix[i][2] * z;
rotated_points[i][j] += y_rot_matrix[i][0] * x + y_rot_matrix[i][1] * y + y_rot_matrix[i][2] * z;
rotated_points[i][j] += z_rot_matrix[i][0] * x + z_rot_matrix[i][1] * y + z_rot_matrix[i][2] * z;
}
}
// shifting to origin
double to_origin_x = *min_element(rotated_points[0].begin(), rotated_points[0].end());
double to_origin_y = *min_element(rotated_points[1].begin(), rotated_points[1].end());
double to_origin_z = *min_element(rotated_points[2].begin(), rotated_points[2].end());
vector<vector<double>> to_origin_points(3);
for (int i = 0; i < 3; i++) {
to_origin_points[i].resize(rotated_points[i].size());
for (unsigned long int j = 0; j < rotated_points[i].size(); j++) {
to_origin_points[i][j] = rotated_points[i][j] - (i == 0 ? to_origin_x : i == 1 ? to_origin_y : to_origin_z);
}
}
//adding shift
for (int q = 0; q < number_of_shifts; q++) {
for (int r = 0; r < number_of_shifts; r++) {
vector<vector<double>> shifted_points(3);
for (int i = 0; i < 3; i++) {
shifted_points[i].resize(to_origin_points[i].size());
for (unsigned long int j = 0; j < to_origin_points[i].size(); j++) {
shifted_points[i][j] = to_origin_points[i][j] + (i == 0 ? shift_x * q : i == 1 ? shift_y * r : 0);
}
}
// find maximum values in each column
double max_x = *max_element(shifted_points[0].begin(), shifted_points[0].end());
double max_y = *max_element(shifted_points[1].begin(), shifted_points[1].end());
//determine raster size
int limit_x = ceil(max_x/raster_size);
int limit_y = ceil(max_y/raster_size);
vector<vector<double>> grid(limit_x, vector<double>(limit_y));
// finding ground
int grid_x;
int grid_y;
for (unsigned long int i = 0; i < shifted_points[0].size(); i++) {
grid_x = floor(shifted_points[0][i]/raster_size);
grid_y = floor(shifted_points[1][i]/raster_size);
if (grid[grid_x][grid_y] == 0)
{
grid[grid_x][grid_y] = i;
} else if (shifted_points[2][i] < shifted_points[2][grid[grid_x][grid_y]]) {
grid[grid_x][grid_y] = i;
}
}
for (int i = 0; i < grid.size(); i++) {
for (int j = 0; j < grid[i].size(); j++) {
int index = grid[i][j];
if (index != 0) {
#pragma omp critical
{
all_lowest_points.insert(index);
count_map[index]++;
}
}
}
}
}
}
}
}
}
This solution works, but as I am a beginner I have a feeling there may be a better one. Also if a 'raster_size' is set to a value less then 1, the computation gets suddenly much slower and only a fraction of CPU power is used. I'm looking for any impovements to my code.
答案1
得分: 2
根据Victor Eijkhout的建议,最好的解决方案是定义一个自定义的omp reduction。另一种选择是进行手动reduction,虽然效率稍低,但可能仍比你的初始代码要高得多,因为关键部分仅在每个线程中遇到一次(而不是在循环中多次)。
首先,你需要拆分omp parallel for
,并声明将应用reduction的对象的本地/私有版本。
#pragma omp parallel
{
// 在这里声明与全局版本相同特征的all_lowest_points_local和count_map_local
#pragma omp for collapse(3)
for (int m = 0; m < num_a; m++) {
for (int n = 0; n < num_b; n++) {
for (int o = 0; o < num_g; o++) {
在你的omp critical
区域,更新本地版本而不是全局版本。这样,你可以删除critical
pragma。
在并行部分结束之前,更新全局版本。
} // o 循环结束
} // n 循环结束
} // m 循环结束
#pragma omp critical
{
// 在这里使用all_lowest_points_local和count_map_local更新all_lowest_points和count_map
}
} // omp parallel 结束
英文:
As suggested by Victor Eijkhout the best solution is to define a custom omp reduction. An alternative is to do a manual reduction, which is a bit less efficient, but probably still much more efficient than your initial code, as the critical is then encountered only one per thread (instead many times in the loops).
First, you have to split the omp parallel for
and to declare local/private versions of the objects you apply the reduction to.
#pragma omp parallel
{
// declare here all_lowest_points_local count_map_local
// with the same characteristics as the global version
#pragma omp for collapse(3)
for (int m = 0; m < num_a; m++) {
for (int n = 0; n < num_b; n++) {
for (int o = 0; o < num_g; o++) {
In your omp critical
region, you update the local versions instead of the global ones. That way, you can remove the critical
pragma
And just before the end of the parallel section you update the global versions.
} // end of o loop
} // end of n loop
} // end of m loop
#pragma omp critical
{
// here you update all_lowest_points and count_map
// using all_lowest_points_local and count_map_local
}
} // end of omp parallel
答案2
得分: 0
-
你要并行处理的数据点数偏低。
-
你对标记这两行为
critical
是正确的,不幸的是,这对性能不利。由于你正在使用C++,你可以将其转化为一个归约操作,如果你定义一个归约运算符来合并lowest_points
和count
数组。我会编写一个low_points
类,并在其上定义operator+
。然后一切都会运行得非常快,几乎完全并行。
英文:
-
Your number of data points to parallelize over is on the low side.
-
You are right to mark the two lines
critical
, unfortunately that is not good for performance. Since you're using C++ you can turn this into a reduction if you define a reduction operator that merges thelowest_points
andcount
arrays. I would write a classlow_points
and define theoperator+
on that. Then everything will run extremely fast and almost fully parallel.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论