需要帮助使用OpenMP并行化C++程序。

huangapple go评论54阅读模式
英文:

need help parallelizing c++ program with OpenMP

问题

我有这段代码,我尝试用OpenMP实现并行化。它使用一个点(X、Y和Z坐标)加载的向量的向量,然后旋转并移动这些点,并在网格中找到最低的点。
这是代码:

        double min_x = *min_element(points[0].begin(), points[0].end());
        double min_y = *min_element(points[1].begin(), points[1].end());
        double min_z = *min_element(points[2].begin(), points[2].end()); 


        // reduction of coordinates
        vector<vector<double>> reduced_points(3);
        for (int i = 0; i < 3; i++) {
            reduced_points[i].resize(points[i].size());
            for (unsigned long int j = 0; j < points[i].size(); j++) {
                reduced_points[i][j] = points[i][j] - (i == 0 ? min_x : i == 1 ? min_y : min_z);
            }
        }

        // conversion of rotation angles to radians

        #pragma omp parallel for collapse(3)
        for (int m = 0; m < num_a; m++) {
            for (int n = 0; n < num_b; n++) {
                for (int o = 0; o < num_g; o++) {

                    double alpha = (alpha_gon[m] / 200) * numbers::pi;
                    double beta = (beta_gon[n] / 200) * numbers::pi;
                    double gamma = (gamma_gon[o] / 200) * numbers::pi;

                    // define rotation matrices for x, y, and z axes
                    vector<vector<double>> x_rot_matrix = {{1, 0, 0}, {0, cos(alpha), sin(alpha)}, {0, -sin(alpha), cos(alpha)}};
                    vector<vector<double>> y_rot_matrix = {{cos(beta), 0, -sin(beta)}, {0, 1, 0}, {sin(beta), 0, cos(beta)}};
                    vector<vector<double>> z_rot_matrix = {{cos(gamma), sin(gamma), 0}, {-sin(gamma), cos(gamma), 0}, {0, 0, 1}};

                    // apply rotations to points in reduced_points and store in rotated_points
                    vector<vector<double>> rotated_points(3);
                    for (int i = 0; i < 3; i++) {
                        rotated_points[i].resize(reduced_points[i].size());
                        for (unsigned long int j = 0; j < reduced_points[i].size(); j++) {
                            double x = reduced_points[0][j];
                            double y = reduced_points[1][j];
                            double z = reduced_points[2][j];
                            rotated_points[i][j] = x_rot_matrix[i][0] * x + x_rot_matrix[i][1] * y + x_rot_matrix[i][2] * z;
                            rotated_points[i][j] += y_rot_matrix[i][0] * x + y_rot_matrix[i][1] * y + y_rot_matrix[i][2] * z;
                            rotated_points[i][j] += z_rot_matrix[i][0] * x + z_rot_matrix[i][1] * y + z_rot_matrix[i][2] * z;
                        }
                    }

                    // shifting to origin
                    double to_origin_x = *min_element(rotated_points[0].begin(), rotated_points[0].end());
                    double to_origin_y = *min_element(rotated_points[1].begin(), rotated_points[1].end());
                    double to_origin_z = *min_element(rotated_points[2].begin(), rotated_points[2].end());

                    vector<vector<double>> to_origin_points(3);
                    for (int i = 0; i < 3; i++) {
                        to_origin_points[i].resize(rotated_points[i].size());
                        for (unsigned long int j = 0; j < rotated_points[i].size(); j++) {
                            to_origin_points[i][j] = rotated_points[i][j] - (i == 0 ? to_origin_x : i == 1 ? to_origin_y : to_origin_z);
                        }
                    }

                    //adding shift

                    for (int q = 0; q < number_of_shifts; q++) {
                        for (int r = 0; r < number_of_shifts; r++) {

                            vector<vector<double>> shifted_points(3);
                            for (int i = 0; i < 3; i++) {
                                shifted_points[i].resize(to_origin_points[i].size());
                                for (unsigned long int j = 0; j < to_origin_points[i].size(); j++) {
                                    shifted_points[i][j] = to_origin_points[i][j] + (i == 0 ? shift_x * q : i == 1 ? shift_y * r : 0);
                                }
                            }    

                            // find maximum values in each column
                            double max_x = *max_element(shifted_points[0].begin(), shifted_points[0].end());
                            double max_y = *max_element(shifted_points[1].begin(), shifted_points[1].end());

                            //determine raster size
                            int limit_x = ceil(max_x/raster_size);
                            int limit_y = ceil(max_y/raster_size);
                            vector<vector<double>> grid(limit_x, vector<double>(limit_y));

                            // finding ground
                            int grid_x;
                            int grid_y;
                            for (unsigned long int i = 0; i < shifted_points[0].size(); i++) {
                                grid_x = floor(shifted_points[0][i]/raster_size);
                                grid_y = floor(shifted_points[1][i]/raster_size);

                                if (grid[grid_x][grid_y] == 0)
                                {
                                    grid[grid_x][grid_y] = i;
                                } else if (shifted_points[2][i] < shifted_points[2][grid[grid_x][grid_y]]) {
                                    grid[grid_x][grid_y] = i;
                                }
                            }

                            for (int i = 0; i < grid.size(); i++) {
                                for (int j = 0; j < grid[i].size(); j++) {
                                    int index = grid[i][j];
                                    if (index != 0) {
                                        #pragma omp critical
                                        {
                                        all_lowest_points.insert(index);
                                        count_map[index]++;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
英文:

I have this piece of code, that I'm trying to parallelize with OpenMP. It uses a vector of vectors loaded with points (X, Y and Z coordinates) and then it rotates and shift the points and finds the lowest ones in a grid.
the code:

        double min_x = *min_element(points[0].begin(), points[0].end());
        double min_y = *min_element(points[1].begin(), points[1].end());
        double min_z = *min_element(points[2].begin(), points[2].end()); 


        // reduction of coordinates
        vector&lt;vector&lt;double&gt;&gt; reduced_points(3);
        for (int i = 0; i &lt; 3; i++) {
            reduced_points[i].resize(points[i].size());
            for (unsigned long int j = 0; j &lt; points[i].size(); j++) {
                reduced_points[i][j] = points[i][j] - (i == 0 ? min_x : i == 1 ? min_y : min_z);
            }
        }

        // conversion of rotation angles to radians

        #pragma omp parallel for collapse(3)
        for (int m = 0; m &lt; num_a; m++) {
            for (int n = 0; n &lt; num_b; n++) {
                for (int o = 0; o &lt; num_g; o++) {

                    double alpha = (alpha_gon[m] / 200) * numbers::pi;
                    double beta = (beta_gon[n] / 200) * numbers::pi;
                    double gamma = (gamma_gon[o] / 200) * numbers::pi;

                    // define rotation matrices for x, y, and z axes
                    vector&lt;vector&lt;double&gt;&gt; x_rot_matrix = {{1, 0, 0}, {0, cos(alpha), sin(alpha)}, {0, -sin(alpha), cos(alpha)}};
                    vector&lt;vector&lt;double&gt;&gt; y_rot_matrix = {{cos(beta), 0, -sin(beta)}, {0, 1, 0}, {sin(beta), 0, cos(beta)}};
                    vector&lt;vector&lt;double&gt;&gt; z_rot_matrix = {{cos(gamma), sin(gamma), 0}, {-sin(gamma), cos(gamma), 0}, {0, 0, 1}};

                    // apply rotations to points in reduced_points and store in rotated_points
                    vector&lt;vector&lt;double&gt;&gt; rotated_points(3);
                    for (int i = 0; i &lt; 3; i++) {
                        rotated_points[i].resize(reduced_points[i].size());
                        for (unsigned long int j = 0; j &lt; reduced_points[i].size(); j++) {
                            double x = reduced_points[0][j];
                            double y = reduced_points[1][j];
                            double z = reduced_points[2][j];
                            rotated_points[i][j] = x_rot_matrix[i][0] * x + x_rot_matrix[i][1] * y + x_rot_matrix[i][2] * z;
                            rotated_points[i][j] += y_rot_matrix[i][0] * x + y_rot_matrix[i][1] * y + y_rot_matrix[i][2] * z;
                            rotated_points[i][j] += z_rot_matrix[i][0] * x + z_rot_matrix[i][1] * y + z_rot_matrix[i][2] * z;
                        }
                    }

                    // shifting to origin
                    double to_origin_x = *min_element(rotated_points[0].begin(), rotated_points[0].end());
                    double to_origin_y = *min_element(rotated_points[1].begin(), rotated_points[1].end());
                    double to_origin_z = *min_element(rotated_points[2].begin(), rotated_points[2].end());

                    vector&lt;vector&lt;double&gt;&gt; to_origin_points(3);
                    for (int i = 0; i &lt; 3; i++) {
                        to_origin_points[i].resize(rotated_points[i].size());
                        for (unsigned long int j = 0; j &lt; rotated_points[i].size(); j++) {
                            to_origin_points[i][j] = rotated_points[i][j] - (i == 0 ? to_origin_x : i == 1 ? to_origin_y : to_origin_z);
                        }
                    }

                    //adding shift

                    for (int q = 0; q &lt; number_of_shifts; q++) {
                        for (int r = 0; r &lt; number_of_shifts; r++) {

                            vector&lt;vector&lt;double&gt;&gt; shifted_points(3);
                            for (int i = 0; i &lt; 3; i++) {
                                shifted_points[i].resize(to_origin_points[i].size());
                                for (unsigned long int j = 0; j &lt; to_origin_points[i].size(); j++) {
                                    shifted_points[i][j] = to_origin_points[i][j] + (i == 0 ? shift_x * q : i == 1 ? shift_y * r : 0);
                                }
                            }    

                            // find maximum values in each column
                            double max_x = *max_element(shifted_points[0].begin(), shifted_points[0].end());
                            double max_y = *max_element(shifted_points[1].begin(), shifted_points[1].end());

                            //determine raster size
                            int limit_x = ceil(max_x/raster_size);
                            int limit_y = ceil(max_y/raster_size);
                            vector&lt;vector&lt;double&gt;&gt; grid(limit_x, vector&lt;double&gt;(limit_y));

                            // finding ground
                            int grid_x;
                            int grid_y;
                            for (unsigned long int i = 0; i &lt; shifted_points[0].size(); i++) {
                                grid_x = floor(shifted_points[0][i]/raster_size);
                                grid_y = floor(shifted_points[1][i]/raster_size);

                                if (grid[grid_x][grid_y] == 0)
                                {
                                    grid[grid_x][grid_y] = i;
                                } else if (shifted_points[2][i] &lt; shifted_points[2][grid[grid_x][grid_y]]) {
                                    grid[grid_x][grid_y] = i;
                                }
                            }

                            for (int i = 0; i &lt; grid.size(); i++) {
                                for (int j = 0; j &lt; grid[i].size(); j++) {
                                    int index = grid[i][j];
                                    if (index != 0) {
                                        #pragma omp critical
                                        {
                                        all_lowest_points.insert(index);
                                        count_map[index]++;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

This solution works, but as I am a beginner I have a feeling there may be a better one. Also if a 'raster_size' is set to a value less then 1, the computation gets suddenly much slower and only a fraction of CPU power is used. I'm looking for any impovements to my code.

答案1

得分: 2

根据Victor Eijkhout的建议,最好的解决方案是定义一个自定义的omp reduction。另一种选择是进行手动reduction,虽然效率稍低,但可能仍比你的初始代码要高得多,因为关键部分仅在每个线程中遇到一次(而不是在循环中多次)。

首先,你需要拆分omp parallel for,并声明将应用reduction的对象的本地/私有版本。

#pragma omp parallel
{
    // 在这里声明与全局版本相同特征的all_lowest_points_local和count_map_local

    #pragma omp for collapse(3)
    for (int m = 0; m < num_a; m++) {
        for (int n = 0; n < num_b; n++) {
            for (int o = 0; o < num_g; o++) {

在你的omp critical区域,更新本地版本而不是全局版本。这样,你可以删除critical pragma。

在并行部分结束之前,更新全局版本。

        } // o 循环结束
    } // n 循环结束
} // m 循环结束
#pragma omp critical
{
    // 在这里使用all_lowest_points_local和count_map_local更新all_lowest_points和count_map
}
} // omp parallel 结束
英文:

As suggested by Victor Eijkhout the best solution is to define a custom omp reduction. An alternative is to do a manual reduction, which is a bit less efficient, but probably still much more efficient than your initial code, as the critical is then encountered only one per thread (instead many times in the loops).

First, you have to split the omp parallel for and to declare local/private versions of the objects you apply the reduction to.

    #pragma omp parallel
    {
        // declare here all_lowest_points_local count_map_local
        // with the same characteristics as the global version

        #pragma omp for collapse(3)
        for (int m = 0; m &lt; num_a; m++) {
            for (int n = 0; n &lt; num_b; n++) {
                for (int o = 0; o &lt; num_g; o++) {

In your omp critical region, you update the local versions instead of the global ones. That way, you can remove the critical pragma

And just before the end of the parallel section you update the global versions.

                } // end of o loop
            } // end of n loop
        } // end of m loop
        #pragma omp critical
        {
            // here you update all_lowest_points and  count_map
            // using all_lowest_points_local and count_map_local
        }
    } // end of omp parallel

答案2

得分: 0

  1. 你要并行处理的数据点数偏低。

  2. 你对标记这两行为critical是正确的,不幸的是,这对性能不利。由于你正在使用C++,你可以将其转化为一个归约操作,如果你定义一个归约运算符来合并lowest_pointscount数组。我会编写一个low_points类,并在其上定义operator+。然后一切都会运行得非常快,几乎完全并行。

英文:
  1. Your number of data points to parallelize over is on the low side.

  2. You are right to mark the two lines critical, unfortunately that is not good for performance. Since you're using C++ you can turn this into a reduction if you define a reduction operator that merges the lowest_points and count arrays. I would write a class low_points and define the operator+ on that. Then everything will run extremely fast and almost fully parallel.

huangapple
  • 本文由 发表于 2023年4月13日 17:07:41
  • 转载请务必保留本文链接:https://go.coder-hub.com/76003667.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定