2023年4月13日 17:07:41go评论77阅读模式

英文:

need help parallelizing c++ program with OpenMP

问题

我有这段代码，我尝试用OpenMP实现并行化。它使用一个点（X、Y和Z坐标）加载的向量的向量，然后旋转并移动这些点，并在网格中找到最低的点。
这是代码：

        double min_x = *min_element(points[0].begin(), points[0].end());
        double min_y = *min_element(points[1].begin(), points[1].end());
        double min_z = *min_element(points[2].begin(), points[2].end()); 
        // reduction of coordinates
        vector<vector<double>> reduced_points(3);
        for (int i = 0; i < 3; i++) {
            reduced_points[i].resize(points[i].size());
            for (unsigned long int j = 0; j < points[i].size(); j++) {
                reduced_points[i][j] = points[i][j] - (i == 0 ? min_x : i == 1 ? min_y : min_z);
            }
        }
        // conversion of rotation angles to radians
        #pragma omp parallel for collapse(3)
        for (int m = 0; m < num_a; m++) {
            for (int n = 0; n < num_b; n++) {
                for (int o = 0; o < num_g; o++) {
                    double alpha = (alpha_gon[m] / 200) * numbers::pi;
                    double beta = (beta_gon[n] / 200) * numbers::pi;
                    double gamma = (gamma_gon[o] / 200) * numbers::pi;
                    // define rotation matrices for x, y, and z axes
                    vector<vector<double>> x_rot_matrix = {{1, 0, 0}, {0, cos(alpha), sin(alpha)}, {0, -sin(alpha), cos(alpha)}};
                    vector<vector<double>> y_rot_matrix = {{cos(beta), 0, -sin(beta)}, {0, 1, 0}, {sin(beta), 0, cos(beta)}};
                    vector<vector<double>> z_rot_matrix = {{cos(gamma), sin(gamma), 0}, {-sin(gamma), cos(gamma), 0}, {0, 0, 1}};
                    // apply rotations to points in reduced_points and store in rotated_points
                    vector<vector<double>> rotated_points(3);
                    for (int i = 0; i < 3; i++) {
                        rotated_points[i].resize(reduced_points[i].size());
                        for (unsigned long int j = 0; j < reduced_points[i].size(); j++) {
                            double x = reduced_points[0][j];
                            double y = reduced_points[1][j];
                            double z = reduced_points[2][j];
                            rotated_points[i][j] = x_rot_matrix[i][0] * x + x_rot_matrix[i][1] * y + x_rot_matrix[i][2] * z;
                            rotated_points[i][j] += y_rot_matrix[i][0] * x + y_rot_matrix[i][1] * y + y_rot_matrix[i][2] * z;
                            rotated_points[i][j] += z_rot_matrix[i][0] * x + z_rot_matrix[i][1] * y + z_rot_matrix[i][2] * z;
                        }
                    }
                    // shifting to origin
                    double to_origin_x = *min_element(rotated_points[0].begin(), rotated_points[0].end());
                    double to_origin_y = *min_element(rotated_points[1].begin(), rotated_points[1].end());
                    double to_origin_z = *min_element(rotated_points[2].begin(), rotated_points[2].end());
                    vector<vector<double>> to_origin_points(3);
                    for (int i = 0; i < 3; i++) {
                        to_origin_points[i].resize(rotated_points[i].size());
                        for (unsigned long int j = 0; j < rotated_points[i].size(); j++) {
                            to_origin_points[i][j] = rotated_points[i][j] - (i == 0 ? to_origin_x : i == 1 ? to_origin_y : to_origin_z);
                        }
                    }
                    //adding shift
                    for (int q = 0; q < number_of_shifts; q++) {
                        for (int r = 0; r < number_of_shifts; r++) {
                            vector<vector<double>> shifted_points(3);
                            for (int i = 0; i < 3; i++) {
                                shifted_points[i].resize(to_origin_points[i].size());
                                for (unsigned long int j = 0; j < to_origin_points[i].size(); j++) {
                                    shifted_points[i][j] = to_origin_points[i][j] + (i == 0 ? shift_x * q : i == 1 ? shift_y * r : 0);
                                }
                            }    
                            // find maximum values in each column
                            double max_x = *max_element(shifted_points[0].begin(), shifted_points[0].end());
                            double max_y = *max_element(shifted_points[1].begin(), shifted_points[1].end());
                            //determine raster size
                            int limit_x = ceil(max_x/raster_size);
                            int limit_y = ceil(max_y/raster_size);
                            vector<vector<double>> grid(limit_x, vector<double>(limit_y));
                            // finding ground
                            int grid_x;
                            int grid_y;
                            for (unsigned long int i = 0; i < shifted_points[0].size(); i++) {
                                grid_x = floor(shifted_points[0][i]/raster_size);
                                grid_y = floor(shifted_points[1][i]/raster_size);
                                if (grid[grid_x][grid_y] == 0)
                                {
                                    grid[grid_x][grid_y] = i;
                                } else if (shifted_points[2][i] < shifted_points[2][grid[grid_x][grid_y]]) {
                                    grid[grid_x][grid_y] = i;
                                }
                            }
                            for (int i = 0; i < grid.size(); i++) {
                                for (int j = 0; j < grid[i].size(); j++) {
                                    int index = grid[i][j];
                                    if (index != 0) {
                                        #pragma omp critical
                                        {
                                        all_lowest_points.insert(index);
                                        count_map[index]++;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

英文:

I have this piece of code, that I'm trying to parallelize with OpenMP. It uses a vector of vectors loaded with points (X, Y and Z coordinates) and then it rotates and shift the points and finds the lowest ones in a grid.
the code:

        double min_x = *min_element(points[0].begin(), points[0].end());
        double min_y = *min_element(points[1].begin(), points[1].end());
        double min_z = *min_element(points[2].begin(), points[2].end()); 
        // reduction of coordinates
        vector&lt;vector&lt;double&gt;&gt; reduced_points(3);
        for (int i = 0; i &lt; 3; i++) {
            reduced_points[i].resize(points[i].size());
            for (unsigned long int j = 0; j &lt; points[i].size(); j++) {
                reduced_points[i][j] = points[i][j] - (i == 0 ? min_x : i == 1 ? min_y : min_z);
            }
        }
        // conversion of rotation angles to radians
        #pragma omp parallel for collapse(3)
        for (int m = 0; m &lt; num_a; m++) {
            for (int n = 0; n &lt; num_b; n++) {
                for (int o = 0; o &lt; num_g; o++) {
                    double alpha = (alpha_gon[m] / 200) * numbers::pi;
                    double beta = (beta_gon[n] / 200) * numbers::pi;
                    double gamma = (gamma_gon[o] / 200) * numbers::pi;
                    // define rotation matrices for x, y, and z axes
                    vector&lt;vector&lt;double&gt;&gt; x_rot_matrix = {{1, 0, 0}, {0, cos(alpha), sin(alpha)}, {0, -sin(alpha), cos(alpha)}};
                    vector&lt;vector&lt;double&gt;&gt; y_rot_matrix = {{cos(beta), 0, -sin(beta)}, {0, 1, 0}, {sin(beta), 0, cos(beta)}};
                    vector&lt;vector&lt;double&gt;&gt; z_rot_matrix = {{cos(gamma), sin(gamma), 0}, {-sin(gamma), cos(gamma), 0}, {0, 0, 1}};
                    // apply rotations to points in reduced_points and store in rotated_points
                    vector&lt;vector&lt;double&gt;&gt; rotated_points(3);
                    for (int i = 0; i &lt; 3; i++) {
                        rotated_points[i].resize(reduced_points[i].size());
                        for (unsigned long int j = 0; j &lt; reduced_points[i].size(); j++) {
                            double x = reduced_points[0][j];
                            double y = reduced_points[1][j];
                            double z = reduced_points[2][j];
                            rotated_points[i][j] = x_rot_matrix[i][0] * x + x_rot_matrix[i][1] * y + x_rot_matrix[i][2] * z;
                            rotated_points[i][j] += y_rot_matrix[i][0] * x + y_rot_matrix[i][1] * y + y_rot_matrix[i][2] * z;
                            rotated_points[i][j] += z_rot_matrix[i][0] * x + z_rot_matrix[i][1] * y + z_rot_matrix[i][2] * z;
                        }
                    }
                    // shifting to origin
                    double to_origin_x = *min_element(rotated_points[0].begin(), rotated_points[0].end());
                    double to_origin_y = *min_element(rotated_points[1].begin(), rotated_points[1].end());
                    double to_origin_z = *min_element(rotated_points[2].begin(), rotated_points[2].end());
                    vector&lt;vector&lt;double&gt;&gt; to_origin_points(3);
                    for (int i = 0; i &lt; 3; i++) {
                        to_origin_points[i].resize(rotated_points[i].size());
                        for (unsigned long int j = 0; j &lt; rotated_points[i].size(); j++) {
                            to_origin_points[i][j] = rotated_points[i][j] - (i == 0 ? to_origin_x : i == 1 ? to_origin_y : to_origin_z);
                        }
                    }
                    //adding shift
                    for (int q = 0; q &lt; number_of_shifts; q++) {
                        for (int r = 0; r &lt; number_of_shifts; r++) {
                            vector&lt;vector&lt;double&gt;&gt; shifted_points(3);
                            for (int i = 0; i &lt; 3; i++) {
                                shifted_points[i].resize(to_origin_points[i].size());
                                for (unsigned long int j = 0; j &lt; to_origin_points[i].size(); j++) {
                                    shifted_points[i][j] = to_origin_points[i][j] + (i == 0 ? shift_x * q : i == 1 ? shift_y * r : 0);
                                }
                            }    
                            // find maximum values in each column
                            double max_x = *max_element(shifted_points[0].begin(), shifted_points[0].end());
                            double max_y = *max_element(shifted_points[1].begin(), shifted_points[1].end());
                            //determine raster size
                            int limit_x = ceil(max_x/raster_size);
                            int limit_y = ceil(max_y/raster_size);
                            vector&lt;vector&lt;double&gt;&gt; grid(limit_x, vector&lt;double&gt;(limit_y));
                            // finding ground
                            int grid_x;
                            int grid_y;
                            for (unsigned long int i = 0; i &lt; shifted_points[0].size(); i++) {
                                grid_x = floor(shifted_points[0][i]/raster_size);
                                grid_y = floor(shifted_points[1][i]/raster_size);
                                if (grid[grid_x][grid_y] == 0)
                                {
                                    grid[grid_x][grid_y] = i;
                                } else if (shifted_points[2][i] &lt; shifted_points[2][grid[grid_x][grid_y]]) {
                                    grid[grid_x][grid_y] = i;
                                }
                            }
                            for (int i = 0; i &lt; grid.size(); i++) {
                                for (int j = 0; j &lt; grid[i].size(); j++) {
                                    int index = grid[i][j];
                                    if (index != 0) {
                                        #pragma omp critical
                                        {
                                        all_lowest_points.insert(index);
                                        count_map[index]++;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

This solution works, but as I am a beginner I have a feeling there may be a better one. Also if a 'raster_size' is set to a value less then 1, the computation gets suddenly much slower and only a fraction of CPU power is used. I'm looking for any impovements to my code.

答案1

得分: 2

根据Victor Eijkhout的建议，最好的解决方案是定义一个自定义的omp reduction。另一种选择是进行手动reduction，虽然效率稍低，但可能仍比你的初始代码要高得多，因为关键部分仅在每个线程中遇到一次（而不是在循环中多次）。

首先，你需要拆分omp parallel for，并声明将应用reduction的对象的本地/私有版本。

#pragma omp parallel
{
    // 在这里声明与全局版本相同特征的all_lowest_points_local和count_map_local
    #pragma omp for collapse(3)
    for (int m = 0; m < num_a; m++) {
        for (int n = 0; n < num_b; n++) {
            for (int o = 0; o < num_g; o++) {

在你的omp critical区域，更新本地版本而不是全局版本。这样，你可以删除critical pragma。

在并行部分结束之前，更新全局版本。

        } // o 循环结束
    } // n 循环结束
} // m 循环结束
#pragma omp critical
{
    // 在这里使用all_lowest_points_local和count_map_local更新all_lowest_points和count_map
}
} // omp parallel 结束

英文:

As suggested by Victor Eijkhout the best solution is to define a custom omp reduction. An alternative is to do a manual reduction, which is a bit less efficient, but probably still much more efficient than your initial code, as the critical is then encountered only one per thread (instead many times in the loops).

First, you have to split the omp parallel for and to declare local/private versions of the objects you apply the reduction to.

    #pragma omp parallel
    {
        // declare here all_lowest_points_local count_map_local
        // with the same characteristics as the global version
        #pragma omp for collapse(3)
        for (int m = 0; m &lt; num_a; m++) {
            for (int n = 0; n &lt; num_b; n++) {
                for (int o = 0; o &lt; num_g; o++) {

In your omp critical region, you update the local versions instead of the global ones. That way, you can remove the critical pragma

And just before the end of the parallel section you update the global versions.

                } // end of o loop
            } // end of n loop
        } // end of m loop
        #pragma omp critical
        {
            // here you update all_lowest_points and  count_map
            // using all_lowest_points_local and count_map_local
        }
    } // end of omp parallel

答案2

得分: 0

你要并行处理的数据点数偏低。
你对标记这两行为critical是正确的，不幸的是，这对性能不利。由于你正在使用C++，你可以将其转化为一个归约操作，如果你定义一个归约运算符来合并lowest_points和count数组。我会编写一个low_points类，并在其上定义operator+。然后一切都会运行得非常快，几乎完全并行。

英文:

Your number of data points to parallelize over is on the low side.
You are right to mark the two lines critical, unfortunately that is not good for performance. Since you're using C++ you can turn this into a reduction if you define a reduction operator that merges the lowest_points and count arrays. I would write a class low_points and define the operator+ on that. Then everything will run extremely fast and almost fully parallel.

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

需要帮助使用OpenMP并行化C++程序。

问题

答案1

答案2

Parallel multiway merge sort for Unix

virtual and constexpr before up to C++17

如何在编译时将相同长度的数组相加？

D语言适合用于编写追踪即时编译器吗？

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。