Problems in implementing adaptive thresholding using CUDA

huangapple go评论107阅读模式
英文:

Problems in implementing adaptive thresholding using CUDA

问题

以下是代码的翻译部分:

  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3. #include <stdio.h>
  4. #include <iostream>
  5. #include <opencv2/core/core.hpp>
  6. #include <opencv2/highgui/highgui.hpp>
  7. #include <opencv2/imgproc/imgproc.hpp>
  8. using namespace cv;
  9. using namespace std;
  10. // CUDA核心函数
  11. __global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
  12. // 计算全局索引
  13. int j = blockIdx.y * blockDim.y + threadIdx.y;
  14. int i = blockIdx.x * blockDim.x + threadIdx.x;
  15. // 检查边界条件
  16. if (j >= nl - halfSize - 1 || i >= nc - halfSize - 1)
  17. return;
  18. // 获取行地址
  19. const uchar* data = image + j * nc;
  20. uchar* binaryRow = binary + j * nc;
  21. const int* idata1 = iimage + (j - halfSize) * nc;
  22. const int* idata2 = iimage + (j + halfSize + 1) * nc;
  23. // 计算总和
  24. int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);
  25. // 应用自适应阈值
  26. if (data[i] < (sum - threshold))
  27. binaryRow[i] = 0;
  28. else
  29. binaryRow[i] = 255;
  30. }
  31. int main()
  32. {
  33. Mat image = imread("image/test.jpg", 0);
  34. if (!image.data) return 0;
  35. resize(image, image, Size(), 1.0, 1.0);
  36. namedWindow("Original Image");
  37. imshow("Original Image", image);
  38. /* 自适应阈值处理函数 */
  39. int blockSize = 16; // 邻域大小
  40. int threshold = 10; // 像素比较阈值
  41. Mat binary = image.clone();
  42. int nl = binary.rows; // 行数
  43. int nc = binary.cols; // 每行的元素总数
  44. Mat iimage;
  45. integral(image, iimage, CV_32S);
  46. // 在设备(GPU)上分配内存
  47. uchar* imageDevice;
  48. uchar* binaryDevice;
  49. int* iimageDevice;
  50. cudaMalloc((void**)&imageDevice, sizeof(uchar) * nl * nc);
  51. cudaMalloc((void**)&binaryDevice, sizeof(uchar) * nl * nc);
  52. cudaMalloc((void**)&iimageDevice, sizeof(int) * nl * nc);
  53. // 从主机复制输入数据到设备
  54. cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
  55. cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
  56. cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * nl * nc, cudaMemcpyHostToDevice);
  57. // 定义网格和块维度
  58. dim3 blockDim(16, 16); // 你可能需要调整块维度
  59. dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);
  60. // 启动CUDA核心
  61. adaptiveThresholdCUDA<<<gridDim, blockDim>>>(imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
  62. }

希望这能帮助你。如果你有任何其他问题,请随时提出。

英文:

If you run the code below, the applied image will look like this: What do I need to fix to make it work correctly?

  1. #include &quot;cuda_runtime.h&quot;
  2. #include &quot;device_launch_parameters.h&quot;
  3. #include &lt;stdio.h&gt;
  4. #include &lt;iostream&gt;
  5. #include &lt;opencv2/core/core.hpp&gt;
  6. #include &lt;opencv2/highgui/highgui.hpp&gt;
  7. #include &lt;opencv2/imgproc/imgproc.hpp&gt;
  8. using namespace cv;
  9. using namespace std;
  10. // CUDA kernel function
  11. __global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
  12. // Calculate global indices
  13. int j = blockIdx.y * blockDim.y + threadIdx.y;
  14. int i = blockIdx.x * blockDim.x + threadIdx.x;
  15. // Check boundary conditions
  16. if (j &gt;= nl - halfSize - 1 || i &gt;= nc - halfSize - 1)
  17. return;
  18. // Get row addresses
  19. const uchar* data = image + j * nc;
  20. uchar* binaryRow = binary + j * nc;
  21. const int* idata1 = iimage + (j - halfSize) * nc;
  22. const int* idata2 = iimage + (j + halfSize + 1) * nc;
  23. // Calculate sum
  24. int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);
  25. // Apply adaptive threshold
  26. if (data[i] &lt; (sum - threshold))
  27. binaryRow[i] = 0;
  28. else
  29. binaryRow[i] = 255;
  30. }
  31. int main()
  32. {
  33. Mat image = imread(&quot;image/test.jpg&quot;, 0);
  34. if (!image.data) return 0;
  35. resize(image, image, Size(), 1.0, 1.0);
  36. namedWindow(&quot;Original Image&quot;);
  37. imshow(&quot;Original Image&quot;, image);
  38. /* Function for Adaptive Thresholding */
  39. int blockSize = 16; // Neighborhood size
  40. int threshold = 10; // Pixel comparison threshold
  41. Mat binary = image.clone();
  42. int nl = binary.rows; // Number of lines
  43. int nc = binary.cols; // Total number of elements per line
  44. Mat iimage;
  45. &gt; integral(image, iimage, CV_32S);
  46. // Allocate memory on device (GPU)
  47. uchar* imageDevice;
  48. uchar* binaryDevice;
  49. int* iimageDevice;
  50. cudaMalloc((void**)&amp;imageDevice, sizeof(uchar) * nl * nc);
  51. cudaMalloc((void**)&amp;binaryDevice, sizeof(uchar) * nl * nc);
  52. cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * nl * nc);
  53. // Copy input data from host to device
  54. cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
  55. cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
  56. cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * nl * nc, cudaMemcpyHostToDevice);
  57. // Define grid and block dimensions
  58. dim3 blockDim(16, 16); // You may need to adjust the block dimensions
  59. dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);
  60. // Launch the CUDA kernel
  61. adaptiveThresholdCUDA &lt;&lt; &lt;gridDim, blockDim &gt;&gt; &gt; (imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
  62. }

Problems in implementing adaptive thresholding using CUDA

Problems in implementing adaptive thresholding using CUDA

When changing the size of blockDim and applying it, if it is set too large or too small, it may not be applied.

答案1

得分: 3

以下是翻译好的部分:

问题在于iimageimage多了一行和一列。

根据cv::integral的文档:

src - 输入图像,大小为W×H,8位或浮点型(32f或64f)。
sum - 积分图像,大小为(W+1)×(H+1),32位整数或浮点型(32f或64f)。

发布示例的图像大小为332列×302行。
图像大小为iimage是333列×303行。


我们需要将iimage复制到具有一行和一列更多的设备上:

  1. int iimage_nl = iimage.rows; // = nl + 1
  2. int iimage_nc = iimage.cols; // = nc + 1
  3. cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
  4. cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);

我们需要根据iimage的实际大小来修复内核:

  1. __global__ void adaptiveThresholdCUDA(...) {
  2. ...
  3. int iimage_nc = nc + 1; // iimage中的列数等于(nc+1)
  4. const int* idata1 = iimage + (j - halfSize) * iimage_nc;
  5. const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
  6. ...
  7. }

更新的代码示例:

  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3. #include <stdio.h>
  4. #include <iostream>
  5. #include "opencv2/opencv.hpp"
  6. using namespace cv;
  7. using namespace std;
  8. // CUDA核函数
  9. __global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
  10. // 计算全局索引
  11. int j = blockIdx.y * blockDim.y + threadIdx.y;
  12. int i = blockIdx.x * blockDim.x + threadIdx.x;
  13. // 检查边界条件
  14. if (j >= nl - halfSize - 1 || i >= nc - halfSize - 1)
  15. return;
  16. // 获取行地址
  17. int iimage_nc = nc + 1; // iimage中的列数等于(nc+1)
  18. const uchar* data = image + j * nc;
  19. uchar* binaryRow = binary + j * nc;
  20. const int* idata1 = iimage + (j - halfSize) * iimage_nc;
  21. const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
  22. // 计算总和
  23. int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);
  24. // 应用自适应阈值
  25. if (data[i] < (sum - threshold))
  26. binaryRow[i] = 0;
  27. else
  28. binaryRow[i] = 255;
  29. }
  30. int main()
  31. {
  32. // 读取图像
  33. Mat image = imread("test.png", 0);
  34. if (!image.data) return 0;
  35. resize(image, image, Size(), 1.0, 1.0);
  36. namedWindow("Original Image");
  37. imshow("Original Image", image);
  38. // 自适应阈值参数
  39. int blockSize = 16; // 邻域大小
  40. int threshold = 10; // 像素比较阈值
  41. Mat binary = image.clone();
  42. int nl = binary.rows; // 行数
  43. int nc = binary.cols; // 每行的元素总数
  44. Mat iimage;
  45. integral(image, iimage, CV_32S);
  46. int iimage_nl = iimage.rows; // = nl + 1
  47. int iimage_nc = iimage.cols; // = nc + 1
  48. // 在设备上分配内存(GPU)
  49. uchar* imageDevice;
  50. uchar* binaryDevice;
  51. int* iimageDevice;
  52. cudaMalloc((void**)&amp;imageDevice, sizeof(uchar) * nl * nc);
  53. cudaMalloc((void**)&amp;binaryDevice, sizeof(uchar) * nl * nc);
  54. cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
  55. // 从主机复制输入数据到设备
  56. cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
  57. cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
  58. cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);
  59. // 定义网格和块维度
  60. dim3 blockDim(16, 16); // 您可能需要调整块维度
  61. dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);
  62. // 启动CUDA核函数
  63. adaptiveThresholdCUDA<<<gridDim, blockDim>>>(imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
  64. Mat output = image.clone();
  65. cudaMemcpy(output.data, binaryDevice, sizeof(uchar) * nl * nc, cudaMemcpyDeviceToHost); // 从设备复制到主机
  66. imshow("Output Image", output);
  67. waitKey();
  68. destroyAllWindows();
  69. imwrite("output.png", output); // 保存输出
  70. }

请注意,以上解决方案仅演示了可能的解决方案(意图是显示为什么输出图像是“倾斜的”)。

英文:

The issue is that iimage has one extra row and one extra column compared to image.

According to the documentation of cv::integral:

>src - input image as W×H, 8-bit or floating-point (32f or 64f).
sum - integral image as (W+1)×(H+1) , 32-bit integer or floating-point (32f or 64f).

The image size of the posted sample is 332 columns by 302 rows.
The image size iimage is 333 columns by 303 rows.


We have to copy iimage to device with one more row and column:

  1. int iimage_nl = iimage.rows; // = nl + 1
  2. int iimage_nc = iimage.cols; // = nc + 1
  3. cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
  4. cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);

We have to fix the kernel according to the actual size of iimage:

  1. __global__ void adaptiveThresholdCUDA(...) {
  2. ...
  3. int iimage_nc = nc + 1; //Number of columns in iimage equals (nc+1)
  4. const int* idata1 = iimage + (j - halfSize) * iimage_nc;
  5. const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
  6. ...

Updated code sample:

  1. #include &quot;cuda_runtime.h&quot;
  2. #include &quot;device_launch_parameters.h&quot;
  3. #include &lt;stdio.h&gt;
  4. #include &lt;iostream&gt;
  5. //#include &lt;opencv2/core/core.hpp&gt;
  6. //#include &lt;opencv2/highgui/highgui.hpp&gt;
  7. //#include &lt;opencv2/imgproc/imgproc.hpp&gt;
  8. #include &quot;opencv2/opencv.hpp&quot;
  9. using namespace cv;
  10. using namespace std;
  11. // CUDA kernel function
  12. __global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
  13. // Calculate global indices
  14. int j = blockIdx.y * blockDim.y + threadIdx.y;
  15. int i = blockIdx.x * blockDim.x + threadIdx.x;
  16. // Check boundary conditions
  17. if (j &gt;= nl - halfSize - 1 || i &gt;= nc - halfSize - 1)
  18. return;
  19. // Get row addresses
  20. int iimage_nc = nc + 1; //Number of columns in iimage equals (nc+1)
  21. const uchar* data = image + j * nc;
  22. uchar* binaryRow = binary + j * nc;
  23. const int* idata1 = iimage + (j - halfSize) * iimage_nc;
  24. const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
  25. // Calculate sum
  26. int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);
  27. // Apply adaptive threshold
  28. if (data[i] &lt; (sum - threshold))
  29. binaryRow[i] = 0;
  30. else
  31. binaryRow[i] = 255;
  32. }
  33. int main()
  34. {
  35. //Mat image = imread(&quot;image/test.jpg&quot;, 0);
  36. Mat image = imread(&quot;test.png&quot;, 0);
  37. if (!image.data) return 0;
  38. resize(image, image, Size(), 1.0, 1.0);
  39. namedWindow(&quot;Original Image&quot;);
  40. imshow(&quot;Original Image&quot;, image);
  41. /* Function for Adaptive Thresholding */
  42. int blockSize = 16; // Neighborhood size
  43. int threshold = 10; // Pixel comparison threshold
  44. Mat binary = image.clone();
  45. //nl = 332, nc = 302
  46. int nl = binary.rows; // Number of lines
  47. int nc = binary.cols; // Total number of elements per line
  48. Mat iimage;
  49. integral(image, iimage, CV_32S);
  50. //https://docs.opencv.org/3.4/d7/d1b/group__imgproc__misc.html#gadeaf38d7701d7ad371278d663c50c77d
  51. //src input image as W&#215;H, 8-bit or floating-point (32f or 64f).
  52. //sum integral image as (W+1)&#215;(H+1) , 32-bit integer or floating-point (32f or 64f).
  53. //iimage_nl = 333, iimage_nc = 303
  54. int iimage_nl = iimage.rows; // = nl + 1
  55. int iimage_nc = iimage.cols; // = nc + 1
  56. // Allocate memory on device (GPU)
  57. uchar* imageDevice;
  58. uchar* binaryDevice;
  59. int* iimageDevice;
  60. cudaMalloc((void**)&amp;imageDevice, sizeof(uchar) * nl * nc);
  61. cudaMalloc((void**)&amp;binaryDevice, sizeof(uchar) * nl * nc);
  62. cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
  63. // Copy input data from host to device
  64. cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
  65. cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
  66. cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);
  67. // Define grid and block dimensions
  68. dim3 blockDim(16, 16); // You may need to adjust the block dimensions
  69. dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);
  70. // Launch the CUDA kernel
  71. adaptiveThresholdCUDA&lt;&lt;&lt;gridDim, blockDim&gt;&gt;&gt;(imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
  72. Mat output = image.clone();
  73. cudaMemcpy(output.data, binaryDevice, sizeof(uchar) * nl * nc, cudaMemcpyDeviceToHost); //Copy from device to host
  74. imshow(&quot;Output Image&quot;, output);
  75. waitKey();
  76. destroyAllWindows();
  77. imwrite(&quot;output.png&quot;, output); //Save the output
  78. }

test.png (input image):
Problems in implementing adaptive thresholding using CUDA

output.png (output image):
Problems in implementing adaptive thresholding using CUDA


Note that the above solution just demonstrates a possible solution (the intention is to show why the output image is "slanted").

huangapple
  • 本文由 发表于 2023年6月13日 01:43:39
  • 转载请务必保留本文链接:https://go.coder-hub.com/76459109.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定