2023年6月13日 01:43:39go评论107阅读模式

英文:

Problems in implementing adaptive thresholding using CUDA

问题

以下是代码的翻译部分：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
using namespace cv;
using namespace std;
// CUDA核心函数
__global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
    // 计算全局索引
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    // 检查边界条件
    if (j >= nl - halfSize - 1 || i >= nc - halfSize - 1)
        return;
    // 获取行地址
    const uchar* data = image + j * nc;
    uchar* binaryRow = binary + j * nc;
    const int* idata1 = iimage + (j - halfSize) * nc;
    const int* idata2 = iimage + (j + halfSize + 1) * nc;
    // 计算总和
    int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);
    // 应用自适应阈值
    if (data[i] < (sum - threshold))
        binaryRow[i] = 0;
    else
        binaryRow[i] = 255;
}
int main()
{
    Mat image = imread("image/test.jpg", 0);
    if (!image.data) return 0;
    resize(image, image, Size(), 1.0, 1.0);
    namedWindow("Original Image");
    imshow("Original Image", image);
    /* 自适应阈值处理函数 */
    int blockSize = 16; // 邻域大小
    int threshold = 10; // 像素比较阈值
    Mat binary = image.clone();
    int nl = binary.rows; // 行数
    int nc = binary.cols; // 每行的元素总数
    Mat iimage;
    integral(image, iimage, CV_32S);
    // 在设备(GPU)上分配内存
    uchar* imageDevice;
    uchar* binaryDevice;
    int* iimageDevice;
    cudaMalloc((void**)&imageDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&binaryDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&iimageDevice, sizeof(int) * nl * nc);
    // 从主机复制输入数据到设备
    cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * nl * nc, cudaMemcpyHostToDevice);
    // 定义网格和块维度
    dim3 blockDim(16, 16);  // 你可能需要调整块维度
    dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);
    // 启动CUDA核心
    adaptiveThresholdCUDA<<<gridDim, blockDim>>>(imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
}

希望这能帮助你。如果你有任何其他问题，请随时提出。

英文:

If you run the code below, the applied image will look like this: What do I need to fix to make it work correctly?

#include &quot;cuda_runtime.h&quot;
#include &quot;device_launch_parameters.h&quot;
#include &lt;stdio.h&gt;
#include &lt;iostream&gt;
#include &lt;opencv2/core/core.hpp&gt;
#include &lt;opencv2/highgui/highgui.hpp&gt;
#include &lt;opencv2/imgproc/imgproc.hpp&gt;
using namespace cv;
using namespace std;
// CUDA kernel function
__global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
    // Calculate global indices
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    // Check boundary conditions
    if (j &gt;= nl - halfSize - 1 || i &gt;= nc - halfSize - 1)
        return;
    // Get row addresses
    const uchar* data = image + j * nc;
    uchar* binaryRow = binary + j * nc;
    const int* idata1 = iimage + (j - halfSize) * nc;
    const int* idata2 = iimage + (j + halfSize + 1) * nc;
    // Calculate sum
    int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);
    // Apply adaptive threshold
    if (data[i] &lt; (sum - threshold))
        binaryRow[i] = 0;
    else
        binaryRow[i] = 255;
}
int main()
{
    Mat image = imread(&quot;image/test.jpg&quot;, 0);
    if (!image.data) return 0;
    resize(image, image, Size(), 1.0, 1.0);
    namedWindow(&quot;Original Image&quot;);
    imshow(&quot;Original Image&quot;, image);
    /*          Function for Adaptive Thresholding          */
    int blockSize = 16; // Neighborhood size
    int threshold = 10; // Pixel comparison threshold
    Mat binary = image.clone();
    int nl = binary.rows; // Number of lines
    int nc = binary.cols; // Total number of elements per line
    Mat iimage;
&gt; integral(image, iimage, CV_32S);
    // Allocate memory on device (GPU)
    uchar* imageDevice;
    uchar* binaryDevice;
    int* iimageDevice;
    cudaMalloc((void**)&amp;imageDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&amp;binaryDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * nl * nc);
    // Copy input data from host to device
    cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * nl * nc, cudaMemcpyHostToDevice);
    // Define grid and block dimensions
    dim3 blockDim(16, 16);  // You may need to adjust the block dimensions
    dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);
    // Launch the CUDA kernel
    adaptiveThresholdCUDA &lt;&lt; &lt;gridDim, blockDim &gt;&gt; &gt; (imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
}

When changing the size of blockDim and applying it, if it is set too large or too small, it may not be applied.

答案1

得分: 3

以下是翻译好的部分：

问题在于iimage比image多了一行和一列。

根据cv::integral的文档：

src - 输入图像，大小为W×H，8位或浮点型（32f或64f）。
sum - 积分图像，大小为(W+1)×(H+1)，32位整数或浮点型（32f或64f）。

发布示例的图像大小为332列×302行。
图像大小为iimage是333列×303行。

我们需要将iimage复制到具有一行和一列更多的设备上：

int iimage_nl = iimage.rows; // = nl + 1
int iimage_nc = iimage.cols; // = nc + 1
cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);

我们需要根据iimage的实际大小来修复内核：

__global__ void adaptiveThresholdCUDA(...) {
    ...
    int iimage_nc = nc + 1; // iimage中的列数等于（nc+1）
    
    const int* idata1 = iimage + (j - halfSize) * iimage_nc;
    const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
    ...
}

更新的代码示例：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;
// CUDA核函数
__global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
    // 计算全局索引
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    // 检查边界条件
    if (j >= nl - halfSize - 1 || i >= nc - halfSize - 1)
        return;
    // 获取行地址
    int iimage_nc = nc + 1; // iimage中的列数等于(nc+1)
    const uchar* data = image + j * nc;
    uchar* binaryRow = binary + j * nc;
    const int* idata1 = iimage + (j - halfSize) * iimage_nc;
    const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
    // 计算总和
    int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);
    // 应用自适应阈值
    if (data[i] < (sum - threshold))
        binaryRow[i] = 0;
    else
        binaryRow[i] = 255;
}
int main()
{
    // 读取图像
    Mat image = imread("test.png", 0);
    if (!image.data) return 0;
    resize(image, image, Size(), 1.0, 1.0);
    namedWindow("Original Image");
    imshow("Original Image", image);
    // 自适应阈值参数
    int blockSize = 16; // 邻域大小
    int threshold = 10; // 像素比较阈值
    Mat binary = image.clone();
    int nl = binary.rows; // 行数
    int nc = binary.cols; // 每行的元素总数
    Mat iimage;
    integral(image, iimage, CV_32S);
    int iimage_nl = iimage.rows; // = nl + 1
    int iimage_nc = iimage.cols; // = nc + 1
    // 在设备上分配内存（GPU）
    uchar* imageDevice;
    uchar* binaryDevice;
    int* iimageDevice;
    cudaMalloc((void**)&amp;imageDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&amp;binaryDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
    // 从主机复制输入数据到设备
    cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);
    // 定义网格和块维度
    dim3 blockDim(16, 16);  // 您可能需要调整块维度
    dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);
    // 启动CUDA核函数
    adaptiveThresholdCUDA<<<gridDim, blockDim>>>(imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
    Mat output = image.clone();
    cudaMemcpy(output.data, binaryDevice, sizeof(uchar) * nl * nc, cudaMemcpyDeviceToHost); // 从设备复制到主机
    imshow("Output Image", output);
    waitKey();
    destroyAllWindows();
    imwrite("output.png", output);  // 保存输出
}

请注意，以上解决方案仅演示了可能的解决方案（意图是显示为什么输出图像是“倾斜的”）。

英文:

The issue is that iimage has one extra row and one extra column compared to image.

According to the documentation of cv::integral:

>src - input image as W×H, 8-bit or floating-point (32f or 64f).
sum - integral image as (W+1)×(H+1) , 32-bit integer or floating-point (32f or 64f).

The image size of the posted sample is 332 columns by 302 rows.
The image size iimage is 333 columns by 303 rows.

We have to copy iimage to device with one more row and column:

int iimage_nl = iimage.rows; // = nl + 1
int iimage_nc = iimage.cols; // = nc + 1
cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);

We have to fix the kernel according to the actual size of iimage:

__global__ void adaptiveThresholdCUDA(...) {
...
int iimage_nc = nc + 1; //Number of columns in iimage equals (nc+1)
const int* idata1 = iimage + (j - halfSize) * iimage_nc;
const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
...

Updated code sample:

#include &quot;cuda_runtime.h&quot;
#include &quot;device_launch_parameters.h&quot;
#include &lt;stdio.h&gt;
#include &lt;iostream&gt;
//#include &lt;opencv2/core/core.hpp&gt;
//#include &lt;opencv2/highgui/highgui.hpp&gt;
//#include &lt;opencv2/imgproc/imgproc.hpp&gt;
#include &quot;opencv2/opencv.hpp&quot;
using namespace cv;
using namespace std;
// CUDA kernel function
__global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
// Calculate global indices
int j = blockIdx.y * blockDim.y + threadIdx.y;
int i = blockIdx.x * blockDim.x + threadIdx.x;
// Check boundary conditions
if (j &gt;= nl - halfSize - 1 || i &gt;= nc - halfSize - 1)
return;
// Get row addresses
int iimage_nc = nc + 1; //Number of columns in iimage equals (nc+1)
const uchar* data = image + j * nc;
uchar* binaryRow = binary + j * nc;
const int* idata1 = iimage + (j - halfSize) * iimage_nc;
const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
// Calculate sum
int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);
// Apply adaptive threshold
if (data[i] &lt; (sum - threshold))
binaryRow[i] = 0;
else
binaryRow[i] = 255;
}
int main()
{
//Mat image = imread(&quot;image/test.jpg&quot;, 0);
Mat image = imread(&quot;test.png&quot;, 0);
if (!image.data) return 0;
resize(image, image, Size(), 1.0, 1.0);
namedWindow(&quot;Original Image&quot;);
imshow(&quot;Original Image&quot;, image);
/*          Function for Adaptive Thresholding          */
int blockSize = 16; // Neighborhood size
int threshold = 10; // Pixel comparison threshold
Mat binary = image.clone();
//nl = 332, nc = 302
int nl = binary.rows; // Number of lines
int nc = binary.cols; // Total number of elements per line
Mat iimage;
integral(image, iimage, CV_32S);
//https://docs.opencv.org/3.4/d7/d1b/group__imgproc__misc.html#gadeaf38d7701d7ad371278d663c50c77d
//src	input image as W&#215;H, 8-bit or floating-point (32f or 64f).
//sum	integral image as (W+1)&#215;(H+1) , 32-bit integer or floating-point (32f or 64f).
//iimage_nl = 333, iimage_nc = 303
int iimage_nl = iimage.rows; // = nl + 1
int iimage_nc = iimage.cols; // = nc + 1
// Allocate memory on device (GPU)
uchar* imageDevice;
uchar* binaryDevice;
int* iimageDevice;
cudaMalloc((void**)&amp;imageDevice, sizeof(uchar) * nl * nc);
cudaMalloc((void**)&amp;binaryDevice, sizeof(uchar) * nl * nc);
cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
// Copy input data from host to device
cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);
// Define grid and block dimensions
dim3 blockDim(16, 16);  // You may need to adjust the block dimensions
dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);
// Launch the CUDA kernel
adaptiveThresholdCUDA&lt;&lt;&lt;gridDim, blockDim&gt;&gt;&gt;(imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
Mat output = image.clone();
cudaMemcpy(output.data, binaryDevice, sizeof(uchar) * nl * nc, cudaMemcpyDeviceToHost); //Copy from device to host
imshow(&quot;Output Image&quot;, output);
waitKey();
destroyAllWindows();
imwrite(&quot;output.png&quot;, output);  //Save the output
}

test.png (input image):

output.png (output image):

Note that the above solution just demonstrates a possible solution (the intention is to show why the output image is "slanted").

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

Problems in implementing adaptive thresholding using CUDA

问题

答案1

修复射线投射算法

If statement with rendering illusions before/after depending on position relative to set y point not working

Error when trying to make 4d array in C++ (using std::vector and Eigen matrices)

获取OpenCV成功匹配和拼接后的图像坐标。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。