Problems in implementing adaptive thresholding using CUDA

huangapple go评论69阅读模式
英文:

Problems in implementing adaptive thresholding using CUDA

问题

以下是代码的翻译部分:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

using namespace cv;
using namespace std;

// CUDA核心函数
__global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
    // 计算全局索引
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // 检查边界条件
    if (j >= nl - halfSize - 1 || i >= nc - halfSize - 1)
        return;

    // 获取行地址
    const uchar* data = image + j * nc;
    uchar* binaryRow = binary + j * nc;
    const int* idata1 = iimage + (j - halfSize) * nc;
    const int* idata2 = iimage + (j + halfSize + 1) * nc;

    // 计算总和
    int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);

    // 应用自适应阈值
    if (data[i] < (sum - threshold))
        binaryRow[i] = 0;
    else
        binaryRow[i] = 255;
}

int main()
{
    Mat image = imread("image/test.jpg", 0);
    if (!image.data) return 0;

    resize(image, image, Size(), 1.0, 1.0);
    namedWindow("Original Image");
    imshow("Original Image", image);

    /* 自适应阈值处理函数 */
    int blockSize = 16; // 邻域大小
    int threshold = 10; // 像素比较阈值

    Mat binary = image.clone();

    int nl = binary.rows; // 行数
    int nc = binary.cols; // 每行的元素总数

    Mat iimage;
    integral(image, iimage, CV_32S);

    // 在设备(GPU)上分配内存
    uchar* imageDevice;
    uchar* binaryDevice;
    int* iimageDevice;
    cudaMalloc((void**)&imageDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&binaryDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&iimageDevice, sizeof(int) * nl * nc);

    // 从主机复制输入数据到设备
    cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * nl * nc, cudaMemcpyHostToDevice);

    // 定义网格和块维度
    dim3 blockDim(16, 16);  // 你可能需要调整块维度
    dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);

    // 启动CUDA核心
    adaptiveThresholdCUDA<<<gridDim, blockDim>>>(imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
}

希望这能帮助你。如果你有任何其他问题,请随时提出。

英文:

If you run the code below, the applied image will look like this: What do I need to fix to make it work correctly?

#include &quot;cuda_runtime.h&quot;
#include &quot;device_launch_parameters.h&quot;
#include &lt;stdio.h&gt;
#include &lt;iostream&gt;
#include &lt;opencv2/core/core.hpp&gt;
#include &lt;opencv2/highgui/highgui.hpp&gt;
#include &lt;opencv2/imgproc/imgproc.hpp&gt;

using namespace cv;
using namespace std;

// CUDA kernel function
__global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
    // Calculate global indices
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // Check boundary conditions
    if (j &gt;= nl - halfSize - 1 || i &gt;= nc - halfSize - 1)
        return;

    // Get row addresses
    const uchar* data = image + j * nc;
    uchar* binaryRow = binary + j * nc;
    const int* idata1 = iimage + (j - halfSize) * nc;
    const int* idata2 = iimage + (j + halfSize + 1) * nc;

    // Calculate sum
    int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);

    // Apply adaptive threshold
    if (data[i] &lt; (sum - threshold))
        binaryRow[i] = 0;
    else
        binaryRow[i] = 255;
}

int main()
{
    Mat image = imread(&quot;image/test.jpg&quot;, 0);
    if (!image.data) return 0;

    resize(image, image, Size(), 1.0, 1.0);
    namedWindow(&quot;Original Image&quot;);
    imshow(&quot;Original Image&quot;, image);

    /*          Function for Adaptive Thresholding          */
    int blockSize = 16; // Neighborhood size
    int threshold = 10; // Pixel comparison threshold

    Mat binary = image.clone();

    int nl = binary.rows; // Number of lines
    int nc = binary.cols; // Total number of elements per line

    Mat iimage;
&gt; integral(image, iimage, CV_32S);

    // Allocate memory on device (GPU)
    uchar* imageDevice;
    uchar* binaryDevice;
    int* iimageDevice;
    cudaMalloc((void**)&amp;imageDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&amp;binaryDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * nl * nc);

    // Copy input data from host to device
    cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * nl * nc, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockDim(16, 16);  // You may need to adjust the block dimensions
    dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);

    // Launch the CUDA kernel
    adaptiveThresholdCUDA &lt;&lt; &lt;gridDim, blockDim &gt;&gt; &gt; (imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);

}

Problems in implementing adaptive thresholding using CUDA

Problems in implementing adaptive thresholding using CUDA

When changing the size of blockDim and applying it, if it is set too large or too small, it may not be applied.

答案1

得分: 3

以下是翻译好的部分:

问题在于iimageimage多了一行和一列。

根据cv::integral的文档:

src - 输入图像,大小为W×H,8位或浮点型(32f或64f)。
sum - 积分图像,大小为(W+1)×(H+1),32位整数或浮点型(32f或64f)。

发布示例的图像大小为332列×302行。
图像大小为iimage是333列×303行。


我们需要将iimage复制到具有一行和一列更多的设备上:

int iimage_nl = iimage.rows; // = nl + 1
int iimage_nc = iimage.cols; // = nc + 1

cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);

cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);

我们需要根据iimage的实际大小来修复内核:

__global__ void adaptiveThresholdCUDA(...) {
    ...
    int iimage_nc = nc + 1; // iimage中的列数等于(nc+1)
    
    const int* idata1 = iimage + (j - halfSize) * iimage_nc;
    const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
    ...
}

更新的代码示例:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
#include "opencv2/opencv.hpp"

using namespace cv;
using namespace std;

// CUDA核函数
__global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
    // 计算全局索引
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // 检查边界条件
    if (j >= nl - halfSize - 1 || i >= nc - halfSize - 1)
        return;

    // 获取行地址
    int iimage_nc = nc + 1; // iimage中的列数等于(nc+1)
    const uchar* data = image + j * nc;
    uchar* binaryRow = binary + j * nc;
    const int* idata1 = iimage + (j - halfSize) * iimage_nc;
    const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;

    // 计算总和
    int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);

    // 应用自适应阈值
    if (data[i] < (sum - threshold))
        binaryRow[i] = 0;
    else
        binaryRow[i] = 255;
}

int main()
{
    // 读取图像
    Mat image = imread("test.png", 0);
    if (!image.data) return 0;

    resize(image, image, Size(), 1.0, 1.0);
    namedWindow("Original Image");
    imshow("Original Image", image);

    // 自适应阈值参数
    int blockSize = 16; // 邻域大小
    int threshold = 10; // 像素比较阈值

    Mat binary = image.clone();
    int nl = binary.rows; // 行数
    int nc = binary.cols; // 每行的元素总数

    Mat iimage;
    integral(image, iimage, CV_32S);

    int iimage_nl = iimage.rows; // = nl + 1
    int iimage_nc = iimage.cols; // = nc + 1

    // 在设备上分配内存(GPU)
    uchar* imageDevice;
    uchar* binaryDevice;
    int* iimageDevice;
    cudaMalloc((void**)&amp;imageDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&amp;binaryDevice, sizeof(uchar) * nl * nc);
    cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);

    // 从主机复制输入数据到设备
    cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
    cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);

    // 定义网格和块维度
    dim3 blockDim(16, 16);  // 您可能需要调整块维度
    dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);

    // 启动CUDA核函数
    adaptiveThresholdCUDA<<<gridDim, blockDim>>>(imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);

    Mat output = image.clone();
    cudaMemcpy(output.data, binaryDevice, sizeof(uchar) * nl * nc, cudaMemcpyDeviceToHost); // 从设备复制到主机
    imshow("Output Image", output);
    waitKey();
    destroyAllWindows();

    imwrite("output.png", output);  // 保存输出
}

请注意,以上解决方案仅演示了可能的解决方案(意图是显示为什么输出图像是“倾斜的”)。

英文:

The issue is that iimage has one extra row and one extra column compared to image.

According to the documentation of cv::integral:

>src - input image as W×H, 8-bit or floating-point (32f or 64f).
sum - integral image as (W+1)×(H+1) , 32-bit integer or floating-point (32f or 64f).

The image size of the posted sample is 332 columns by 302 rows.
The image size iimage is 333 columns by 303 rows.


We have to copy iimage to device with one more row and column:

int iimage_nl = iimage.rows; // = nl + 1
int iimage_nc = iimage.cols; // = nc + 1
cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);

We have to fix the kernel according to the actual size of iimage:

__global__ void adaptiveThresholdCUDA(...) {
...
int iimage_nc = nc + 1; //Number of columns in iimage equals (nc+1)
const int* idata1 = iimage + (j - halfSize) * iimage_nc;
const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
...

Updated code sample:

#include &quot;cuda_runtime.h&quot;
#include &quot;device_launch_parameters.h&quot;
#include &lt;stdio.h&gt;
#include &lt;iostream&gt;
//#include &lt;opencv2/core/core.hpp&gt;
//#include &lt;opencv2/highgui/highgui.hpp&gt;
//#include &lt;opencv2/imgproc/imgproc.hpp&gt;
#include &quot;opencv2/opencv.hpp&quot;
using namespace cv;
using namespace std;
// CUDA kernel function
__global__ void adaptiveThresholdCUDA(const uchar* image, uchar* binary, const int* iimage, int nl, int nc, int halfSize, int blockSize, int threshold) {
// Calculate global indices
int j = blockIdx.y * blockDim.y + threadIdx.y;
int i = blockIdx.x * blockDim.x + threadIdx.x;
// Check boundary conditions
if (j &gt;= nl - halfSize - 1 || i &gt;= nc - halfSize - 1)
return;
// Get row addresses
int iimage_nc = nc + 1; //Number of columns in iimage equals (nc+1)
const uchar* data = image + j * nc;
uchar* binaryRow = binary + j * nc;
const int* idata1 = iimage + (j - halfSize) * iimage_nc;
const int* idata2 = iimage + (j + halfSize + 1) * iimage_nc;
// Calculate sum
int sum = (idata2[i + halfSize + 1] - idata2[i - halfSize] - idata1[i + halfSize + 1] + idata1[i - halfSize]) / (blockSize * blockSize);
// Apply adaptive threshold
if (data[i] &lt; (sum - threshold))
binaryRow[i] = 0;
else
binaryRow[i] = 255;
}
int main()
{
//Mat image = imread(&quot;image/test.jpg&quot;, 0);
Mat image = imread(&quot;test.png&quot;, 0);
if (!image.data) return 0;
resize(image, image, Size(), 1.0, 1.0);
namedWindow(&quot;Original Image&quot;);
imshow(&quot;Original Image&quot;, image);
/*          Function for Adaptive Thresholding          */
int blockSize = 16; // Neighborhood size
int threshold = 10; // Pixel comparison threshold
Mat binary = image.clone();
//nl = 332, nc = 302
int nl = binary.rows; // Number of lines
int nc = binary.cols; // Total number of elements per line
Mat iimage;
integral(image, iimage, CV_32S);
//https://docs.opencv.org/3.4/d7/d1b/group__imgproc__misc.html#gadeaf38d7701d7ad371278d663c50c77d
//src	input image as W&#215;H, 8-bit or floating-point (32f or 64f).
//sum	integral image as (W+1)&#215;(H+1) , 32-bit integer or floating-point (32f or 64f).
//iimage_nl = 333, iimage_nc = 303
int iimage_nl = iimage.rows; // = nl + 1
int iimage_nc = iimage.cols; // = nc + 1
// Allocate memory on device (GPU)
uchar* imageDevice;
uchar* binaryDevice;
int* iimageDevice;
cudaMalloc((void**)&amp;imageDevice, sizeof(uchar) * nl * nc);
cudaMalloc((void**)&amp;binaryDevice, sizeof(uchar) * nl * nc);
cudaMalloc((void**)&amp;iimageDevice, sizeof(int) * iimage_nl * iimage_nc);
// Copy input data from host to device
cudaMemcpy(imageDevice, image.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
cudaMemcpy(binaryDevice, binary.data, sizeof(uchar) * nl * nc, cudaMemcpyHostToDevice);
cudaMemcpy(iimageDevice, iimage.data, sizeof(int) * iimage_nl * iimage_nc, cudaMemcpyHostToDevice);
// Define grid and block dimensions
dim3 blockDim(16, 16);  // You may need to adjust the block dimensions
dim3 gridDim((nc + blockDim.x - 1) / blockDim.x, (nl + blockDim.y - 1) / blockDim.y);
// Launch the CUDA kernel
adaptiveThresholdCUDA&lt;&lt;&lt;gridDim, blockDim&gt;&gt;&gt;(imageDevice, binaryDevice, iimageDevice, nl, nc, blockSize / 2, blockSize, threshold);
Mat output = image.clone();
cudaMemcpy(output.data, binaryDevice, sizeof(uchar) * nl * nc, cudaMemcpyDeviceToHost); //Copy from device to host
imshow(&quot;Output Image&quot;, output);
waitKey();
destroyAllWindows();
imwrite(&quot;output.png&quot;, output);  //Save the output
}

test.png (input image):
Problems in implementing adaptive thresholding using CUDA

output.png (output image):
Problems in implementing adaptive thresholding using CUDA


Note that the above solution just demonstrates a possible solution (the intention is to show why the output image is "slanted").

huangapple
  • 本文由 发表于 2023年6月13日 01:43:39
  • 转载请务必保留本文链接:https://go.coder-hub.com/76459109.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定