问题

I've translated the code portion you provided, excluding the parts related to code comments and output messages:

#define gpuErrchk(ans) gpuAssert((ans), __FILE__, __LINE__)
inline int gpuAssert(cudaError_t code, const char *file, int line)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        return 1;
    }
    return 0;
}

int gpuMedfilt2(const float* pSrc, float* pDst, int M, int N, int winSize)
{
    NppStatus status;
    Npp32f* d_in, *d_out;
    Npp32s nSrcStep = N * sizeof(float), nDstStep = N * sizeof(float);
    NppiSize oSizeROI = {N, M};
    NppiSize oMaskSize = {winSize, winSize};
    NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
    Npp8u* pBuffer;
    Npp32u pBufferSize;
    size_t d_in_pitch, d_out_pitch;

    if (gpuErrchk(cudaMallocPitch((void**)&d_in, &d_in_pitch, N * sizeof(float), M)))
        return 0;

    if (gpuErrchk(cudaMallocPitch((void**)&d_out, &d_out_pitch, N * sizeof(float), M)))
    {
        cudaFree((void*)d_in);
        return 0;
    }
    if (gpuErrchk(cudaMemcpy2D((void*)d_in, d_in_pitch, (const void*)pSrc, nSrcStep, N * sizeof(float), M, cudaMemcpyHostToDevice)))
    {
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if ((status = nppiFilterMedianGetBufferSize_32f_C1R(oSizeROI, oMaskSize, &pBufferSize)) != NPP_SUCCESS)
    {
        fprintf(stderr, "NPP Error: Failed to calculate buffer space for median filter operation\n");
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if (gpuErrchk(cudaMalloc((void**)&pBuffer, pBufferSize)))
    {
        fprintf(stderr, "NPP Error: Failed to allocate buffer space for median filter operation\n");
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if ((status = nppiFilterMedian_32f_C1R(d_in, d_in_pitch, d_out, d_out_pitch, oSizeROI, oMaskSize, oAnchor, pBuffer)) != NPP_SUCCESS)
    {
        fprintf(stderr, "NPP Error: Failed to call nppiFilterMedian_32f_C1R function\n");
        cudaFree((void*)pBuffer);
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if (gpuErrchk(cudaMemcpy2D((void*)pDst, nDstStep, (const void*)d_out, d_out_pitch, sizeof(float) * N, M, cudaMemcpyDeviceToHost)))
    {
        cudaFree((void*)pBuffer);
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    return 1;
}

Please note that this translation preserves the C/C++ code structure and does not include code comments or error messages.

英文:

Was wondering if any one more versed in how to call this NPP cuda function could tell me where the mistake is occurring?


#define gpuErrchk(ans) gpuAssert((ans), __FILE__, __LINE__)
inline int gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess) 
{
fprintf(stderr, &quot;GPUassert: %s %s %d\n&quot;, cudaGetErrorString(code), file, line);
return 1;
}
return 0;
}
int gpuMedfilt2(const float* pSrc, float* pDst, int M, int N, int winSize)
{
NppStatus status;
Npp32f* d_in, *d_out;
Npp32s nSrcStep = N * sizeof(float), nDstStep = N * sizeof(float);
NppiSize oSizeROI = {N, M};
NppiSize oMaskSize = {winSize, winSize};
NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
Npp8u* pBuffer;
Npp32u pBufferSize;
size_t d_in_pitch, d_out_pitch;
if (gpuErrchk(cudaMallocPitch((void**)&amp;d_in, &amp;d_in_pitch, N * sizeof(float), M)))
return 0;
if (gpuErrchk(cudaMallocPitch((void**)&amp;d_out, &amp;d_out_pitch, N * sizeof(float), M)))
{
cudaFree((void*)d_in);
return 0;
}
if (gpuErrchk(cudaMemcpy2D((void*)d_in, d_in_pitch, (const void*)pSrc, nSrcStep, N * sizeof(float), M, cudaMemcpyHostToDevice)))
{
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if ((status = nppiFilterMedianGetBufferSize_32f_C1R(oSizeROI, oMaskSize, &amp;pBufferSize)) != NPP_SUCCESS)
{
fprintf(stderr, &quot;NPP Error: Failed to calculate buffer space for median filter operation\n&quot;);
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if (gpuErrchk(cudaMalloc((void**)&amp;pBuffer, pBufferSize)))
{
fprintf(stderr, &quot;NPP Error: Failed to allocate buffer space for median filter operation\n&quot;);
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if ((status = nppiFilterMedian_32f_C1R(d_in, d_in_pitch, d_out, d_out_pitch, oSizeROI, oMaskSize, oAnchor, pBuffer)) != NPP_SUCCESS)
{
fprintf(stderr, &quot;NPP Error: Failed to call nppiFilterMedian_32f_C1R function\n&quot;);
cudaFree((void*)pBuffer);
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if (gpuErrchk(cudaMemcpy2D((void*)pDst, nDstStep, (const void*)d_out, d_out_pitch, sizeof(float) * N, M, cudaMemcpyDeviceToHost)))
{
cudaFree((void*)pBuffer);
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
return 1;
}

compute-sanitizer shows many errors.

My main looks like this,

float* in = malloc(sizeof(float) * M * N);
float* out = malloc(sizeof(float) * M * N);
gpuMedfilt2(in, out, M, N, 5);

Thank you for any insight.

I feel like something is wrong with the cudaMallocPitch and the cudaMemcpy2D. I am not getting the memory sizes right?

Here is output from compute-sanitizer, this occurs around ~100+ times.

========= Invalid __global__ read of size 4 bytes
=========     at 0x2b8 in void FilterMedianKernelSortingNetworkShared::RunKernel5x5&lt;float, (int)1, (int)1, (int)25&gt;(Pixel&lt;T1, T2&gt; *, int, NppiSize, NppiSize, const Pixel&lt;T1, T2&gt; *, int, int)
=========     by thread (1,0,0) in block (0,2,0)
=========     Address 0x701d1fffc is out of bounds
=========     and is 4 bytes before the nearest allocation at 0x701d20000 of size 25,600 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x30b442]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x393adb]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame: [0x3ef278]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame: [0x140046]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame: [0x1401cb]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame:nppiFilterMedian_32f_C1R [0x12266f]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame:gpuMedfilt2 [0x1601]
=========                in /home/rctodd/code/cuda/cuMedfilt2/libgpuMedfilt2.so
=========     Host Frame:main [0x127e]
=========                in /home/rctodd/code/cuda/cuMedfilt2/./app
=========     Host Frame:../csu/libc-start.c:342:__libc_start_main [0x24083]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0x110e]
=========                in /home/rctodd/code/cuda/cuMedfilt2/./app
=========

ldd of my shared library I compile against

ldd libgpuMedfilt2.so 
linux-vdso.so.1 (0x00007fff6533d000)
libcudart.so.11.0 =&gt; /home/rctodd/cuda11.7/lib/libcudart.so.11.0 (0x00007f1bb2911000)
libnppif.so.11 =&gt; /home/rctodd/cuda11.7/lib/libnppif.so.11 (0x00007f1badfbc000)
libstdc++.so.6 =&gt; /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f1baddc8000)
libc.so.6 =&gt; /lib/x86_64-linux-gnu/libc.so.6 (0x00007f1badbd6000)
/lib64/ld-linux-x86-64.so.2 (0x00007f1bb2bbd000)
libdl.so.2 =&gt; /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f1badbd0000)
libpthread.so.0 =&gt; /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f1badbad000)
librt.so.1 =&gt; /lib/x86_64-linux-gnu/librt.so.1 (0x00007f1badba1000)
libnppc.so.11 =&gt; /home/rctodd/cuda11.7/lib/libnppc.so.11 (0x00007f1bad813000)
libm.so.6 =&gt; /lib/x86_64-linux-gnu/libm.so.6 (0x00007f1bad6c4000)
libgcc_s.so.1 =&gt; /home/rctodd/cuda11.7/lib/libgcc_s.so.1 (0x00007f1bad6ab000)

Here is my compile commands

# Shared library
/./home/rctodd/cuda11.7/bin/nvcc -o libgpuMedfilt2.so -shared gpuMedfilt2.cu --compiler-options &#39;-fPIC&#39; -Xlinker -L/home/rctodd/cuda11.7/lib -Xlinker -rpath=/home/rctodd/cuda11.7/lib -lcudart -lnppif -arch=sm_50
# Application
gcc -o app main.c -L$(pwd) -Wl,-rpath=$(pwd) -lgpuMedfilt2

答案1

得分: 1

以下是您提供的代码部分的翻译：

"NPP"提供的“普通”滤波功能期望掩码/滤波核的任何放置都会落在图像中正确定义的像素上。这意味着您不能以这种方式对输入图像进行边缘到边缘的滤波。您必须留下一个未经滤波的边界，其大小将取决于您的掩码/滤波核维度。([某些](https://stackoverflow.com/questions/76323245/nppifiltergaussborder-8u-c1r-osrcsize-and-osrcoffset-parameters) NPP滤波函数提供了一个`Boundary`变体，它将对边界像素进行“自动”处理，即计算所需但位于定义图像之外的像素，但[中值滤波不是其中之一](https://docs.nvidia.com/cuda/npp/group__image__rank__filters.html)。)

您的代码违反了这一期望，因此`compute-sanitizer`报告了非法的越界访问。

解决这一期望的一种典型方法是将滤波器限制在“适合”原始图像的区域，留下足够的已定义像素边界区域，以使滤波器在滤波器区域内始终选择定义像素（来自原始图像）。

滤波核锚定像素的选择将影响这一点，但您选择了滤波核中心的“典型”锚点。

因此，在您的情况下，我们可以滤波一个“中心区域”，在原始图像的顶部、底部、左侧和右侧留下2个像素未经滤波，从而使水平和垂直的滤波图像比原始尺寸少4个像素。

这里有一个实际示例。中值滤波有几个有趣之处。其中之一是在仍然为具有某些特性的“噪音”提供“低通”滤波效果的同时，保持图像边缘“不变”。以下示例演示了这一点：

$ cat t19.cu
#include &lt;npp.h&gt;
#include &lt;nppi.h&gt;
#include &lt;cstdio&gt;
#include &lt;iostream&gt;

#define gpuErrchk(ans) gpuAssert((ans), __FILE__, __LINE__)
inline int gpuAssert(cudaError_t code, const char *file, int line)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        return 1;
    }
    return 0;
}

int gpuMedfilt2(const float* pSrc, float* pDst, int ih, int iw, int winSize)
{

    NppStatus status;
    Npp32f* d_in, *d_out;
    Npp32s nSrcStep = iw * sizeof(float), nDstStep = iw * sizeof(float);
    NppiSize oMaskSize = {winSize, winSize};
    NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
    NppiSize oSizeROI = {iw-2*oAnchor.x, ih-2*oAnchor.y};
    Npp8u* pBuffer;
    Npp32u pBufferSize;
    size_t d_in_pitch, d_out_pitch;

    if (gpuErrchk(cudaMallocPitch((void**)&d_in, &d_in_pitch, iw * sizeof(float), ih)))
        return 0;

    if (gpuErrchk(cudaMallocPitch((void**)&d_out, &d_out_pitch, iw * sizeof(float), ih)))
    {
        cudaFree((void*)d_in);
        return 0;
    }
    if (gpuErrchk(cudaMemcpy2D((void*)(d_in), d_in_pitch, (const void*)pSrc, nSrcStep, iw * sizeof(float), ih, cudaMemcpyHostToDevice)))
    {
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if ((status = nppiFilterMedianGetBufferSize_32f_C1R(oSizeROI, oMaskSize, &pBufferSize)) != NPP_SUCCESS)
    {
        fprintf(stderr, "NPP Error: Failed to calculate buffer space for median filter operation\n");
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if (gpuErrchk(cudaMalloc((void**)&pBuffer, pBufferSize)))
    {
        fprintf(stderr, "NPP Error: Failed to allocate buffer space for median filter operation\n");
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }
    cudaMemset(d_out, 0, ih*d_out_pitch); // filter will not touch every output pixel
    if ((status = nppiFilterMedian_32f_C1R((float *)((unsigned char *)d_in+oAnchor.y*d_in_pitch)+oAnchor.x, d_in_pitch, (float *)((unsigned char *)d_out+oAnchor.y*d_out_pitch)+oAnchor.x, d_out_pitch, oSizeROI, oMaskSize, oAnchor, pBuffer)) != NPP_SUCCESS)
    {
        fprintf(stderr, "NPP Error: Failed to call nppiFilterMedian_32f_C1R function\n");
        cudaFree((void*)pBuffer);
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if (gpuErrchk(cudaMemcpy2D((void*)pDst, nDstStep, (const void*)(d_out), d_out_pitch, sizeof(float) * iw, ih, cudaMemcpyDeviceToHost)))
    {
        cudaFree((void*)pBuffer);
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    return (int)status;
}

int main(){
    const int sz = 36;
    const int iw = sz;
    const int ih = sz;
    const int ms = 5;
    float* in = (float *)malloc(sizeof(float) * ih * iw);
    float* out = (float *)malloc(sizeof(float) * ih * iw);
    for (int i = 0; i < sz; i++)
      for (int j = 0; j < sz; j++) {


<details>
<summary>英文:</summary>


The &quot;ordinary&quot; filtering functions provided by NPP expect that any placement of the mask/filter kernel will land on properly defined pixels in the image. The ramification of this is that you cannot filter an input image edge-to-edge this way. You must leave an unfiltered boundary, the size of which will depend on your mask/filter kernel dimensions. ([Some](https://stackoverflow.com/questions/76323245/nppifiltergaussborder-8u-c1r-osrcsize-and-osrcoffset-parameters) NPP filter functions provide a `Boundary` variant which will have &quot;automatic&quot; handling of boundary pixels, i.e. pixels needed for calculation but which fall outside of the defined image, but [median filter is not one of those](https://docs.nvidia.com/cuda/npp/group__image__rank__filters.html). )


Your code violates this expectation, so its not surprising that `compute-sanitizer` reports illegal, out-of-bounds accesses.

A typical method to address this expectation is to restrict the filter to a region that &quot;fits within&quot; the original image, leaving enough border area of defined pixels so that the placement of the filter within the filter region always selects defined pixels (from the original image) within the filter kernel area.

The choice of filter kernel anchor pixel will affect this, but you have chosen a &quot;typical&quot; anchor at the center of the filter kernel.

Therefore, in your case we can filter a &quot;central region&quot;, leaving a boundary of 2 pixels unfiltered, at the top, bottom, left, and right of the original image, resulting in a filtered image that is 4 pixels less than the original dimensions for horizontal and vertical

Here is a worked example.  The median filter is interesting for several reasons.  One of its capabilities is to leave image edges &quot;intact&quot; while still offering something like a &quot;low-pass&quot; filter effect for &quot;noise&quot; having certain properties.  The following example demonstrates that:

    $ cat t19.cu
    #include &lt;npp.h&gt;
    #include &lt;nppi.h&gt;
    #include &lt;cstdio&gt;
    #include &lt;iostream&gt;
    
    #define gpuErrchk(ans) gpuAssert((ans), __FILE__, __LINE__)
    inline int gpuAssert(cudaError_t code, const char *file, int line)
    {
        if (code != cudaSuccess)
        {
            fprintf(stderr, &quot;GPUassert: %s %s %d\n&quot;, cudaGetErrorString(code), file, line);
            return 1;
        }
        return 0;
    }
    
    int gpuMedfilt2(const float* pSrc, float* pDst, int ih, int iw, int winSize)
    {
    
        NppStatus status;
        Npp32f* d_in, *d_out;
        Npp32s nSrcStep = iw * sizeof(float), nDstStep = iw * sizeof(float);
        NppiSize oMaskSize = {winSize, winSize};
        NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
        NppiSize oSizeROI = {iw-2*oAnchor.x, ih-2*oAnchor.y};
        Npp8u* pBuffer;
        Npp32u pBufferSize;
        size_t d_in_pitch, d_out_pitch;
    
        if (gpuErrchk(cudaMallocPitch((void**)&amp;d_in, &amp;d_in_pitch, iw * sizeof(float), ih)))
            return 0;
    
        if (gpuErrchk(cudaMallocPitch((void**)&amp;d_out, &amp;d_out_pitch, iw * sizeof(float), ih)))
        {
            cudaFree((void*)d_in);
            return 0;
        }
        if (gpuErrchk(cudaMemcpy2D((void*)(d_in), d_in_pitch, (const void*)pSrc, nSrcStep, iw * sizeof(float), ih, cudaMemcpyHostToDevice)))
        {
            cudaFree((void*)d_in);
            cudaFree((void*)d_out);
            return 0;
        }
    
        if ((status = nppiFilterMedianGetBufferSize_32f_C1R(oSizeROI, oMaskSize, &amp;pBufferSize)) != NPP_SUCCESS)
        {
            fprintf(stderr, &quot;NPP Error: Failed to calculate buffer space for median filter operation\n&quot;);
            cudaFree((void*)d_in);
            cudaFree((void*)d_out);
            return 0;
        }
    
        if (gpuErrchk(cudaMalloc((void**)&amp;pBuffer, pBufferSize)))
        {
            fprintf(stderr, &quot;NPP Error: Failed to allocate buffer space for median filter operation\n&quot;);
            cudaFree((void*)d_in);
            cudaFree((void*)d_out);
            return 0;
        }
        cudaMemset(d_out, 0, ih*d_out_pitch); // filter will not touch every output pixel
        if ((status = nppiFilterMedian_32f_C1R((float *)((unsigned char *)d_in+oAnchor.y*d_in_pitch)+oAnchor.x, d_in_pitch, (float *)((unsigned char *)d_out+oAnchor.y*d_out_pitch)+oAnchor.x, d_out_pitch, oSizeROI, oMaskSize, oAnchor, pBuffer)) != NPP_SUCCESS)
        {
            fprintf(stderr, &quot;NPP Error: Failed to call nppiFilterMedian_32f_C1R function\n&quot;);
            cudaFree((void*)pBuffer);
            cudaFree((void*)d_in);
            cudaFree((void*)d_out);
            return 0;
        }
    
        if (gpuErrchk(cudaMemcpy2D((void*)pDst, nDstStep, (const void*)(d_out), d_out_pitch, sizeof(float) * iw, ih, cudaMemcpyDeviceToHost)))
        {
            cudaFree((void*)pBuffer);
            cudaFree((void*)d_in);
            cudaFree((void*)d_out);
            return 0;
        }
    
        return (int)status;
    }
    
    int main(){
        const int sz = 36;
        const int iw = sz;
        const int ih = sz;
        const int ms = 5;
        float* in = (float *)malloc(sizeof(float) * ih * iw);
        float* out = (float *)malloc(sizeof(float) * ih * iw);
        for (int i = 0; i &lt; sz; i++)
          for (int j = 0; j &lt; sz; j++) {
            float pix_val = (j&gt;(sz/2))?1.0f:0;  // create image with edge
            if ((j%ms==0)&amp;&amp;(i%ms==0)) pix_val += 0.1f; // additive noise
            in[i*sz+j] = pix_val;}
        gpuMedfilt2(in, out, ih, iw, ms);
        for (int i = 0; i &lt; sz; i++) {
          for (int j = 0; j &lt; sz; j++)
            std::cout &lt;&lt; out[i*sz+j] &lt;&lt; &quot; &quot;;
          std::cout &lt;&lt; std::endl;
          }
    }
    $ nvcc -o t19 t19.cu -lnpps -lnppif
    $ compute-sanitizer ./t19
    ========= COMPUTE-SANITIZER
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    ========= ERROR SUMMARY: 0 errors
    $


We note that the additive noise is gone, and the vertical edge is &quot;intact&quot; (i.e. unchanged) at the center of the image.  We also note that there appears to be a boundary of 2 pixels all the way around, which are unfiltered (and set to 0 by the `cudaMemset` operation).  Another possible method to handle the output border region (rather than setting it to zero) would be to copy the input image to the output image, before the filtering operation, in place of the `cudaMemset` operation, effectively setting output pixels equal to input pixels, in the border region.

</details>

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

Cuda NPP中的中值滤波函数

问题

答案1

使用CUDA Thrust进行矢量的替换/合并操作

比全局内存具有更高带宽的纹理内存

Cuda 使用模板类 / 将 Lambda 传递给非类函数

Cublas gemms不尊重NaN输入

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论