Cuda 使用模板类 / 将 Lambda 传递给非类函数

huangapple go评论51阅读模式
英文:

Cuda using template class / passing lambdas to non-class function

问题

第一个程序(ts0.cu)编译和运行成功,产生了预期的结果,但第二个程序(ts1.cu)出现了编译错误。错误信息指出在ts1.cu文件中的operator+没有被声明。这是因为在CUDA中,operator+不会自动被识别为设备代码(device code)。为了解决这个问题,你需要将operator+函数标记为设备函数。

以下是对ts1.cu代码的修改,以使其编译成功:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <cassert>

template <typename T, typename F>
__global__ void do_op(T *a, T *b, T *c, F f)
{
    int i = threadIdx.x;

    c[i] = f(a[i], b[i]);
}

template <typename T, unsigned int N>
class vector
{
private:
    T _v[N];

public:
    vector() : _v{0} {}

    vector(const vector<T, N> &src)
    {
        std::copy(src._v, src._v + N, this->_v);
    }

    vector(std::initializer_list<T> src)
    {
        assert(src.size() == N);
        std::copy(src.begin(), src.end(), this->_v);
    }

    __host__ __device__ friend vector<T, N> operator+(const vector<T, N> &lhs, const vector<T, N> &rhs)
    {
        vector<T, N> vec;

        T *cudaLS = 0;
        T *cudaRS = 0;
        T *cudaRV = 0;

        cudaMalloc(&cudaLS, N * sizeof(T));
        cudaMalloc(&cudaRS, N * sizeof(T));
        cudaMalloc(&cudaRV, N * sizeof(T));

        cudaMemcpy(cudaLS, lhs._v, N * sizeof(T), cudaMemcpyHostToDevice);
        cudaMemcpy(cudaRS, rhs._v, N * sizeof(T), cudaMemcpyHostToDevice);
        do_op<T><<<1, N>>>(cudaLS, cudaRS, cudaRV, [] __device__(T l, T r)
                           { return l + r; });
        cudaMemcpy(vec._v, cudaRV, N * sizeof(T), cudaMemcpyDeviceToHost);

        cudaFree(cudaLS);
        cudaFree(cudaRS);
        cudaFree(cudaRV);

        return vec;
    }

    friend std::ostream &operator<<(std::ostream &os, const vector<T, N> &vec)
    {
        for (unsigned int i = 0; i < N; ++i)
        {
            os << (i == 0 ? '[' : ',') << vec._v[i];
        }
        os << ']';

        return os;
    }
};

int main()
{
    vector<int, 3> v0 = {1, 4, 9};
    vector<int, 3> v1 = v0;
    vector<int, 3> v2 = v0 + v1;

    std::cout << v0 << std::endl;
    std::cout << v1 << std::endl;
    std::cout << v2 << std::endl;

    return 0;
}

在这个修改后的代码中,operator+函数被标记为__host__ __device__,以允许其在主机和设备代码中使用。这将解决编译错误,并使你的第二个程序能够成功编译和运行。

英文:

I am trying to learn Cuda programming and have written a couple test programs in the process. The first of these works as I expected:

ts0.cu:

#include &quot;cuda_runtime.h&quot;
#include &quot;device_launch_parameters.h&quot;
#include &lt;iostream&gt;
#include &lt;cassert&gt;
template &lt;typename T, typename F&gt;
__global__ void do_op(T *a, T *b, T *c, F f)
{
int i = threadIdx.x;
c[i] = f(a[i], b[i]);
}
int main()
{
int a[] = {1, 2, 3};
int b[] = {4, 5, 6};
int c[sizeof(a) / sizeof(int)] = {0};
int *cudaA = 0;
int *cudaB = 0;
int *cudaC = 0;
cudaMalloc(&amp;cudaA, sizeof(a));
cudaMalloc(&amp;cudaB, sizeof(b));
cudaMalloc(&amp;cudaC, sizeof(c));
cudaMemcpy(cudaA, a, sizeof(a), cudaMemcpyHostToDevice);
cudaMemcpy(cudaB, b, sizeof(b), cudaMemcpyHostToDevice);
do_op&lt;int&gt;&lt;&lt;&lt;1, sizeof(a) / sizeof(int)&gt;&gt;&gt;(cudaA, cudaB, cudaC, [] __device__(int l, int r)
{ return l + r; }); // nvcc has no difficulty identifying this + operator
cudaMemcpy(c, cudaC, sizeof(b), cudaMemcpyDeviceToHost);
for (unsigned int i = 0; i &lt; sizeof(c) / sizeof(int); ++i)
{
std::cout &lt;&lt; (i == 0 ? &#39;[&#39; : &#39;,&#39;) &lt;&lt; c[i];
}
std::cout &lt;&lt; &#39;]&#39; &lt;&lt; std::endl;
return 0;
}

Compiling and running yield the following:

$ nvcc ts0.cu --extended-lambda -o ts0
$ ./ts0 
[5,7,9]

I then tried wrapping the code above in a vector class I'm defining (again for learning purposes) by doing the following:

ts1.cu:

#include &quot;cuda_runtime.h&quot;
#include &quot;device_launch_parameters.h&quot;
#include &lt;iostream&gt;
#include &lt;cassert&gt;
template &lt;typename T, typename F&gt;
__global__ void do_op(T *a, T *b, T *c, F f)
{
int i = threadIdx.x;
c[i] = f(a[i], b[i]);
}
template &lt;typename T, unsigned int N&gt;
class vector
{
private:
T _v[N];
public:
vector() : _v{0} {}
vector(const vector&lt;T, N&gt; &amp;src)
{
std::copy(src._v, src._v + N, this-&gt;_v);
}
vector(std::initializer_list&lt;T&gt; src)
{
assert(size(src) == N);
std::copy(src.begin(), src.end(), this-&gt;_v);
}
friend vector&lt;T, N&gt; operator+(const vector&lt;T, N&gt; &amp;lhs, const vector&lt;T, N&gt; &amp;rhs)
{
vector&lt;T, N&gt; vec;
T *cudaLS = 0;
T *cudaRS = 0;
T *cudaRV = 0;
cudaMalloc(&amp;cudaLS, N);
cudaMalloc(&amp;cudaRS, N);
cudaMalloc(&amp;cudaRV, N);
cudaMemcpy(cudaLS, lhs._v, N, cudaMemcpyHostToDevice);
cudaMemcpy(cudaRS, rhs._v, N, cudaMemcpyHostToDevice);
do_op&lt;T&gt;&lt;&lt;&lt;1, N&gt;&gt;&gt;(cudaLS, cudaRS, cudaRV, [] __device__(T l, T r)
{ return l + r; }); // nvcc doesn&#39;t recognize this + operator
cudaMemcpy(vec._v, cudaRV, N, cudaMemcpyDeviceToHost);
cudaFree(cudaLS);
cudaFree(cudaRS);
cudaFree(cudaRV);
return vec;
}
friend std::ostream &amp;operator&lt;&lt;(std::ostream &amp;os, const vector&lt;T, N&gt; &amp;vec)
{
for (unsigned int i = 0; i &lt; N; ++i)
{
os &lt;&lt; (i == 0 ? &#39;[&#39; : &#39;,&#39;) &lt;&lt; vec._v[i];
}
os &lt;&lt; &#39;]&#39;;
return os;
}
};
int main()
{
vector&lt;int, 3&gt; v0 = {1, 4, 9};
vector&lt;int, 3&gt; v1 = v0;
vector&lt;int, 3&gt; v2 = v0 + v1;
std::cout &lt;&lt; v0 &lt;&lt; std::endl;
std::cout &lt;&lt; v1 &lt;&lt; std::endl;
std::cout &lt;&lt; v2 &lt;&lt; std::endl;
return 0;
}

And when I compile I get the following error:

$ nvcc ts1.cu --extended-lambda -o ts1
ts1.cu: In function ‘vector&lt;T, N&gt; operator+(const vector&lt;T, N&gt;&amp;, const vector&lt;T, N&gt;&amp;)’:
ts1.cu:49:171: error: ‘::operator+’ has not been declared
49 |         do_op&lt;T&gt;&lt;&lt;&lt;1, N&gt;&gt;&gt;(cudaLS, cudaRS, cudaRV, [] __device__(T l, T r)
|                                                                                                                                                                           
... followed by a large number of suggestions.
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Jan__6_16:45:21_PST_2023
Cuda compilation tools, release 12.0, V12.0.140
Build cuda_12.0.r12.0/compiler.32267302_0

OS: Fedora 37

Why does the second approach fail to compile while the first one succeeds and how can I modify the second to get this general approach to compile and run successfully?

答案1

得分: 1

@paleonix提供了一些建议,这些建议有所帮助... 以下实现有效:

ts1.cu:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <cassert>

template <typename T, unsigned int N>
class vector
{
private:
    T _v[N];
    T *_cv;

    void upload()
    {
        cudaMemcpy(this->_cv, this->_v, N * sizeof(T), cudaMemcpyHostToDevice);
    }

    void download()
    {
        cudaMemcpy(this->_v, this->_cv, N * sizeof(T), cudaMemcpyDeviceToHost);
    }

public:
    vector() : _v{0}, _cv(0)
    {
        cudaMalloc(&(this->_cv), N * sizeof(T));
    }

    vector(const vector<T, N> &src) : _cv(0)
    {
        std::copy(src._v, src._v + N, this->_v);
        cudaMalloc(&(this->_cv), N * sizeof(T));
    }

    vector(std::initializer_list<T> src)
    {
        assert(src.size() == N);
        std::copy(src.begin(), src.end(), this->_v);
        cudaMalloc(&(this->_cv), N * sizeof(T));
    }

    template <typename S, unsigned int M>
    friend vector<S, M> operator+(vector<S, M> &a, vector<S, M> &b);

    template <typename S, unsigned int M>
    friend std::ostream &operator<<(std::ostream &os, const vector<S, M> &vec);
};

template <typename T, typename F>
__global__ void do_op(T *a, T *b, T *c, F f)
{
    int i = threadIdx.x;

    c[i] = f(a[i], b[i]);
}

template <typename T, unsigned int N>
std::ostream &operator<<(std::ostream &os, const vector<T, N> &vec)
{
    for (unsigned int i = 0; i < N; ++i)
    {
        os << (i == 0 ? '[' : ',') << vec._v[i];
    }
    os << ']';

    return os;
}

template <typename T, unsigned int N>
vector<T, N> operator+(vector<T, N> &a, vector<T, N> &b)
{
    vector<T, N> c;

    a.upload();
    b.upload();
    do_op<<<1, N>>>(a._cv, b._cv, c._cv, [] __device__(T l, T r)
                    { return l + r; });
    c.download();

    return c;
}

int main()
{
    vector<int, 3> v0 = {1, 4, 9};
    vector<int, 3> v1 = {5, 10, 15};
    vector<int, 3> v2 = v0 + v1;

    std::cout << v0 << std::endl;
    std::cout << v1 << std::endl;
    std::cout << v2 << std::endl;

    return 0;
}

输出:

$ nvcc ts1.cu --extended-lambda -o ts1
$ ./ts1
[1,4,9]
[5,10,15]
[6,14,24]
英文:

@paleonix provided some suggestions which helped... The following implementation works:

ts1.cu:

#include &quot;cuda_runtime.h&quot;
#include &quot;device_launch_parameters.h&quot;
#include &lt;iostream&gt;
#include &lt;cassert&gt;
template &lt;typename T, unsigned int N&gt;
class vector
{
private:
T _v[N];
T *_cv;
void upload()
{
cudaMemcpy(this-&gt;_cv, this-&gt;_v, N * sizeof(T), cudaMemcpyHostToDevice);
}
void download()
{
cudaMemcpy(this-&gt;_v, this-&gt;_cv, N * sizeof(T), cudaMemcpyDeviceToHost);
}
public:
vector() : _v{0}, _cv(0)
{
cudaMalloc(&amp;(this-&gt;_cv), N * sizeof(T));
}
vector(const vector&lt;T, N&gt; &amp;src) : _cv(0)
{
std::copy(src._v, src._v + N, this-&gt;_v);
cudaMalloc(&amp;(this-&gt;_cv), N * sizeof(T));
}
vector(std::initializer_list&lt;T&gt; src)
{
assert(size(src) == N);
std::copy(src.begin(), src.end(), this-&gt;_v);
cudaMalloc(&amp;(this-&gt;_cv), N * sizeof(T));
}
template &lt;typename S, unsigned int M&gt;
friend vector&lt;S, M&gt; operator+(vector&lt;S, M&gt; &amp;a, vector&lt;S, M&gt; &amp;b);
template &lt;typename S, unsigned int M&gt;
friend std::ostream &amp;operator&lt;&lt;(std::ostream &amp;os, const vector&lt;S, M&gt; &amp;vec);
};
template &lt;typename T, typename F&gt;
__global__ void do_op(T *a, T *b, T *c, F f)
{
int i = threadIdx.x;
c[i] = f(a[i], b[i]);
}
template &lt;typename T, unsigned int N&gt;
std::ostream &amp;operator&lt;&lt;(std::ostream &amp;os, const vector&lt;T, N&gt; &amp;vec)
{
for (unsigned int i = 0; i &lt; N; ++i)
{
os &lt;&lt; (i == 0 ? &#39;[&#39; : &#39;,&#39;) &lt;&lt; vec._v[i];
}
os &lt;&lt; &#39;]&#39;;
return os;
}
template &lt;typename T, unsigned int N&gt;
vector&lt;T, N&gt; operator+(vector&lt;T, N&gt; &amp;a, vector&lt;T, N&gt; &amp;b)
{
vector&lt;T, N&gt; c;
a.upload();
b.upload();
do_op&lt;&lt;&lt;1, N&gt;&gt;&gt;(a._cv, b._cv, c._cv, [] __device__(T l, T r)
{ return l + r; });
c.download();
return c;
}
int main()
{
vector&lt;int, 3&gt; v0 = {1, 4, 9};
vector&lt;int, 3&gt; v1 = {5, 10, 15};
vector&lt;int, 3&gt; v2 = v0 + v1;
std::cout &lt;&lt; v0 &lt;&lt; std::endl;
std::cout &lt;&lt; v1 &lt;&lt; std::endl;
std::cout &lt;&lt; v2 &lt;&lt; std::endl;
return 0;
}

Output:

$ nvcc ts1.cu --extended-lambda -o ts1
$ ./ts1
[1,4,9]
[5,10,15]
[6,14,24]

huangapple
  • 本文由 发表于 2023年2月19日 14:40:04
  • 转载请务必保留本文链接:https://go.coder-hub.com/75498429.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定