Cuda 使用模板类 / 将 Lambda 传递给非类函数

huangapple go评论86阅读模式
英文:

Cuda using template class / passing lambdas to non-class function

问题

第一个程序(ts0.cu)编译和运行成功,产生了预期的结果,但第二个程序(ts1.cu)出现了编译错误。错误信息指出在ts1.cu文件中的operator+没有被声明。这是因为在CUDA中,operator+不会自动被识别为设备代码(device code)。为了解决这个问题,你需要将operator+函数标记为设备函数。

以下是对ts1.cu代码的修改,以使其编译成功:

  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3. #include <iostream>
  4. #include <cassert>
  5. template <typename T, typename F>
  6. __global__ void do_op(T *a, T *b, T *c, F f)
  7. {
  8. int i = threadIdx.x;
  9. c[i] = f(a[i], b[i]);
  10. }
  11. template <typename T, unsigned int N>
  12. class vector
  13. {
  14. private:
  15. T _v[N];
  16. public:
  17. vector() : _v{0} {}
  18. vector(const vector<T, N> &src)
  19. {
  20. std::copy(src._v, src._v + N, this->_v);
  21. }
  22. vector(std::initializer_list<T> src)
  23. {
  24. assert(src.size() == N);
  25. std::copy(src.begin(), src.end(), this->_v);
  26. }
  27. __host__ __device__ friend vector<T, N> operator+(const vector<T, N> &lhs, const vector<T, N> &rhs)
  28. {
  29. vector<T, N> vec;
  30. T *cudaLS = 0;
  31. T *cudaRS = 0;
  32. T *cudaRV = 0;
  33. cudaMalloc(&cudaLS, N * sizeof(T));
  34. cudaMalloc(&cudaRS, N * sizeof(T));
  35. cudaMalloc(&cudaRV, N * sizeof(T));
  36. cudaMemcpy(cudaLS, lhs._v, N * sizeof(T), cudaMemcpyHostToDevice);
  37. cudaMemcpy(cudaRS, rhs._v, N * sizeof(T), cudaMemcpyHostToDevice);
  38. do_op<T><<<1, N>>>(cudaLS, cudaRS, cudaRV, [] __device__(T l, T r)
  39. { return l + r; });
  40. cudaMemcpy(vec._v, cudaRV, N * sizeof(T), cudaMemcpyDeviceToHost);
  41. cudaFree(cudaLS);
  42. cudaFree(cudaRS);
  43. cudaFree(cudaRV);
  44. return vec;
  45. }
  46. friend std::ostream &operator<<(std::ostream &os, const vector<T, N> &vec)
  47. {
  48. for (unsigned int i = 0; i < N; ++i)
  49. {
  50. os << (i == 0 ? '[' : ',') << vec._v[i];
  51. }
  52. os << ']';
  53. return os;
  54. }
  55. };
  56. int main()
  57. {
  58. vector<int, 3> v0 = {1, 4, 9};
  59. vector<int, 3> v1 = v0;
  60. vector<int, 3> v2 = v0 + v1;
  61. std::cout << v0 << std::endl;
  62. std::cout << v1 << std::endl;
  63. std::cout << v2 << std::endl;
  64. return 0;
  65. }

在这个修改后的代码中,operator+函数被标记为__host__ __device__,以允许其在主机和设备代码中使用。这将解决编译错误,并使你的第二个程序能够成功编译和运行。

英文:

I am trying to learn Cuda programming and have written a couple test programs in the process. The first of these works as I expected:

ts0.cu:

  1. #include &quot;cuda_runtime.h&quot;
  2. #include &quot;device_launch_parameters.h&quot;
  3. #include &lt;iostream&gt;
  4. #include &lt;cassert&gt;
  5. template &lt;typename T, typename F&gt;
  6. __global__ void do_op(T *a, T *b, T *c, F f)
  7. {
  8. int i = threadIdx.x;
  9. c[i] = f(a[i], b[i]);
  10. }
  11. int main()
  12. {
  13. int a[] = {1, 2, 3};
  14. int b[] = {4, 5, 6};
  15. int c[sizeof(a) / sizeof(int)] = {0};
  16. int *cudaA = 0;
  17. int *cudaB = 0;
  18. int *cudaC = 0;
  19. cudaMalloc(&amp;cudaA, sizeof(a));
  20. cudaMalloc(&amp;cudaB, sizeof(b));
  21. cudaMalloc(&amp;cudaC, sizeof(c));
  22. cudaMemcpy(cudaA, a, sizeof(a), cudaMemcpyHostToDevice);
  23. cudaMemcpy(cudaB, b, sizeof(b), cudaMemcpyHostToDevice);
  24. do_op&lt;int&gt;&lt;&lt;&lt;1, sizeof(a) / sizeof(int)&gt;&gt;&gt;(cudaA, cudaB, cudaC, [] __device__(int l, int r)
  25. { return l + r; }); // nvcc has no difficulty identifying this + operator
  26. cudaMemcpy(c, cudaC, sizeof(b), cudaMemcpyDeviceToHost);
  27. for (unsigned int i = 0; i &lt; sizeof(c) / sizeof(int); ++i)
  28. {
  29. std::cout &lt;&lt; (i == 0 ? &#39;[&#39; : &#39;,&#39;) &lt;&lt; c[i];
  30. }
  31. std::cout &lt;&lt; &#39;]&#39; &lt;&lt; std::endl;
  32. return 0;
  33. }

Compiling and running yield the following:

  1. $ nvcc ts0.cu --extended-lambda -o ts0
  2. $ ./ts0
  3. [5,7,9]

I then tried wrapping the code above in a vector class I'm defining (again for learning purposes) by doing the following:

ts1.cu:

  1. #include &quot;cuda_runtime.h&quot;
  2. #include &quot;device_launch_parameters.h&quot;
  3. #include &lt;iostream&gt;
  4. #include &lt;cassert&gt;
  5. template &lt;typename T, typename F&gt;
  6. __global__ void do_op(T *a, T *b, T *c, F f)
  7. {
  8. int i = threadIdx.x;
  9. c[i] = f(a[i], b[i]);
  10. }
  11. template &lt;typename T, unsigned int N&gt;
  12. class vector
  13. {
  14. private:
  15. T _v[N];
  16. public:
  17. vector() : _v{0} {}
  18. vector(const vector&lt;T, N&gt; &amp;src)
  19. {
  20. std::copy(src._v, src._v + N, this-&gt;_v);
  21. }
  22. vector(std::initializer_list&lt;T&gt; src)
  23. {
  24. assert(size(src) == N);
  25. std::copy(src.begin(), src.end(), this-&gt;_v);
  26. }
  27. friend vector&lt;T, N&gt; operator+(const vector&lt;T, N&gt; &amp;lhs, const vector&lt;T, N&gt; &amp;rhs)
  28. {
  29. vector&lt;T, N&gt; vec;
  30. T *cudaLS = 0;
  31. T *cudaRS = 0;
  32. T *cudaRV = 0;
  33. cudaMalloc(&amp;cudaLS, N);
  34. cudaMalloc(&amp;cudaRS, N);
  35. cudaMalloc(&amp;cudaRV, N);
  36. cudaMemcpy(cudaLS, lhs._v, N, cudaMemcpyHostToDevice);
  37. cudaMemcpy(cudaRS, rhs._v, N, cudaMemcpyHostToDevice);
  38. do_op&lt;T&gt;&lt;&lt;&lt;1, N&gt;&gt;&gt;(cudaLS, cudaRS, cudaRV, [] __device__(T l, T r)
  39. { return l + r; }); // nvcc doesn&#39;t recognize this + operator
  40. cudaMemcpy(vec._v, cudaRV, N, cudaMemcpyDeviceToHost);
  41. cudaFree(cudaLS);
  42. cudaFree(cudaRS);
  43. cudaFree(cudaRV);
  44. return vec;
  45. }
  46. friend std::ostream &amp;operator&lt;&lt;(std::ostream &amp;os, const vector&lt;T, N&gt; &amp;vec)
  47. {
  48. for (unsigned int i = 0; i &lt; N; ++i)
  49. {
  50. os &lt;&lt; (i == 0 ? &#39;[&#39; : &#39;,&#39;) &lt;&lt; vec._v[i];
  51. }
  52. os &lt;&lt; &#39;]&#39;;
  53. return os;
  54. }
  55. };
  56. int main()
  57. {
  58. vector&lt;int, 3&gt; v0 = {1, 4, 9};
  59. vector&lt;int, 3&gt; v1 = v0;
  60. vector&lt;int, 3&gt; v2 = v0 + v1;
  61. std::cout &lt;&lt; v0 &lt;&lt; std::endl;
  62. std::cout &lt;&lt; v1 &lt;&lt; std::endl;
  63. std::cout &lt;&lt; v2 &lt;&lt; std::endl;
  64. return 0;
  65. }

And when I compile I get the following error:

  1. $ nvcc ts1.cu --extended-lambda -o ts1
  2. ts1.cu: In function vector&lt;T, N&gt; operator+(const vector&lt;T, N&gt;&amp;, const vector&lt;T, N&gt;&amp;)’:
  3. ts1.cu:49:171: error: ‘::operator+’ has not been declared
  4. 49 | do_op&lt;T&gt;&lt;&lt;&lt;1, N&gt;&gt;&gt;(cudaLS, cudaRS, cudaRV, [] __device__(T l, T r)
  5. |
  6. ... followed by a large number of suggestions.
  7. $ nvcc --version
  8. nvcc: NVIDIA (R) Cuda compiler driver
  9. Copyright (c) 2005-2023 NVIDIA Corporation
  10. Built on Fri_Jan__6_16:45:21_PST_2023
  11. Cuda compilation tools, release 12.0, V12.0.140
  12. Build cuda_12.0.r12.0/compiler.32267302_0

OS: Fedora 37

Why does the second approach fail to compile while the first one succeeds and how can I modify the second to get this general approach to compile and run successfully?

答案1

得分: 1

@paleonix提供了一些建议,这些建议有所帮助... 以下实现有效:

ts1.cu:

  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3. #include <iostream>
  4. #include <cassert>
  5. template <typename T, unsigned int N>
  6. class vector
  7. {
  8. private:
  9. T _v[N];
  10. T *_cv;
  11. void upload()
  12. {
  13. cudaMemcpy(this->_cv, this->_v, N * sizeof(T), cudaMemcpyHostToDevice);
  14. }
  15. void download()
  16. {
  17. cudaMemcpy(this->_v, this->_cv, N * sizeof(T), cudaMemcpyDeviceToHost);
  18. }
  19. public:
  20. vector() : _v{0}, _cv(0)
  21. {
  22. cudaMalloc(&(this->_cv), N * sizeof(T));
  23. }
  24. vector(const vector<T, N> &src) : _cv(0)
  25. {
  26. std::copy(src._v, src._v + N, this->_v);
  27. cudaMalloc(&(this->_cv), N * sizeof(T));
  28. }
  29. vector(std::initializer_list<T> src)
  30. {
  31. assert(src.size() == N);
  32. std::copy(src.begin(), src.end(), this->_v);
  33. cudaMalloc(&(this->_cv), N * sizeof(T));
  34. }
  35. template <typename S, unsigned int M>
  36. friend vector<S, M> operator+(vector<S, M> &a, vector<S, M> &b);
  37. template <typename S, unsigned int M>
  38. friend std::ostream &operator<<(std::ostream &os, const vector<S, M> &vec);
  39. };
  40. template <typename T, typename F>
  41. __global__ void do_op(T *a, T *b, T *c, F f)
  42. {
  43. int i = threadIdx.x;
  44. c[i] = f(a[i], b[i]);
  45. }
  46. template <typename T, unsigned int N>
  47. std::ostream &operator<<(std::ostream &os, const vector<T, N> &vec)
  48. {
  49. for (unsigned int i = 0; i < N; ++i)
  50. {
  51. os << (i == 0 ? '[' : ',') << vec._v[i];
  52. }
  53. os << ']';
  54. return os;
  55. }
  56. template <typename T, unsigned int N>
  57. vector<T, N> operator+(vector<T, N> &a, vector<T, N> &b)
  58. {
  59. vector<T, N> c;
  60. a.upload();
  61. b.upload();
  62. do_op<<<1, N>>>(a._cv, b._cv, c._cv, [] __device__(T l, T r)
  63. { return l + r; });
  64. c.download();
  65. return c;
  66. }
  67. int main()
  68. {
  69. vector<int, 3> v0 = {1, 4, 9};
  70. vector<int, 3> v1 = {5, 10, 15};
  71. vector<int, 3> v2 = v0 + v1;
  72. std::cout << v0 << std::endl;
  73. std::cout << v1 << std::endl;
  74. std::cout << v2 << std::endl;
  75. return 0;
  76. }

输出:

  1. $ nvcc ts1.cu --extended-lambda -o ts1
  2. $ ./ts1
  3. [1,4,9]
  4. [5,10,15]
  5. [6,14,24]
英文:

@paleonix provided some suggestions which helped... The following implementation works:

ts1.cu:

  1. #include &quot;cuda_runtime.h&quot;
  2. #include &quot;device_launch_parameters.h&quot;
  3. #include &lt;iostream&gt;
  4. #include &lt;cassert&gt;
  5. template &lt;typename T, unsigned int N&gt;
  6. class vector
  7. {
  8. private:
  9. T _v[N];
  10. T *_cv;
  11. void upload()
  12. {
  13. cudaMemcpy(this-&gt;_cv, this-&gt;_v, N * sizeof(T), cudaMemcpyHostToDevice);
  14. }
  15. void download()
  16. {
  17. cudaMemcpy(this-&gt;_v, this-&gt;_cv, N * sizeof(T), cudaMemcpyDeviceToHost);
  18. }
  19. public:
  20. vector() : _v{0}, _cv(0)
  21. {
  22. cudaMalloc(&amp;(this-&gt;_cv), N * sizeof(T));
  23. }
  24. vector(const vector&lt;T, N&gt; &amp;src) : _cv(0)
  25. {
  26. std::copy(src._v, src._v + N, this-&gt;_v);
  27. cudaMalloc(&amp;(this-&gt;_cv), N * sizeof(T));
  28. }
  29. vector(std::initializer_list&lt;T&gt; src)
  30. {
  31. assert(size(src) == N);
  32. std::copy(src.begin(), src.end(), this-&gt;_v);
  33. cudaMalloc(&amp;(this-&gt;_cv), N * sizeof(T));
  34. }
  35. template &lt;typename S, unsigned int M&gt;
  36. friend vector&lt;S, M&gt; operator+(vector&lt;S, M&gt; &amp;a, vector&lt;S, M&gt; &amp;b);
  37. template &lt;typename S, unsigned int M&gt;
  38. friend std::ostream &amp;operator&lt;&lt;(std::ostream &amp;os, const vector&lt;S, M&gt; &amp;vec);
  39. };
  40. template &lt;typename T, typename F&gt;
  41. __global__ void do_op(T *a, T *b, T *c, F f)
  42. {
  43. int i = threadIdx.x;
  44. c[i] = f(a[i], b[i]);
  45. }
  46. template &lt;typename T, unsigned int N&gt;
  47. std::ostream &amp;operator&lt;&lt;(std::ostream &amp;os, const vector&lt;T, N&gt; &amp;vec)
  48. {
  49. for (unsigned int i = 0; i &lt; N; ++i)
  50. {
  51. os &lt;&lt; (i == 0 ? &#39;[&#39; : &#39;,&#39;) &lt;&lt; vec._v[i];
  52. }
  53. os &lt;&lt; &#39;]&#39;;
  54. return os;
  55. }
  56. template &lt;typename T, unsigned int N&gt;
  57. vector&lt;T, N&gt; operator+(vector&lt;T, N&gt; &amp;a, vector&lt;T, N&gt; &amp;b)
  58. {
  59. vector&lt;T, N&gt; c;
  60. a.upload();
  61. b.upload();
  62. do_op&lt;&lt;&lt;1, N&gt;&gt;&gt;(a._cv, b._cv, c._cv, [] __device__(T l, T r)
  63. { return l + r; });
  64. c.download();
  65. return c;
  66. }
  67. int main()
  68. {
  69. vector&lt;int, 3&gt; v0 = {1, 4, 9};
  70. vector&lt;int, 3&gt; v1 = {5, 10, 15};
  71. vector&lt;int, 3&gt; v2 = v0 + v1;
  72. std::cout &lt;&lt; v0 &lt;&lt; std::endl;
  73. std::cout &lt;&lt; v1 &lt;&lt; std::endl;
  74. std::cout &lt;&lt; v2 &lt;&lt; std::endl;
  75. return 0;
  76. }

Output:

  1. $ nvcc ts1.cu --extended-lambda -o ts1
  2. $ ./ts1
  3. [1,4,9]
  4. [5,10,15]
  5. [6,14,24]

huangapple
  • 本文由 发表于 2023年2月19日 14:40:04
  • 转载请务必保留本文链接:https://go.coder-hub.com/75498429.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定