英文:
Cuda using template class / passing lambdas to non-class function
问题
第一个程序(ts0.cu)编译和运行成功,产生了预期的结果,但第二个程序(ts1.cu)出现了编译错误。错误信息指出在ts1.cu文件中的operator+
没有被声明。这是因为在CUDA中,operator+
不会自动被识别为设备代码(device code)。为了解决这个问题,你需要将operator+
函数标记为设备函数。
以下是对ts1.cu代码的修改,以使其编译成功:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cassert>
template <typename T, typename F>
__global__ void do_op(T *a, T *b, T *c, F f)
{
int i = threadIdx.x;
c[i] = f(a[i], b[i]);
}
template <typename T, unsigned int N>
class vector
{
private:
T _v[N];
public:
vector() : _v{0} {}
vector(const vector<T, N> &src)
{
std::copy(src._v, src._v + N, this->_v);
}
vector(std::initializer_list<T> src)
{
assert(src.size() == N);
std::copy(src.begin(), src.end(), this->_v);
}
__host__ __device__ friend vector<T, N> operator+(const vector<T, N> &lhs, const vector<T, N> &rhs)
{
vector<T, N> vec;
T *cudaLS = 0;
T *cudaRS = 0;
T *cudaRV = 0;
cudaMalloc(&cudaLS, N * sizeof(T));
cudaMalloc(&cudaRS, N * sizeof(T));
cudaMalloc(&cudaRV, N * sizeof(T));
cudaMemcpy(cudaLS, lhs._v, N * sizeof(T), cudaMemcpyHostToDevice);
cudaMemcpy(cudaRS, rhs._v, N * sizeof(T), cudaMemcpyHostToDevice);
do_op<T><<<1, N>>>(cudaLS, cudaRS, cudaRV, [] __device__(T l, T r)
{ return l + r; });
cudaMemcpy(vec._v, cudaRV, N * sizeof(T), cudaMemcpyDeviceToHost);
cudaFree(cudaLS);
cudaFree(cudaRS);
cudaFree(cudaRV);
return vec;
}
friend std::ostream &operator<<(std::ostream &os, const vector<T, N> &vec)
{
for (unsigned int i = 0; i < N; ++i)
{
os << (i == 0 ? '[' : ',') << vec._v[i];
}
os << ']';
return os;
}
};
int main()
{
vector<int, 3> v0 = {1, 4, 9};
vector<int, 3> v1 = v0;
vector<int, 3> v2 = v0 + v1;
std::cout << v0 << std::endl;
std::cout << v1 << std::endl;
std::cout << v2 << std::endl;
return 0;
}
在这个修改后的代码中,operator+
函数被标记为__host__ __device__
,以允许其在主机和设备代码中使用。这将解决编译错误,并使你的第二个程序能够成功编译和运行。
英文:
I am trying to learn Cuda programming and have written a couple test programs in the process. The first of these works as I expected:
ts0.cu:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cassert>
template <typename T, typename F>
__global__ void do_op(T *a, T *b, T *c, F f)
{
int i = threadIdx.x;
c[i] = f(a[i], b[i]);
}
int main()
{
int a[] = {1, 2, 3};
int b[] = {4, 5, 6};
int c[sizeof(a) / sizeof(int)] = {0};
int *cudaA = 0;
int *cudaB = 0;
int *cudaC = 0;
cudaMalloc(&cudaA, sizeof(a));
cudaMalloc(&cudaB, sizeof(b));
cudaMalloc(&cudaC, sizeof(c));
cudaMemcpy(cudaA, a, sizeof(a), cudaMemcpyHostToDevice);
cudaMemcpy(cudaB, b, sizeof(b), cudaMemcpyHostToDevice);
do_op<int><<<1, sizeof(a) / sizeof(int)>>>(cudaA, cudaB, cudaC, [] __device__(int l, int r)
{ return l + r; }); // nvcc has no difficulty identifying this + operator
cudaMemcpy(c, cudaC, sizeof(b), cudaMemcpyDeviceToHost);
for (unsigned int i = 0; i < sizeof(c) / sizeof(int); ++i)
{
std::cout << (i == 0 ? '[' : ',') << c[i];
}
std::cout << ']' << std::endl;
return 0;
}
Compiling and running yield the following:
$ nvcc ts0.cu --extended-lambda -o ts0
$ ./ts0
[5,7,9]
I then tried wrapping the code above in a vector class I'm defining (again for learning purposes) by doing the following:
ts1.cu:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cassert>
template <typename T, typename F>
__global__ void do_op(T *a, T *b, T *c, F f)
{
int i = threadIdx.x;
c[i] = f(a[i], b[i]);
}
template <typename T, unsigned int N>
class vector
{
private:
T _v[N];
public:
vector() : _v{0} {}
vector(const vector<T, N> &src)
{
std::copy(src._v, src._v + N, this->_v);
}
vector(std::initializer_list<T> src)
{
assert(size(src) == N);
std::copy(src.begin(), src.end(), this->_v);
}
friend vector<T, N> operator+(const vector<T, N> &lhs, const vector<T, N> &rhs)
{
vector<T, N> vec;
T *cudaLS = 0;
T *cudaRS = 0;
T *cudaRV = 0;
cudaMalloc(&cudaLS, N);
cudaMalloc(&cudaRS, N);
cudaMalloc(&cudaRV, N);
cudaMemcpy(cudaLS, lhs._v, N, cudaMemcpyHostToDevice);
cudaMemcpy(cudaRS, rhs._v, N, cudaMemcpyHostToDevice);
do_op<T><<<1, N>>>(cudaLS, cudaRS, cudaRV, [] __device__(T l, T r)
{ return l + r; }); // nvcc doesn't recognize this + operator
cudaMemcpy(vec._v, cudaRV, N, cudaMemcpyDeviceToHost);
cudaFree(cudaLS);
cudaFree(cudaRS);
cudaFree(cudaRV);
return vec;
}
friend std::ostream &operator<<(std::ostream &os, const vector<T, N> &vec)
{
for (unsigned int i = 0; i < N; ++i)
{
os << (i == 0 ? '[' : ',') << vec._v[i];
}
os << ']';
return os;
}
};
int main()
{
vector<int, 3> v0 = {1, 4, 9};
vector<int, 3> v1 = v0;
vector<int, 3> v2 = v0 + v1;
std::cout << v0 << std::endl;
std::cout << v1 << std::endl;
std::cout << v2 << std::endl;
return 0;
}
And when I compile I get the following error:
$ nvcc ts1.cu --extended-lambda -o ts1
ts1.cu: In function ‘vector<T, N> operator+(const vector<T, N>&, const vector<T, N>&)’:
ts1.cu:49:171: error: ‘::operator+’ has not been declared
49 | do_op<T><<<1, N>>>(cudaLS, cudaRS, cudaRV, [] __device__(T l, T r)
|
... followed by a large number of suggestions.
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Jan__6_16:45:21_PST_2023
Cuda compilation tools, release 12.0, V12.0.140
Build cuda_12.0.r12.0/compiler.32267302_0
OS: Fedora 37
Why does the second approach fail to compile while the first one succeeds and how can I modify the second to get this general approach to compile and run successfully?
答案1
得分: 1
@paleonix提供了一些建议,这些建议有所帮助... 以下实现有效:
ts1.cu:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cassert>
template <typename T, unsigned int N>
class vector
{
private:
T _v[N];
T *_cv;
void upload()
{
cudaMemcpy(this->_cv, this->_v, N * sizeof(T), cudaMemcpyHostToDevice);
}
void download()
{
cudaMemcpy(this->_v, this->_cv, N * sizeof(T), cudaMemcpyDeviceToHost);
}
public:
vector() : _v{0}, _cv(0)
{
cudaMalloc(&(this->_cv), N * sizeof(T));
}
vector(const vector<T, N> &src) : _cv(0)
{
std::copy(src._v, src._v + N, this->_v);
cudaMalloc(&(this->_cv), N * sizeof(T));
}
vector(std::initializer_list<T> src)
{
assert(src.size() == N);
std::copy(src.begin(), src.end(), this->_v);
cudaMalloc(&(this->_cv), N * sizeof(T));
}
template <typename S, unsigned int M>
friend vector<S, M> operator+(vector<S, M> &a, vector<S, M> &b);
template <typename S, unsigned int M>
friend std::ostream &operator<<(std::ostream &os, const vector<S, M> &vec);
};
template <typename T, typename F>
__global__ void do_op(T *a, T *b, T *c, F f)
{
int i = threadIdx.x;
c[i] = f(a[i], b[i]);
}
template <typename T, unsigned int N>
std::ostream &operator<<(std::ostream &os, const vector<T, N> &vec)
{
for (unsigned int i = 0; i < N; ++i)
{
os << (i == 0 ? '[' : ',') << vec._v[i];
}
os << ']';
return os;
}
template <typename T, unsigned int N>
vector<T, N> operator+(vector<T, N> &a, vector<T, N> &b)
{
vector<T, N> c;
a.upload();
b.upload();
do_op<<<1, N>>>(a._cv, b._cv, c._cv, [] __device__(T l, T r)
{ return l + r; });
c.download();
return c;
}
int main()
{
vector<int, 3> v0 = {1, 4, 9};
vector<int, 3> v1 = {5, 10, 15};
vector<int, 3> v2 = v0 + v1;
std::cout << v0 << std::endl;
std::cout << v1 << std::endl;
std::cout << v2 << std::endl;
return 0;
}
输出:
$ nvcc ts1.cu --extended-lambda -o ts1
$ ./ts1
[1,4,9]
[5,10,15]
[6,14,24]
英文:
@paleonix provided some suggestions which helped... The following implementation works:
ts1.cu:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cassert>
template <typename T, unsigned int N>
class vector
{
private:
T _v[N];
T *_cv;
void upload()
{
cudaMemcpy(this->_cv, this->_v, N * sizeof(T), cudaMemcpyHostToDevice);
}
void download()
{
cudaMemcpy(this->_v, this->_cv, N * sizeof(T), cudaMemcpyDeviceToHost);
}
public:
vector() : _v{0}, _cv(0)
{
cudaMalloc(&(this->_cv), N * sizeof(T));
}
vector(const vector<T, N> &src) : _cv(0)
{
std::copy(src._v, src._v + N, this->_v);
cudaMalloc(&(this->_cv), N * sizeof(T));
}
vector(std::initializer_list<T> src)
{
assert(size(src) == N);
std::copy(src.begin(), src.end(), this->_v);
cudaMalloc(&(this->_cv), N * sizeof(T));
}
template <typename S, unsigned int M>
friend vector<S, M> operator+(vector<S, M> &a, vector<S, M> &b);
template <typename S, unsigned int M>
friend std::ostream &operator<<(std::ostream &os, const vector<S, M> &vec);
};
template <typename T, typename F>
__global__ void do_op(T *a, T *b, T *c, F f)
{
int i = threadIdx.x;
c[i] = f(a[i], b[i]);
}
template <typename T, unsigned int N>
std::ostream &operator<<(std::ostream &os, const vector<T, N> &vec)
{
for (unsigned int i = 0; i < N; ++i)
{
os << (i == 0 ? '[' : ',') << vec._v[i];
}
os << ']';
return os;
}
template <typename T, unsigned int N>
vector<T, N> operator+(vector<T, N> &a, vector<T, N> &b)
{
vector<T, N> c;
a.upload();
b.upload();
do_op<<<1, N>>>(a._cv, b._cv, c._cv, [] __device__(T l, T r)
{ return l + r; });
c.download();
return c;
}
int main()
{
vector<int, 3> v0 = {1, 4, 9};
vector<int, 3> v1 = {5, 10, 15};
vector<int, 3> v2 = v0 + v1;
std::cout << v0 << std::endl;
std::cout << v1 << std::endl;
std::cout << v2 << std::endl;
return 0;
}
Output:
$ nvcc ts1.cu --extended-lambda -o ts1
$ ./ts1
[1,4,9]
[5,10,15]
[6,14,24]
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论