英文:
CUDA unified memory how to prefetch from device to host?
问题
Here's the translated code you provided without any additional content:
#include <cuda_runtime.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count() / 1000000.0;
}
};
int N = 10000000;
void GenData(int N, float* a) {
for (int i = 0; i < N; i++) a[i] = float(rand() % 1000000) / (rand() % 100 + 1);
}
__global__
void HelloWorld() {
printf("Hello world\n");
}
constexpr int npoints = 6;
const string costnames[] = { "allocate", "H2D", "sort", "D2H", "hostsum", "free" };
double cost[3][npoints];
volatile double dummy = 0;
void Test1() {
MyTimer timer;
timer.startCounter();
float* h_a = new float[N];
float* d_a;
cudaMalloc(&d_a, N * sizeof(float));
cudaDeviceSynchronize();
cost[0][0] += timer.getCounterMsPrecise();
GenData(N, h_a);
dummy = h_a[rand() % N];
timer.startCounter();
cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cost[0][1] += timer.getCounterMsPrecise();
timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(d_a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[0][2] += timer.getCounterMsPrecise();
timer.startCounter();
cudaMemcpy(h_a, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
dummy = h_a[rand() % N];
cost[0][3] += timer.getCounterMsPrecise();
timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += h_a[i];
dummy = sum;
cost[0][4] += timer.getCounterMsPrecise();
timer.startCounter();
delete[] h_a;
cudaFree(d_a);
cudaDeviceSynchronize();
cost[0][5] += timer.getCounterMsPrecise();
for (int i = 0; i < npoints; i++) dummy += cost[0][i];
}
void Test2() {
MyTimer timer;
timer.startCounter();
float* a;
cudaMallocManaged(&a, N * sizeof(float);
cost[1][0] += timer.getCounterMsPrecise();
GenData(N, a);
dummy = a[rand() % N];
timer.startCounter();
cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
cudaDeviceSynchronize();
cost[1][1] += timer.getCounterMsPrecise();
timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[1][2] += timer.getCounterMsPrecise();
timer.startCounter();
cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
cudaDeviceSynchronize();
dummy = a[rand() % N];
cost[1][3] += timer.getCounterMsPrecise();
timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += a[i];
dummy = sum;
cost[1][4] += timer.getCounterMsPrecise();
timer.startCounter();
cudaFree(a);
cudaDeviceSynchronize();
cost[1][5] += timer.getCounterMsPrecise();
for (int i = 0; i < npoints; i++) dummy += cost[1][i];
}
void Test3() {
MyTimer timer;
timer.startCounter();
float* a;
cudaMallocManaged(&a, N * sizeof(float));
cost[2][0] += timer.getCounterMsPrecise();
GenData(N, a);
dummy = a[rand() % N];
timer.startCounter();
//cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
//cudaDeviceSynchronize();
cost[2][1] += timer.getCounterMsPrecise();
timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[2][2] += timer.getCounterMsPrecise();
timer.startCounter();
// cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
// cudaDeviceSynchronize();
dummy = a[rand() % N];
cost[2][3] += timer.getCounterMsPrecise();
timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += a[i];
dummy = sum;
cost[2][4] += timer.getCounterMsPrecise();
timer.startCounter();
cudaFree(a);
cudaDeviceSynchronize();
cost[2][5] += timer.getCounterMsPrecise();
for (int i = 0; i < npoints; i++) dummy += cost[2][i];
}
int main() {
srand(time(NULL));
HelloWorld<<<1, 1>>>();
// warmup
Test1();
Test2();
for (int i = 0; i < 3; i++)
for (int j = 0; j < npoints; j++)
cost[i][j] = 0;
int ntest = 10;
for (int t = 1; t <= ntest; t++) {
Test1();
Test2();
Test3();
}
for (int i = 0; i < npoints; i++) {
cout << "cost " << costnames[i] << " = " << (cost[0][i] / ntest) << " , " << (cost[1][i] / ntest
<details>
<summary>英文:</summary>
#include <cuda_runtime.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}
};
int N = 10000000;
void GenData(int N, float* a)
{
for (int i = 0; i < N; i ++) a[i] = float(rand() % 1000000) / (rand() % 100 + 1);
}
global
void HelloWorld()
{
printf("Hello world\n");
}
constexpr int npoints = 6;
const string costnames[] = {"allocate", "H2D", "sort", "D2H", "hostsum", "free"};
double cost[3][npoints];
volatile double dummy = 0;
void Test1()
{
MyTimer timer;
timer.startCounter();
float *h_a = new float[N];
float *d_a;
cudaMalloc(&d_a, N * sizeof(float));
cudaDeviceSynchronize();
cost[0][0] += timer.getCounterMsPrecise();
GenData(N, h_a);
dummy = h_a[rand() % N];
timer.startCounter();
cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cost[0][1] += timer.getCounterMsPrecise();
timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(d_a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[0][2] += timer.getCounterMsPrecise();
timer.startCounter();
cudaMemcpy(h_a, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
dummy = h_a[rand() % N];
cost[0][3] += timer.getCounterMsPrecise();
timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += h_a[i];
dummy = sum;
cost[0][4] += timer.getCounterMsPrecise();
timer.startCounter();
delete[] h_a;
cudaFree(d_a);
cudaDeviceSynchronize();
cost[0][5] += timer.getCounterMsPrecise();
for (int i = 0; i < npoints; i++) dummy += cost[0][i];
}
void Test2()
{
MyTimer timer;
timer.startCounter();
float *a;
cudaMallocManaged(&a, N * sizeof(float));
cost[1][0] += timer.getCounterMsPrecise();
GenData(N, a);
dummy = a[rand() % N];
timer.startCounter();
cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
cudaDeviceSynchronize();
cost[1][1] += timer.getCounterMsPrecise();
timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[1][2] += timer.getCounterMsPrecise();
timer.startCounter();
cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
cudaDeviceSynchronize();
dummy = a[rand() % N];
cost[1][3] += timer.getCounterMsPrecise();
timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += a[i];
dummy = sum;
cost[1][4] += timer.getCounterMsPrecise();
timer.startCounter();
cudaFree(a);
cudaDeviceSynchronize();
cost[1][5] += timer.getCounterMsPrecise();
for (int i = 0; i < npoints; i++) dummy += cost[1][i];
}
void Test3()
{
MyTimer timer;
timer.startCounter();
float *a;
cudaMallocManaged(&a, N * sizeof(float));
cost[2][0] += timer.getCounterMsPrecise();
GenData(N, a);
dummy = a[rand() % N];
timer.startCounter();
//cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
//cudaDeviceSynchronize();
cost[2][1] += timer.getCounterMsPrecise();
timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[2][2] += timer.getCounterMsPrecise();
timer.startCounter();
// cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
// cudaDeviceSynchronize();
dummy = a[rand() % N];
cost[2][3] += timer.getCounterMsPrecise();
timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += a[i];
dummy = sum;
cost[2][4] += timer.getCounterMsPrecise();
timer.startCounter();
cudaFree(a);
cudaDeviceSynchronize();
cost[2][5] += timer.getCounterMsPrecise();
for (int i = 0; i < npoints; i++) dummy += cost[2][i];
}
int main()
{
srand(time(NULL));
HelloWorld<<<1,1>>>();
// warmup
Test1();
Test2();
for (int i = 0; i < 3; i++)
for (int j = 0; j < npoints; j++) cost[i][j] = 0;
int ntest = 10;
for (int t = 1; t <= ntest; t++) {
Test1();
Test2();
Test3();
}
for (int i = 0; i < npoints; i++) {
cout << "cost " << costnames[i] << " = " << (cost[0][i] / ntest) << " , " << (cost[1][i] / ntest) << " , " << (cost[2][i] / ntest) << "\n";
}
return 0;
}
// 2080ti
// Hello world
// cost allocate = 0.245438 , 0.0470603 , 0.029834
// cost H2D = 6.25315 , 6.36215 , 3.71e-05
// cost sort = 2.61625 , 2.6077 , 14.5418
// cost D2H = 8.74573 , 0.0520719 , 0.0759482
// cost hostsum = 6.98815 , 17.9619 , 18.3188
// cost free = 2.82205 , 3.8711 , 4.12887
I'm trying to compare performance of `cudaMalloc` vs `cudaMallocManaged`. The use case is for a matrix library, where the use of GPU is hidden from the user (i.e they can just use it like a normal library, but some operations will automatically use GPU).
If an algo only uses the GPU, then we can `cudaMemPrefetch` the memory to the GPU. You can see that `cost[0][2] == cost[1][2]`, and `cost[0][3]` is much slower. However, prefetch doesn't work in the opposite direction, so `cost[1][4] > cost[0][3] + cost[0][4]`, ~10-15% slower.
So, is there any way to prefetch unified memory from device to host?
</details>
# 答案1
**得分**: 4
Your usage of `cudaMemPrefetchAsync` to prefetch data to host is incorrect. As per API documentation:
> Passing in `cudaCpuDeviceId` for `dstDevice` will prefetch the data to host memory.
Running your code as is, I observe the following output on my machine:
```plaintext
Hello world
cost allocate = 0.190719 , 0.0421818 , 0.0278854
cost H2D = 3.29175 , 5.30171 , 4.3e-05
cost sort = 0.619405 , 0.59198 , 11.6026
cost D2H = 3.42561 , 0.730888 , 0.729142
cost hostsum = 7.34508 , 12.7422 , 12.9242
cost free = 2.20156 , 5.1042 , 5.99327
When I use cudaCpuDeviceId
to prefetch the sorted data to the host, the hostsum
time decreases:
Hello world
cost allocate = 0.192218 , 0.0414427 , 0.0268805
cost H2D = 3.21791 , 5.31319 , 5e-05
cost sort = 0.617812 , 0.594804 , 12.6862
cost D2H = 3.3481 , 2.9555 , 0.730368
cost hostsum = 7.23154 , 7.20661 , 12.737
cost free = 2.101 , 5.22388 , 5.8554
英文:
Your usage of cudaMemPrefetchAsync to prefetch data to host is incorrect. As per API documentation:
>Passing in cudaCpuDeviceId for dstDevice will prefetch the data to host memory
Running your code as is, I observe the following output on my machine.
Hello world
cost allocate = 0.190719 , 0.0421818 , 0.0278854
cost H2D = 3.29175 , 5.30171 , 4.3e-05
cost sort = 0.619405 , 0.59198 , 11.6026
cost D2H = 3.42561 , 0.730888 , 0.729142
cost hostsum = 7.34508 , 12.7422 , 12.9242
cost free = 2.20156 , 5.1042 , 5.99327
When I use cudaCpuDeviceId to prefetch the sorted data to the host, the hostsum time decreases.
Hello world
cost allocate = 0.192218 , 0.0414427 , 0.0268805
cost H2D = 3.21791 , 5.31319 , 5e-05
cost sort = 0.617812 , 0.594804 , 12.6862
cost D2H = 3.3481 , 2.9555 , 0.730368
cost hostsum = 7.23154 , 7.20661 , 12.737
cost free = 2.101 , 5.22388 , 5.8554
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论