CUDA统一内存如何从设备预取到主机?

huangapple go评论72阅读模式
英文:

CUDA unified memory how to prefetch from device to host?

问题

Here's the translated code you provided without any additional content:

#include <cuda_runtime.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count() / 1000000.0;
    }
};

int N = 10000000;

void GenData(int N, float* a) {
    for (int i = 0; i < N; i++) a[i] = float(rand() % 1000000) / (rand() % 100 + 1);
}

__global__
void HelloWorld() {
    printf("Hello world\n");
}

constexpr int npoints = 6;
const string costnames[] = { "allocate", "H2D", "sort", "D2H", "hostsum", "free" };
double cost[3][npoints];
volatile double dummy = 0;

void Test1() {
    MyTimer timer;

    timer.startCounter();
    float* h_a = new float[N];
    float* d_a;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaDeviceSynchronize();
    cost[0][0] += timer.getCounterMsPrecise();

    GenData(N, h_a);
    dummy = h_a[rand() % N];

    timer.startCounter();
    cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    cost[0][1] += timer.getCounterMsPrecise();

    timer.startCounter();
    thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(d_a);
    thrust::sort(dev_ptr, dev_ptr + N);
    cudaDeviceSynchronize();
    cost[0][2] += timer.getCounterMsPrecise();

    timer.startCounter();
    cudaMemcpy(h_a, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    dummy = h_a[rand() % N];
    cost[0][3] += timer.getCounterMsPrecise();

    timer.startCounter();
    float sum = 0;
    for (int i = 0; i < N; i++) sum += h_a[i];
    dummy = sum;
    cost[0][4] += timer.getCounterMsPrecise();

    timer.startCounter();
    delete[] h_a;
    cudaFree(d_a);
    cudaDeviceSynchronize();
    cost[0][5] += timer.getCounterMsPrecise();

    for (int i = 0; i < npoints; i++) dummy += cost[0][i];
}

void Test2() {
    MyTimer timer;

    timer.startCounter();
    float* a;
    cudaMallocManaged(&a, N * sizeof(float);
    cost[1][0] += timer.getCounterMsPrecise();

    GenData(N, a);
    dummy = a[rand() % N];

    timer.startCounter();
    cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
    cudaDeviceSynchronize();
    cost[1][1] += timer.getCounterMsPrecise();

    timer.startCounter();
    thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
    thrust::sort(dev_ptr, dev_ptr + N);
    cudaDeviceSynchronize();
    cost[1][2] += timer.getCounterMsPrecise();

    timer.startCounter();
    cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
    cudaDeviceSynchronize();
    dummy = a[rand() % N];
    cost[1][3] += timer.getCounterMsPrecise();

    timer.startCounter();
    float sum = 0;
    for (int i = 0; i < N; i++) sum += a[i];
    dummy = sum;
    cost[1][4] += timer.getCounterMsPrecise();

    timer.startCounter();
    cudaFree(a);
    cudaDeviceSynchronize();
    cost[1][5] += timer.getCounterMsPrecise();

    for (int i = 0; i < npoints; i++) dummy += cost[1][i];
}

void Test3() {
    MyTimer timer;

    timer.startCounter();
    float* a;
    cudaMallocManaged(&a, N * sizeof(float));
    cost[2][0] += timer.getCounterMsPrecise();

    GenData(N, a);
    dummy = a[rand() % N];

    timer.startCounter();
    //cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
    //cudaDeviceSynchronize();
    cost[2][1] += timer.getCounterMsPrecise();

    timer.startCounter();
    thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
    thrust::sort(dev_ptr, dev_ptr + N);
    cudaDeviceSynchronize();
    cost[2][2] += timer.getCounterMsPrecise();

    timer.startCounter();
    // cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
    // cudaDeviceSynchronize();
    dummy = a[rand() % N];
    cost[2][3] += timer.getCounterMsPrecise();

    timer.startCounter();
    float sum = 0;
    for (int i = 0; i < N; i++) sum += a[i];
    dummy = sum;
    cost[2][4] += timer.getCounterMsPrecise();

    timer.startCounter();
    cudaFree(a);
    cudaDeviceSynchronize();
    cost[2][5] += timer.getCounterMsPrecise();

    for (int i = 0; i < npoints; i++) dummy += cost[2][i];
}

int main() {
    srand(time(NULL));
    HelloWorld<<<1, 1>>>();

    // warmup
    Test1();
    Test2();
    for (int i = 0; i < 3; i++)
        for (int j = 0; j < npoints; j++)
            cost[i][j] = 0;

    int ntest = 10;
    for (int t = 1; t <= ntest; t++) {
        Test1();
        Test2();
        Test3();
    }

    for (int i = 0; i < npoints; i++) {
        cout << "cost " << costnames[i] << " = " << (cost[0][i] / ntest) << " , " << (cost[1][i] / ntest

<details>
<summary>英文:</summary>

#include <cuda_runtime.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;

class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;

public:
void startCounter() {
start = std::chrono::system_clock::now();
}

int64_t getCounterNs() {
return std::chrono::duration_cast&lt;std::chrono::nanoseconds&gt;(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast&lt;std::chrono::milliseconds&gt;(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast&lt;std::chrono::nanoseconds&gt;(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}

};

int N = 10000000;

void GenData(int N, float* a)
{
for (int i = 0; i < N; i ++) a[i] = float(rand() % 1000000) / (rand() % 100 + 1);
}

global
void HelloWorld()
{
printf("Hello world\n");
}

constexpr int npoints = 6;
const string costnames[] = {"allocate", "H2D", "sort", "D2H", "hostsum", "free"};
double cost[3][npoints];
volatile double dummy = 0;

void Test1()
{
MyTimer timer;

timer.startCounter();
float *h_a = new float[N];
float *d_a;
cudaMalloc(&d_a, N * sizeof(float));
cudaDeviceSynchronize();
cost[0][0] += timer.getCounterMsPrecise();

GenData(N, h_a);
dummy = h_a[rand() % N];

timer.startCounter();
cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cost[0][1] += timer.getCounterMsPrecise();

timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(d_a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[0][2] += timer.getCounterMsPrecise();

timer.startCounter();
cudaMemcpy(h_a, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
dummy = h_a[rand() % N];
cost[0][3] += timer.getCounterMsPrecise();

timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += h_a[i];
dummy = sum;
cost[0][4] += timer.getCounterMsPrecise();

timer.startCounter();
delete[] h_a;
cudaFree(d_a);
cudaDeviceSynchronize();
cost[0][5] += timer.getCounterMsPrecise();

for (int i = 0; i < npoints; i++) dummy += cost[0][i];
}

void Test2()
{
MyTimer timer;

timer.startCounter();
float *a;
cudaMallocManaged(&a, N * sizeof(float));
cost[1][0] += timer.getCounterMsPrecise();

GenData(N, a);
dummy = a[rand() % N];

timer.startCounter();
cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
cudaDeviceSynchronize();
cost[1][1] += timer.getCounterMsPrecise();

timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[1][2] += timer.getCounterMsPrecise();

timer.startCounter();
cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
cudaDeviceSynchronize();
dummy = a[rand() % N];
cost[1][3] += timer.getCounterMsPrecise();

timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += a[i];
dummy = sum;
cost[1][4] += timer.getCounterMsPrecise();

timer.startCounter();
cudaFree(a);
cudaDeviceSynchronize();
cost[1][5] += timer.getCounterMsPrecise();

for (int i = 0; i < npoints; i++) dummy += cost[1][i];
}

void Test3()
{
MyTimer timer;

timer.startCounter();
float *a;
cudaMallocManaged(&a, N * sizeof(float));
cost[2][0] += timer.getCounterMsPrecise();

GenData(N, a);
dummy = a[rand() % N];

timer.startCounter();
//cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
//cudaDeviceSynchronize();
cost[2][1] += timer.getCounterMsPrecise();

timer.startCounter();
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a);
thrust::sort(dev_ptr, dev_ptr + N);
cudaDeviceSynchronize();
cost[2][2] += timer.getCounterMsPrecise();

timer.startCounter();
// cudaMemPrefetchAsync(a, N * sizeof(float), 0, 0);
// cudaDeviceSynchronize();
dummy = a[rand() % N];
cost[2][3] += timer.getCounterMsPrecise();

timer.startCounter();
float sum = 0;
for (int i = 0; i < N; i++) sum += a[i];
dummy = sum;
cost[2][4] += timer.getCounterMsPrecise();

timer.startCounter();
cudaFree(a);
cudaDeviceSynchronize();
cost[2][5] += timer.getCounterMsPrecise();

for (int i = 0; i < npoints; i++) dummy += cost[2][i];
}

int main()
{
srand(time(NULL));
HelloWorld<<<1,1>>>();

// warmup
Test1();
Test2();
for (int i = 0; i < 3; i++)
for (int j = 0; j < npoints; j++) cost[i][j] = 0;

int ntest = 10;
for (int t = 1; t <= ntest; t++) {
Test1();
Test2();
Test3();
}

for (int i = 0; i < npoints; i++) {
cout << "cost " << costnames[i] << " = " << (cost[0][i] / ntest) << " , " << (cost[1][i] / ntest) << " , " << (cost[2][i] / ntest) << "\n";
}

return 0;
}

// 2080ti
// Hello world
// cost allocate = 0.245438 , 0.0470603 , 0.029834
// cost H2D = 6.25315 , 6.36215 , 3.71e-05
// cost sort = 2.61625 , 2.6077 , 14.5418
// cost D2H = 8.74573 , 0.0520719 , 0.0759482
// cost hostsum = 6.98815 , 17.9619 , 18.3188
// cost free = 2.82205 , 3.8711 , 4.12887


I&#39;m trying to compare performance of `cudaMalloc` vs `cudaMallocManaged`. The use case is for a matrix library, where the use of GPU is hidden from the user (i.e they can just use it like a normal library, but some operations will automatically use GPU).
If an algo only uses the GPU, then we can `cudaMemPrefetch` the memory to the GPU. You can see that `cost[0][2] == cost[1][2]`, and `cost[0][3]` is much slower. However, prefetch doesn&#39;t work in the opposite direction, so `cost[1][4] &gt; cost[0][3] + cost[0][4]`, ~10-15% slower.
So, is there any way to prefetch unified memory from device to host?
</details>
# 答案1
**得分**: 4
Your usage of `cudaMemPrefetchAsync` to prefetch data to host is incorrect. As per API documentation:
> Passing in `cudaCpuDeviceId` for `dstDevice` will prefetch the data to host memory.
Running your code as is, I observe the following output on my machine:
```plaintext
Hello world
cost allocate = 0.190719 , 0.0421818 , 0.0278854
cost H2D = 3.29175 , 5.30171 , 4.3e-05
cost sort = 0.619405 , 0.59198 , 11.6026
cost D2H = 3.42561 , 0.730888 , 0.729142
cost hostsum = 7.34508 , 12.7422 , 12.9242
cost free = 2.20156 , 5.1042 , 5.99327

When I use cudaCpuDeviceId to prefetch the sorted data to the host, the hostsum time decreases:

Hello world
cost allocate = 0.192218 , 0.0414427 , 0.0268805
cost H2D = 3.21791 , 5.31319 , 5e-05
cost sort = 0.617812 , 0.594804 , 12.6862
cost D2H = 3.3481 , 2.9555 , 0.730368
cost hostsum = 7.23154 , 7.20661 , 12.737
cost free = 2.101 , 5.22388 , 5.8554
英文:

Your usage of cudaMemPrefetchAsync to prefetch data to host is incorrect. As per API documentation:
>Passing in cudaCpuDeviceId for dstDevice will prefetch the data to host memory

Running your code as is, I observe the following output on my machine.

Hello world
cost allocate = 0.190719 , 0.0421818 , 0.0278854
cost H2D = 3.29175 , 5.30171 , 4.3e-05
cost sort = 0.619405 , 0.59198 , 11.6026
cost D2H = 3.42561 , 0.730888 , 0.729142
cost hostsum = 7.34508 , 12.7422 , 12.9242
cost free = 2.20156 , 5.1042 , 5.99327

When I use cudaCpuDeviceId to prefetch the sorted data to the host, the hostsum time decreases.

Hello world
cost allocate = 0.192218 , 0.0414427 , 0.0268805
cost H2D = 3.21791 , 5.31319 , 5e-05
cost sort = 0.617812 , 0.594804 , 12.6862
cost D2H = 3.3481 , 2.9555 , 0.730368
cost hostsum = 7.23154 , 7.20661 , 12.737
cost free = 2.101 , 5.22388 , 5.8554

huangapple
  • 本文由 发表于 2023年3月23日 11:30:17
  • 转载请务必保留本文链接:https://go.coder-hub.com/75819026.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定