Does thread creation trigger page faults in Linux? How does it relate to soft-dirty PTEs?

huangapple go评论69阅读模式
英文:

Does thread creation trigger page faults in Linux? How does it relate to soft-dirty PTEs?

问题

以下是您提供的内容的翻译:

我提出这个问题的原因是,在测试Linux软脏位的行为时,我发现如果我创建一个线程而不触及任何内存,所有页面的软脏位都会被设置为1(脏)。

例如,在主线程中执行malloc(100MB),然后清除软脏位,然后创建一个仅休眠的线程。在创建线程后,那100MB内存块的所有页面的软脏位都设置为1。

这是我正在使用的测试程序:

#include <thread>
#include <iostream>
#include <vector>
#include <cstdint>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>

#define PAGE_SIZE_4K 0x1000

int GetDirtyBit(uint64_t vaddr) {
  int fd = open("/proc/self/pagemap", O_RDONLY);
  if (fd < 0) {
    perror("Failed open pagemap");
    exit(1);
  }

  off_t offset = vaddr / 4096 * 8;
  if (lseek(fd, offset, SEEK_SET) < 0) {
    perror("Failed lseek pagemap");
    exit(1);
  }

  uint64_t pfn = 0;
  if (read(fd, &pfn, sizeof(pfn)) != sizeof(pfn)) {
    perror("Failed read pagemap");
    sleep(1000);
    exit(1);
  }
  close(fd);

  return pfn & (1UL << 55) ? 1 : 0;
}

void CleanSoftDirty() {
  int fd = open("/proc/self/clear_refs", O_RDWR);
  if (fd < 0) {
    perror("Failed open clear_refs");
    exit(1);
  }

  char cmd[] = "4";
  if (write(fd, cmd, sizeof(cmd)) != sizeof(cmd)) {
    perror("Failed write clear_refs");
    exit(1);
  }

  close(fd);
}

int demo(int argc, char *argv[]) {
  int x = 1;
  // 100 MB
  uint64_t size = 1024UL * 1024UL * 100;
  void *ptr = malloc(size);
  for (uint64_t s = 0; s < size; s += PAGE_SIZE_4K) {
    // 填充页面
    memset(ptr + s, x, PAGE_SIZE_4K);
  }

  char *cptr = reinterpret_cast<char *>(ptr);
  printf("Soft dirty after malloc: %ld, (50MB offset)%ld\n",
        GetDirtyBit(reinterpret_cast<uint64_t>(cptr)),
        GetDirtyBit(reinterpret_cast<uint64_t>(cptr + 50 * 1024 * 1024)));

  printf("ALLOCATE FINISHED\n");

  std::string line;
  std::vector<std::thread> threads;
  while (true) {
    sleep(2);
    // 将所有页面的软脏位设置为0。
    CleanSoftDirty();

    char *cptr = reinterpret_cast<char *>(ptr);
    printf("Soft dirty after reset: %ld, (50MB offset)%ld\n",
      GetDirtyBit(reinterpret_cast<uint64_t>(cptr)),
      GetDirtyBit(reinterpret_cast<uint64_t>(cptr + 50 * 1024 * 1024)));
    
    // 创建线程。
    threads.push_back(std::thread([]() { while(true) sleep(1); }));

    sleep(2);
    
    printf("Soft dirty after create thread: %ld, (50MB offset)%ld\n",
      GetDirtyBit(reinterpret_cast<uint64_t>(cptr)),
      GetDirtyBit(reinterpret_cast<uint64_t>(cptr + 50 * 1024 * 1024)));

    // 填充前20MB
    memset(cptr, x++, 1024UL * 1024UL * 20);
    printf("Soft dirty after memset: %ld, (50MB offset)%ld\n",
      GetDirtyBit(reinterpret_cast<uint64_t>(cptr)),
      GetDirtyBit(reinterpret_cast<uint64_t>(cptr + 50 * 1024 * 1024)));
  }

  return 0;
}

int main(int argc, char *argv[]) {
  std::string last_arg = argv[argc - 1];
  printf("PID: %d\n", getpid());

  return demo(argc, argv);
}

我打印了第一页和偏移量50 * 1024 * 1024的脏位。以下是发生的情况:

  1. malloc()后软脏位为1,这是预期的。
  2. 清除软脏位后,它们变为0。
  3. 创建一个仅休眠的线程。
  4. 检查脏位,现在进程的100MB区域内的所有页面都将软脏位设置为1(我没有打印所有页面的脏位,但我进行了自己的检查)。
  5. 重新开始循环,现在行为是正确的,创建其他线程后软脏位保持为0。
  6. 偏移量为0的页面的软脏位为1,因为我执行了memset(),而页面50 MB的软脏位保持为0。

这是输出:

Soft dirty after malloc: 1, (50MB offset)1
ALLOCATE FINISHED
Soft dirty after reset: 0, (50MB offset)0
Soft dirty after create thread: 1, (50MB offset)1
Soft dirty after memset: 1, (50MB offset)1

(上述的步骤1-4)
(步骤5开始如下)
Soft dirty after reset: 0, (50MB offset)0
Soft dirty after create thread: 0, (50MB offset)0
Soft dirty after memset: 1, (50MB offset)0

Soft dirty after reset: 0, (50MB offset)0
Soft dirty after create thread: 0, (50MB offset)0
Soft dirty after memset: 1, (50MB

<details>
<summary>英文:</summary>

The reason why I ask this question is that, when testing the behavior of the Linux soft-dirty bit, I found that if I create a thread without touching any memory, the soft-dirty bit of all pages will be set to 1 (dirty).

For example, `malloc(100MB)` in the main thread, then clean soft dirty bits, then create a thread that just sleeps. After the thread is created, the soft-dirty bit of all that 100MB memory chunk is set to 1. 

Here is the test program I&#39;m using:

```cpp
#include &lt;thread&gt;
#include &lt;iostream&gt;
#include &lt;vector&gt;
#include &lt;cstdint&gt;
#include &lt;stdlib.h&gt;
#include &lt;fcntl.h&gt;
#include &lt;unistd.h&gt;
#include &lt;string.h&gt;
#include &lt;sys/types.h&gt;

#define PAGE_SIZE_4K 0x1000

int GetDirtyBit(uint64_t vaddr) {
  int fd = open(&quot;/proc/self/pagemap&quot;, O_RDONLY);
  if (fd &lt; 0) {
    perror(&quot;Failed open pagemap&quot;);
    exit(1);
  }

  off_t offset = vaddr / 4096 * 8;
  if (lseek(fd, offset, SEEK_SET) &lt; 0) {
    perror(&quot;Failed lseek pagemap&quot;);
    exit(1);
  }

  uint64_t pfn = 0;
  if (read(fd, &amp;pfn, sizeof(pfn)) != sizeof(pfn)) {
    perror(&quot;Failed read pagemap&quot;);
    sleep(1000);
    exit(1);
  }
  close(fd);

  return pfn &amp; (1UL &lt;&lt; 55) ? 1 : 0;
}

void CleanSoftDirty() {
  int fd = open(&quot;/proc/self/clear_refs&quot;, O_RDWR);
  if (fd &lt; 0) {
    perror(&quot;Failed open clear_refs&quot;);
    exit(1);
  }

  char cmd[] = &quot;4&quot;;
  if (write(fd, cmd, sizeof(cmd)) != sizeof(cmd)) {
    perror(&quot;Failed write clear_refs&quot;);
    exit(1);
  }

  close(fd);
}

int demo(int argc, char *argv[]) {
  int x = 1;
  // 100 MB
  uint64_t size = 1024UL * 1024UL * 100;
  void *ptr = malloc(size);
  for (uint64_t s = 0; s &lt; size; s += PAGE_SIZE_4K) {
    // populate pages
    memset(ptr + s, x, PAGE_SIZE_4K);
  }

  char *cptr = reinterpret_cast&lt;char *&gt;(ptr);
  printf(&quot;Soft dirty after malloc: %ld, (50MB offset)%ld\n&quot;,
        GetDirtyBit(reinterpret_cast&lt;uint64_t&gt;(cptr)),
        GetDirtyBit(reinterpret_cast&lt;uint64_t&gt;(cptr + 50 * 1024 * 1024)));

  printf(&quot;ALLOCATE FINISHED\n&quot;);

  std::string line;
  std::vector&lt;std::thread&gt; threads;
  while (true) {
    sleep(2);
    // Set soft dirty of all pages to 0.
    CleanSoftDirty();

    char *cptr = reinterpret_cast&lt;char *&gt;(ptr);
    printf(&quot;Soft dirty after reset: %ld, (50MB offset)%ld\n&quot;,
      GetDirtyBit(reinterpret_cast&lt;uint64_t&gt;(cptr)),
      GetDirtyBit(reinterpret_cast&lt;uint64_t&gt;(cptr + 50 * 1024 * 1024)));
    
    // Create thread.
    threads.push_back(std::thread([]() { while(true) sleep(1); }));

    sleep(2);
    
    printf(&quot;Soft dirty after create thread: %ld, (50MB offset)%ld\n&quot;,
      GetDirtyBit(reinterpret_cast&lt;uint64_t&gt;(cptr)),
      GetDirtyBit(reinterpret_cast&lt;uint64_t&gt;(cptr + 50 * 1024 * 1024)));

    // memset the first 20MB
    memset(cptr, x++, 1024UL * 1024UL * 20);
    printf(&quot;Soft dirty after memset: %ld, (50MB offset)%ld\n&quot;,
      GetDirtyBit(reinterpret_cast&lt;uint64_t&gt;(cptr)),
      GetDirtyBit(reinterpret_cast&lt;uint64_t&gt;(cptr + 50 * 1024 * 1024)));
  }

  return 0;
}

int main(int argc, char *argv[]) {
  std::string last_arg = argv[argc - 1];
  printf(&quot;PID: %d\n&quot;, getpid());

  return demo(argc, argv);
}

I print the dirty bit of the first page, and the page at offset 50 * 1024 * 1024. Here is what happens:

  1. The soft-dirty bits after malloc() are 1, which is expected.
  2. After clean soft-dirty, they become 0.
  3. Create a thread that just sleeps.
  4. Check dirty bit, all pages in the 100MB region (I didn't print dirty bits of all pages, but I did the check on my own) now have the soft-dirty bit set to 1.
  5. Restart the loop, now the behavior is correct, soft-dirty bits remain 0 after creating additional threads.
  6. The soft-dirty bit of the page at offset 0 is 1 since I did memset(), and the soft-dirty bit of page 50 MB remains 0.

Here is the output:

Soft dirty after malloc: 1, (50MB offset)1
ALLOCATE FINISHED
Soft dirty after reset: 0, (50MB offset)0
Soft dirty after create thread: 1, (50MB offset)1
Soft dirty after memset: 1, (50MB offset)1
(steps 1-4 above)
(step 5 starts below)
Soft dirty after reset: 0, (50MB offset)0
Soft dirty after create thread: 0, (50MB offset)0
Soft dirty after memset: 1, (50MB offset)0
Soft dirty after reset: 0, (50MB offset)0
Soft dirty after create thread: 0, (50MB offset)0
Soft dirty after memset: 1, (50MB offset)0
Soft dirty after reset: 0, (50MB offset)0
Soft dirty after create thread: 0, (50MB offset)0
Soft dirty after memset: 1, (50MB offset)0

I thought thread creation would just mark the pages as being in a "shared" state, not modify them, so the soft-dirty bit should remain unchanged. Apparently, the behavior is different. Therefore I'm thinking: does creating a thread trigger page faults on all of the pages? So the OS sets all pages' soft-dirty bit to 1 when handling the page fault.

If this is not the case, why does creating a thread make all memory pages of the process become "dirty"? Why does only the first thread creation have such behavior?

I hope I explained the question well, please let me know if more details are needed, or if anything doesn't make sense.

答案1

得分: 6

这是有趣且有趣的情况。您的特定情况以及软脏位的行为都相当奇特。没有发生页面错误,并且软脏位未在所有内存页面上设置,而仅在其中一些页面上设置了(通过malloc分配的页面)。

如果您在strace下运行程序,您将注意到一些事情,这些事情将有助于解释您观察到的情况:

[1] mmap(NULL, 104861696, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8669b66000
...
[2] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f8669365000
[2] mprotect(0x7f8669366000, 8388608, PROT_READ|PROT_WRITE) = 0
[2] clone(child_stack=0x7f8669b64fb0, ...) = 97197
...

如您所见:

  1. 您的malloc()分配相当大,因此您将不会得到一个正常的堆块,而是通过mmap系统调用保留的专用内存区域。

  2. 当您创建一个线程时,库代码通过另一个mmap接着是mprotect来为线程设置栈。

Linux中mmap的正常行为是从进程创建时选择的mmap_base开始保留内存,每次减去请求的大小(除非明确请求了特定地址,在这种情况下不考虑mmap_base)。因此,在点1处的mmap将保留在动态加载器映射的最后一个共享库上方的页面,点2处的mmap将在点1处映射的页面之前保留页面。然后,mprotect将标记此第二个区域(除了第一个页面之外的所有页面)为RW。

由于这些映射是连续的,都是匿名的,并且都具有相同的保护(RW),从内核的角度来看,这看起来像一个单一的内存区域已增大。事实上,内核将其视为单个VMA(vm_area_struct)。

现在,正如我们从内核文档中可以读到关于软脏位的说明(请注意我加粗的部分):

虽然在大多数情况下,通过#PF-s跟踪内存更改已经足够,但仍然存在一种情况,我们可以失去软脏位 -- 任务取消映射了以前映射的内存区域,然后在完全相同的位置映射了一个新的区域。在调用取消映射时,内核会内部清除PTE值,包括软脏位。为了通知用户空间应用程序有关这种内存区域的更新情况,内核总是将新的内存区域(以及扩展的区域)标记为软脏位

因此,您看到在清除软脏位后,它重新出现在初始的malloc分配的内存块上,这是一个有趣的巧合:由于分配线程栈引起的内存区域(VMA)的不那么直观的“扩展”。


为了使事情更清晰,我们可以通过/proc/[pid]/maps检查进程的虚拟内存布局的不同阶段。它看起来可能是这样的(摘自我的机器):

  • malloc()之前:

    ...
    5653d8b82000-5653d8b83000 r--p 00005000 00:18 77464613     [your program]
    5653d8b83000-5653d8b84000 rw-p 00006000 00:18 77464613     [your program]
    5653d983f000-5653d9860000 rw-p 00000000 00:00 0            [heap]
    7f866ff6c000-7f866ff79000 r--p 00000000 00:18 77146186     [shared libraries]
    7f866ff79000-7f8670013000 r-xp 0000d000 00:18 77146186     [shared libraries]
    ...
    
  • malloc()之后:

    ...
    5653d8b82000-5653d8b83000 r--p 00005000 00:18 77464613     [your program]
    5653d8b83000-5653d8b84000 rw-p 00006000 00:18 77464613     [your program]
    5653d983f000-5653d9860000 rw-p 00000000 00:00 0            [heap]
    7f8669b66000-7f866ff6c000 rw-p 00000000 00:00 0        *** MALLOC'D MEMORY
    7f866ff6c000-7f866ff79000 r--p 00000000 00:18 77146186     [shared libraries]
    7f866ff79000-7f8670013000 r-xp 0000d000 00:18 77146186     [shared libraries]
    ...
    
  • 创建第一个线程后(注意VMA的起始位置从7f8669b66000变为7f8669366000,因为它已经增大):

    ...
    5653d8b82000-5653d8b83000 r--p 00005000 00:18 77464613     [your program]
    5653d8b83000-5653d8b84000 rw-p 00006000 00:18 774
    
英文:

So, this is kind of funny and interesting. Your specific situation, as well as the behavior of the soft-dirty bits, are quite peculiar. No page faults are happening, and the soft-dirty bit is not being set on all memory pages, but just on some of them (the ones you allocated through malloc).

If you run your program under strace you will notice a couple of things that will help explain what you are observing:

[1] mmap(NULL, 104861696, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8669b66000
...
[2] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f8669365000
[2] mprotect(0x7f8669366000, 8388608, PROT_READ|PROT_WRITE) = 0
[2] clone(child_stack=0x7f8669b64fb0, ...) = 97197
...

As you can see above:

  1. Your malloc() is pretty large, so you will not get a normal heap chunk, but a dedicated memory area reserved through a mmap syscall.

  2. When you create a thread, library code sets up a stack for the thread through another mmap followed by mprotect.

The normal mmap behavior in Linux is to reserve memory starting from a mmap_base chosen at process creation time, subtracting each time the size of the request (unless a specific address is explicitly requested, in which case mmap_base is not considered). For this reason, the mmap at point 1 will reserve pages right above the last shared library mapped by the dynamic loader, and the mmap at point 2 above will reserve pages right before the pages mapped at point 1. The mprotect will then mark this second area (except for the very first page) as RW.

Since these mappings are contiguous, both anonymous and both with the same protections (RW), from the kernel's perspective this looks like a single memory region that has grown in size. In fact, the kernel treats this as a single VMA (vm_area_struct).

Now, as we can read from the kernel documentation about the soft-dirty bit (notice the part I highlighted in bold):

> While in most cases tracking memory changes by #PF-s is more than
> enough there is still a scenario when we can lose soft dirty bits -- a
> task unmaps a previously mapped memory region and then maps a new one
> at exactly the same place. When unmap is called, the kernel internally
> clears PTE values including soft dirty bits. To notify user space
> application about such memory region renewal the kernel always marks
> new memory regions (and expanded regions) as soft dirty
.

So the reason why you see the soft-dirty bit re-appear on the initial malloc'd chunk of memory after clearing it is a funny coincidence: a result of the not-so-intuitive "expansion" of the memory region (VMA) containing it caused by the allocation of the thread stack.


To make things clearer, we can inspect the virtual memory layout of the process through /proc/[pid]/maps at different stages. It will look something like this (taken from my machine):

  • Before malloc():

    ...
    5653d8b82000-5653d8b83000 r--p 00005000 00:18 77464613     [your program]
    5653d8b83000-5653d8b84000 rw-p 00006000 00:18 77464613     [your program]
    5653d983f000-5653d9860000 rw-p 00000000 00:00 0            [heap]
    7f866ff6c000-7f866ff79000 r--p 00000000 00:18 77146186     [shared libraries]
    7f866ff79000-7f8670013000 r-xp 0000d000 00:18 77146186     [shared libraries]
    ...
    
  • After malloc():

    ...
    5653d8b82000-5653d8b83000 r--p 00005000 00:18 77464613     [your program]
    5653d8b83000-5653d8b84000 rw-p 00006000 00:18 77464613     [your program]
    5653d983f000-5653d9860000 rw-p 00000000 00:00 0            [heap]
    7f8669b66000-7f866ff6c000 rw-p 00000000 00:00 0        *** MALLOC&#39;D MEMORY
    7f866ff6c000-7f866ff79000 r--p 00000000 00:18 77146186     [shared libraries]
    7f866ff79000-7f8670013000 r-xp 0000d000 00:18 77146186     [shared libraries]
    ...
    
  • After creating the first thread (notice how the start of the VMA changes from 7f8669b66000 to 7f8669366000 since it has grown in size):

    ...
    5653d8b82000-5653d8b83000 r--p 00005000 00:18 77464613     [your program]
    5653d8b83000-5653d8b84000 rw-p 00006000 00:18 77464613     [your program]
    5653d983f000-5653d9860000 rw-p 00000000 00:00 0            [heap]
    7f8669365000-7f8669366000 ---p 00000000 00:00 0        *** GUARD PAGE
    7f8669366000-7f866ff6c000 rw-p 00000000 00:00 0        *** THREAD STACK + MALLOC&#39;D MEMORY
    7f866ff6c000-7f866ff79000 r--p 00000000 00:18 77146186     [shared libraries]
    7f866ff79000-7f8670013000 r-xp 0000d000 00:18 77146186     [shared libraries]
    ...
    

You can clearly see that, after creating the thread, the kernel shows the two memory regions (thread stack + your malloc'd chunk) together as a single VMA, given that they are contiguous, anonymous and have the same protections (rw).

The guard page above the thread stack is treated as a separate VMA (it has different protections), and subsequent threads will mmap their stack above it, so they will not affect the soft-dirty bits of your original memory region:

...
5653d8b82000-5653d8b83000 r--p 00005000 00:18 77464613     [your program]
5653d8b83000-5653d8b84000 rw-p 00006000 00:18 77464613     [your program]
5653d983f000-5653d9860000 rw-p 00000000 00:00 0            [heap]
7f8668363000-7f8668364000 ---p 00000000 00:00 0        *** GUARD PAGE
7f8668364000-7f8668b64000 rw-p 00000000 00:00 0        *** THREAD 3 STACK
7f8668b64000-7f8668b65000 ---p 00000000 00:00 0        *** GUARD PAGE
7f8668b65000-7f8669365000 rw-p 00000000 00:00 0        *** THREAD 2 STACK
7f8669365000-7f8669366000 ---p 00000000 00:00 0        *** GUARD PAGE
7f8669366000-7f866ff6c000 rw-p 00000000 00:00 0        *** THREAD 1 STACK + MALLOC&#39;D MEMORY
7f866ff6c000-7f866ff79000 r--p 00000000 00:18 77146186     [shared libraries]
7f866ff79000-7f8670013000 r-xp 0000d000 00:18 77146186     [shared libraries]
...

This is why from the second thread onward you don't see anything unexpected happening.

huangapple
  • 本文由 发表于 2023年5月17日 07:03:48
  • 转载请务必保留本文链接:https://go.coder-hub.com/76267600.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定