lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250603-fork-tearing-v1-0-a7f64b7cfc96@google.com>
Date: Tue, 03 Jun 2025 20:21:01 +0200
From: Jann Horn <jannh@...gle.com>
To: Andrew Morton <akpm@...ux-foundation.org>, 
 David Hildenbrand <david@...hat.com>, 
 Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, 
 "Liam R. Howlett" <Liam.Howlett@...cle.com>, 
 Vlastimil Babka <vbabka@...e.cz>, Mike Rapoport <rppt@...nel.org>, 
 Suren Baghdasaryan <surenb@...gle.com>, Michal Hocko <mhocko@...e.com>, 
 linux-mm@...ck.org
Cc: Peter Xu <peterx@...hat.com>, linux-kernel@...r.kernel.org, 
 Jann Horn <jannh@...gle.com>, stable@...r.kernel.org
Subject: [PATCH 0/2] mm/memory: fix memory tearing on threaded fork

The first patch is a fix with an explanation of the issue, you should
read that first.
The second patch adds a comment to document the rules because figuring
this out from scratch causes brain pain.

Accidentally hitting this issue and getting negative consequences from
it would require several stars to line up just right; but if someone out
there is using a malloc() implementation that uses lockless data
structures across threads or such, this could actually be a problem.

In case someone wants a testcase, here's a very artificial one:

```
 #include <pthread.h>
 #include <err.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/uio.h>
 #include <sys/mman.h>
 #include <sys/wait.h>
 #include <linux/io_uring.h>

 #define SYSCHK(x) ({          \
  typeof(x) __res = (x);      \
  if (__res == (typeof(x))-1) \
    err(1, "SYSCHK(" #x ")"); \
  __res;                      \
})

 #define NUM_SQ_PAGES 4
static int uring_init(struct io_uring_sqe **sqesp, void **cqesp) {
  struct io_uring_sqe *sqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0));
  void *cqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0));
  *(volatile unsigned int *)(cqes+4) = 64 * NUM_SQ_PAGES;
  struct io_uring_params params = {
    .flags = IORING_SETUP_NO_MMAP|IORING_SETUP_NO_SQARRAY,
    .sq_off = { .user_addr = (unsigned long)sqes },
    .cq_off = { .user_addr = (unsigned long)cqes }
  };
  int uring_fd = SYSCHK(syscall(__NR_io_uring_setup, /*entries=*/10, &params));
  if (sqesp)
    *sqesp = sqes;
  if (cqesp)
    *cqesp = cqes;
  return uring_fd;
}

static char *bufmem[0x3000] __attribute__((aligned(0x1000)));

static void *thread_fn(void *dummy) {
  unsigned long i = 0;
  while (1) {
    *(volatile unsigned long *)(bufmem + 0x0000) = i;
    *(volatile unsigned long *)(bufmem + 0x0f00) = i;
    *(volatile unsigned long *)(bufmem + 0x1000) = i;
    *(volatile unsigned long *)(bufmem + 0x1f00) = i;
    *(volatile unsigned long *)(bufmem + 0x2000) = i;
    *(volatile unsigned long *)(bufmem + 0x2f00) = i;
    i++;
  }
}

int main(void) {
 #if 1
  int uring_fd = uring_init(NULL, NULL);
  struct iovec reg_iov = { .iov_base = bufmem, .iov_len = 0x2000 };
  SYSCHK(syscall(__NR_io_uring_register, uring_fd, IORING_REGISTER_BUFFERS, &reg_iov, 1));
 #endif

  pthread_t thread;
  if (pthread_create(&thread, NULL, thread_fn, NULL))
    errx(1, "pthread_create");

  sleep(1);
  int child = SYSCHK(fork());
  if (child == 0) {
    printf("bufmem values:\n");
    printf("  0x0000: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x0000));
    printf("  0x0f00: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x0f00));
    printf("  0x1000: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x1000));
    printf("  0x1f00: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x1f00));
    printf("  0x2000: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x2000));
    printf("  0x2f00: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x2f00));
    return 0;
  }
  int wstatus;
  SYSCHK(wait(&wstatus));
  return 0;
}
```

Without this series, the child will usually print results that are
apart by more than 1, which is not a state that ever occurred in
the parent; in my opinion, that counts as a bug.
If you change the "#if 1" to "#if 0", the bug won't manifest.

Signed-off-by: Jann Horn <jannh@...gle.com>
---
Jann Horn (2):
      mm/memory: ensure fork child sees coherent memory snapshot
      mm/memory: Document how we make a coherent memory snapshot

 kernel/fork.c | 34 ++++++++++++++++++++++++++++++++++
 mm/memory.c   | 18 ++++++++++++++++++
 2 files changed, 52 insertions(+)
---
base-commit: 8477ab143069c6b05d6da4a8184ded8b969240f5
change-id: 20250530-fork-tearing-71da211a50cf

-- 
Jann Horn <jannh@...gle.com>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ