[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220329144047.35zsrw24p2t2ggmg@wittgenstein>
Date: Tue, 29 Mar 2022 16:40:47 +0200
From: Christian Brauner <brauner@...nel.org>
To: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: Fedor Pchelkin <aissur0002@...il.com>,
Alexey Khoroshilov <khoroshilov@...ras.ru>,
Eric Biggers <ebiggers@...nel.org>,
Alexander Viro <viro@...iv.linux.org.uk>,
linux-fsdevel <linux-fsdevel@...r.kernel.org>,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 4/4] file: Fix file descriptor leak in copy_fd_bitmaps()
On Tue, Mar 29, 2022 at 12:23:47PM +0200, Christian Brauner wrote:
> On Sun, Mar 27, 2022 at 03:21:18PM -0700, Linus Torvalds wrote:
> > On Sun, Mar 27, 2022 at 2:53 PM <aissur0002@...il.com> wrote:
> > >
> > > I am sorry, that was my first attempt to contribute to the kernel and
> > > I messed up a little bit with the patch tag: it is actually a single,
> > > standalone patch and there has been nothing posted previously.
> >
> > No problem, thanks for clarifying.
> >
> > But the patch itself in that case is missing some detail, since it
> > clearly doesn't apply to upstream.
> >
> > Anyway:
> >
> > > In few words, an error occurs while executing close_range() call with
> > > CLOSE_RANGE_UNSHARE flag: in __close_range() the value of
> > > max_unshare_fds (later used as max_fds in dup_fd() and copy_fd_bitmaps())
> > > can be an arbitrary number.
> > >
> > > if (max_fd >= last_fd(files_fdtable(cur_fds)))
> > > max_unshare_fds = fd;
> > >
> > > and not be a multiple of BITS_PER_LONG or BITS_PER_BYTE.
> >
> > Very good, that's the piece I was missing. I only looked in fs/file.c,
> > and missed how that max_unshare_fds gets passed from __close_range()
> > into fork.c for unshare_fd() and then back to file.c and dup_fd(). And
> > then affects sane_fdtable_size().
> >
> > I _think_ it should be sufficient to just do something like
> >
> > max_fds = ALIGN(max_fds, BITS_PER_LONG)
> >
> > in sane_fdtable_size(), but honestly, I haven't actually thought about
> > it all that much. That's just a gut feel without really analyzing
> > things - sane_fdtable_size() really should never return a value that
> > isn't BITS_PER_LONG aligned.
> >
> > And that whole close_range() is why I added Christian Brauner to the
> > participant list, though, so let's see if Christian has any comments.
> >
> > Christian?
>
> (Sorry, I was heads-deep in some other fs work and went into airplaine
> mode. I'm back.)
>
> So I investigated a similar report a little while back and I spent quite
> a lot of time trying to track this down but didn't find the cause.
> If you'd call:
>
> close_range(131, -1, CLOSE_RANGE_UNSHARE);
>
> for an fdtable that is smaller than 131 then we'd call:
>
> unshare_fd(..., 131)
> \dup_fd(..., 131)
> \sane_fdtable_size(..., 131)
>
> So sane_fdtable_size() would return 131 which is not aligned. This
> couldn't happen before CLOSE_RANGE_UNSHARE afaict. I'll try to do a
> repro with this with your suggested fix applied.
Ok, I managed to repro this issue on an upstream 5.17 kernel. You'll
need kmemleak enabled:
CONFIG_MEMCG_KMEM=y
CONFIG_HAVE_DEBUG_KMEMLEAK=y
CONFIG_DEBUG_KMEMLEAK=y
CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE=16000
# CONFIG_DEBUG_KMEMLEAK_TEST is not set
# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set
CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y
The reproducer at [1] I'm using for this is super ugly fwiw. Compile with
gcc -pthread <bla>.c -o <bla>
The repro should trigger in about 2-3 iterations once the fdtable has
grown sufficiently for fd_start to become the upper limit instead of the
actual fdtable size when calling close_range(131, -1, CLOSE_RANGE_UNSHARE).
I don't think this actually leads to any fd leaks afaict as they should
all be properly closed after all sane_fdtable_size() does return the
number of open fds. But this is indeed triggered by the missing
BITS_PER_LONG alignment.
Your patch fixes this, Linus. I've compiled a kernel with your patch in
sane_fdtable_size() applied running with the repro for a few hours now
and no leaks. Feel free to take my ack.
I think alloc_fdtable() does everything correctly. The issue imho is
sane_fdtable_size() not aligning and then below when we do:
for (i = open_files; i != 0; i--) {
in dup_fd() it looks to kmemleak like it's leaking (I'm not completely
sure it is actually.).
[1]:
#define _GNU_SOURCE
#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#include <linux/futex.h>
#ifndef __NR_close_range
#define __NR_close_range 436
#endif
#ifndef SYS_gettid
#error "SYS_gettid unavailable on this system"
#endif
#define gettid() ((pid_t)syscall(SYS_gettid))
static void sleep_ms(uint64_t ms)
{
usleep(ms * 1000);
}
static uint64_t current_time_ms(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
exit(1);
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}
static void thread_start(void* (*fn)(void*), void* arg)
{
pid_t pid = getpid();
pid_t tid = gettid();
pthread_t th;
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, 128 << 10);
int i = 0;
for (; i < 100; i++) {
if (pthread_create(&th, &attr, fn, arg) == 0) {
pthread_attr_destroy(&attr);
printf("%d %d created thread\n", pid, tid);
return;
}
if (errno == EAGAIN) {
usleep(50);
continue;
}
break;
}
exit(1);
}
typedef struct {
int state;
} event_t;
static void event_init(event_t* ev)
{
ev->state = 0;
}
static void event_reset(event_t* ev)
{
ev->state = 0;
}
static void event_set(event_t* ev)
{
if (ev->state)
exit(1);
__atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
syscall(SYS_futex, &ev->state, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1000000);
}
static void event_wait(event_t* ev)
{
while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0);
}
static int event_isset(event_t* ev)
{
return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
}
static int event_timedwait(event_t* ev, uint64_t timeout)
{
uint64_t start = current_time_ms();
uint64_t now = start;
for (;;) {
uint64_t remain = timeout - (now - start);
struct timespec ts;
ts.tv_sec = remain / 1000;
ts.tv_nsec = (remain % 1000) * 1000 * 1000;
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, &ts);
if (__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
return 1;
now = current_time_ms();
if (now - start > timeout)
return 0;
}
}
static bool write_file(const char* file, const char* what, ...)
{
char buf[1024];
va_list args;
va_start(args, what);
vsnprintf(buf, sizeof(buf), what, args);
va_end(args);
buf[sizeof(buf) - 1] = 0;
int len = strlen(buf);
int fd = open(file, O_WRONLY | O_CLOEXEC);
if (fd == -1)
return false;
if (write(fd, buf, len) != len) {
int err = errno;
close(fd);
errno = err;
return false;
}
close(fd);
return true;
}
static void kill_and_wait(int pid, int* status)
{
kill(-pid, SIGKILL);
kill(pid, SIGKILL);
for (int i = 0; i < 100; i++) {
if (waitpid(-1, status, WNOHANG | __WALL) == pid)
return;
usleep(1000);
}
DIR* dir = opendir("/sys/fs/fuse/connections");
if (dir) {
for (;;) {
struct dirent* ent = readdir(dir);
if (!ent)
break;
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
continue;
char abort[300];
snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort", ent->d_name);
int fd = open(abort, O_WRONLY);
if (fd == -1) {
continue;
}
if (write(fd, abort, 1) < 0) {
}
close(fd);
}
closedir(dir);
} else {
}
while (waitpid(-1, status, __WALL) != pid) {
}
}
static void setup_test()
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
write_file("/proc/self/oom_score_adj", "1000");
}
#define KMEMLEAK_FILE "/sys/kernel/debug/kmemleak"
static void setup_leak()
{
if (!write_file(KMEMLEAK_FILE, "scan"))
exit(1);
sleep(5);
if (!write_file(KMEMLEAK_FILE, "scan"))
exit(1);
if (!write_file(KMEMLEAK_FILE, "clear"))
exit(1);
}
static void check_leaks(void)
{
int fd = open(KMEMLEAK_FILE, O_RDWR);
if (fd == -1)
exit(1);
uint64_t start = current_time_ms();
if (write(fd, "scan", 4) != 4)
exit(1);
sleep(1);
while (current_time_ms() - start < 4 * 1000)
sleep(1);
if (write(fd, "scan", 4) != 4)
exit(1);
static char buf[128 << 10];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
if (n < 0)
exit(1);
int nleaks = 0;
if (n != 0) {
sleep(1);
if (write(fd, "scan", 4) != 4)
exit(1);
if (lseek(fd, 0, SEEK_SET) < 0)
exit(1);
n = read(fd, buf, sizeof(buf) - 1);
if (n < 0)
exit(1);
buf[n] = 0;
char* pos = buf;
char* end = buf + n;
while (pos < end) {
char* next = strstr(pos + 1, "unreferenced object");
if (!next)
next = end;
char prev = *next;
*next = 0;
fprintf(stderr, "BUG: memory leak\n%s\n", pos);
*next = prev;
pos = next;
nleaks++;
}
}
if (write(fd, "clear", 5) != 5)
exit(1);
close(fd);
if (nleaks)
exit(1);
}
struct thread_t {
int created, call;
event_t ready, done;
};
static struct thread_t threads[16];
static void execute_call(int call);
static int running;
static void* thr(void* arg)
{
struct thread_t* th = (struct thread_t*)arg;
for (;;) {
event_wait(&th->ready);
event_reset(&th->ready);
execute_call(th->call);
__atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED);
event_set(&th->done);
}
return 0;
}
static void execute_one(void)
{
sleep_ms(100);
int i, call, thread;
for (call = 0; call < 2; call++) {
for (thread = 0; thread < (int)(sizeof(threads) / sizeof(threads[0])); thread++) {
struct thread_t* th = &threads[thread];
if (!th->created) {
th->created = 1;
event_init(&th->ready);
event_init(&th->done);
event_set(&th->done);
thread_start(thr, th);
}
if (!event_isset(&th->done))
continue;
event_reset(&th->done);
th->call = call;
__atomic_fetch_add(&running, 1, __ATOMIC_RELAXED);
event_set(&th->ready);
event_timedwait(&th->done, 50);
break;
}
}
for (i = 0; i < 100 && __atomic_load_n(&running, __ATOMIC_RELAXED); i++)
sleep_ms(1);
}
static void execute_one(void);
#define WAIT_FLAGS __WALL
static void loop(void)
{
pid_t parentPID = getpid();
pid_t parentTID = gettid();
int iter = 0;
for (;; iter++) {
printf("%d %d forking\n", parentPID, parentTID);
int pid = fork();
if (pid < 0)
exit(1);
if (pid == 0) {
setup_test();
execute_one();
exit(0);
}
printf("%d %d forked child %d\n", parentPID, parentTID, pid);
int status = 0;
uint64_t start = current_time_ms();
for (;;) {
if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
break;
sleep_ms(1);
if (current_time_ms() - start < 5000)
continue;
kill_and_wait(pid, &status);
break;
}
printf("%d %d checking leak after executor exited\n", parentPID, parentTID);
check_leaks();
}
}
uint64_t r[1] = {0xffffffffffffffff};
void execute_call(int call)
{
pid_t pid = getpid();
pid_t tid = gettid();
intptr_t res = 0;
switch (call) {
case 0:
printf("%d %d pipe2\n", pid, tid);
res = syscall(__NR_pipe2, 0x20000080ul, 0ul);
{
int i;
for(i = 0; i < 64 /* 32 also triggers the bug, but 16 doesn't */; i++) {
syscall(__NR_pipe2, 0x20000080ul, 0ul);
}
}
if (res != -1)
r[0] = *(uint32_t*)0x20000080;
break;
case 1:
printf("%d %d close_range\n", pid, tid);
syscall(__NR_close_range, r[0], -1, 2ul /* CLOSE_RANGE_UNSHARE */);
break;
}
}
int main(void)
{
syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul);
syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
setup_leak();
printf("%d %d starting loop\n", getpid(), gettid());
loop();
return 0;
}
Powered by blists - more mailing lists