linux-kernel - [BUG Report] The read syscall extremely slow on some type of the AMD CPU

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <30d0e4e5-d4f1-4fb5-8359-8752a77fce42@manjusaka.me>
Date:   Tue, 28 Nov 2023 20:53:41 +0800
From:   Manjusaka <me@...jusaka.me>
To:     linux-kernel@...r.kernel.org
Cc:     ryncsn@...il.com
Subject: [BUG Report] The read syscall extremely slow on some type of the AMD
 CPU

Hello everyone

We have some of the code(Python vs Rust) below 

```python
import pathlib
import timeit

root = pathlib.Path(__file__).parent
filename = "file"

def read_file_with_normal() -> bytes:
    with open("/tmp/demofile2", "rb") as fp:
        result = fp.read()
    return result

if __name__ == "__main__":
    read_file_with_normal()
```

```rust
use std::io::Read;
use std::fs::OpenOptions;

fn main() {
    // let bs = op.read("file").unwrap();
    let mut bs = vec![0; 1024 * 1024 * 64];
    let mut f = OpenOptions::new().read(true).open("/tmp/demofile2").unwrap();
    f.read_exact(&mut bs).unwrap();
    // let bs = std::fs::read("/tmp/test/file").unwrap();
    //assert_eq!(bs.len(), 64 * 1024 * 1024);
}
```

We find out the Rust Code on `read` syscall is slower than Python syscall. we use the eBPF to measure the read operation time on "/tmp/demofile2"

```python
from bcc import BPF
import time
import argparse

bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/fs.h>
#include <linux/sched.h>

BPF_HASH(fd_info, pid_t, int);
BPF_HASH(action_info, pid_t, u64);

struct event_data_t {
    u32 pid;
    u64 delta_ts;
};


BPF_RINGBUF_OUTPUT(events, 65536);

static __always_inline int matchPrefix(const char *cs, const char *ct, int size) {
    int len = 0;
    unsigned char c1, c2;
    for (len=0;len<(size & 0xff);len++) {
        c1 = *cs++;
        c2 = *ct++;
        if (c1 != c2) return c1 < c2 ? -1 : 1;
        if (!c1) break;
     }
     return 0;
}

TRACEPOINT_PROBE(syscalls, sys_enter_openat) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    char *filename = args->filename;
    if (matchPrefix(filename, "/tmp/demofile2", sizeof(filename)) != 0) {
        return 0;
    }
    int fd=0;
    fd_info.update(&tid, &fd);

    return 0;
}

TRACEPOINT_PROBE(syscalls, sys_exit_openat) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    int *dfd = fd_info.lookup(&tid);
    if (dfd == NULL) {
        return 0;
    }
    int fd = args->ret;
    fd_info.update(&tid, &fd);
    return 0;
}


TRACEPOINT_PROBE(syscalls, sys_enter_open) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    char *filename = args->filename;
    if (matchPrefix(filename, "/tmp/demofile2", sizeof(filename)) != 0) {
        return 0;
    }
    int fd=0;
    fd_info.update(&tid, &fd);

    return 0;
}

TRACEPOINT_PROBE(syscalls, sys_exit_open) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    int *dfd = fd_info.lookup(&tid);
    if (dfd == NULL) {
        return 0;
    }
    int fd = args->ret;
    fd_info.update(&tid, &fd);
    return 0;
}

TRACEPOINT_PROBE(syscalls, sys_enter_read) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    int *dfd = fd_info.lookup(&tid);
    if (dfd == NULL) {
        return 0;
    }

    if (*dfd != args->fd) {
        return 0;
    }
    u64 ts = bpf_ktime_get_ns();
    action_info.update(&tid, &ts);

    return 0;
}

TRACEPOINT_PROBE(syscalls, sys_exit_read) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    u64 *ts = action_info.lookup(&tid);
    if (ts == NULL) {
        return 0;
    }
    action_info.delete(&tid);
    struct event_data_t *event_data = events.ringbuf_reserve(sizeof(struct event_data_t));
    if (!event_data) {
        return 0;
    }
    event_data->pid = pid;
    event_data->delta_ts = bpf_ktime_get_ns() - *ts;
    events.ringbuf_submit(event_data, sizeof(event_data));
    return 0;
}
"""

bpf = BPF(text=bpf_text)


def process_event_data(cpu, data, size):
    event = bpf["events"].event(data)
    print(f"Process {event.pid} read file {event.delta_ts} ns")


bpf["events"].open_ring_buffer(process_event_data)

while True:
    try:
        bpf.ring_buffer_consume()
    except KeyboardInterrupt:
        exit()
```

The data shows that the Rust code for the read operation time on "/tmp/demofile2" time is almost double times for the Python version.

```text
Process 88010 read file 11094021 ns
Process 92652 read file 29149375 ns
```

We change the Rust code to `C` code 

```c
#include <stdio.h>
#include <stdlib.h>

#define FILE_SIZE 64 * 1024 * 1024  // 64 MiB

int main() {
    FILE *file;
    char *buffer;
    size_t result;

    file = fopen("/tmp/file", "rb");
    if (file == NULL) {
        fputs("Error opening file", stderr);
        return 1;
    }

    buffer = (char *)malloc(sizeof(char) * FILE_SIZE);
    if (buffer == NULL) {
        fputs("Memory error", stderr);
        fclose(file);
        return 2;
    }

    result = fread(buffer, 1, FILE_SIZE, file);
    if (result != FILE_SIZE) {
        fputs("Reading error", stderr);
        fclose(file);
        free(buffer);
        return 3;
    }

    fclose(file);
    free(buffer);

    return 0;
}
```

The results is the same.

And we find a interested issue. We change buffer variable's offset in C

```c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>

#define FILE_SIZE 64 * 1024 * 1024  // 64 MiB

int main() {
    int file;
    char *buffer;
    size_t result;

    file = open("/tmp/file", O_RDONLY);
    if (file == NULL) {
        fputs("Error opening file", stderr);
        return 1;
    }

    buffer = (char *)malloc(sizeof(char) * FILE_SIZE);
    if (buffer == NULL) {
        fputs("Memory error", stderr);
        close(file);
        return 2;
    }

    result = read(file, buffer + 0x20, FILE_SIZE);
    if (result != FILE_SIZE) {
        fputs("Reading error", stderr);
        close(file);
        free(buffer);
        return 3;
    }

    close(file);
    free(buffer);

    return 0;
}
```

The result becomes normal. The behavior is same with we change malloc to aligned_alloc.

And we found that the result is normal when the offset is in range [page*4k-0x10,page*4k+0x10], otherwise the result is slow.

This issue can be produced on AMD Ryzen 9 5950X, AMD R7 5700X, AMD Ryzen 9 5900X

Here's some reference for this issue which is reported in the community

1. https://github.com/apache/incubator-opendal/issues/3665
2. https://users.rust-lang.org/t/std-read-slow/85424/13