[<prev] [next>] [day] [month] [year] [list]
Message-ID: <30d0e4e5-d4f1-4fb5-8359-8752a77fce42@manjusaka.me>
Date: Tue, 28 Nov 2023 20:53:41 +0800
From: Manjusaka <me@...jusaka.me>
To: linux-kernel@...r.kernel.org
Cc: ryncsn@...il.com
Subject: [BUG Report] The read syscall extremely slow on some type of the AMD
CPU
Hello everyone
We have some of the code(Python vs Rust) below
```python
import pathlib
import timeit
root = pathlib.Path(__file__).parent
filename = "file"
def read_file_with_normal() -> bytes:
with open("/tmp/demofile2", "rb") as fp:
result = fp.read()
return result
if __name__ == "__main__":
read_file_with_normal()
```
```rust
use std::io::Read;
use std::fs::OpenOptions;
fn main() {
// let bs = op.read("file").unwrap();
let mut bs = vec![0; 1024 * 1024 * 64];
let mut f = OpenOptions::new().read(true).open("/tmp/demofile2").unwrap();
f.read_exact(&mut bs).unwrap();
// let bs = std::fs::read("/tmp/test/file").unwrap();
//assert_eq!(bs.len(), 64 * 1024 * 1024);
}
```
We find out the Rust Code on `read` syscall is slower than Python syscall. we use the eBPF to measure the read operation time on "/tmp/demofile2"
```python
from bcc import BPF
import time
import argparse
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/fs.h>
#include <linux/sched.h>
BPF_HASH(fd_info, pid_t, int);
BPF_HASH(action_info, pid_t, u64);
struct event_data_t {
u32 pid;
u64 delta_ts;
};
BPF_RINGBUF_OUTPUT(events, 65536);
static __always_inline int matchPrefix(const char *cs, const char *ct, int size) {
int len = 0;
unsigned char c1, c2;
for (len=0;len<(size & 0xff);len++) {
c1 = *cs++;
c2 = *ct++;
if (c1 != c2) return c1 < c2 ? -1 : 1;
if (!c1) break;
}
return 0;
}
TRACEPOINT_PROBE(syscalls, sys_enter_openat) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
char *filename = args->filename;
if (matchPrefix(filename, "/tmp/demofile2", sizeof(filename)) != 0) {
return 0;
}
int fd=0;
fd_info.update(&tid, &fd);
return 0;
}
TRACEPOINT_PROBE(syscalls, sys_exit_openat) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
int *dfd = fd_info.lookup(&tid);
if (dfd == NULL) {
return 0;
}
int fd = args->ret;
fd_info.update(&tid, &fd);
return 0;
}
TRACEPOINT_PROBE(syscalls, sys_enter_open) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
char *filename = args->filename;
if (matchPrefix(filename, "/tmp/demofile2", sizeof(filename)) != 0) {
return 0;
}
int fd=0;
fd_info.update(&tid, &fd);
return 0;
}
TRACEPOINT_PROBE(syscalls, sys_exit_open) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
int *dfd = fd_info.lookup(&tid);
if (dfd == NULL) {
return 0;
}
int fd = args->ret;
fd_info.update(&tid, &fd);
return 0;
}
TRACEPOINT_PROBE(syscalls, sys_enter_read) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
int *dfd = fd_info.lookup(&tid);
if (dfd == NULL) {
return 0;
}
if (*dfd != args->fd) {
return 0;
}
u64 ts = bpf_ktime_get_ns();
action_info.update(&tid, &ts);
return 0;
}
TRACEPOINT_PROBE(syscalls, sys_exit_read) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u32 tid = bpf_get_current_pid_tgid();
u64 *ts = action_info.lookup(&tid);
if (ts == NULL) {
return 0;
}
action_info.delete(&tid);
struct event_data_t *event_data = events.ringbuf_reserve(sizeof(struct event_data_t));
if (!event_data) {
return 0;
}
event_data->pid = pid;
event_data->delta_ts = bpf_ktime_get_ns() - *ts;
events.ringbuf_submit(event_data, sizeof(event_data));
return 0;
}
"""
bpf = BPF(text=bpf_text)
def process_event_data(cpu, data, size):
event = bpf["events"].event(data)
print(f"Process {event.pid} read file {event.delta_ts} ns")
bpf["events"].open_ring_buffer(process_event_data)
while True:
try:
bpf.ring_buffer_consume()
except KeyboardInterrupt:
exit()
```
The data shows that the Rust code for the read operation time on "/tmp/demofile2" time is almost double times for the Python version.
```text
Process 88010 read file 11094021 ns
Process 92652 read file 29149375 ns
```
We change the Rust code to `C` code
```c
#include <stdio.h>
#include <stdlib.h>
#define FILE_SIZE 64 * 1024 * 1024 // 64 MiB
int main() {
FILE *file;
char *buffer;
size_t result;
file = fopen("/tmp/file", "rb");
if (file == NULL) {
fputs("Error opening file", stderr);
return 1;
}
buffer = (char *)malloc(sizeof(char) * FILE_SIZE);
if (buffer == NULL) {
fputs("Memory error", stderr);
fclose(file);
return 2;
}
result = fread(buffer, 1, FILE_SIZE, file);
if (result != FILE_SIZE) {
fputs("Reading error", stderr);
fclose(file);
free(buffer);
return 3;
}
fclose(file);
free(buffer);
return 0;
}
```
The results is the same.
And we find a interested issue. We change buffer variable's offset in C
```c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#define FILE_SIZE 64 * 1024 * 1024 // 64 MiB
int main() {
int file;
char *buffer;
size_t result;
file = open("/tmp/file", O_RDONLY);
if (file == NULL) {
fputs("Error opening file", stderr);
return 1;
}
buffer = (char *)malloc(sizeof(char) * FILE_SIZE);
if (buffer == NULL) {
fputs("Memory error", stderr);
close(file);
return 2;
}
result = read(file, buffer + 0x20, FILE_SIZE);
if (result != FILE_SIZE) {
fputs("Reading error", stderr);
close(file);
free(buffer);
return 3;
}
close(file);
free(buffer);
return 0;
}
```
The result becomes normal. The behavior is same with we change malloc to aligned_alloc.
And we found that the result is normal when the offset is in range [page*4k-0x10,page*4k+0x10], otherwise the result is slow.
This issue can be produced on AMD Ryzen 9 5950X, AMD R7 5700X, AMD Ryzen 9 5900X
Here's some reference for this issue which is reported in the community
1. https://github.com/apache/incubator-opendal/issues/3665
2. https://users.rust-lang.org/t/std-read-slow/85424/13
Powered by blists - more mailing lists