lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sat, 23 Apr 2022 03:47:52 +0000
From:   Sean Christopherson <seanjc@...gle.com>
To:     Paolo Bonzini <pbonzini@...hat.com>
Cc:     Sean Christopherson <seanjc@...gle.com>,
        Vitaly Kuznetsov <vkuznets@...hat.com>,
        Wanpeng Li <wanpengli@...cent.com>,
        Jim Mattson <jmattson@...gle.com>,
        Joerg Roedel <joro@...tes.org>, kvm@...r.kernel.org,
        linux-kernel@...r.kernel.org, Ben Gardon <bgardon@...gle.com>,
        David Matlack <dmatlack@...gle.com>,
        Venkatesh Srinivas <venkateshs@...gle.com>,
        Chao Peng <chao.p.peng@...ux.intel.com>
Subject: [PATCH 12/12] DO NOT MERGE: KVM: selftests: Attempt to detect lost
 dirty bits

A failed attempt to detect improper dropping of Writable and/or Dirty
bits.  Doesn't work because the primary MMU write-protects its PTEs when
file writeback occurs, i.e. KVM's dirty bits are meaningless as far as
file-backed guest memory is concnered.

Not-signed-off-by: Sean Christopherson <seanjc@...gle.com>
---
 tools/testing/selftests/kvm/.gitignore        |   1 +
 tools/testing/selftests/kvm/Makefile          |   4 +
 .../selftests/kvm/volatile_spte_test.c        | 208 ++++++++++++++++++
 3 files changed, 213 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/volatile_spte_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 56140068b763..3307444d9fda 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -70,3 +70,4 @@
 /steal_time
 /kvm_binary_stats_test
 /system_counter_offset_test
+/volatile_spte_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index af582d168621..bc0907de6638 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -103,6 +103,7 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
 TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
 TEST_GEN_PROGS_x86_64 += system_counter_offset_test
+TEST_GEN_PROGS_x86_64 += volatile_spte_test
 
 TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
@@ -122,6 +123,7 @@ TEST_GEN_PROGS_aarch64 += rseq_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
 TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test
+TEST_GEN_PROGS_aarch64 += volatile_spte_test
 
 TEST_GEN_PROGS_s390x = s390x/memop
 TEST_GEN_PROGS_s390x += s390x/resets
@@ -134,6 +136,7 @@ TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += rseq_test
 TEST_GEN_PROGS_s390x += set_memory_region_test
 TEST_GEN_PROGS_s390x += kvm_binary_stats_test
+TEST_GEN_PROGS_s390x += volatile_spte_test
 
 TEST_GEN_PROGS_riscv += demand_paging_test
 TEST_GEN_PROGS_riscv += dirty_log_test
@@ -141,6 +144,7 @@ TEST_GEN_PROGS_riscv += kvm_create_max_vcpus
 TEST_GEN_PROGS_riscv += kvm_page_table_test
 TEST_GEN_PROGS_riscv += set_memory_region_test
 TEST_GEN_PROGS_riscv += kvm_binary_stats_test
+TEST_GEN_PROGS_riscv += volatile_spte_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
 LIBKVM += $(LIBKVM_$(UNAME_M))
diff --git a/tools/testing/selftests/kvm/volatile_spte_test.c b/tools/testing/selftests/kvm/volatile_spte_test.c
new file mode 100644
index 000000000000..a4277216eb3d
--- /dev/null
+++ b/tools/testing/selftests/kvm/volatile_spte_test.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <syscall.h>
+#include <sys/ioctl.h>
+#include <sys/sysinfo.h>
+#include <asm/barrier.h>
+#include <linux/atomic.h>
+#include <linux/rseq.h>
+#include <linux/unistd.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define VCPU_ID 0
+
+#define PAGE_SIZE 4096
+
+#define NR_ITERATIONS		1000
+
+#define MEM_FILE_NAME		"volatile_spte_test_mem"
+#define MEM_FILE_MEMSLOT	1
+#define MEM_FILE_DATA_PATTERN	0xa5a5a5a5a5a5a5a5ul
+
+static const uint64_t gpa = (4ull * (1 << 30));
+
+static uint64_t *hva;
+
+static pthread_t mprotect_thread;
+static atomic_t rendezvous;
+static bool done;
+
+static void guest_code(void)
+{
+	uint64_t *gva = (uint64_t *)gpa;
+
+	while (!READ_ONCE(done)) {
+		WRITE_ONCE(*gva, 0);
+		GUEST_SYNC(0);
+
+		WRITE_ONCE(*gva, MEM_FILE_DATA_PATTERN);
+		GUEST_SYNC(1);
+	}
+}
+
+static void *mprotect_worker(void *ign)
+{
+	int i, r;
+
+	i = 0;
+	while (!READ_ONCE(done)) {
+		for ( ; atomic_read(&rendezvous) != 1; i++)
+			cpu_relax();
+
+		usleep((i % 10) + 1);
+
+		r = mprotect(hva, PAGE_SIZE, PROT_NONE);
+		TEST_ASSERT(!r, "Failed to mprotect file (hva = %lx), errno = %d (%s)",
+			    (unsigned long)hva, errno, strerror(errno));
+
+		atomic_inc(&rendezvous);
+	}
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t bitmap = -1ull, val;
+	int i, r, fd, nr_writes;
+	struct kvm_regs regs;
+	struct ucall ucall;
+	struct kvm_vm *vm;
+
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+	vcpu_regs_get(vm, VCPU_ID, &regs);
+	ucall_init(vm, NULL);
+
+	pthread_create(&mprotect_thread, NULL, mprotect_worker, 0);
+
+	fd = open(MEM_FILE_NAME, O_RDWR | O_CREAT, 0644);
+	TEST_ASSERT(fd >= 0, "Failed to open '%s', errno = %d (%s)",
+		    MEM_FILE_NAME, errno, strerror(errno));
+
+	r = ftruncate(fd, PAGE_SIZE);
+	TEST_ASSERT(fd >= 0, "Failed to ftruncate '%s', errno = %d (%s)",
+		    MEM_FILE_NAME, errno, strerror(errno));
+
+	hva = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	TEST_ASSERT(hva != MAP_FAILED,  "Failed to map file, errno = %d (%s)",
+		    errno, strerror(errno));
+
+	vm_set_user_memory_region(vm, MEM_FILE_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES,
+				  gpa, PAGE_SIZE, hva);
+	virt_pg_map(vm, gpa, gpa);
+
+	for (i = 0, nr_writes = 0; i < NR_ITERATIONS; i++) {
+		fdatasync(fd);
+
+		vcpu_run(vm, VCPU_ID);
+		ASSERT_EQ(*hva, 0);
+		ASSERT_EQ(get_ucall(vm, VCPU_ID, &ucall), UCALL_SYNC);
+		ASSERT_EQ(ucall.args[1], 0);
+
+		/*
+		 * The origin hope/intent was to detect dropped Dirty bits by
+		 * checking for missed file writeback.  Sadly, the kernel is
+		 * too smart and write-protects the primary MMU's PTEs, which
+		 * zaps KVM's SPTEs and ultimately causes the folio/page to get
+		 * marked marked dirty by the primary MMU when KVM re-faults on
+		 * the page.
+		 *
+		 * Triggering swap _might_ be a way to detect failure, as swap
+		 * is treated differently than "normal" files.
+		 *
+		 * RIP: 0010:kvm_unmap_gfn_range+0xf1/0x100 [kvm]
+		 * Call Trace:
+		 * <TASK>
+		 *   kvm_mmu_notifier_invalidate_range_start+0x11c/0x2c0 [kvm]
+		 *   __mmu_notifier_invalidate_range_start+0x7e/0x190
+		 *   page_mkclean_one+0x226/0x250
+		 *   rmap_walk_file+0x213/0x430
+		 *   folio_mkclean+0x95/0xb0
+		 *   folio_clear_dirty_for_io+0x5d/0x1c0
+		 *   mpage_submit_page+0x1f/0x70
+		 *   mpage_process_page_bufs+0xf8/0x110
+		 *   mpage_prepare_extent_to_map+0x1e3/0x420
+		 *   ext4_writepages+0x277/0xca0
+		 *   do_writepages+0xd1/0x190
+		 *   filemap_fdatawrite_wbc+0x62/0x90
+		 *   file_write_and_wait_range+0xa3/0xe0
+		 *   ext4_sync_file+0xdb/0x340
+		 *   do_fsync+0x38/0x70
+		 *   __x64_sys_fdatasync+0x13/0x20
+		 *   do_syscall_64+0x31/0x50
+		 *   entry_SYSCALL_64_after_hwframe+0x44/0xae
+		 * </TASK>
+		 *
+		 * RIP: 0010:__folio_mark_dirty+0x266/0x310
+		 * Call Trace:
+		 * <TASK>
+		 *   mark_buffer_dirty+0xe7/0x140
+		 *   __block_commit_write.isra.0+0x59/0xc0
+		 *   block_page_mkwrite+0x15a/0x170
+		 *   ext4_page_mkwrite+0x485/0x620
+		 *   do_page_mkwrite+0x54/0x150
+		 *   __handle_mm_fault+0xe2a/0x1600
+		 *   handle_mm_fault+0xbd/0x280
+		 *   do_user_addr_fault+0x192/0x600
+		 *   exc_page_fault+0x6c/0x140
+		 *   asm_exc_page_fault+0x1e/0x30
+		 * </TASK>
+		 */
+		/* fdatasync(fd); */
+
+		/*
+		 * Clear the dirty log to coerce KVM into write-protecting the
+		 * SPTE (or into clearing dirty bits when using PML).
+		 */
+		kvm_vm_clear_dirty_log(vm, MEM_FILE_MEMSLOT, &bitmap, 0, 1);
+
+		atomic_inc(&rendezvous);
+
+		usleep(i % 10);
+
+		r = _vcpu_run(vm, VCPU_ID);
+
+		while (atomic_read(&rendezvous) != 2)
+			cpu_relax();
+
+		atomic_set(&rendezvous, 0);
+
+		fdatasync(fd);
+		mprotect(hva, PAGE_SIZE, PROT_READ | PROT_WRITE);
+
+		val = READ_ONCE(*hva);
+		if (r) {
+			TEST_ASSERT(!val, "Memory should be zero, write faulted\n");
+			vcpu_regs_set(vm, VCPU_ID, &regs);
+			continue;
+		}
+		nr_writes++;
+		TEST_ASSERT(val == MEM_FILE_DATA_PATTERN,
+			    "Memory doesn't match data pattern, want 0x%lx, got 0x%lx",
+			    MEM_FILE_DATA_PATTERN, val);
+		ASSERT_EQ(get_ucall(vm, VCPU_ID, &ucall), UCALL_SYNC);
+		ASSERT_EQ(ucall.args[1], 1);
+	}
+
+	printf("%d of %d iterations wrote memory\n", nr_writes, NR_ITERATIONS);
+
+	atomic_inc(&rendezvous);
+	WRITE_ONCE(done, true);
+
+	pthread_join(mprotect_thread, NULL);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
+
-- 
2.36.0.rc2.479.g8af0fa9b8e-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ