lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aBApDSHblacSBaFH@google.com>
Date: Mon, 28 Apr 2025 18:19:09 -0700
From: Sean Christopherson <seanjc@...gle.com>
To: James Houghton <jthoughton@...gle.com>
Cc: kvm@...r.kernel.org, Maxim Levitsky <mlevitsk@...hat.com>, 
	Axel Rasmussen <axelrasmussen@...gle.com>, Tejun Heo <tj@...nel.org>, 
	Johannes Weiner <hannes@...xchg.org>, mkoutny@...e.com, Yosry Ahmed <yosry.ahmed@...ux.dev>, 
	Yu Zhao <yuzhao@...gle.com>, David Matlack <dmatlack@...gle.com>, cgroups@...r.kernel.org, 
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3 5/5] KVM: selftests: access_tracking_perf_test: Use
 MGLRU for access tracking

On Mon, Apr 14, 2025, James Houghton wrote:
> By using MGLRU's debugfs for invoking test_young() and clear_young(), we
> avoid page_idle's incompatibility with MGLRU, and we can mark pages as
> idle (clear_young()) much faster.
> 
> The ability to use page_idle is left in, as it is useful for kernels
> that do not have MGLRU built in. If MGLRU is enabled but is not usable
> (e.g. we can't access the debugfs mount), the test will fail, as
> page_idle is not compatible with MGLRU.
> 
> cgroup utility functions have been borrowed so that, when running with
> MGLRU, we can create a memcg in which to run our test.
> 
> Other MGLRU-debugfs-specific parsing code has been added to
> lru_gen_util.{c,h}.

This is not a proper changelog, at least not by upstream KVM standards.  Please
rewrite it describe the changes being made, using imperative mood/tone.  From
Documentation/process/maintainer-kvm-x86.rst:

  Changelog
  ~~~~~~~~~
  Most importantly, write changelogs using imperative mood and avoid pronouns.

> @@ -354,7 +459,12 @@ static int access_tracking_unreliable(void)
>  		puts("Skipping idle page count sanity check, because NUMA balancing is enabled");
>  		return 1;
>  	}
> +	return 0;
> +}
>  
> +int run_test_in_cg(const char *cgroup, void *arg)

static

> +{
> +	for_each_guest_mode(run_test, arg);

Having "separate" flows for MGLRU vs. page_idle is unnecessary.  Give the helper
a more common name and use it for both:

static int run_test_for_each_guest_mode(const char *cgroup, void *arg)
{
	for_each_guest_mode(run_test, arg);
	return 0;
}

>  	return 0;
>  }
>  
> @@ -372,7 +482,7 @@ static void help(char *name)
>  	printf(" -v: specify the number of vCPUs to run.\n");
>  	printf(" -o: Overlap guest memory accesses instead of partitioning\n"
>  	       "     them into a separate region of memory for each vCPU.\n");
> -	printf(" -w: Control whether the test warns or fails if more than 10%\n"
> +	printf(" -w: Control whether the test warns or fails if more than 10%%\n"
>  	       "     of pages are still seen as idle/old after accessing guest\n"
>  	       "     memory.  >0 == warn only, 0 == fail, <0 == auto.  For auto\n"
>  	       "     mode, the test fails by default, but switches to warn only\n"
> @@ -383,6 +493,12 @@ static void help(char *name)
>  	exit(0);
>  }
>  
> +void destroy_cgroup(char *cg)

static.  But this is a pointless wrapper, just delete it.

> +{
> +	printf("Destroying cgroup: %s\n", cg);
> +	cg_destroy(cg);
> +}
> +
>  int main(int argc, char *argv[])
>  {
>  	struct test_params params = {
> @@ -390,6 +506,7 @@ int main(int argc, char *argv[])
>  		.vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE,
>  		.nr_vcpus = 1,
>  	};
> +	char *new_cg = NULL;
>  	int page_idle_fd;
>  	int opt;
>  
> @@ -424,15 +541,53 @@ int main(int argc, char *argv[])
>  		}
>  	}
>  
> -	page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
> -	__TEST_REQUIRE(page_idle_fd >= 0,
> -		       "CONFIG_IDLE_PAGE_TRACKING is not enabled");
> -	close(page_idle_fd);
> +	if (lru_gen_usable()) {

Using MGLRU on my home box fails.  It's full cgroup v2, and has both
CONFIG_IDLE_PAGE_TRACKING=y and MGLRU enabled.

==== Test Assertion Failure ====
  access_tracking_perf_test.c:244: false
  pid=114670 tid=114670 errno=17 - File exists
     1	0x00000000004032a9: find_generation at access_tracking_perf_test.c:244
     2	0x00000000004032da: lru_gen_mark_memory_idle at access_tracking_perf_test.c:272
     3	0x00000000004034e4: mark_memory_idle at access_tracking_perf_test.c:391
     4	 (inlined by) run_test at access_tracking_perf_test.c:431
     5	0x0000000000403d84: for_each_guest_mode at guest_modes.c:96
     6	0x0000000000402c61: run_test_for_each_guest_mode at access_tracking_perf_test.c:492
     7	0x000000000041d8e2: cg_run at cgroup_util.c:382
     8	0x00000000004027fa: main at access_tracking_perf_test.c:572
     9	0x00007fa1cb629d8f: ?? ??:0
    10	0x00007fa1cb629e3f: ?? ??:0
    11	0x00000000004029d4: _start at ??:?
  Could not find a generation with 90% of guest memory (235929 pages).

Interestingly, if I force the test to use /sys/kernel/mm/page_idle/bitmap, it
passes.

Please try to reproduce the failure (assuming you haven't already tested that
exact combination of cgroups v2, MGLRU=y, and CONFIG_IDLE_PAGE_TRACKING=y). I
don't have bandwidth to dig any further at this time.

> +		if (cg_find_unified_root(cgroup_root, sizeof(cgroup_root), NULL))
> +			ksft_exit_skip("cgroup v2 isn't mounted\n");
> +
> +		new_cg = cg_name(cgroup_root, TEST_MEMCG_NAME);
> +		printf("Creating cgroup: %s\n", new_cg);
> +		if (cg_create(new_cg) && errno != EEXIST)
> +			ksft_exit_skip("could not create new cgroup: %s\n", new_cg);
> +
> +		use_lru_gen = true;
> +	} else {
> +		page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
> +		__TEST_REQUIRE(page_idle_fd >= 0,
> +			       "Couldn't open /sys/kernel/mm/page_idle/bitmap. "
> +			       "Is CONFIG_IDLE_PAGE_TRACKING enabled?");
> +
> +		close(page_idle_fd);
> +	}

Splitting the "check" and "execute" into separate if-else statements results in
some compilers complaining about new_cg possibly being unused.  The compiler is
probably being a bit stupid, but the code is just as must to blame.  There's zero
reason to split this in two, just do everything after the idle_pages_warn_only
and total_pages processing.  Code at the bottom (note, you'll have to rebase on
my not-yet-posted series, or undo the use of __open_path_or_exit()).

>  
>  	if (idle_pages_warn_only == -1)
>  		idle_pages_warn_only = access_tracking_unreliable();
>  
> -	for_each_guest_mode(run_test, &params);
> +	/*
> +	 * If guest_page_size is larger than the host's page size, the
> +	 * guest (memstress) will only fault in a subset of the host's pages.
> +	 */
> +	total_pages = params.nr_vcpus * params.vcpu_memory_bytes /
> +		      max(memstress_args.guest_page_size,
> +			  (uint64_t)getpagesize());
> +
> +	if (use_lru_gen) {
> +		int ret;
> +
> +		puts("Using lru_gen for aging");
> +		/*
> +		 * This will fork off a new process to run the test within
> +		 * a new memcg, so we need to properly propagate the return
> +		 * value up.
> +		 */
> +		ret = cg_run(new_cg, &run_test_in_cg, &params);
> +		destroy_cgroup(new_cg);
> +		if (ret)
> +			return ret;
> +	} else {
> +		puts("Using page_idle for aging");
> +		for_each_guest_mode(run_test, &params);
> +	}

static int run_test_for_each_guest_mode(const char *cgroup, void *arg)
{
	for_each_guest_mode(run_test, arg);
	return 0;
}

int main(int argc, char *argv[])
{
	struct test_params params = {
		.backing_src = DEFAULT_VM_MEM_SRC,
		.vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE,
		.nr_vcpus = 1,
	};
	int page_idle_fd;
	int opt;

	guest_modes_append_default();

	while ((opt = getopt(argc, argv, "hm:b:v:os:w:")) != -1) {
		switch (opt) {
		case 'm':
			guest_modes_cmdline(optarg);
			break;
		case 'b':
			params.vcpu_memory_bytes = parse_size(optarg);
			break;
		case 'v':
			params.nr_vcpus = atoi_positive("Number of vCPUs", optarg);
			break;
		case 'o':
			overlap_memory_access = true;
			break;
		case 's':
			params.backing_src = parse_backing_src_type(optarg);
			break;
		case 'w':
			idle_pages_warn_only =
				atoi_non_negative("Idle pages warning",
						  optarg);
			break;
		case 'h':
		default:
			help(argv[0]);
			break;
		}
	}

	if (idle_pages_warn_only == -1)
		idle_pages_warn_only = access_tracking_unreliable();

	/*
	 * If guest_page_size is larger than the host's page size, the
	 * guest (memstress) will only fault in a subset of the host's pages.
	 */
	total_pages = params.nr_vcpus * params.vcpu_memory_bytes /
		      max_t(uint64_t, memstress_args.guest_page_size, getpagesize());

	if (lru_gen_usable()) {
		bool cg_created = true;
		char *test_cg = NULL;
		int ret;

		puts("Using lru_gen for aging");
		use_lru_gen = true;

		if (cg_find_controller_root(cgroup_root, sizeof(cgroup_root), "memory"))
			ksft_exit_skip("Cannot find memory group controller\n");

		test_cg = cg_name(cgroup_root, TEST_MEMCG_NAME);
		printf("Creating cgroup: %s\n", test_cg);
		if (cg_create(test_cg)) {
			if (errno == EEXIST)
				cg_created = false;
			else
				ksft_exit_skip("could not create new cgroup: %s\n", test_cg);
		}

		/*
		 * This will fork off a new process to run the test within
		 * a new memcg, so we need to properly propagate the return
		 * value up.
		 */
		ret = cg_run(test_cg, &run_test_for_each_guest_mode, &params);
		if (cg_created)
			cg_destroy(test_cg);
		return ret;
	}

	puts("Using page_idle for aging");
	page_idle_fd = __open_path_or_exit("/sys/kernel/mm/page_idle/bitmap", O_RDWR,
					   "Is CONFIG_IDLE_PAGE_TRACKING enabled?");
	close(page_idle_fd);
	run_test_for_each_guest_mode(NULL, &params);
	return 0;
}

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ