lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240402014706.3969151-3-harisokn@amazon.com>
Date: Mon, 1 Apr 2024 20:47:06 -0500
From: Haris Okanovic <harisokn@...zon.com>
To: <linux-kernel@...r.kernel.org>, <linux-pm@...r.kernel.org>,
	<linux-assembly@...r.kernel.org>
CC: <peterz@...radead.org>, Haris Okanovic <harisokn@...zon.com>, Ali Saidi
	<alisaidi@...zon.com>, Geoff Blake <blakgeof@...zon.com>, Brian Silver
	<silverbr@...zon.com>
Subject: [PATCH 3/3] arm64: cpuidle: Add arm_poll_idle

An arm64 cpuidle driver with two states: (1) First polls for new runable
tasks up to 100 us (by default) before (2) a wfi idle and awoken by
interrupt (the current arm64 behavior). It allows CPUs to return from
idle more quickly by avoiding the longer interrupt wakeup path, which
may require EL1/EL2 transition in certain VM scenarios.

Poll duration is optionally configured at load time via the poll_limit
module parameter.

The default 100 us duration was experimentally chosen, by measuring QPS
(queries per sec) of the MLPerf bert inference benchmark, which seems
particularly susceptible to this change; see procedure below. 100 us is
the inflection point where QPS stopped growing in a range of tested
values. All results are from AWS m7g.16xlarge instances (Graviton3 SoC)
with dedicated tenancy (dedicated hardware).

| before | 10us  | 25us | 50us | 100us | 125us | 150us | 200us | 300us |
| 5.87   | 5.91  | 5.96 | 6.01 | 6.06  | 6.07  | 6.06  | 6.06  | 6.06  |

Perf's scheduler benchmarks also improve with a range of poll_limit
values >= 10 us. Higher limits produce near identical results within a
3% noise margin. The following tables are `perf bench sched` results,
run times in seconds.

`perf bench sched messaging -l 80000`
| AWS instance  | SoC       | Before | After  | % Change |
| c6g.16xl (VM) | Graviton2 | 18.974 | 18.400 | none     |
| c7g.16xl (VM) | Graviton3 | 13.852 | 13.859 | none     |
| c6g.metal     | Graviton2 | 17.621 | 16.744 | none     |
| c7g.metal     | Graviton3 | 13.430 | 13.404 | none     |

`perf bench sched pipe -l 2500000`
| AWS instance  | SoC       | Before | After  | % Change |
| c6g.16xl (VM) | Graviton2 | 30.158 | 15.181 | -50%     |
| c7g.16xl (VM) | Graviton3 | 18.289 | 12.067 | -34%     |
| c6g.metal     | Graviton2 | 17.609 | 15.170 | -14%     |
| c7g.metal     | Graviton3 | 14.103 | 12.304 | -13%     |

`perf bench sched seccomp-notify -l 2500000`
| AWS instance  | SoC       | Before | After  | % Change |
| c6g.16xl (VM) | Graviton2 | 28.784 | 13.754 | -52%     |
| c7g.16xl (VM) | Graviton3 | 16.964 | 11.430 | -33%     |
| c6g.metal     | Graviton2 | 15.717 | 13.536 | -14%     |
| c7g.metal     | Graviton3 | 13.301 | 11.491 | -14%     |

Steps to run MLPerf bert inference on Ubuntu 22.04:
 sudo apt install build-essential python3 python3-pip
 pip install "pybind11[global]" tensorflow  transformers
 export TF_ENABLE_ONEDNN_OPTS=1
 export DNNL_DEFAULT_FPMATH_MODE=BF16
 git clone https://github.com/mlcommons/inference.git --recursive
 cd inference
 git checkout v2.0
 cd loadgen
 CFLAGS="-std=c++14" python3 setup.py bdist_wheel
 pip install dist/*.whl
 cd ../language/bert
 make setup
 python3 run.py --backend=tf --scenario=SingleStream

Suggested-by: Ali Saidi <alisaidi@...zon.com>
Reviewed-by: Ali Saidi <alisaidi@...zon.com>
Reviewed-by: Geoff Blake <blakgeof@...zon.com>
Cc: Brian Silver <silverbr@...zon.com>
Signed-off-by: Haris Okanovic <harisokn@...zon.com>
---
 drivers/cpuidle/Kconfig.arm           |  13 ++
 drivers/cpuidle/Makefile              |   1 +
 drivers/cpuidle/cpuidle-arm-polling.c | 171 ++++++++++++++++++++++++++
 3 files changed, 185 insertions(+)
 create mode 100644 drivers/cpuidle/cpuidle-arm-polling.c

diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index a1ee475d180d..484666dda38d 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -14,6 +14,19 @@ config ARM_CPUIDLE
 	  initialized by calling the CPU operations init idle hook
 	  provided by architecture code.
 
+config ARM_POLL_CPUIDLE
+	bool "ARM64 CPU idle Driver with polling"
+	depends on ARM64
+	depends on ARM_ARCH_TIMER_EVTSTREAM
+	select CPU_IDLE_MULTIPLE_DRIVERS
+	help
+	  Select this to enable a polling cpuidle driver for ARM64:
+	  The first state polls TIF_NEED_RESCHED for best latency on short
+	  sleep intervals. The second state falls back to arch_cpu_idle() to
+	  wait for interrupt. This is can be helpful in workloads that
+	  frequently block/wake at short intervals or VMs where wakeup IPIs
+	  are more expensive.
+
 config ARM_PSCI_CPUIDLE
 	bool "PSCI CPU idle Driver"
 	depends on ARM_PSCI_FW
diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index d103342b7cfc..23c21422792d 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_ARM_U8500_CPUIDLE)         += cpuidle-ux500.o
 obj-$(CONFIG_ARM_AT91_CPUIDLE)          += cpuidle-at91.o
 obj-$(CONFIG_ARM_EXYNOS_CPUIDLE)        += cpuidle-exynos.o
 obj-$(CONFIG_ARM_CPUIDLE)		+= cpuidle-arm.o
+obj-$(CONFIG_ARM_POLL_CPUIDLE)		+= cpuidle-arm-polling.o
 obj-$(CONFIG_ARM_PSCI_CPUIDLE)		+= cpuidle-psci.o
 obj-$(CONFIG_ARM_PSCI_CPUIDLE_DOMAIN)	+= cpuidle-psci-domain.o
 obj-$(CONFIG_ARM_TEGRA_CPUIDLE)		+= cpuidle-tegra.o
diff --git a/drivers/cpuidle/cpuidle-arm-polling.c b/drivers/cpuidle/cpuidle-arm-polling.c
new file mode 100644
index 000000000000..bca128568114
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-arm-polling.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM64 CPU idle driver using wfe polling
+ *
+ * Copyright 2024 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * Authors:
+ *   Haris Okanovic <harisokn@...zon.com>
+ *   Brian Silver <silverbr@...zon.com>
+ *
+ * Based on cpuidle-arm.c
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Lorenzo Pieralisi <lorenzo.pieralisi@....com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpu_cooling.h>
+#include <linux/cpuidle.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpuidle.h>
+#include <asm/readex.h>
+
+#include "dt_idle_states.h"
+
+/* Max duration of the wfe() poll loop in us, before transitioning to
+ * arch_cpu_idle()/wfi() sleep.
+ */
+#define DEFAULT_POLL_LIMIT_US 100
+static unsigned int poll_limit __read_mostly = DEFAULT_POLL_LIMIT_US;
+
+/*
+ * arm_idle_wfe_poll - Polls state in wfe loop until reschedule is
+ * needed or timeout
+ */
+static int __cpuidle arm_idle_wfe_poll(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv, int idx)
+{
+	u64 time_start, time_limit;
+
+	time_start = local_clock();
+	dev->poll_time_limit = false;
+
+	local_irq_enable();
+
+	if (current_set_polling_and_test())
+		goto end;
+
+	time_limit = cpuidle_poll_time(drv, dev);
+
+	do {
+		// exclusive read arms the monitor for wfe
+		if (__READ_ONCE_EX(current_thread_info()->flags) & _TIF_NEED_RESCHED)
+			goto end;
+
+		// may exit prematurely, see ARM_ARCH_TIMER_EVTSTREAM
+		wfe();
+	} while (local_clock() - time_start < time_limit);
+
+	dev->poll_time_limit = true;
+
+end:
+	current_clr_polling();
+	return idx;
+}
+
+/*
+ * arm_idle_wfi - Places cpu in lower power state until interrupt,
+ * a fallback to polling
+ */
+static int __cpuidle arm_idle_wfi(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv, int idx)
+{
+	if (current_clr_polling_and_test()) {
+		local_irq_enable();
+		return idx;
+	}
+	arch_cpu_idle();
+	return idx;
+}
+
+static struct cpuidle_driver arm_poll_idle_driver __initdata = {
+	.name = "arm_poll_idle",
+	.owner = THIS_MODULE,
+	.states = {
+		{
+			.enter			= arm_idle_wfe_poll,
+			.exit_latency		= 0,
+			.target_residency	= 0,
+			.exit_latency_ns	= 0,
+			.power_usage		= UINT_MAX,
+			.flags			= CPUIDLE_FLAG_POLLING,
+			.name			= "WFE",
+			.desc			= "ARM WFE",
+		},
+		{
+			.enter			= arm_idle_wfi,
+			.exit_latency		= DEFAULT_POLL_LIMIT_US,
+			.target_residency	= DEFAULT_POLL_LIMIT_US,
+			.power_usage		= UINT_MAX,
+			.name			= "WFI",
+			.desc			= "ARM WFI",
+		},
+	},
+	.state_count = 2,
+};
+
+/*
+ * arm_poll_init_cpu - Initializes arm cpuidle polling driver for one cpu
+ */
+static int __init arm_poll_init_cpu(int cpu)
+{
+	int ret;
+	struct cpuidle_driver *drv;
+
+	drv = kmemdup(&arm_poll_idle_driver, sizeof(*drv), GFP_KERNEL);
+	if (!drv)
+		return -ENOMEM;
+
+	drv->cpumask = (struct cpumask *)cpumask_of(cpu);
+	drv->states[1].exit_latency = poll_limit;
+	drv->states[1].target_residency = poll_limit;
+
+	ret = cpuidle_register(drv, NULL);
+	if (ret) {
+		pr_err("failed to register driver: %d, cpu %d\n", ret, cpu);
+		goto out_kfree_drv;
+	}
+
+	pr_info("registered driver cpu %d\n", cpu);
+
+	cpuidle_cooling_register(drv);
+
+	return 0;
+
+out_kfree_drv:
+	kfree(drv);
+	return ret;
+}
+
+/*
+ * arm_poll_init - Initializes arm cpuidle polling driver
+ */
+static int __init arm_poll_init(void)
+{
+	int cpu, ret;
+	struct cpuidle_driver *drv;
+	struct cpuidle_device *dev;
+
+	for_each_possible_cpu(cpu) {
+		ret = arm_poll_init_cpu(cpu);
+		if (ret)
+			goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	pr_info("de-register all");
+	while (--cpu >= 0) {
+		dev = per_cpu(cpuidle_devices, cpu);
+		drv = cpuidle_get_cpu_driver(dev);
+		cpuidle_unregister(drv);
+		kfree(drv);
+	}
+
+	return ret;
+}
+
+module_param(poll_limit, uint, 0444);
+device_initcall(arm_poll_init);
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ