[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <555fca66-81b6-3406-eac1-140c00669477@arm.com>
Date: Thu, 26 Jan 2023 19:15:26 +0000
From: Robin Murphy <robin.murphy@....com>
To: Georgi Djakov <quic_c_gdjako@...cinc.com>, catalin.marinas@....com,
will@...nel.org
Cc: dave.hansen@...ux.intel.com, luto@...nel.org, peterz@...radead.org,
tglx@...utronix.de, mingo@...hat.com, bp@...en8.de, hpa@...or.com,
hch@....de, m.szyprowski@...sung.com,
linux-arm-kernel@...ts.infradead.org, iommu@...ts.linux.dev,
linux-kernel@...r.kernel.org, djakov@...nel.org
Subject: Re: [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line
On 2023-01-26 16:43, Georgi Djakov wrote:
> From: Chris Goldsworthy <quic_cgoldswo@...cinc.com>
>
> It's useful to have an option to disable the ZONE_DMA32 during boot as
> CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
> example). There are platforms that do not use this zone and in some high
> memory pressure scenarios this would help on easing kswapd (to leave file
> backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
> these platforms - kswapd is woken up more easily and drains the file cache
> which leads to some performance issues.
>
> Signed-off-by: Chris Goldsworthy <quic_cgoldswo@...cinc.com>
> [georgi: updated commit text]
> Signed-off-by: Georgi Djakov <quic_c_gdjako@...cinc.com>
> ---
> The main question here is whether we can have a kernel command line
> option to disable CONFIG_ZONE_DMA32 during boot (at least on arm64).
> I can imagine this being useful also for Linux distros.
FWIW I'd say that "disabled" and "left empty then awkwardly tiptoed
around in a few places" are very different notions...
However, I'm just going to take a step back and read the commit message
a few more times... Given what it claims, I can't help but ask why
wouldn't we want a parameter to control kswapd's behaviour and address
that issue directly, rather than a massive hammer that breaks everyone
allocating explicitly or implicitly with __GFP_DMA32 (especially on
systems where it doesn't normally matter because all memory is below 4GB
anyway), just to achieve one rather niche side-effect?
Thanks,
Robin.
> .../admin-guide/kernel-parameters.txt | 5 +++++
> arch/arm64/mm/init.c | 20 ++++++++++++++++-
> arch/x86/mm/init.c | 20 ++++++++++++++++-
> include/linux/dma-direct.h | 22 +++++++++++++++++++
> kernel/dma/direct.c | 5 +++--
> kernel/dma/pool.c | 8 +++----
> 6 files changed, 72 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index cb12df402161..854ff65ac6b0 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -1070,6 +1070,11 @@
> Disable Dynamic DMA Window support. Use this
> to workaround buggy firmware.
>
> + disable_dma32= [KNL]
> + Dynamically disable ZONE_DMA32 on kernels compiled with
> + CONFIG_ZONE_DMA32=y.
> +
> +
> disable_ipv6= [IPV6]
> See Documentation/networking/ipv6.rst.
>
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 58a0bb2c17f1..1a56098c0e19 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -118,6 +118,12 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
> return 0;
> }
>
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;
> +
> /*
> * reserve_crashkernel() - reserves memory for crash kernel
> *
> @@ -244,7 +250,7 @@ static void __init zone_sizes_init(void)
> max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
> #endif
> #ifdef CONFIG_ZONE_DMA32
> - max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
> + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : PFN_DOWN(dma32_phys_limit);
> if (!arm64_dma_phys_limit)
> arm64_dma_phys_limit = dma32_phys_limit;
> #endif
> @@ -253,6 +259,18 @@ static void __init zone_sizes_init(void)
> free_area_init(max_zone_pfns);
> }
>
> +static int __init early_disable_dma32(char *buf)
> +{
> + if (!buf)
> + return -EINVAL;
> +
> + if (!strcmp(buf, "on"))
> + disable_dma32 = true;
> +
> + return 0;
> +}
> +early_param("disable_dma32", early_disable_dma32);
> +
> int pfn_is_map_memory(unsigned long pfn)
> {
> phys_addr_t addr = PFN_PHYS(pfn);
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index cb258f58fdc8..b8af7e2f21f5 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -112,6 +112,12 @@ static unsigned long min_pfn_mapped;
>
> static bool __initdata can_use_brk_pgt = true;
>
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;
> +
> /*
> * Pages returned are already directly mapped.
> *
> @@ -1032,7 +1038,7 @@ void __init zone_sizes_init(void)
> max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
> #endif
> #ifdef CONFIG_ZONE_DMA32
> - max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
> + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
> #endif
> max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
> #ifdef CONFIG_HIGHMEM
> @@ -1042,6 +1048,18 @@ void __init zone_sizes_init(void)
> free_area_init(max_zone_pfns);
> }
>
> +static int __init early_disable_dma32(char *buf)
> +{
> + if (!buf)
> + return -EINVAL;
> +
> + if (!strcmp(buf, "on"))
> + disable_dma32 = true;
> +
> + return 0;
> +}
> +early_param("disable_dma32", early_disable_dma32);
> +
> __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
> .loaded_mm = &init_mm,
> .next_asid = 1,
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index 18aade195884..ed69618cf1fc 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -24,6 +24,28 @@ struct bus_dma_region {
> u64 offset;
> };
>
> +static inline bool zone_dma32_is_empty(int node)
> +{
> +#ifdef CONFIG_ZONE_DMA32
> + pg_data_t *pgdat = NODE_DATA(node);
> +
> + return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]);
> +#else
> + return true;
> +#endif
> +}
> +
> +static inline bool zone_dma32_are_empty(void)
> +{
> + int node;
> +
> + for_each_node(node)
> + if (!zone_dma32_is_empty(node))
> + return false;
> +
> + return true;
> +}
> +
> static inline dma_addr_t translate_phys_to_dma(struct device *dev,
> phys_addr_t paddr)
> {
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 63859a101ed8..754210c65658 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -60,7 +60,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
> *phys_limit = dma_to_phys(dev, dma_limit);
> if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
> return GFP_DMA;
> - if (*phys_limit <= DMA_BIT_MASK(32))
> + if (*phys_limit <= DMA_BIT_MASK(32) && !zone_dma32_is_empty(dev_to_node(dev)))
> return GFP_DMA32;
> return 0;
> }
> @@ -145,7 +145,8 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>
> if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
> phys_limit < DMA_BIT_MASK(64) &&
> - !(gfp & (GFP_DMA32 | GFP_DMA))) {
> + !(gfp & (GFP_DMA32 | GFP_DMA)) &&
> + !zone_dma32_is_empty(node)) {
> gfp |= GFP_DMA32;
> goto again;
> }
> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index 4d40dcce7604..8e79903fbda8 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -71,7 +71,7 @@ static bool cma_in_zone(gfp_t gfp)
> end = cma_get_base(cma) + size - 1;
> if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
> return end <= DMA_BIT_MASK(zone_dma_bits);
> - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
> return end <= DMA_BIT_MASK(32);
> return true;
> }
> @@ -153,7 +153,7 @@ static void atomic_pool_work_fn(struct work_struct *work)
> if (IS_ENABLED(CONFIG_ZONE_DMA))
> atomic_pool_resize(atomic_pool_dma,
> GFP_KERNEL | GFP_DMA);
> - if (IS_ENABLED(CONFIG_ZONE_DMA32))
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty())
> atomic_pool_resize(atomic_pool_dma32,
> GFP_KERNEL | GFP_DMA32);
> atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
> @@ -209,7 +209,7 @@ static int __init dma_atomic_pool_init(void)
> if (!atomic_pool_dma)
> ret = -ENOMEM;
> }
> - if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) {
> atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
> GFP_KERNEL | GFP_DMA32);
> if (!atomic_pool_dma32)
> @@ -224,7 +224,7 @@ postcore_initcall(dma_atomic_pool_init);
> static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
> {
> if (prev == NULL) {
> - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
> return atomic_pool_dma32;
> if (atomic_pool_dma && (gfp & GFP_DMA))
> return atomic_pool_dma;
Powered by blists - more mailing lists