linux-kernel - Re: [PATCH 07/10] x86,mm: enable broadcast TLB invalidation for multi-threaded processes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241222113601.GX11133@noisy.programming.kicks-ass.net>
Date: Sun, 22 Dec 2024 12:36:01 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Rik van Riel <riel@...riel.com>
Cc: x86@...nel.org, linux-kernel@...r.kernel.org, kernel-team@...a.com,
	dave.hansen@...ux.intel.com, luto@...nel.org, tglx@...utronix.de,
	mingo@...hat.com, bp@...en8.de, hpa@...or.com,
	akpm@...ux-foundation.org
Subject: Re: [PATCH 07/10] x86,mm: enable broadcast TLB invalidation for
 multi-threaded processes

On Sat, Dec 21, 2024 at 11:06:39PM -0500, Rik van Riel wrote:

> +#ifdef CONFIG_CPU_SUP_AMD
> +/*
> + * Logic for AMD INVLPGB support.
> + */
> +static DEFINE_SPINLOCK(broadcast_asid_lock);

RAW_SPINLOCK ?

> +static u16 last_broadcast_asid = TLB_NR_DYN_ASIDS;
> +static DECLARE_BITMAP(broadcast_asid_used, MAX_ASID_AVAILABLE) = { 0 };
> +static LIST_HEAD(broadcast_asid_list);
> +static int broadcast_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
> +
> +static void reset_broadcast_asid_space(void)
> +{
> +	mm_context_t *context;
> +
> +	assert_spin_locked(&broadcast_asid_lock);

	lockdep_assert_locked(&broadcast_asid_lock);

> +
> +	/*
> +	 * Flush once when we wrap around the ASID space, so we won't need
> +	 * to flush every time we allocate an ASID for boradcast flushing.
> +	 */
> +	invlpgb_flush_all_nonglobals();
> +	tlbsync();
> +
> +	/*
> +	 * Leave the currently used broadcast ASIDs set in the bitmap, since
> +	 * those cannot be reused before the next wraparound and flush..
> +	 */
> +	bitmap_clear(broadcast_asid_used, 0, MAX_ASID_AVAILABLE);
> +	list_for_each_entry(context, &broadcast_asid_list, broadcast_asid_list)
> +		__set_bit(context->broadcast_asid, broadcast_asid_used);
> +
> +	last_broadcast_asid = TLB_NR_DYN_ASIDS;
> +}
> +
> +static u16 get_broadcast_asid(void)
> +{
> +	assert_spin_locked(&broadcast_asid_lock);

	lockdep_assert_locked

> +
> +	do {
> +		u16 start = last_broadcast_asid;
> +		u16 asid = find_next_zero_bit(broadcast_asid_used, MAX_ASID_AVAILABLE, start);
> +
> +		if (asid >= MAX_ASID_AVAILABLE) {
> +			reset_broadcast_asid_space();
> +			continue;
> +		}
> +
> +		/* Try claiming this broadcast ASID. */
> +		if (!test_and_set_bit(asid, broadcast_asid_used)) {
> +			last_broadcast_asid = asid;
> +			return asid;
> +		}
> +	} while (1);
> +}
> +
> +/*
> + * Returns true if the mm is transitioning from a CPU-local ASID to a broadcast
> + * (INVLPGB) ASID, or the other way around.
> + */
> +static bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid)
> +{
> +	u16 broadcast_asid = next->context.broadcast_asid;
> +
> +	if (broadcast_asid && prev_asid != broadcast_asid) {
> +		return true;
> +	}
> +
> +	if (!broadcast_asid && is_broadcast_asid(prev_asid)) {
> +		return true;
> +	}
> +
> +	return false;
> +}

Those return statements don't really need {} on.

> +
> +void destroy_context_free_broadcast_asid(struct mm_struct *mm) {

{ goes on a new line.

> +	unsigned long flags;
> +
> +	if (!mm->context.broadcast_asid)
> +		return;
> +
> +	spin_lock_irqsave(&broadcast_asid_lock, flags);

	guard(raw_spin_lock_irqsave)(&broadcast_asid_lock);

> +	mm->context.broadcast_asid = 0;
> +	list_del(&mm->context.broadcast_asid_list);
> +	broadcast_asid_available++;
> +	spin_unlock_irqrestore(&broadcast_asid_lock, flags);
> +}
> +
> +static int mm_active_cpus(struct mm_struct *mm)
> +{
> +	int count = 0;
> +	int cpu;
> +
> +	for_each_cpu(cpu, mm_cpumask(mm)) {
> +		/* Skip the CPUs that aren't really running this process. */
> +		if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
> +			continue;
> +
> +		if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
> +			continue;
> +
> +		count++;
> +	}
> +	return count;
> +}
> +
> +/*
> + * Assign a broadcast ASID to the current process, protecting against
> + * races between multiple threads in the process.
> + */
> +static void use_broadcast_asid(struct mm_struct *mm)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&broadcast_asid_lock, flags);

	guard(raw_spin_lock_irqsave)(&broadcast_asid_lock);

> +
> +	/* This process is already using broadcast TLB invalidation. */
> +	if (mm->context.broadcast_asid)
> +		goto out_unlock;

		return;

> +	mm->context.broadcast_asid = get_broadcast_asid();
> +	mm->context.asid_transition = true;
> +	list_add(&mm->context.broadcast_asid_list, &broadcast_asid_list);
> +	broadcast_asid_available--;
> +
> +out_unlock:

Notably we're really wanting to get away from the whole goto unlock
pattern.

> +	spin_unlock_irqrestore(&broadcast_asid_lock, flags);
> +}
> +
> +/*
> + * Figure out whether to assign a broadcast (global) ASID to a process.
> + * We vary the threshold by how empty or full broadcast ASID space is.
> + * 1/4 full: >= 4 active threads
> + * 1/2 full: >= 8 active threads
> + * 3/4 full: >= 16 active threads
> + * 7/8 full: >= 32 active threads
> + * etc
> + *
> + * This way we should never exhaust the broadcast ASID space, even on very
> + * large systems, and the processes with the largest number of active
> + * threads should be able to use broadcast TLB invalidation.

I'm a little confused, at most we need one ASID per CPU, IIRC we have
something like 4k ASIDs (page-offset bits in the physical address bits)
so for anything with less than 4K CPUs we're good, but with anything
having more CPUs we're up a creek irrespective of the above scheme, no?

> + */
> +#define HALFFULL_THRESHOLD 8
> +static bool meets_broadcast_asid_threshold(struct mm_struct *mm)
> +{
> +	int avail = broadcast_asid_available;
> +	int threshold = HALFFULL_THRESHOLD;
> +	int mm_active_threads;
> +
> +	if (!avail)
> +		return false;
> +
> +	mm_active_threads = mm_active_cpus(mm);
> +
> +	/* Small processes can just use IPI TLB flushing. */
> +	if (mm_active_threads < 3)
> +		return false;
> +
> +	if (avail > MAX_ASID_AVAILABLE * 3 / 4) {
> +		threshold = HALFFULL_THRESHOLD / 4;
> +	} else if (avail > MAX_ASID_AVAILABLE / 2) {
> +		threshold = HALFFULL_THRESHOLD / 2;
> +	} else if (avail < MAX_ASID_AVAILABLE / 3) {
> +		do {
> +			avail *= 2;
> +			threshold *= 2;
> +		} while ((avail + threshold ) < MAX_ASID_AVAILABLE / 2);
> +	}
> +
> +	return mm_active_threads > threshold;
> +}