linux-kernel - Re: [PATCH v4 08/10] workqueue: Introduce struct wq_node_nr

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240130180010.GA2011608@dev-arch.thelio-3990X>
Date: Tue, 30 Jan 2024 11:00:10 -0700
From: Nathan Chancellor <nathan@...nel.org>
To: Tejun Heo <tj@...nel.org>
Cc: Lai Jiangshan <jiangshanlai@...il.com>, linux-kernel@...r.kernel.org,
	Naohiro.Aota@....com, kernel-team@...a.com
Subject: Re: [PATCH v4 08/10] workqueue: Introduce struct wq_node_nr_active

Hi Tejun,

On Mon, Jan 29, 2024 at 08:14:09AM -1000, Tejun Heo wrote:
> >From 91ccc6e7233bb10a9c176aa4cc70d6f432a441a5 Mon Sep 17 00:00:00 2001
> From: Tejun Heo <tj@...nel.org>
> Date: Mon, 29 Jan 2024 08:11:24 -1000
> 
> Currently, for both percpu and unbound workqueues, max_active applies
> per-cpu, which is a recent change for unbound workqueues. The change for
> unbound workqueues was a significant departure from the previous behavior of
> per-node application. It made some use cases create undesirable number of
> concurrent work items and left no good way of fixing them. To address the
> problem, workqueue is implementing a NUMA node segmented global nr_active
> mechanism, which will be explained further in the next patch.
> 
> As a preparation, this patch introduces struct wq_node_nr_active. It's a
> data structured allocated for each workqueue and NUMA node pair and
> currently only tracks the workqueue's number of active work items on the
> node. This is split out from the next patch to make it easier to understand
> and review.
> 
> Note that there is an extra wq_node_nr_active allocated for the invalid node
> nr_node_ids which is used to track nr_active for pools which don't have NUMA
> node associated such as the default fallback system-wide pool.
> 
> This doesn't cause any behavior changes visible to userland yet. The next
> patch will expand to implement the control mechanism on top.
> 
> v4: - Fixed out-of-bound access when freeing per-cpu workqueues.
> 
> v3: - Use flexible array for wq->node_nr_active as suggested by Lai.
> 
> v2: - wq->max_active now uses WRITE/READ_ONCE() as suggested by Lai.
> 
>     - Lai pointed out that pwq_tryinc_nr_active() incorrectly dropped
>       pwq->max_active check. Restored. As the next patch replaces the
>       max_active enforcement mechanism, this doesn't change the end result.
> 
> Signed-off-by: Tejun Heo <tj@...nel.org>
> Reviewed-by: Lai Jiangshan <jiangshanlai@...il.com>
> ---
>  kernel/workqueue.c | 142 ++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 135 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index b5aba0e5a699..8d465478adb9 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -284,6 +284,16 @@ struct wq_flusher {
>  
>  struct wq_device;
>  
> +/*
> + * Unlike in a per-cpu workqueue where max_active limits its concurrency level
> + * on each CPU, in an unbound workqueue, max_active applies to the whole system.
> + * As sharing a single nr_active across multiple sockets can be very expensive,
> + * the counting and enforcement is per NUMA node.
> + */
> +struct wq_node_nr_active {
> +	atomic_t		nr;		/* per-node nr_active count */
> +};
> +
>  /*
>   * The externally visible workqueue.  It relays the issued work items to
>   * the appropriate worker_pool through its pool_workqueues.
> @@ -330,6 +340,7 @@ struct workqueue_struct {
>  	/* hot fields used during command issue, aligned to cacheline */
>  	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
>  	struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
> +	struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
>  };
>  
>  static struct kmem_cache *pwq_cache;
> @@ -1425,6 +1436,31 @@ work_func_t wq_worker_last_func(struct task_struct *task)
>  	return worker->last_func;
>  }
>  
> +/**
> + * wq_node_nr_active - Determine wq_node_nr_active to use
> + * @wq: workqueue of interest
> + * @node: NUMA node, can be %NUMA_NO_NODE
> + *
> + * Determine wq_node_nr_active to use for @wq on @node. Returns:
> + *
> + * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
> + *
> + * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
> + *
> + * - Otherwise, node_nr_active[@node].
> + */
> +static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
> +						   int node)
> +{
> +	if (!(wq->flags & WQ_UNBOUND))
> +		return NULL;
> +
> +	if (node == NUMA_NO_NODE)
> +		node = nr_node_ids;
> +
> +	return wq->node_nr_active[node];
> +}
> +
>  /**
>   * get_pwq - get an extra reference on the specified pool_workqueue
>   * @pwq: pool_workqueue to get
> @@ -1506,12 +1542,17 @@ static bool pwq_activate_work(struct pool_workqueue *pwq,
>  			      struct work_struct *work)
>  {
>  	struct worker_pool *pool = pwq->pool;
> +	struct wq_node_nr_active *nna;
>  
>  	lockdep_assert_held(&pool->lock);
>  
>  	if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
>  		return false;
>  
> +	nna = wq_node_nr_active(pwq->wq, pool->node);
> +	if (nna)
> +		atomic_inc(&nna->nr);
> +
>  	pwq->nr_active++;
>  	__pwq_activate_work(pwq, work);
>  	return true;
> @@ -1528,14 +1569,18 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq)
>  {
>  	struct workqueue_struct *wq = pwq->wq;
>  	struct worker_pool *pool = pwq->pool;
> +	struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
>  	bool obtained;
>  
>  	lockdep_assert_held(&pool->lock);
>  
>  	obtained = pwq->nr_active < READ_ONCE(wq->max_active);
>  
> -	if (obtained)
> +	if (obtained) {
>  		pwq->nr_active++;
> +		if (nna)
> +			atomic_inc(&nna->nr);
> +	}
>  	return obtained;
>  }
>  
> @@ -1572,10 +1617,26 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq)
>  static void pwq_dec_nr_active(struct pool_workqueue *pwq)
>  {
>  	struct worker_pool *pool = pwq->pool;
> +	struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
>  
>  	lockdep_assert_held(&pool->lock);
>  
> +	/*
> +	 * @pwq->nr_active should be decremented for both percpu and unbound
> +	 * workqueues.
> +	 */
>  	pwq->nr_active--;
> +
> +	/*
> +	 * For a percpu workqueue, it's simple. Just need to kick the first
> +	 * inactive work item on @pwq itself.
> +	 */
> +	if (!nna) {
> +		pwq_activate_first_inactive(pwq);
> +		return;
> +	}
> +
> +	atomic_dec(&nna->nr);
>  	pwq_activate_first_inactive(pwq);
>  }
>  
> @@ -4039,11 +4100,63 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
>  }
>  #endif
>  
> +static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
> +{
> +	int node;
> +
> +	for_each_node(node) {
> +		kfree(nna_ar[node]);
> +		nna_ar[node] = NULL;
> +	}
> +
> +	kfree(nna_ar[nr_node_ids]);
> +	nna_ar[nr_node_ids] = NULL;
> +}
> +
> +static void init_node_nr_active(struct wq_node_nr_active *nna)
> +{
> +	atomic_set(&nna->nr, 0);
> +}
> +
> +/*
> + * Each node's nr_active counter will be accessed mostly from its own node and
> + * should be allocated in the node.
> + */
> +static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
> +{
> +	struct wq_node_nr_active *nna;
> +	int node;
> +
> +	for_each_node(node) {
> +		nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
> +		if (!nna)
> +			goto err_free;
> +		init_node_nr_active(nna);
> +		nna_ar[node] = nna;
> +	}
> +
> +	/* [nr_node_ids] is used as the fallback */
> +	nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
> +	if (!nna)
> +		goto err_free;
> +	init_node_nr_active(nna);
> +	nna_ar[nr_node_ids] = nna;
> +
> +	return 0;
> +
> +err_free:
> +	free_node_nr_active(nna_ar);
> +	return -ENOMEM;
> +}
> +
>  static void rcu_free_wq(struct rcu_head *rcu)
>  {
>  	struct workqueue_struct *wq =
>  		container_of(rcu, struct workqueue_struct, rcu);
>  
> +	if (wq->flags & WQ_UNBOUND)
> +		free_node_nr_active(wq->node_nr_active);
> +
>  	wq_free_lockdep(wq);
>  	free_percpu(wq->cpu_pwq);
>  	free_workqueue_attrs(wq->unbound_attrs);
> @@ -4785,7 +4898,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
>  {
>  	va_list args;
>  	struct workqueue_struct *wq;
> -	int len;
> +	size_t wq_size;
> +	int name_len;
>  
>  	/*
>  	 * Unbound && max_active == 1 used to imply ordered, which is no longer
> @@ -4801,7 +4915,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
>  		flags |= WQ_UNBOUND;
>  
>  	/* allocate wq and format name */
> -	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
> +	if (flags & WQ_UNBOUND)
> +		wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
> +	else
> +		wq_size = sizeof(*wq);
> +
> +	wq = kzalloc(wq_size, GFP_KERNEL);
>  	if (!wq)
>  		return NULL;
>  
> @@ -4812,11 +4931,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
>  	}
>  
>  	va_start(args, max_active);
> -	len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
> +	name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
>  	va_end(args);
>  
> -	if (len >= WQ_NAME_LEN)
> -		pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name);
> +	if (name_len >= WQ_NAME_LEN)
> +		pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
> +			     wq->name);
>  
>  	max_active = max_active ?: WQ_DFL_ACTIVE;
>  	max_active = wq_clamp_max_active(max_active, flags, wq->name);
> @@ -4835,8 +4955,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
>  	wq_init_lockdep(wq);
>  	INIT_LIST_HEAD(&wq->list);
>  
> +	if (flags & WQ_UNBOUND) {
> +		if (alloc_node_nr_active(wq->node_nr_active) < 0)
> +			goto err_unreg_lockdep;
> +	}
> +
>  	if (alloc_and_link_pwqs(wq) < 0)
> -		goto err_unreg_lockdep;
> +		goto err_free_node_nr_active;
>  
>  	if (wq_online && init_rescuer(wq) < 0)
>  		goto err_destroy;
> @@ -4861,6 +4986,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
>  
>  	return wq;
>  
> +err_free_node_nr_active:
> +	if (wq->flags & WQ_UNBOUND)
> +		free_node_nr_active(wq->node_nr_active);
>  err_unreg_lockdep:
>  	wq_unregister_lockdep(wq);
>  	wq_free_lockdep(wq);
> -- 
> 2.43.0
> 

I just bisected a crash that I see when booting several different architectures
in QEMU in -next to this change as commit 5797b1c18919 ("workqueue: Implement
system-wide nr_active enforcement for unbound workqueues"). For example, with
arm64 virtconfig (I also see it with ARCH=arm multi_v7_defconfig and
ARCH=riscv defconfig):

$ make -skj"$(nproc)" ARCH=arm64 CROSS_COMPILE=aarch64-linux- virtconfig Image.gz

$ qemu-system-aarch64 \
    -display none \
    -nodefaults \
    -cpu max,pauth-impdef=true \
    -machine virt,gic-version=max,virtualization=true \
    -append 'console=ttyAMA0 earlycon' \
    -kernel arch/arm64/boot/Image.gz \
    -initrd rootfs.cpio \
    -m 512m \
    -serial mon:stdio
[    0.000000] Booting Linux on physical CPU 0x0000000000 [0x000f0510]
[    0.000000] Linux version 6.7.0-09946-g5797b1c18919 (nathan@...-arch.thelio-3990X) (aarch64-linux-gcc (GCC) 13.2.0, GNU ld (GNU Binutils) 2.41) #1 SMP PREEMPT Tue Jan 30 10:10:57 MST 2024
..
[    0.000000] Unable to handle kernel paging request at virtual address ffff000021c0b380
[    0.000000] Mem abort info:
[    0.000000]   ESR = 0x0000000096000006
[    0.000000]   EC = 0x25: DABT (current EL), IL = 32 bits
[    0.000000]   SET = 0, FnV = 0
[    0.000000]   EA = 0, S1PTW = 0
[    0.000000]   FSC = 0x06: level 2 translation fault
[    0.000000] Data abort info:
[    0.000000]   ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000
[    0.000000]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[    0.000000]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[    0.000000] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000413b1000
[    0.000000] [ffff000021c0b380] pgd=180000005fff7003, p4d=180000005fff7003, pud=180000005fff6003, pmd=0000000000000000
[    0.000000] Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP
[    0.000000] Modules linked in:
[    0.000000] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.7.0-09946-g5797b1c18919 #1
[    0.000000] Hardware name: linux,dummy-virt (DT)
[    0.000000] pstate: 600000c9 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    0.000000] pc : wq_update_node_max_active+0x48/0x1d8
[    0.000000] lr : apply_wqattrs_commit+0x160/0x180
[    0.000000] sp : ffffda8d476b3be0
[    0.000000] x29: ffffda8d476b3be0 x28: ffff000001c0d600 x27: 0000000000000000
[    0.000000] x26: ffff000001c0d6c0 x25: 0000000000000001 x24: 0000000000000200
[    0.000000] x23: 00000000ffffffff x22: ffffda8d476b9c40 x21: 0000000000000008
[    0.000000] x20: ffffda8d476b9a40 x19: ffff000001c0b360 x18: ffff00001feebed0
[    0.000000] x17: 0000000000c65c70 x16: ffff00001feebb28 x15: fffffc0000070488
[    0.000000] x14: 0000000000000000 x13: 0000000000000000 x12: ffff00001feebb28
[    0.000000] x11: 0000000000000001 x10: ffff000001c0b388 x9 : 0000000000000000
[    0.000000] x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffff000001c0d600
[    0.000000] x5 : ffff000001c0d600 x4 : ffff000001c0e880 x3 : ffff000001c0d600
[    0.000000] x2 : ffff000001c0b388 x1 : ffffda8d476b9000 x0 : 0000000003ffffff
[    0.000000] Call trace:
[    0.000000]  wq_update_node_max_active+0x48/0x1d8
[    0.000000]  apply_wqattrs_commit+0x160/0x180
[    0.000000]  apply_workqueue_attrs_locked+0x50/0x84
[    0.000000]  alloc_workqueue+0x588/0x6c8
[    0.000000]  workqueue_init_early+0x480/0x554
[    0.000000]  start_kernel+0x240/0x5e8
[    0.000000]  __primary_switched+0xb8/0xc0
[    0.000000] Code: f9418033 d000a081 9100a262 f90037e2 (f8607840)
[    0.000000] ---[ end trace 0000000000000000 ]---
[    0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
[    0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---

If there is any more information I can provide or patches I can test, I
am more than happy to do so.

Cheers,
Nathan

# bad: [41d66f96d0f15a0a2ad6fa2208f6bac1a66cbd52] Add linux-next specific files for 20240130
# good: [41bccc98fb7931d63d03f326a746ac4d429c1dd3] Linux 6.8-rc2
git bisect start '41d66f96d0f15a0a2ad6fa2208f6bac1a66cbd52' '41bccc98fb7931d63d03f326a746ac4d429c1dd3'
# good: [f3f89885646036e16b325aea597fc9f375f1a56a] Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git
git bisect good f3f89885646036e16b325aea597fc9f375f1a56a
# good: [8042d32dd6c3730b0b4c8c9c811e204ed9f5f829] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
git bisect good 8042d32dd6c3730b0b4c8c9c811e204ed9f5f829
# bad: [f0edba72fe5ae932877f49796aaef8adf1a2eb8c] Merge branch 'togreg' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git
git bisect bad f0edba72fe5ae932877f49796aaef8adf1a2eb8c
# bad: [549632942a27cc3987473f8a8629200bc2ab0734] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
git bisect bad 549632942a27cc3987473f8a8629200bc2ab0734
# good: [9eb48e8d465d67362dac65963979e65cb2ad7634] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git
git bisect good 9eb48e8d465d67362dac65963979e65cb2ad7634
# good: [bc83a87759cabbf6f3366568e44bda088b315204] dt-bindings: usb: dwc3: Add snps,host-vbus-glitches-quirk avoid vbus glitch
git bisect good bc83a87759cabbf6f3366568e44bda088b315204
# bad: [2c0ea2f8eb6140767fae8d01a30ce45dcf4ead85] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git
git bisect bad 2c0ea2f8eb6140767fae8d01a30ce45dcf4ead85
# good: [a045a272d887575da17ad86d6573e82871b50c27] workqueue: Move pwq->max_active to wq->max_active
git bisect good a045a272d887575da17ad86d6573e82871b50c27
# good: [9f66cff212bb3c1cd25996aaa0dfd0c9e9d8baab] workqueue: RCU protect wq->dfl_pwq and implement accessors for it
git bisect good 9f66cff212bb3c1cd25996aaa0dfd0c9e9d8baab
# good: [91ccc6e7233bb10a9c176aa4cc70d6f432a441a5] workqueue: Introduce struct wq_node_nr_active
git bisect good 91ccc6e7233bb10a9c176aa4cc70d6f432a441a5
# bad: [07daa99b7fd7adfffa22180184e39ec124e73013] tools/workqueue/wq_dump.py: Add node_nr/max_active dump
git bisect bad 07daa99b7fd7adfffa22180184e39ec124e73013
# bad: [5797b1c18919cd9c289ded7954383e499f729ce0] workqueue: Implement system-wide nr_active enforcement for unbound workqueues
git bisect bad 5797b1c18919cd9c289ded7954383e499f729ce0
# first bad commit: [5797b1c18919cd9c289ded7954383e499f729ce0] workqueue: Implement system-wide nr_active enforcement for unbound workqueues