linux-kernel - Re: [PATCH v5 16/18] timer: Implement the hierarchical pull model

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230323092221.GA2508414@hirez.programming.kicks-ass.net>
Date:   Thu, 23 Mar 2023 10:22:21 +0100
From:   Peter Zijlstra <peterz@...radead.org>
To:     Anna-Maria Behnsen <anna-maria@...utronix.de>
Cc:     linux-kernel@...r.kernel.org, John Stultz <jstultz@...gle.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Eric Dumazet <edumazet@...gle.com>,
        "Rafael J . Wysocki" <rafael.j.wysocki@...el.com>,
        Arjan van de Ven <arjan@...radead.org>,
        "Paul E . McKenney" <paulmck@...nel.org>,
        Frederic Weisbecker <fweisbec@...il.com>,
        Rik van Riel <riel@...riel.com>
Subject: Re: [PATCH v5 16/18] timer: Implement the hierarchical pull model


(since the code is structured such, I also wrote the comments bottom-up)

On Wed, Mar 01, 2023 at 03:17:42PM +0100, Anna-Maria Behnsen wrote:

> +static struct tmigr_group *tmigr_get_group(unsigned int cpu, unsigned int node,
> +					   unsigned int lvl)
> +{
> +	struct tmigr_group *tmp, *group = NULL;
> +	bool first_loop = true;
> +	atomic_t *migr_state;

When first reading this I was wondering what was protecting the lists;
since this is at least 2 deep from where you take the lock. Perhaps add:

	lockdep_assert_held(&tmigr_mutex);

to clarify the locking context.

> +
> +reloop:
> +	/* Try to attach to an exisiting group first */
> +	list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
> +		/*
> +		 * If @lvl is below the cross numa node level, check whether
> +		 * this group belongs to the same numa node.
> +		 */
> +		if (lvl < tmigr_crossnode_level && tmp->numa_node != node)
> +			continue;
> +
> +		/* Capacity left? */
> +		if (tmp->num_childs >= TMIGR_CHILDS_PER_GROUP)
> +			continue;
> +
> +		/*
> +		 * If this is the lowest level of the hierarchy, make sure
> +		 * that thread siblings share a group. It is only executed
> +		 * when siblings exist. ALL groups of lowest level needs to
> +		 * be checked for thread sibling, before thread cpu is
> +		 * added to a random group with capacity. When all groups
> +		 * are checked and no thread sibling was found, reloop of
> +		 * level zero groups is required to get a group with
> +		 * capacity.
> +		 */
> +		if (!lvl && (tmigr_cores_per_group != TMIGR_CHILDS_PER_GROUP)) {
> +			if (first_loop == true && !sibling_in_group(cpu, tmp)) {
> +				continue;
> +			} else if (first_loop == false) {
> +				if (tmp->num_cores >= tmigr_cores_per_group)
> +					continue;
> +				else
> +					tmp->num_cores++;
> +			}
> +		}
> +
> +		group = tmp;
> +		break;
> +	}
> +
> +	if (group) {
> +		return group;
> +	} else if (first_loop == true) {
> +		first_loop = false;
> +		goto reloop;
> +	}
> +
> +	/* Allocate and	set up a new group with corresponding migr_state */
> +	group = kzalloc_node(sizeof(*group), GFP_KERNEL, node);
> +	if (!group)
> +		return ERR_PTR(-ENOMEM);
> +
> +	migr_state = kzalloc_node(sizeof(atomic_t), GFP_KERNEL, node);
> +	if (!migr_state) {
> +		kfree(group);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	tmigr_init_group(group, lvl, node, migr_state);
> +	/* Setup successful. Add it to the hierarchy */
> +	list_add(&group->list, &tmigr_level_list[lvl]);
> +	return group;
> +}
> +
> +static void tmigr_connect_child_parent(struct tmigr_group *child,
> +				       struct tmigr_group *parent)
> +{
> +	union tmigr_state childstate;
> +	unsigned long flags;
> +
> +	raw_spin_lock_irqsave(&child->lock, flags);

Both callsites are in tmigr_setup_groups(), which must be ran in
preemptible context due to GFP_KERNEL allocations. Please use
raw_spin_lock_irq().

> +	raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
> +
> +	child->parent = parent;
> +	child->childmask = BIT(parent->num_childs++);
> +
> +	raw_spin_unlock(&parent->lock);
> +	raw_spin_unlock_irqrestore(&child->lock, flags);
> +
> +	/*
> +	 * To prevent inconsistent states, active childs needs to be active
> +	 * in new parent as well. Inactive childs are already marked
> +	 * inactive in parent group.
> +	 */
> +	childstate.state = atomic_read(child->migr_state);
> +	if (childstate.migrator != TMIGR_NONE) {
> +		struct tmigr_walk data;
> +
> +		data.childmask = child->childmask;
> +		data.groupstate.state = atomic_read(parent->migr_state);
> +
> +		/*
> +		 * There is only one new level per time. When connecting
> +		 * child and parent and set child active when parent is
> +		 * inactive, parent needs to be the upperst
> +		 * level. Otherwise there went something wrong!
> +		 */
> +		WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
> +	}
> +}
> +
> +static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
> +{
> +	struct tmigr_group *group, *child, **stack;
> +	int top = 0, err = 0, i = 0;
> +	struct list_head *lvllist;
> +	size_t sz;
> +
> +	sz = sizeof(struct tmigr_group *) * tmigr_hierarchy_levels;
> +	stack = kzalloc(sz, GFP_KERNEL);
> +	if (!stack)
> +		return -ENOMEM;
> +
> +	do {
> +		group = tmigr_get_group(cpu, node, i);
> +		if (IS_ERR(group)) {
> +			err = IS_ERR(group);
> +			break;
> +		}
> +
> +		top = i;
> +		stack[i++] = group;
> +
> +		/*
> +		 * When booting only less CPUs of a system than CPUs are
> +		 * available, not all calculated hierarchy levels are required.
> +		 *
> +		 * The loop is aborted as soon as the highest level, which might
> +		 * be different from tmigr_hierarchy_levels, contains only a
> +		 * single group.
> +		 */
> +		if (group->parent || i == tmigr_hierarchy_levels ||
> +		    (list_empty(&tmigr_level_list[i]) &&
> +		     list_is_singular(&tmigr_level_list[i - 1])))
> +			break;
> +
> +	} while (i < tmigr_hierarchy_levels);
> +
> +	do {
> +		group = stack[--i];
> +
> +		if (err < 0) {
> +			list_del(&group->list);
> +			kfree(group);
> +			continue;
> +		}
> +
> +		DBG_BUG_ON(i != group->level);
> +
> +		/*
> +		 * Update tmc -> group / child -> group connection
> +		 */
> +		if (i == 0) {
> +			struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
> +			unsigned long flags;
> +
> +			raw_spin_lock_irqsave(&group->lock, flags);

You're holding a mutex, you just did a GFP_KERNEL allocation; there is
no way IRQs are already disabled here, please use raw_spin_lock_irq() to
reduce confusion.

> +
> +			tmc->tmgroup = group;
> +			tmc->childmask = BIT(group->num_childs);
> +
> +			group->cpus[group->num_childs++] = cpu;
> +
> +			raw_spin_unlock_irqrestore(&group->lock, flags);
> +
> +			/* There are no childs that needs to be connected */
> +			continue;
> +		} else {
> +			child = stack[i - 1];
> +			tmigr_connect_child_parent(child, group);
> +		}
> +
> +		/* check if upperst level was newly created */
> +		if (top != i)
> +			continue;
> +
> +		DBG_BUG_ON(top == 0);
> +
> +		lvllist = &tmigr_level_list[top];
> +		if (group->num_childs == 1 && list_is_singular(lvllist)) {
> +			lvllist = &tmigr_level_list[top - 1];
> +			list_for_each_entry(child, lvllist, list) {
> +				if (child->parent)
> +					continue;
> +
> +				tmigr_connect_child_parent(child, group);
> +			}
> +		}
> +	} while (i > 0);
> +
> +	kfree(stack);
> +
> +	return err;
> +}
> +
> +static int tmigr_add_cpu(unsigned int cpu)
> +{
> +	unsigned int node = cpu_to_node(cpu);
> +	int ret;
> +	mutex_lock(&tmigr_mutex);
> +	ret = tmigr_setup_groups(cpu, node);
> +	mutex_unlock(&tmigr_mutex);
> +
> +	return ret;
> +}
> +
> +static int tmigr_cpu_online(unsigned int cpu)
> +{
> +	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
> +	unsigned long flags;
> +	unsigned int ret;
> +
> +	/* First online attempt? Initialize CPU data */
> +	if (!tmc->tmgroup) {
> +		raw_spin_lock_init(&tmc->lock);
> +
> +		ret = tmigr_add_cpu(cpu);
> +		if (ret < 0)
> +			return ret;
> +
> +		if (tmc->childmask == 0)
> +			return -EINVAL;
> +
> +		timerqueue_init(&tmc->cpuevt.nextevt);
> +		tmc->cpuevt.nextevt.expires = KTIME_MAX;
> +		tmc->cpuevt.ignore = 1;
> +		tmc->cpuevt.cpu = cpu;
> +
> +		tmc->remote = 0;
> +		tmc->idle = 0;
> +		tmc->wakeup = KTIME_MAX;
> +	}
> +	raw_spin_lock_irqsave(&tmc->lock, flags);

You just called tmigr_add_cpu() which takes a mutex, there is no
possible way this can be called with IRQs already disabled. Please use
raw_spin_lock_irq() to reduce confusion.

Also, offline below does that and this is just inconsistent.

> +	__tmigr_cpu_activate(tmc);
> +	tmc->online = 1;
> +	raw_spin_unlock_irqrestore(&tmc->lock, flags);
> +	return 0;
> +}
> +
> +static int tmigr_cpu_offline(unsigned int cpu)
> +{
> +	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
> +
> +	raw_spin_lock_irq(&tmc->lock);
> +	tmc->online = 0;
> +	__tmigr_cpu_deactivate(tmc, KTIME_MAX);
> +	raw_spin_unlock_irq(&tmc->lock);
> +
> +	return 0;
> +}