linux-kernel - Re: [PATCH v6 19/21] timer: Implement the hierarchical pull model

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230515110646.zQCEzB8f@linutronix.de>
Date:   Mon, 15 May 2023 13:06:46 +0200
From:   Sebastian Siewior <bigeasy@...utronix.de>
To:     Anna-Maria Behnsen <anna-maria@...utronix.de>
Cc:     linux-kernel@...r.kernel.org,
        Peter Zijlstra <peterz@...radead.org>,
        John Stultz <jstultz@...gle.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Eric Dumazet <edumazet@...gle.com>,
        "Rafael J . Wysocki" <rafael.j.wysocki@...el.com>,
        Arjan van de Ven <arjan@...radead.org>,
        "Paul E . McKenney" <paulmck@...nel.org>,
        Frederic Weisbecker <fweisbec@...il.com>,
        Rik van Riel <riel@...riel.com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Giovanni Gherdovich <ggherdovich@...e.cz>,
        Lukasz Luba <lukasz.luba@....com>,
        "Gautham R . Shenoy" <gautham.shenoy@....com>
Subject: Re: [PATCH v6 19/21] timer: Implement the hierarchical pull model

On 2023-05-10 09:28:15 [+0200], Anna-Maria Behnsen wrote:
> diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
> new file mode 100644
> index 000000000000..accce2b368f4
> --- /dev/null
> +++ b/kernel/time/timer_migration.c
> @@ -0,0 +1,1322 @@
…
> + * 4. Propagating the changes of step 1 through the hierarchy to GRP1:0
> + *
> + *    LVL 1            [GRP1:0]
> + *                  -> migrator = GRP0:1
> + *                  -> active   = GRP0:1
> + *                   /                \
> + *    LVL 0  [GRP0:0]                  [GRP0:1]
> + *           migrator = CPU1           migrator = CPU2
> + *           active   = CPU1           active   = CPU2
> + *              /         \                /         \
> + *    CPUs     0           1              2           3
> + *             idle        active         active      idle
> + *
> + * Now there is a inconsistent overall state because GRP0:0 is active, but
                   an

> + * it is marked as idle in the GRP1:0. This is prevented by incrementing
> + * sequence counter whenever changing the state.
> + */
> +
> +#ifdef DEBUG
> +# define DBG_BUG_ON(x) BUG_ON(x)
> +#else
> +# define DBG_BUG_ON(x)
> +#endif

I would make them WARN_ON_ONCE() in !DEBUG case or remove it. There is
nothing that sets DEBUG here, right?

.
> +static bool tmigr_update_events(struct tmigr_group *group,
…
> +	if (nextexp == KTIME_MAX) {
> +		evt->ignore = 1;
> +
> +		/*
> +		 * When next event could be ignored (nextexp is KTIME MAX)
> +		 * and there was no remote timer handling before or the
> +		 * group is already active, there is no need to walk the
> +		 * hierarchy even if there is a parent group.
> +		 *
> +		 * The other way round: even if the event could be ignored,
> +		 * but if a remote timer handling was executed before and
> +		 * the group is not active, walking the hierarchy is
> +		 * required to not miss a enqueued timer in the non active
                                        an

> +		 * group. The enqueued timer needs to be propagated to a
> +		 * higher level to ensure it is handeld.
…
> +static struct tmigr_group *tmigr_get_group(unsigned int cpu, unsigned int node,
> +					   unsigned int lvl)
> +{
> +	struct tmigr_group *tmp, *group = NULL;
> +	bool first_loop = true;
> +
> +	lockdep_assert_held(&tmigr_mutex);
> +
> +reloop:
> +	/* Try to attach to an exisiting group first */
> +	list_for_each_entry(tmp, &tmigr_level_list[lvl], list) {
> +		/*
> +		 * If @lvl is below the cross numa node level, check whether
> +		 * this group belongs to the same numa node.
> +		 */
> +		if (lvl < tmigr_crossnode_level && tmp->numa_node != node)
> +			continue;
> +
> +		/* Capacity left? */
> +		if (tmp->num_children >= TMIGR_CHILDS_PER_GROUP)
> +			continue;
> +
> +		/*
> +		 * TODO: A possible further improvement: Make sure that all
> +		 * CPU siblings end up in the same group of lowest level of
> +		 * the hierarchy. Rely on topology sibling mask would be a
> +		 * reasonable solution.
> +		 */
> +
> +		group = tmp;
> +		break;
> +	}
> +
> +	if (group) {
> +		return group;
> +	} else if (first_loop == true) {
> +		first_loop = false;

Why do have this? There is no lock drop or anything that could alter the
outcome of the loop iteration or do I miss something that obvious?

> +		goto reloop;
> +	}
> +
> +	/* Allocate and	set up a new group */
> +	group = kzalloc_node(sizeof(*group), GFP_KERNEL, node);
> +	if (!group)
> +		return ERR_PTR(-ENOMEM);
> +
> +	tmigr_init_group(group, lvl, node);
> +
> +	/* Setup successful. Add it to the hierarchy */
> +	list_add(&group->list, &tmigr_level_list[lvl]);

Not sure if it makes any difference if the item is added to the front of
the list or the end but be aware that this adds it to the front.

> +	return group;
> +}
…
> +static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
> +{
> +	struct tmigr_group *group, *child, **stack;
> +	int top = 0, err = 0, i = 0;
> +	struct list_head *lvllist;
> +	size_t sz;
> +
> +	sz = sizeof(struct tmigr_group *) * tmigr_hierarchy_levels;
> +	stack = kzalloc(sz, GFP_KERNEL);

kcalloc().

> +	if (!stack)
> +		return -ENOMEM;
> +
> +	do {
> +		group = tmigr_get_group(cpu, node, i);
> +		if (IS_ERR(group)) {
> +			err = IS_ERR(group);

This needs to be
	err = PTR_ERR(group);

since now group is either 1 or 0.
_OR_ you err a bool an replace the check later from
	if (err < 0)
with
	if (err)

since you don't actually care about the error code but need to know if
it failed.

> +			break;
> +		}
> +
> +		top = i;
> +		stack[i++] = group;
> +
> +		/*
> +		 * When booting only less CPUs of a system than CPUs are
> +		 * available, not all calculated hierarchy levels are required.
> +		 *
> +		 * The loop is aborted as soon as the highest level, which might
> +		 * be different from tmigr_hierarchy_levels, contains only a
> +		 * single group.
> +		 */
> +		if (group->parent || i == tmigr_hierarchy_levels ||
> +		    (list_empty(&tmigr_level_list[i]) &&
> +		     list_is_singular(&tmigr_level_list[i - 1])))
> +			break;
> +
> +	} while (i < tmigr_hierarchy_levels);
> +
> +	do {
> +		group = stack[--i];
> +
> +		if (err < 0) {
> +			list_del(&group->list);
> +			kfree(group);
> +			continue;
> +		}
…
> +static int __init tmigr_init(void)
> +{
> +	unsigned int cpulvl, nodelvl, cpus_per_node, i;
> +	unsigned int nnodes = num_possible_nodes();
> +	unsigned int ncpus = num_possible_cpus();
> +	int ret = -ENOMEM;
> +	size_t sz;
…
> +	sz = sizeof(struct list_head) * tmigr_hierarchy_levels;
> +	tmigr_level_list = kzalloc(sz, GFP_KERNEL);
kcalloc().

…
> --- /dev/null
> +++ b/kernel/time/timer_migration.h
> @@ -0,0 +1,138 @@
…
> +struct tmigr_group {
> +	raw_spinlock_t		lock;
> +	atomic_t		migr_state;
> +	struct tmigr_group	*parent;
> +	struct tmigr_event	groupevt;
> +	u64			next_expiry;
> +	struct timerqueue_head	events;
> +	u8			childmask;
> +	unsigned int		level;
> +	struct list_head	list;
> +	unsigned int		numa_node;

numa_node is usually an int (not unsigned).

> +	unsigned int		num_children;
> +};

Sebastian