linux-kernel - Re: [PATCH sched_ext/for-6.11 1/2] sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAADnVQ+h2W88nWnj_frPa24vYmE+yebHYaT6mronRnDYvC+JLQ@mail.gmail.com>
Date: Thu, 27 Jun 2024 18:11:48 -0700
From: Alexei Starovoitov <alexei.starovoitov@...il.com>
To: Tejun Heo <tj@...nel.org>
Cc: Alexei Starovoitov <ast@...nel.org>, LKML <linux-kernel@...r.kernel.org>, 
	bpf <bpf@...r.kernel.org>, David Vernet <void@...ifault.com>
Subject: Re: [PATCH sched_ext/for-6.11 1/2] sched_ext: Implement DSQ iterator

On Thu, Jun 27, 2024 at 5:20 PM Tejun Heo <tj@...nel.org> wrote:
>
> DSQs are very opaque in the consumption path. The BPF scheduler has no way
> of knowing which tasks are being considered and which is picked. This patch
> adds BPF DSQ iterator.
>
> - Allows iterating tasks queued on a DSQ in the dispatch order or reverse
>   from anywhere using bpf_for_each(scx_dsq) or calling the iterator kfuncs
>   directly.
>
> - Has ordering guarantee where only tasks which were already queued when the
>   iteration started are visible and consumable during the iteration.
>
> scx_qmap is updated to implement periodic dumping of the shared DSQ.
>
> v2: - scx_bpf_consume_task() is separated out into a separate patch.
>
>     - DSQ seq and iter flags don't need to be u64. Use u32.
>
> Signed-off-by: Tejun Heo <tj@...nel.org>
> Reviewed-by: David Vernet <dvernet@...a.com>
> Cc: Alexei Starovoitov <ast@...nel.org>
> Cc: bpf@...r.kernel.org
> ---
> Hello, Alexei.
>
> These two patches implement inline iterator for a task queue data structure
> that's used by sched_ext. The first one implements the iterator itself. It's
> pretty straightforward and seems to work fine. The second one implements a
> kfunc which consumes a task while iterating. This one is a bit nasty
> unfortunately. I'll continue on the second patch.
>
> Thanks.
>
>  include/linux/sched/ext.h                |    4
>  kernel/sched/ext.c                       |  182 ++++++++++++++++++++++++++++++-
>  tools/sched_ext/include/scx/common.bpf.h |    3
>  tools/sched_ext/scx_qmap.bpf.c           |   25 ++++
>  tools/sched_ext/scx_qmap.c               |    8 +
>  5 files changed, 218 insertions(+), 4 deletions(-)
>
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -61,6 +61,7 @@ struct scx_dispatch_q {
>         struct list_head        list;   /* tasks in dispatch order */
>         struct rb_root          priq;   /* used to order by p->scx.dsq_vtime */
>         u32                     nr;
> +       u32                     seq;    /* used by BPF iter */
>         u64                     id;
>         struct rhash_head       hash_node;
>         struct llist_node       free_node;
> @@ -94,6 +95,8 @@ enum scx_task_state {
>  /* scx_entity.dsq_flags */
>  enum scx_ent_dsq_flags {
>         SCX_TASK_DSQ_ON_PRIQ    = 1 << 0, /* task is queued on the priority queue of a dsq */
> +
> +       SCX_TASK_DSQ_CURSOR     = 1 << 31, /* iteration cursor, not a task */
>  };
>
>  /*
> @@ -134,6 +137,7 @@ struct scx_dsq_node {
>  struct sched_ext_entity {
>         struct scx_dispatch_q   *dsq;
>         struct scx_dsq_node     dsq_node;       /* protected by dsq lock */
> +       u32                     dsq_seq;
>         u32                     flags;          /* protected by rq lock */
>         u32                     weight;
>         s32                     sticky_cpu;
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1066,6 +1066,72 @@ static __always_inline bool scx_kf_allow
>         return true;
>  }
>
> +/**
> + * nldsq_next_task - Iterate to the next task in a non-local DSQ
> + * @dsq: user dsq being interated
> + * @cur: current position, %NULL to start iteration
> + * @rev: walk backwards
> + *
> + * Returns %NULL when iteration is finished.
> + */
> +static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
> +                                          struct task_struct *cur, bool rev)
> +{
> +       struct list_head *list_node;
> +       struct scx_dsq_node *dsq_node;
> +
> +       lockdep_assert_held(&dsq->lock);
> +
> +       if (cur)
> +               list_node = &cur->scx.dsq_node.list;
> +       else
> +               list_node = &dsq->list;
> +
> +       /* find the next task, need to skip BPF iteration cursors */
> +       do {
> +               if (rev)
> +                       list_node = list_node->prev;
> +               else
> +                       list_node = list_node->next;
> +
> +               if (list_node == &dsq->list)
> +                       return NULL;
> +
> +               dsq_node = container_of(list_node, struct scx_dsq_node, list);
> +       } while (dsq_node->flags & SCX_TASK_DSQ_CURSOR);
> +
> +       return container_of(dsq_node, struct task_struct, scx.dsq_node);
> +}
> +
> +#define nldsq_for_each_task(p, dsq)                                            \
> +       for ((p) = nldsq_next_task((dsq), NULL, false); (p);                    \
> +            (p) = nldsq_next_task((dsq), (p), false))
> +
> +
> +/*
> + * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
> + * dispatch order. BPF-visible iterator is opaque and larger to allow future
> + * changes without breaking backward compatibility. Can be used with
> + * bpf_for_each(). See bpf_iter_scx_dsq_*().
> + */
> +enum scx_dsq_iter_flags {
> +       /* iterate in the reverse dispatch order */
> +       SCX_DSQ_ITER_REV                = 1U << 0,
> +
> +       __SCX_DSQ_ITER_ALL_FLAGS        = SCX_DSQ_ITER_REV,
> +};
> +
> +struct bpf_iter_scx_dsq_kern {
> +       struct scx_dsq_node             cursor;
> +       struct scx_dispatch_q           *dsq;
> +       u32                             dsq_seq;
> +       u32                             flags;
> +} __attribute__((aligned(8)));
> +
> +struct bpf_iter_scx_dsq {
> +       u64                             __opaque[12];
> +} __attribute__((aligned(8)));

I think this is a bit too much to put on the prog stack.
Folks are working on increasing this limit and moving
the stack into "divided stack", so it won't be an issue eventually,
but let's find a way to reduce it.
It seems to me scx_dsq_node has a bunch of fields,
but if I'm reading the code correctly this patch is
only using cursor.list part of it ?

Another alternative is to use bpf_mem_alloc() like we do
in bpf_iter_css_task and others?

> +
>
>  /*
>   * SCX task iterator.
> @@ -1415,7 +1481,7 @@ static void dispatch_enqueue(struct scx_
>                  * tested easily when adding the first task.
>                  */
>                 if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
> -                            !list_empty(&dsq->list)))
> +                            nldsq_next_task(dsq, NULL, false)))
>                         scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
>                                       dsq->id);
>
> @@ -1447,6 +1513,10 @@ static void dispatch_enqueue(struct scx_
>                         list_add_tail(&p->scx.dsq_node.list, &dsq->list);
>         }
>
> +       /* seq records the order tasks are queued, used by BPF DSQ iterator */
> +       dsq->seq++;
> +       p->scx.dsq_seq = dsq->seq;
> +
>         dsq_mod_nr(dsq, 1);
>         p->scx.dsq = dsq;
>
> @@ -2109,7 +2179,7 @@ retry:
>
>         raw_spin_lock(&dsq->lock);
>
> -       list_for_each_entry(p, &dsq->list, scx.dsq_node.list) {
> +       nldsq_for_each_task(p, dsq) {
>                 struct rq *task_rq = task_rq(p);
>
>                 if (rq == task_rq) {
> @@ -5697,6 +5767,111 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64
>         destroy_dsq(dsq_id);
>  }
>
> +/**
> + * bpf_iter_scx_dsq_new - Create a DSQ iterator
> + * @it: iterator to initialize
> + * @dsq_id: DSQ to iterate
> + * @flags: %SCX_DSQ_ITER_*
> + *
> + * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
> + * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
> + * tasks which are already queued when this function is invoked.
> + */
> +__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
> +                                    u64 flags)
> +{
> +       struct bpf_iter_scx_dsq_kern *kit = (void *)it;
> +
> +       BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
> +                    sizeof(struct bpf_iter_scx_dsq));
> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
> +                    __alignof__(struct bpf_iter_scx_dsq));
> +
> +       if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS)
> +               return -EINVAL;
> +
> +       kit->dsq = find_non_local_dsq(dsq_id);
> +       if (!kit->dsq)
> +               return -ENOENT;
> +
> +       INIT_LIST_HEAD(&kit->cursor.list);
> +       RB_CLEAR_NODE(&kit->cursor.priq);
> +       kit->cursor.flags = SCX_TASK_DSQ_CURSOR;

Are these two assignments really necessary?
Something inside nldsq_next_task() is using that?


> +       kit->dsq_seq = READ_ONCE(kit->dsq->seq);
> +       kit->flags = flags;
> +
> +       return 0;
> +}