linux-kernel - Re: [PATCH] sched/schedutil: Fix deadlock between cpuset and cpu hotplug when using schedutil

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAB8ipk_gCLtvEahsp2DvPJf4NxRsM8WCYmmH=yTd7zQE+81_Yg@mail.gmail.com>
Date:   Fri, 29 Jul 2022 15:33:22 +0800
From:   Xuewen Yan <xuewen.yan94@...il.com>
To:     Tejun Heo <tj@...nel.org>
Cc:     Waiman Long <longman@...hat.com>,
        Qais Yousef <qais.yousef@....com>,
        Xuewen Yan <xuewen.yan@...soc.com>, rafael@...nel.org,
        viresh.kumar@...aro.org, mingo@...hat.com, peterz@...radead.org,
        juri.lelli@...hat.com, vincent.guittot@...aro.org,
        dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
        mgorman@...e.de, bristot@...hat.com, linux-kernel@...r.kernel.org,
        ke.wang@...soc.com, xuewyan@...mail.com, linux-pm@...r.kernel.org,
        Lukasz Luba <Lukasz.Luba@....com>, pengcheng.lai@...soc.com
Subject: Re: [PATCH] sched/schedutil: Fix deadlock between cpuset and cpu
 hotplug when using schedutil

Hi Tejun

On Thu, Jul 28, 2022 at 4:09 AM Tejun Heo <tj@...nel.org> wrote:
>
> Hello,
>
>
> Can youp please see whether the following patch fixes the problem?
>

I have tested the patch, sadly, although the original deadlock call
stack is gone, there seems to be a new problem.
I did a preliminary analysis, but no further in-depth analysis. I
haven't found the root cause yet, but I think I should let you know
the test results first.
Once I find the root cause, I'll report back immediately.
The thread-A is waiting for the cpu_hotplug_lock's writer, but the
writer is waiting for the readers:

PID: 1106   TASK: ffffff8097e90000  CPU: 7   COMMAND: "OomAdjuster"
 #0 [ffffffc011a23850] __switch_to at ffffffc0081229b4
 #1 [ffffffc011a238b0] __schedule at ffffffc009c824f8
 #2 [ffffffc011a23910] schedule at ffffffc009c82b50
 #3 [ffffffc011a23970] percpu_rwsem_wait at ffffffc00828fbf4
 #4 [ffffffc011a239b0] __percpu_down_read at ffffffc0082901a8
 #5 [ffffffc011a239e0] cpus_read_lock at ffffffc0081dadc8
 #6 [ffffffc011a23a20] cpuset_attach at ffffffc008366f10
 #7 [ffffffc011a23a90] cgroup_migrate_execute at ffffffc00834f808
 #8 [ffffffc011a23b60] cgroup_attach_task at ffffffc0083539f0
 #9 [ffffffc011a23bd0] __cgroup1_procs_write at ffffffc00835f6e8
#10 [ffffffc011a23c30] cgroup1_tasks_write at ffffffc00835f054
#11 [ffffffc011a23c60] cgroup_file_write at ffffffc008348b2c
#12 [ffffffc011a23c90] kernfs_fop_write_iter at ffffffc008754514
#13 [ffffffc011a23d50] vfs_write at ffffffc008607ac8
#14 [ffffffc011a23db0] ksys_write at ffffffc008607714
#15 [ffffffc011a23df0] __arm64_sys_write at ffffffc00860767c
#16 [ffffffc011a23e10] invoke_syscall at ffffffc00813dcac
#17 [ffffffc011a23e30] el0_svc_common at ffffffc00813dbc4
#18 [ffffffc011a23e70] do_el0_svc at ffffffc00813daa8
#19 [ffffffc011a23e80] el0_svc at ffffffc0098828d8
#20 [ffffffc011a23ea0] el0t_64_sync_handler at ffffffc00988284c
#21 [ffffffc011a23fe0] el0t_64_sync at ffffffc008091e44

cpu_hotplug_lock = $3 = {
  rss = {
    gp_state = 2,
    gp_count = 1,
    gp_wait = {
      lock = {
        {
          rlock = {
            raw_lock = {
              {
                val = {
                  counter = 0
                },
                {
                  locked = 0 '\000',
                  pending = 0 '\000'
                },
                {
                  locked_pending = 0,
                  tail = 0
                }
              }
            }
          }
        }
      },
      head = {
        next = 0xffffffc00b0eb6a0 <cpu_hotplug_lock+16>,
        prev = 0xffffffc00b0eb6a0 <cpu_hotplug_lock+16>
      }
    },
    cb_head = {
      next = 0x0,
      func = 0x0
    }
  },
  read_count = 0xffffffc00b0a0808 <__percpu_rwsem_rc_cpu_hotplug_lock>,
  writer = {
    task = 0xffffff80f303ca00
  },
  {
    waiters = {
      lock = {
        {
          rlock = {
            raw_lock = {
              {
                val = {
                  counter = 0
                },
                {
                  locked = 0 '\000',
                  pending = 0 '\000'
                },
                {
                  locked_pending = 0,
                  tail = 0
                }
              }
            }
          }
        }
      },
      head = {
        next = 0xffffffc011a23958,
        prev = 0xffffffc01430bcb8
      }
    },
    destroy_list_entry = {
      next = 0x0,
      prev = 0xffffffc011a23958
    }
  },
  block = {
    counter = 1
  }
}

PID: 15760  TASK: ffffff80f303ca00  CPU: 5   COMMAND: "test.sh"
 #0 [ffffffc0129639a0] __switch_to at ffffffc0081229b4
 #1 [ffffffc012963a00] __schedule at ffffffc009c824f8
 #2 [ffffffc012963a60] schedule at ffffffc009c82b50
 #3 [ffffffc012963a90] percpu_down_write at ffffffc00828f9d8
 #4 [ffffffc012963ae0] _cpu_down at ffffffc009884550
 #5 [ffffffc012963b40] cpu_device_down at ffffffc0081df814
 #6 [ffffffc012963b60] cpu_subsys_offline at ffffffc008d8dd8c
 #7 [ffffffc012963b90] device_offline at ffffffc008d77124
 #8 [ffffffc012963bd0] online_store at ffffffc008d76d44
 #9 [ffffffc012963c30] dev_attr_store at ffffffc008d7ba30
#10 [ffffffc012963c60] sysfs_kf_write at ffffffc0087578d0
#11 [ffffffc012963c90] kernfs_fop_write_iter at ffffffc008754514
#12 [ffffffc012963d50] vfs_write at ffffffc008607ac8
#13 [ffffffc012963db0] ksys_write at ffffffc008607714
#14 [ffffffc012963df0] __arm64_sys_write at ffffffc00860767c
#15 [ffffffc012963e10] invoke_syscall at ffffffc00813dcac
#16 [ffffffc012963e30] el0_svc_common at ffffffc00813dbf4
#17 [ffffffc012963e70] do_el0_svc at ffffffc00813daa8
#18 [ffffffc012963e80] el0_svc at ffffffc0098828d8
#19 [ffffffc012963ea0] el0t_64_sync_handler at ffffffc00988284c
#20 [ffffffc012963fe0] el0t_64_sync at ffffffc008091e44

Thanks!
BR
-- 
xuewen.yan

> Thanks.
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 13c8e91d78620..7caefc8af9127 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -2345,6 +2345,38 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
>  }
>  EXPORT_SYMBOL_GPL(task_cgroup_path);
>
> +/**
> + * lock_threadgroup - Stabilize threadgroups
> + *
> + * cgroup migration operations need the threadgroups stabilized against forks
> + * and exits, which can be achieved by write-locking cgroup_threadgroup_rwsem.
> + *
> + * Note that write-locking threadgdroup_rwsem is nested inside CPU hotplug
> + * disable. This is because cpuset needs CPU hotplug disabled during ->attach()
> + * and bringing up a CPU may involve creating new tasks which can require
> + * read-locking threadgroup_rwsem. If we call cpuset's ->attach() with
> + * threadgroup_rwsem write-locked with hotplug enabled, we can deadlock by
> + * cpuset waiting for an on-going CPU hotplug operation which in turn is waiting
> + * for the threadgroup_rwsem to be released to create new tasks. See the
> + * following for more details:
> + *
> + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
> + */
> +static void lock_threadgroup(void)
> +{
> +       cpus_read_lock();
> +       percpu_down_write(&cgroup_threadgroup_rwsem);
> +}
> +
> +/**
> + * lock_threadgroup - Undo lock_threadgroup
> + */
> +static void unlock_threadgroup(void)
> +{
> +       percpu_up_write(&cgroup_threadgroup_rwsem);
> +       cpus_read_unlock();
> +}
> +
>  /**
>   * cgroup_migrate_add_task - add a migration target task to a migration context
>   * @task: target task
> @@ -2822,7 +2854,6 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
>
>  struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
>                                              bool *locked)
> -       __acquires(&cgroup_threadgroup_rwsem)
>  {
>         struct task_struct *tsk;
>         pid_t pid;
> @@ -2840,7 +2871,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
>          */
>         lockdep_assert_held(&cgroup_mutex);
>         if (pid || threadgroup) {
> -               percpu_down_write(&cgroup_threadgroup_rwsem);
> +               lock_threadgroup();
>                 *locked = true;
>         } else {
>                 *locked = false;
> @@ -2876,7 +2907,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
>
>  out_unlock_threadgroup:
>         if (*locked) {
> -               percpu_up_write(&cgroup_threadgroup_rwsem);
> +               unlock_threadgroup();
>                 *locked = false;
>         }
>  out_unlock_rcu:
> @@ -2885,7 +2916,6 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
>  }
>
>  void cgroup_procs_write_finish(struct task_struct *task, bool locked)
> -       __releases(&cgroup_threadgroup_rwsem)
>  {
>         struct cgroup_subsys *ss;
>         int ssid;
> @@ -2894,7 +2924,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
>         put_task_struct(task);
>
>         if (locked)
> -               percpu_up_write(&cgroup_threadgroup_rwsem);
> +               unlock_threadgroup();
> +
>         for_each_subsys(ss, ssid)
>                 if (ss->post_attach)
>                         ss->post_attach();
> @@ -2953,7 +2984,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
>
>         lockdep_assert_held(&cgroup_mutex);
>
> -       percpu_down_write(&cgroup_threadgroup_rwsem);
> +       lock_threadgroup();
>
>         /* look up all csses currently attached to @cgrp's subtree */
>         spin_lock_irq(&css_set_lock);
> @@ -2984,7 +3015,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
>         ret = cgroup_migrate_execute(&mgctx);
>  out_finish:
>         cgroup_migrate_finish(&mgctx);
> -       percpu_up_write(&cgroup_threadgroup_rwsem);
> +       unlock_threadgroup();
>         return ret;
>  }
>