[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJqdLrry4HaHpsWGH_YUUr_mWetCCogOxnw1SMRW-XcWE_FXcg@mail.gmail.com>
Date: Tue, 25 Feb 2025 18:36:37 +0100
From: Alexander Mikhalitsyn <alexander@...alicyn.com>
To: Michal Koutný <mkoutny@...e.com>
Cc: Christian Brauner <brauner@...nel.org>, linux-doc@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
linux-trace-kernel@...r.kernel.org, Jonathan Corbet <corbet@....net>,
Kees Cook <kees@...nel.org>, Andrew Morton <akpm@...ux-foundation.org>,
"Eric W . Biederman" <ebiederm@...ssion.com>, Oleg Nesterov <oleg@...hat.com>
Subject: Re: [PATCH 1/2] Revert "pid: allow pid_max to be set per pid namespace"
Am Fr., 21. Feb. 2025 um 18:02 Uhr schrieb Michal Koutný <mkoutny@...e.com>:
>
> This reverts commit 7863dcc72d0f4b13a641065670426435448b3d80.
If we revert this one, then we should also revert a corresponding kselftest:
https://github.com/torvalds/linux/commit/615ab43b838bb982dc234feff75ee9ad35447c5d
>
> It is already difficult for users to troubleshoot which of multiple pid
> limits restricts their workload. I'm afraid making pid_max
> per-(hierarchical-)NS will contribute to confusion.
> Also, the implementation copies the limit upon creation from
> parent, this pattern showed cumbersome with some attributes in legacy
> cgroup controllers -- it's subject to race condition between parent's
> limit modification and children creation and once copied it must be
> changed in the descendant.
>
> This is very similar to what pids.max of a cgroup (already) does that
> can be used as an alternative.
>
> Link: https://lore.kernel.org/r/bnxhqrq7tip6jl2hu6jsvxxogdfii7ugmafbhgsogovrchxfyp@kagotkztqurt/
> Signed-off-by: Michal Koutný <mkoutny@...e.com>
> ---
> include/linux/pid.h | 3 +
> include/linux/pid_namespace.h | 10 +--
> kernel/pid.c | 125 ++----------------------------
> kernel/pid_namespace.c | 43 +++-------
> kernel/sysctl.c | 9 +++
> kernel/trace/pid_list.c | 2 +-
> kernel/trace/trace.h | 2 +
> kernel/trace/trace_sched_switch.c | 2 +-
> 8 files changed, 35 insertions(+), 161 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 98837a1ff0f33..fe575fcdb4afa 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -108,6 +108,9 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
> extern void transfer_pid(struct task_struct *old, struct task_struct *new,
> enum pid_type);
>
> +extern int pid_max;
> +extern int pid_max_min, pid_max_max;
> +
> /*
> * look up a PID in the hash table. Must be called with the tasklist_lock
> * or rcu_read_lock() held.
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index 7c67a58111998..f9f9931e02d6a 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -30,7 +30,6 @@ struct pid_namespace {
> struct task_struct *child_reaper;
> struct kmem_cache *pid_cachep;
> unsigned int level;
> - int pid_max;
> struct pid_namespace *parent;
> #ifdef CONFIG_BSD_PROCESS_ACCT
> struct fs_pin *bacct;
> @@ -39,14 +38,9 @@ struct pid_namespace {
> struct ucounts *ucounts;
> int reboot; /* group exit code if this pidns was rebooted */
> struct ns_common ns;
> - struct work_struct work;
> -#ifdef CONFIG_SYSCTL
> - struct ctl_table_set set;
> - struct ctl_table_header *sysctls;
> -#if defined(CONFIG_MEMFD_CREATE)
> +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> int memfd_noexec_scope;
> #endif
> -#endif
> } __randomize_layout;
>
> extern struct pid_namespace init_pid_ns;
> @@ -123,8 +117,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
> extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
> void pidhash_init(void);
> void pid_idr_init(void);
> -int register_pidns_sysctls(struct pid_namespace *pidns);
> -void unregister_pidns_sysctls(struct pid_namespace *pidns);
>
> static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
> {
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 924084713be8b..aa2a7d4da4555 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -61,8 +61,10 @@ struct pid init_struct_pid = {
> }, }
> };
>
> -static int pid_max_min = RESERVED_PIDS + 1;
> -static int pid_max_max = PID_MAX_LIMIT;
> +int pid_max = PID_MAX_DEFAULT;
> +
> +int pid_max_min = RESERVED_PIDS + 1;
> +int pid_max_max = PID_MAX_LIMIT;
>
> /*
> * PID-map pages start out as NULL, they get allocated upon
> @@ -81,7 +83,6 @@ struct pid_namespace init_pid_ns = {
> #ifdef CONFIG_PID_NS
> .ns.ops = &pidns_operations,
> #endif
> - .pid_max = PID_MAX_DEFAULT,
> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
> #endif
> @@ -190,7 +191,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>
> for (i = ns->level; i >= 0; i--) {
> int tid = 0;
> - int pid_max = READ_ONCE(tmp->pid_max);
>
> if (set_tid_size) {
> tid = set_tid[ns->level - i];
> @@ -644,118 +644,17 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> return fd;
> }
>
> -#ifdef CONFIG_SYSCTL
> -static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
> -{
> - return &task_active_pid_ns(current)->set;
> -}
> -
> -static int set_is_seen(struct ctl_table_set *set)
> -{
> - return &task_active_pid_ns(current)->set == set;
> -}
> -
> -static int pid_table_root_permissions(struct ctl_table_header *head,
> - const struct ctl_table *table)
> -{
> - struct pid_namespace *pidns =
> - container_of(head->set, struct pid_namespace, set);
> - int mode = table->mode;
> -
> - if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
> - uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
> - mode = (mode & S_IRWXU) >> 6;
> - else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
> - mode = (mode & S_IRWXG) >> 3;
> - else
> - mode = mode & S_IROTH;
> - return (mode << 6) | (mode << 3) | mode;
> -}
> -
> -static void pid_table_root_set_ownership(struct ctl_table_header *head,
> - kuid_t *uid, kgid_t *gid)
> -{
> - struct pid_namespace *pidns =
> - container_of(head->set, struct pid_namespace, set);
> - kuid_t ns_root_uid;
> - kgid_t ns_root_gid;
> -
> - ns_root_uid = make_kuid(pidns->user_ns, 0);
> - if (uid_valid(ns_root_uid))
> - *uid = ns_root_uid;
> -
> - ns_root_gid = make_kgid(pidns->user_ns, 0);
> - if (gid_valid(ns_root_gid))
> - *gid = ns_root_gid;
> -}
> -
> -static struct ctl_table_root pid_table_root = {
> - .lookup = pid_table_root_lookup,
> - .permissions = pid_table_root_permissions,
> - .set_ownership = pid_table_root_set_ownership,
> -};
> -
> -static const struct ctl_table pid_table[] = {
> - {
> - .procname = "pid_max",
> - .data = &init_pid_ns.pid_max,
> - .maxlen = sizeof(int),
> - .mode = 0644,
> - .proc_handler = proc_dointvec_minmax,
> - .extra1 = &pid_max_min,
> - .extra2 = &pid_max_max,
> - },
> -};
> -#endif
> -
> -int register_pidns_sysctls(struct pid_namespace *pidns)
> -{
> -#ifdef CONFIG_SYSCTL
> - struct ctl_table *tbl;
> -
> - setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
> -
> - tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
> - if (!tbl)
> - return -ENOMEM;
> - tbl->data = &pidns->pid_max;
> - pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
> - PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
> -
> - pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
> - ARRAY_SIZE(pid_table));
> - if (!pidns->sysctls) {
> - kfree(tbl);
> - retire_sysctl_set(&pidns->set);
> - return -ENOMEM;
> - }
> -#endif
> - return 0;
> -}
> -
> -void unregister_pidns_sysctls(struct pid_namespace *pidns)
> -{
> -#ifdef CONFIG_SYSCTL
> - const struct ctl_table *tbl;
> -
> - tbl = pidns->sysctls->ctl_table_arg;
> - unregister_sysctl_table(pidns->sysctls);
> - retire_sysctl_set(&pidns->set);
> - kfree(tbl);
> -#endif
> -}
> -
> void __init pid_idr_init(void)
> {
> /* Verify no one has done anything silly: */
> BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
>
> /* bump default and minimum pid_max based on number of cpus */
> - init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
> - PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
> + pid_max = min(pid_max_max, max_t(int, pid_max,
> + PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
> pid_max_min = max_t(int, pid_max_min,
> PIDS_PER_CPU_MIN * num_possible_cpus());
> - pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
> + pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
>
> idr_init(&init_pid_ns.idr);
>
> @@ -766,16 +665,6 @@ void __init pid_idr_init(void)
> NULL);
> }
>
> -static __init int pid_namespace_sysctl_init(void)
> -{
> -#ifdef CONFIG_SYSCTL
> - /* "kernel" directory will have already been initialized. */
> - BUG_ON(register_pidns_sysctls(&init_pid_ns));
> -#endif
> - return 0;
> -}
> -subsys_initcall(pid_namespace_sysctl_init);
> -
> static struct file *__pidfd_fget(struct task_struct *task, int fd)
> {
> struct file *file;
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 8f6cfec87555a..0f23285be4f92 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -70,8 +70,6 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
> dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
> }
>
> -static void destroy_pid_namespace_work(struct work_struct *work);
> -
> static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
> struct pid_namespace *parent_pid_ns)
> {
> @@ -107,27 +105,17 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> goto out_free_idr;
> ns->ns.ops = &pidns_operations;
>
> - ns->pid_max = parent_pid_ns->pid_max;
> - err = register_pidns_sysctls(ns);
> - if (err)
> - goto out_free_inum;
> -
> refcount_set(&ns->ns.count, 1);
> ns->level = level;
> ns->parent = get_pid_ns(parent_pid_ns);
> ns->user_ns = get_user_ns(user_ns);
> ns->ucounts = ucounts;
> ns->pid_allocated = PIDNS_ADDING;
> - INIT_WORK(&ns->work, destroy_pid_namespace_work);
> -
> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
> #endif
> -
> return ns;
>
> -out_free_inum:
> - ns_free_inum(&ns->ns);
> out_free_idr:
> idr_destroy(&ns->idr);
> kmem_cache_free(pid_ns_cachep, ns);
> @@ -149,28 +137,12 @@ static void delayed_free_pidns(struct rcu_head *p)
>
> static void destroy_pid_namespace(struct pid_namespace *ns)
> {
> - unregister_pidns_sysctls(ns);
> -
> ns_free_inum(&ns->ns);
>
> idr_destroy(&ns->idr);
> call_rcu(&ns->rcu, delayed_free_pidns);
> }
>
> -static void destroy_pid_namespace_work(struct work_struct *work)
> -{
> - struct pid_namespace *ns =
> - container_of(work, struct pid_namespace, work);
> -
> - do {
> - struct pid_namespace *parent;
> -
> - parent = ns->parent;
> - destroy_pid_namespace(ns);
> - ns = parent;
> - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
> -}
> -
> struct pid_namespace *copy_pid_ns(unsigned long flags,
> struct user_namespace *user_ns, struct pid_namespace *old_ns)
> {
> @@ -183,8 +155,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
>
> void put_pid_ns(struct pid_namespace *ns)
> {
> - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
> - schedule_work(&ns->work);
> + struct pid_namespace *parent;
> +
> + while (ns != &init_pid_ns) {
> + parent = ns->parent;
> + if (!refcount_dec_and_test(&ns->ns.count))
> + break;
> + destroy_pid_namespace(ns);
> + ns = parent;
> + }
> }
> EXPORT_SYMBOL_GPL(put_pid_ns);
>
> @@ -295,7 +274,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
> next = idr_get_cursor(&pid_ns->idr) - 1;
>
> tmp.data = &next;
> - tmp.extra2 = &pid_ns->pid_max;
> ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> if (!ret && write)
> idr_set_cursor(&pid_ns->idr, next + 1);
> @@ -303,6 +281,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
> return ret;
> }
>
> +extern int pid_max;
> static const struct ctl_table pid_ns_ctl_table[] = {
> {
> .procname = "ns_last_pid",
> @@ -310,7 +289,7 @@ static const struct ctl_table pid_ns_ctl_table[] = {
> .mode = 0666, /* permissions are checked in the handler */
> .proc_handler = pid_ns_ctl_handler,
> .extra1 = SYSCTL_ZERO,
> - .extra2 = &init_pid_ns.pid_max,
> + .extra2 = &pid_max,
> },
> };
> #endif /* CONFIG_CHECKPOINT_RESTORE */
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index cb57da499ebb1..bb739608680f2 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1803,6 +1803,15 @@ static const struct ctl_table kern_table[] = {
> .proc_handler = proc_dointvec,
> },
> #endif
> + {
> + .procname = "pid_max",
> + .data = &pid_max,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec_minmax,
> + .extra1 = &pid_max_min,
> + .extra2 = &pid_max_max,
> + },
> {
> .procname = "panic_on_oops",
> .data = &panic_on_oops,
> diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
> index c62b9b3cfb3d8..4966e6bbdf6f3 100644
> --- a/kernel/trace/pid_list.c
> +++ b/kernel/trace/pid_list.c
> @@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
> int i;
>
> /* According to linux/thread.h, pids can be no bigger that 30 bits */
> - WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
> + WARN_ON_ONCE(pid_max > (1 << 30));
>
> pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
> if (!pid_list)
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 9c21ba45b7af6..46c65402ad7e5 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -732,6 +732,8 @@ extern unsigned long tracing_thresh;
>
> /* PID filtering */
>
> +extern int pid_max;
> +
> bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
> pid_t search_pid);
> bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
> diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
> index cb49f7279dc80..573b5d8e8a28e 100644
> --- a/kernel/trace/trace_sched_switch.c
> +++ b/kernel/trace/trace_sched_switch.c
> @@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
> if (tgid_map)
> return 0;
>
> - tgid_map_max = init_pid_ns.pid_max;
> + tgid_map_max = pid_max;
> map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
> GFP_KERNEL);
> if (!map)
> --
> 2.48.1
>
Powered by blists - more mailing lists