lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20171019073050.GC29091@outlook.office365.com>
Date:   Thu, 19 Oct 2017 00:30:52 -0700
From:   Andrei Vagin <avagin@...tuozzo.com>
To:     Gargi Sharma <gs051095@...il.com>
Cc:     linux-kernel@...r.kernel.org, riel@...riel.com,
        julia.lawall@...6.fr, akpm@...ux-foundation.org, mingo@...nel.org,
        pasha.tatashin@...cle.com, ktkhai@...tuozzo.com, oleg@...hat.com,
        ebiederm@...ssion.com, hch@...radead.org, lkp@...el.com,
        tony.luck@...el.com
Subject: Re: [v6,1/2] pid: Replace pid bitmap implementation with IDR API

Hi Gargi,

This patch breaks CRIU, because it changes a meaning of ns_last_pid.

========================== Run zdtm/static/env00 in h ==========================
 DEP       env00.d
 CC        env00.o
 LINK      env00
Start test
./env00 --pidfile=env00.pid --outfile=env00.out --envname=ENV_00_TEST
Run criu dump
Run criu restore
=[log]=> dump/zdtm/static/env00/52/1/restore.log
------------------------ grep Error ------------------------
(00.000587) No mountpoints-6.img image
(00.000593) mnt: Reading mountpoint images (id 6 pid 52)
(00.000653) Forking task with 52 pid (flags 0x0)
(00.007568) PID: real 51 virt 52
(00.010363)     52: Error (criu/cr-restore.c:1787): Pid 51 do not match expected 52 (task 52)
(00.010474) Error (criu/cr-restore.c:2449): Restoring FAILED.
------------------------ ERROR OVER ------------------------

Before this patch, ns_last_pid contains a pid of a last process. With
this patch, it contains a pid of a "next" process.

In CRIU we use ns_last_pid to restore a process with a specified pid,
and now this logic is broken:

$ uname -a
Linux laptop 4.11.11-200.fc25.x86_64 #1 SMP Mon Jul 17 17:41:12 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
$ echo 19999 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$'
20000

$ uname -a
Linux fc24 4.14.0-rc5-next-20171018 #1 SMP Wed Oct 18 23:52:43 PDT 2017 x86_64 x86_64 x86_64 GNU/Linux
$ echo 19999 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$'
19999

Thanks,
Andrei

On Wed, Oct 11, 2017 at 06:19:38PM -0400, Gargi Sharma wrote:
> This patch replaces the current bitmap implemetation for
> Process ID allocation. Functions that are no longer required,
> for example, free_pidmap(), alloc_pidmap(), etc. are removed.
> The rest of the functions are modified to use the IDR API.
> The change was made to make the PID allocation less complex by
> replacing custom code with calls to generic API.
> 
> Signed-off-by: Gargi Sharma <gs051095@...il.com>
> Reviewed-by: Rik van Riel <riel@...hat.com>
> ---
>  arch/powerpc/platforms/cell/spufs/sched.c |   2 +-
>  fs/proc/loadavg.c                         |   2 +-
>  include/linux/pid_namespace.h             |  14 +--
>  init/main.c                               |   2 +-
>  kernel/pid.c                              | 201 ++++++------------------------
>  kernel/pid_namespace.c                    |  44 +++----
>  6 files changed, 57 insertions(+), 208 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
> index 1fbb5da..e47761c 100644
> --- a/arch/powerpc/platforms/cell/spufs/sched.c
> +++ b/arch/powerpc/platforms/cell/spufs/sched.c
> @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
>  		LOAD_INT(c), LOAD_FRAC(c),
>  		count_active_contexts(),
>  		atomic_read(&nr_spu_contexts),
> -		task_active_pid_ns(current)->last_pid);
> +		idr_get_cursor(&task_active_pid_ns(current)->idr));
>  	return 0;
>  }
>  
> diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
> index 983fce5..ba3d0e2 100644
> --- a/fs/proc/loadavg.c
> +++ b/fs/proc/loadavg.c
> @@ -23,7 +23,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
>  		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
>  		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
>  		nr_running(), nr_threads,
> -		task_active_pid_ns(current)->last_pid);
> +		idr_get_cursor(&task_active_pid_ns(current)->idr));
>  	return 0;
>  }
>  
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index b09136f..f4db4a7 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -9,15 +9,8 @@
>  #include <linux/nsproxy.h>
>  #include <linux/kref.h>
>  #include <linux/ns_common.h>
> +#include <linux/idr.h>
>  
> -struct pidmap {
> -       atomic_t nr_free;
> -       void *page;
> -};
> -
> -#define BITS_PER_PAGE		(PAGE_SIZE * 8)
> -#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
> -#define PIDMAP_ENTRIES		((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
>  
>  struct fs_pin;
>  
> @@ -29,9 +22,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
>  
>  struct pid_namespace {
>  	struct kref kref;
> -	struct pidmap pidmap[PIDMAP_ENTRIES];
> +	struct idr idr;
>  	struct rcu_head rcu;
> -	int last_pid;
>  	unsigned int nr_hashed;
>  	struct task_struct *child_reaper;
>  	struct kmem_cache *pid_cachep;
> @@ -105,6 +97,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>  
>  extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
>  void pidhash_init(void);
> -void pidmap_init(void);
> +void pid_idr_init(void);
>  
>  #endif /* _LINUX_PID_NS_H */
> diff --git a/init/main.c b/init/main.c
> index 0ee9c686..9f4db20 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -667,7 +667,7 @@ asmlinkage __visible void __init start_kernel(void)
>  	if (late_time_init)
>  		late_time_init();
>  	calibrate_delay();
> -	pidmap_init();
> +	pid_idr_init();
>  	anon_vma_init();
>  	acpi_early_init();
>  #ifdef CONFIG_X86
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 020dedb..0ce5936 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -39,6 +39,7 @@
>  #include <linux/proc_ns.h>
>  #include <linux/proc_fs.h>
>  #include <linux/sched/task.h>
> +#include <linux/idr.h>
>  
>  #define pid_hashfn(nr, ns)	\
>  	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
> @@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
>  int pid_max_min = RESERVED_PIDS + 1;
>  int pid_max_max = PID_MAX_LIMIT;
>  
> -static inline int mk_pid(struct pid_namespace *pid_ns,
> -		struct pidmap *map, int off)
> -{
> -	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
> -}
> -
> -#define find_next_offset(map, off)					\
> -		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
>  
>  /*
>   * PID-map pages start out as NULL, they get allocated upon
> @@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
>   */
>  struct pid_namespace init_pid_ns = {
>  	.kref = KREF_INIT(2),
> -	.pidmap = {
> -		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
> -	},
> -	.last_pid = 0,
> +	.idr = IDR_INIT,
>  	.nr_hashed = PIDNS_HASH_ADDING,
>  	.level = 0,
>  	.child_reaper = &init_task,
> @@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
>  
>  static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
>  
> -static void free_pidmap(struct upid *upid)
> -{
> -	int nr = upid->nr;
> -	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
> -	int offset = nr & BITS_PER_PAGE_MASK;
> -
> -	clear_bit(offset, map->page);
> -	atomic_inc(&map->nr_free);
> -}
> -
> -/*
> - * If we started walking pids at 'base', is 'a' seen before 'b'?
> - */
> -static int pid_before(int base, int a, int b)
> -{
> -	/*
> -	 * This is the same as saying
> -	 *
> -	 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
> -	 * and that mapping orders 'a' and 'b' with respect to 'base'.
> -	 */
> -	return (unsigned)(a - base) < (unsigned)(b - base);
> -}
> -
> -/*
> - * We might be racing with someone else trying to set pid_ns->last_pid
> - * at the pid allocation time (there's also a sysctl for this, but racing
> - * with this one is OK, see comment in kernel/pid_namespace.c about it).
> - * We want the winner to have the "later" value, because if the
> - * "earlier" value prevails, then a pid may get reused immediately.
> - *
> - * Since pids rollover, it is not sufficient to just pick the bigger
> - * value.  We have to consider where we started counting from.
> - *
> - * 'base' is the value of pid_ns->last_pid that we observed when
> - * we started looking for a pid.
> - *
> - * 'pid' is the pid that we eventually found.
> - */
> -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
> -{
> -	int prev;
> -	int last_write = base;
> -	do {
> -		prev = last_write;
> -		last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
> -	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
> -}
> -
> -static int alloc_pidmap(struct pid_namespace *pid_ns)
> -{
> -	int i, offset, max_scan, pid, last = pid_ns->last_pid;
> -	struct pidmap *map;
> -
> -	pid = last + 1;
> -	if (pid >= pid_max)
> -		pid = RESERVED_PIDS;
> -	offset = pid & BITS_PER_PAGE_MASK;
> -	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
> -	/*
> -	 * If last_pid points into the middle of the map->page we
> -	 * want to scan this bitmap block twice, the second time
> -	 * we start with offset == 0 (or RESERVED_PIDS).
> -	 */
> -	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
> -	for (i = 0; i <= max_scan; ++i) {
> -		if (unlikely(!map->page)) {
> -			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
> -			/*
> -			 * Free the page if someone raced with us
> -			 * installing it:
> -			 */
> -			spin_lock_irq(&pidmap_lock);
> -			if (!map->page) {
> -				map->page = page;
> -				page = NULL;
> -			}
> -			spin_unlock_irq(&pidmap_lock);
> -			kfree(page);
> -			if (unlikely(!map->page))
> -				return -ENOMEM;
> -		}
> -		if (likely(atomic_read(&map->nr_free))) {
> -			for ( ; ; ) {
> -				if (!test_and_set_bit(offset, map->page)) {
> -					atomic_dec(&map->nr_free);
> -					set_last_pid(pid_ns, last, pid);
> -					return pid;
> -				}
> -				offset = find_next_offset(map, offset);
> -				if (offset >= BITS_PER_PAGE)
> -					break;
> -				pid = mk_pid(pid_ns, map, offset);
> -				if (pid >= pid_max)
> -					break;
> -			}
> -		}
> -		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
> -			++map;
> -			offset = 0;
> -		} else {
> -			map = &pid_ns->pidmap[0];
> -			offset = RESERVED_PIDS;
> -			if (unlikely(last == offset))
> -				break;
> -		}
> -		pid = mk_pid(pid_ns, map, offset);
> -	}
> -	return -EAGAIN;
> -}
> -
> -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
> -{
> -	int offset;
> -	struct pidmap *map, *end;
> -
> -	if (last >= PID_MAX_LIMIT)
> -		return -1;
> -
> -	offset = (last + 1) & BITS_PER_PAGE_MASK;
> -	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
> -	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
> -	for (; map < end; map++, offset = 0) {
> -		if (unlikely(!map->page))
> -			continue;
> -		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
> -		if (offset < BITS_PER_PAGE)
> -			return mk_pid(pid_ns, map, offset);
> -	}
> -	return -1;
> -}
> -
>  void put_pid(struct pid *pid)
>  {
>  	struct pid_namespace *ns;
> @@ -266,7 +124,7 @@ void free_pid(struct pid *pid)
>  		struct upid *upid = pid->numbers + i;
>  		struct pid_namespace *ns = upid->ns;
>  		hlist_del_rcu(&upid->pid_chain);
> -		switch(--ns->nr_hashed) {
> +		switch (--ns->nr_hashed) {
>  		case 2:
>  		case 1:
>  			/* When all that is left in the pid namespace
> @@ -284,12 +142,11 @@ void free_pid(struct pid *pid)
>  			schedule_work(&ns->proc_work);
>  			break;
>  		}
> +
> +		idr_remove(&ns->idr, upid->nr);
>  	}
>  	spin_unlock_irqrestore(&pidmap_lock, flags);
>  
> -	for (i = 0; i <= pid->level; i++)
> -		free_pidmap(pid->numbers + i);
> -
>  	call_rcu(&pid->rcu, delayed_put_pid);
>  }
>  
> @@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>  
>  	tmp = ns;
>  	pid->level = ns->level;
> +
>  	for (i = ns->level; i >= 0; i--) {
> -		nr = alloc_pidmap(tmp);
> +		int pid_min = 1;
> +
> +		idr_preload(GFP_KERNEL);
> +		spin_lock_irq(&pidmap_lock);
> +
> +		/*
> +		 * init really needs pid 1, but after reaching the maximum
> +		 * wrap back to RESERVED_PIDS
> +		 */
> +		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
> +			pid_min = RESERVED_PIDS;
> +
> +		/*
> +		 * Store a null pointer so find_pid_ns does not find
> +		 * a partially initialized PID (see below).
> +		 */
> +		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> +				      pid_max, GFP_ATOMIC);
> +		spin_unlock_irq(&pidmap_lock);
> +		idr_preload_end();
> +
>  		if (nr < 0) {
>  			retval = nr;
>  			goto out_free;
> @@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>  	for ( ; upid >= pid->numbers; --upid) {
>  		hlist_add_head_rcu(&upid->pid_chain,
>  				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
> +		/* Make the PID visible to find_pid_ns. */
> +		idr_replace(&upid->ns->idr, pid, upid->nr);
>  		upid->ns->nr_hashed++;
>  	}
>  	spin_unlock_irq(&pidmap_lock);
> @@ -350,8 +230,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>  	put_pid_ns(ns);
>  
>  out_free:
> +	spin_lock_irq(&pidmap_lock);
>  	while (++i <= ns->level)
> -		free_pidmap(pid->numbers + i);
> +		idr_remove(&ns->idr, (pid->numbers + i)->nr);
> +
> +	spin_unlock_irq(&pidmap_lock);
>  
>  	kmem_cache_free(ns->pid_cachep, pid);
>  	return ERR_PTR(retval);
> @@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
>   */
>  struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
>  {
> -	struct pid *pid;
> -
> -	do {
> -		pid = find_pid_ns(nr, ns);
> -		if (pid)
> -			break;
> -		nr = next_pidmap(ns, nr);
> -	} while (nr > 0);
> -
> -	return pid;
> +	return idr_get_next(&ns->idr, &nr);
>  }
>  
>  /*
> @@ -578,7 +452,7 @@ void __init pidhash_init(void)
>  					   0, 4096);
>  }
>  
> -void __init pidmap_init(void)
> +void __init pid_idr_init(void)
>  {
>  	/* Verify no one has done anything silly: */
>  	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
> @@ -590,10 +464,7 @@ void __init pidmap_init(void)
>  				PIDS_PER_CPU_MIN * num_possible_cpus());
>  	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
>  
> -	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
> -	/* Reserve PID 0. We never call free_pidmap(0) */
> -	set_bit(0, init_pid_ns.pidmap[0].page);
> -	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
> +	idr_init(&init_pid_ns.idr);
>  
>  	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
>  			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 4918314..5009dbe 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -21,6 +21,7 @@
>  #include <linux/export.h>
>  #include <linux/sched/task.h>
>  #include <linux/sched/signal.h>
> +#include <linux/idr.h>
>  
>  struct pid_cache {
>  	int nr_ids;
> @@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
>  	struct pid_namespace *ns;
>  	unsigned int level = parent_pid_ns->level + 1;
>  	struct ucounts *ucounts;
> -	int i;
>  	int err;
>  
>  	err = -EINVAL;
> @@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
>  	if (ns == NULL)
>  		goto out_dec;
>  
> -	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
> -	if (!ns->pidmap[0].page)
> -		goto out_free;
> +	idr_init(&ns->idr);
>  
>  	ns->pid_cachep = create_pid_cachep(level + 1);
>  	if (ns->pid_cachep == NULL)
> -		goto out_free_map;
> +		goto out_free_idr;
>  
>  	err = ns_alloc_inum(&ns->ns);
>  	if (err)
> -		goto out_free_map;
> +		goto out_free_idr;
>  	ns->ns.ops = &pidns_operations;
>  
>  	kref_init(&ns->kref);
> @@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
>  	ns->nr_hashed = PIDNS_HASH_ADDING;
>  	INIT_WORK(&ns->proc_work, proc_cleanup_work);
>  
> -	set_bit(0, ns->pidmap[0].page);
> -	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
> -
> -	for (i = 1; i < PIDMAP_ENTRIES; i++)
> -		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
> -
>  	return ns;
>  
> -out_free_map:
> -	kfree(ns->pidmap[0].page);
> -out_free:
> +out_free_idr:
> +	idr_destroy(&ns->idr);
>  	kmem_cache_free(pid_ns_cachep, ns);
>  out_dec:
>  	dec_pid_namespaces(ucounts);
> @@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
>  
>  static void destroy_pid_namespace(struct pid_namespace *ns)
>  {
> -	int i;
> -
>  	ns_free_inum(&ns->ns);
> -	for (i = 0; i < PIDMAP_ENTRIES; i++)
> -		kfree(ns->pidmap[i].page);
> +
> +	idr_destroy(&ns->idr);
>  	call_rcu(&ns->rcu, delayed_free_pidns);
>  }
>  
> @@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>  	int rc;
>  	struct task_struct *task, *me = current;
>  	int init_pids = thread_group_leader(me) ? 1 : 2;
> +	struct pid *pid;
>  
>  	/* Don't allow any more processes into the pid namespace */
>  	disable_pid_allocation(pid_ns);
> @@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>  	 * 	  maintain a tasklist for each pid namespace.
>  	 *
>  	 */
> +	rcu_read_lock();
>  	read_lock(&tasklist_lock);
> -	nr = next_pidmap(pid_ns, 1);
> -	while (nr > 0) {
> -		rcu_read_lock();
> -
> -		task = pid_task(find_vpid(nr), PIDTYPE_PID);
> +	nr = 2;
> +	idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
> +		task = pid_task(pid, PIDTYPE_PID);
>  		if (task && !__fatal_signal_pending(task))
>  			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
> -
> -		rcu_read_unlock();
> -
> -		nr = next_pidmap(pid_ns, nr);
>  	}
>  	read_unlock(&tasklist_lock);
> +	rcu_read_unlock();
>  
>  	/*
>  	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
> @@ -311,7 +297,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
>  	 * it should synchronize its usage with external means.
>  	 */
>  
> -	tmp.data = &pid_ns->last_pid;
> +	tmp.data = &pid_ns->idr.idr_next;
>  	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
>  }
>  

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ