lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5cf9866c-28bc-8654-07c2-269a95219ada@huawei.com>
Date: Thu, 1 Aug 2024 20:23:18 +0800
From: "Liao, Chang" <liaochang1@...wei.com>
To: Andrii Nakryiko <andrii@...nel.org>, <linux-trace-kernel@...r.kernel.org>,
	<peterz@...radead.org>, <oleg@...hat.com>, <rostedt@...dmis.org>,
	<mhiramat@...nel.org>
CC: <bpf@...r.kernel.org>, <linux-kernel@...r.kernel.org>, <jolsa@...nel.org>,
	<paulmck@...nel.org>
Subject: Re: [PATCH 3/8] uprobes: protected uprobe lifetime with SRCU



在 2024/8/1 5:42, Andrii Nakryiko 写道:
> To avoid unnecessarily taking a (brief) refcount on uprobe during
> breakpoint handling in handle_swbp for entry uprobes, make find_uprobe()
> not take refcount, but protect the lifetime of a uprobe instance with
> RCU. This improves scalability, as refcount gets quite expensive due to
> cache line bouncing between multiple CPUs.
> 
> Specifically, we utilize our own uprobe-specific SRCU instance for this
> RCU protection. put_uprobe() will delay actual kfree() using call_srcu().
> 
> For now, uretprobe and single-stepping handling will still acquire
> refcount as necessary. We'll address these issues in follow up patches
> by making them use SRCU with timeout.
> 
> Signed-off-by: Andrii Nakryiko <andrii@...nel.org>
> ---
>  kernel/events/uprobes.c | 93 ++++++++++++++++++++++++-----------------
>  1 file changed, 55 insertions(+), 38 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 23dde3ec5b09..6d5c3f4b210f 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -41,6 +41,8 @@ static struct rb_root uprobes_tree = RB_ROOT;
>  
>  static DEFINE_RWLOCK(uprobes_treelock);	/* serialize rbtree access */
>  
> +DEFINE_STATIC_SRCU(uprobes_srcu);
> +
>  #define UPROBES_HASH_SZ	13
>  /* serialize uprobe->pending_list */
>  static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
> @@ -59,6 +61,7 @@ struct uprobe {
>  	struct list_head	pending_list;
>  	struct uprobe_consumer	*consumers;
>  	struct inode		*inode;		/* Also hold a ref to inode */
> +	struct rcu_head		rcu;
>  	loff_t			offset;
>  	loff_t			ref_ctr_offset;
>  	unsigned long		flags;
> @@ -612,6 +615,13 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
>  	return !RB_EMPTY_NODE(&uprobe->rb_node);
>  }
>  
> +static void uprobe_free_rcu(struct rcu_head *rcu)
> +{
> +	struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> +
> +	kfree(uprobe);
> +}
> +
>  static void put_uprobe(struct uprobe *uprobe)
>  {
>  	if (!refcount_dec_and_test(&uprobe->ref))
> @@ -632,6 +642,8 @@ static void put_uprobe(struct uprobe *uprobe)
>  	mutex_lock(&delayed_uprobe_lock);
>  	delayed_uprobe_remove(uprobe, NULL);
>  	mutex_unlock(&delayed_uprobe_lock);
> +
> +	call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
>  }
>  
>  static __always_inline
> @@ -673,33 +685,25 @@ static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
>  	return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
>  }
>  
> -static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
> +/*
> + * Assumes being inside RCU protected region.
> + * No refcount is taken on returned uprobe.
> + */
> +static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
>  {
>  	struct __uprobe_key key = {
>  		.inode = inode,
>  		.offset = offset,
>  	};
> -	struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
> -
> -	if (node)
> -		return try_get_uprobe(__node_2_uprobe(node));
> +	struct rb_node *node;
>  
> -	return NULL;
> -}
> -
> -/*
> - * Find a uprobe corresponding to a given inode:offset
> - * Acquires uprobes_treelock
> - */
> -static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
> -{
> -	struct uprobe *uprobe;
> +	lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
>  
>  	read_lock(&uprobes_treelock);
> -	uprobe = __find_uprobe(inode, offset);
> +	node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
>  	read_unlock(&uprobes_treelock);
>  
> -	return uprobe;
> +	return node ? __node_2_uprobe(node) : NULL;
>  }
>  
>  /*
> @@ -1073,10 +1077,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
>  			goto free;
>  		/*
>  		 * We take mmap_lock for writing to avoid the race with
> -		 * find_active_uprobe() which takes mmap_lock for reading.
> +		 * find_active_uprobe_rcu() which takes mmap_lock for reading.
>  		 * Thus this install_breakpoint() can not make
> -		 * is_trap_at_addr() true right after find_uprobe()
> -		 * returns NULL in find_active_uprobe().
> +		 * is_trap_at_addr() true right after find_uprobe_rcu()
> +		 * returns NULL in find_active_uprobe_rcu().
>  		 */
>  		mmap_write_lock(mm);
>  		vma = find_vma(mm, info->vaddr);
> @@ -1885,9 +1889,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
>  		return;
>  	}
>  
> +	/* we need to bump refcount to store uprobe in utask */
> +	if (!try_get_uprobe(uprobe))
> +		return;
> +
>  	ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
>  	if (!ri)
> -		return;
> +		goto fail;
>  
>  	trampoline_vaddr = uprobe_get_trampoline_vaddr();
>  	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
> @@ -1914,11 +1922,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
>  		}
>  		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
>  	}
> -	 /*
> -	  * uprobe's refcnt is positive, held by caller, so it's safe to
> -	  * unconditionally bump it one more time here
> -	  */
> -	ri->uprobe = get_uprobe(uprobe);
> +	ri->uprobe = uprobe;
>  	ri->func = instruction_pointer(regs);
>  	ri->stack = user_stack_pointer(regs);
>  	ri->orig_ret_vaddr = orig_ret_vaddr;
> @@ -1929,8 +1933,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
>  	utask->return_instances = ri;
>  
>  	return;
> - fail:
> +fail:
>  	kfree(ri);
> +	put_uprobe(uprobe);
>  }
>  
>  /* Prepare to single-step probed instruction out of line. */
> @@ -1945,9 +1950,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
>  	if (!utask)
>  		return -ENOMEM;
>  
> +	if (!try_get_uprobe(uprobe))
> +		return -EINVAL;
> +
>  	xol_vaddr = xol_get_insn_slot(uprobe);
> -	if (!xol_vaddr)
> -		return -ENOMEM;
> +	if (!xol_vaddr) {
> +		err = -ENOMEM;
> +		goto err_out;
> +	}
>  
>  	utask->xol_vaddr = xol_vaddr;
>  	utask->vaddr = bp_vaddr;
> @@ -1955,12 +1965,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
>  	err = arch_uprobe_pre_xol(&uprobe->arch, regs);
>  	if (unlikely(err)) {
>  		xol_free_insn_slot(current);
> -		return err;
> +		goto err_out;
>  	}
>  
>  	utask->active_uprobe = uprobe;
>  	utask->state = UTASK_SSTEP;
>  	return 0;
> +err_out:
> +	put_uprobe(uprobe);
> +	return err;
>  }
>  
>  /*
> @@ -2044,7 +2057,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
>  	return is_trap_insn(&opcode);
>  }
>  
> -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
> +/* assumes being inside RCU protected region */
> +static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
>  {
>  	struct mm_struct *mm = current->mm;
>  	struct uprobe *uprobe = NULL;
> @@ -2057,7 +2071,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
>  			struct inode *inode = file_inode(vma->vm_file);
>  			loff_t offset = vaddr_to_offset(vma, bp_vaddr);
>  
> -			uprobe = find_uprobe(inode, offset);
> +			uprobe = find_uprobe_rcu(inode, offset);
>  		}
>  
>  		if (!uprobe)
> @@ -2201,13 +2215,15 @@ static void handle_swbp(struct pt_regs *regs)
>  {
>  	struct uprobe *uprobe;
>  	unsigned long bp_vaddr;
> -	int is_swbp;
> +	int is_swbp, srcu_idx;
>  
>  	bp_vaddr = uprobe_get_swbp_addr(regs);
>  	if (bp_vaddr == uprobe_get_trampoline_vaddr())
>  		return uprobe_handle_trampoline(regs);
>  
> -	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
> +	srcu_idx = srcu_read_lock(&uprobes_srcu);
> +
> +	uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
>  	if (!uprobe) {
>  		if (is_swbp > 0) {
>  			/* No matching uprobe; signal SIGTRAP. */
> @@ -2223,6 +2239,7 @@ static void handle_swbp(struct pt_regs *regs)
>  			 */
>  			instruction_pointer_set(regs, bp_vaddr);
>  		}
> +		srcu_read_unlock(&uprobes_srcu, srcu_idx);
>  		return;
>  	}
>  
> @@ -2258,12 +2275,12 @@ static void handle_swbp(struct pt_regs *regs)
>  	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
>  		goto out;
>  
> -	if (!pre_ssout(uprobe, regs, bp_vaddr))
> -		return;
> +	if (pre_ssout(uprobe, regs, bp_vaddr))
> +		goto out;
>  

Regardless what pre_ssout() returns, it always reach the label 'out', so the
if block is unnecessary.


> -	/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
>  out:
> -	put_uprobe(uprobe);
> +	/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
> +	srcu_read_unlock(&uprobes_srcu, srcu_idx);
>  }
>  
>  /*

-- 
BR
Liao, Chang

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ