linux-kernel - Re: [PATCHv11 2.6.36-rc2-tip 5/15] 5: uprobes: Uprobes (un)registration and exception handling.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1283852003.1930.1133.camel@laptop>
Date:	Tue, 07 Sep 2010 11:33:23 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
Cc:	Ingo Molnar <mingo@...e.hu>, Steven Rostedt <rostedt@...dmis.org>,
	Arnaldo Carvalho de Melo <acme@...radead.org>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Christoph Hellwig <hch@...radead.org>,
	Masami Hiramatsu <masami.hiramatsu.pt@...achi.com>,
	Oleg Nesterov <oleg@...hat.com>,
	Mark Wielaard <mjw@...hat.com>,
	Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Naren A Devaiah <naren.devaiah@...ibm.com>,
	Jim Keniston <jkenisto@...ux.vnet.ibm.com>,
	Frederic Weisbecker <fweisbec@...il.com>,
	"Frank Ch. Eigler" <fche@...hat.com>,
	Ananth N Mavinakayanahalli <ananth@...ibm.com>,
	LKML <linux-kernel@...r.kernel.org>,
	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>
Subject: Re: [PATCHv11 2.6.36-rc2-tip 5/15]  5: uprobes: Uprobes
 (un)registration and exception handling.

On Tue, 2010-09-07 at 12:18 +0530, Srikar Dronamraju wrote:
> > You're really not getting it, are you? No, it would result in the exact
> > same amount of actual breakpoints hit.
> 
> If there is just one instance of traced process for the inode then yes the
> number of breakpoints when traced with pid or based on inode would be the
> same. However if there are multiple instances of the traced process [example
> bash/zsh] (or the inode corresponds to a library that gets mapped into
> multiple processes example libc), and the user is interested in tracing
> just one instance of the process, then dont wont the inode based tracing
> amount to far more number of breakpoints hits? 

Not if your filter function works.

So let me try this again, (assumes boosted probes):

struct uprobe {
	struct inode	*inode;	/* we hold a ref */
	unsigned long	offset;

	int (*handler)(void); /* arguments.. ? */
	int (*filter)(struct task_struct *);

	int		insn_size;		/* size of */
	char		insn[MAX_INSN_SIZE];	/* the original insn */

	int		ret_addr_offset;	/* return addr offset
						   in the slot */
	char		replacement[SLOT_SIZE]; /* replacement
						   instructions */
	
	atomic_t	ref; /* lifetime muck */
	struct rcu_head	rcu;
};

static struct {
	raw_spinlock_t	tree_lock;
	rb_root		tree;
} uprobes;

static void uprobes_add(struct uprobe *uprobe)
{
	/* add to uprobes.tree, sorted on inode:offset */
}

static void uprobes_del(struct uprobe *uprobe)
{
	/* delete from uprobes.tree */
}

static struct uprobe *
uprobes_find_get(struct address_space *mapping, unsigned long offset)
{
	unsigned long flags;
	struct uprobe *uprobe;

	raw_spin_lock_irqsave(&uprobes.treelock, flags);
	uprobe = find_in_tree(&uprobes.tree);
	if (!atomic_inc_not_zero(&uprobe->ref))
		uprobe = NULL;
	raw_spin_unlock_irqrestore(&uprobes.treelock, flags);

	return uprobe;
}

static void __uprobe_free(struct rcu_head *head)
{
	struct uprobe *uprobe = container_of(head, struct uprobe, rcu);

	kfree(uprobe);
}

static void put_uprobe(struct uprobe *uprobe)
{
	if (atomic_dec_and_test(&uprobe->ref))
		call_rcu(&uprobe->rcu, __uprobe_free);
}

static inline int valid_vma(struct vm_area_struct *vma)
{
	if (!vma->vm_file)
		return 0;

	if (vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED) ==
			    (VM_READ|VM_EXEC))
		return 1;

	return 0;
}

int register_uprobe(struct uprobe *uprobe)
{
	struct vm_area_struct *vma;

	inode_get(uprobe->inode);
	atomic_set(1, &uprobe->ref);

	uprobes_add(uprobe); /* add before the rmap walk, so that 
				new mmap()s will find it too */

	for_each_rmap_vma(vma, uprobe->inode->i_mapping) {
		struct mm_struct *mm = vma->vm_mm;
		int install_probe = 0;

		if (!valid_vma(vma))
			continue;

		for_each_task_in_process(p, mm->owner) {
			if (uprobe->filter(p)) {
				p->has_uprobe = 1;
				install_probe = 1;
			}
		}

		if (install_probe) {
			mm->has_uprobes = 1;
			frob_text(uprobe, mm);
		}
	}
}

void unregister_uprobe(struct uprobe *uprobe)
{
	/* pretty much the same, except restore the original text */
	put_uprobe(uprobe);
}

void uprobe_fork(struct task_struct *child)
{
	struct vm_area_struct *vma;

	if (!child->mm->has_uprobes)	
		return;

	for_each_vma(vma, child->mm) {
		struct uprobe *uprobe;

		if (!valid_vma(vma))
			continue;

		for_each_probe_in_mapping(uprobe, vma->vm_file->f_mapping) {
			if (uprobe->filter(child)) {
				child->has_uprobe = 1;
				return;
			}
		}
	}
}

void uprobe_mmap(struct vm_area_struct *vma)
{
	struct uprobe *uprobe;

	if (!valid_vma(vma))
		return;

	for_each_probe_in_mapping(uprobe, vma->vm_file->f_mapping) {
		int install_probe = 0;

		for_each_task_in_process(p, vma->vm_mm->owner) {
			if (uprobe->filter(p)) {
				p->has_uprobe = 1;
				install_probe = 1;
			}
		}

		if (install_probe) {
			mm->has_uprobes = 1;
			frob_text(uprobe, mm);
		}
	}
}

void uprobe_hit(struct pt_regs *regs)
{
	unsigned long addr = instruction_pointer(regs);
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long offset;

	down_read(&mm->mmap_sem);
	vma = find_vma(mm, addr);

	if (!valid_vma)
		goto fail;

	offset = addr - vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT);

	uprobe = uprobes_find_get(vma->vm_file->f_mapping, offset);
	up_read(&mm->mmap_sem);

	if (!uprobe)
		goto fail;

	if (current->has_uprobe && uprobe->filter(current))
		uprobe->handle();

	ret_addr = addr + uprobe->insn_size;

	cpu = get_cpu()
	slot = get_slot(cpu);
	memcpy(slot, uprobe->replacement, SLOT_SIZE);
	memcpy(slot + uprobe->ret_addr_offset, &ret_addr, sizeof(unsigned
long));
	set_instruction_pointer(regs, uaddr_addr_of(slot));
	put_cpu(); /* preemption notifiers would take it from here */

	put_uprobe(uprobe);
	return;

fail:
	SIGTRAP
}

See, no extra traps, no funny intermediate data structures to manage,
and you get the power of ->filter() to implement whatever policy you
want, including simple process wide things.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/