linux-kernel - [RFC PATCH] perf: Container-aware tracing support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <148422306852.26030.9442609517501845274.stgit@aravinda>
Date:   Thu, 12 Jan 2017 17:41:08 +0530
From:   Aravinda Prasad <aravinda@...ux.vnet.ibm.com>
To:     a.p.zijlstra@...llo.nl, linux-kernel@...r.kernel.org,
        rostedt@...dmis.org, mingo@...hat.com, paulus@...ba.org,
        acme@...nel.org, ebiederm@...ssion.com
Cc:     hbathini@...ux.vnet.ibm.com, ananth@...ibm.com
Subject: [RFC PATCH] perf: Container-aware tracing support

The RFC patch supports filtering container specific events
when perf tool is executed inside a container.

Unlike previous approaches, this approach lets the user
decide what is a container through a set of kernel configs.
The main reason for such an approach is the lack of
container-unique identifier in the kernel and a clear
definition on what constitutes a container; any combination
of the namespaces can be considered as a container.

Previous approaches mandated at least a PID namespace or a
cgroup namespace or a perf-namespace (was newly introduced
to support container-aware tracing) to be a part of a container.
However, based on the discussions in LKML, mandating a
namespace to be a part of a container is not acceptable.
Hence, this patch lets the user to define a container
through a set of kernel configs.

This patch restricts the filtering of events to perf hardware
events with sample type set to PERF_SAMPLE_IDENTIFIER.
Further, this patch piggybacks on the cgroups support, i.e.,
the patch expects processes inside a container to be grouped
into a single perf_event cgroup.

However, if the approach of user deciding what is a container
is acceptable, then the filtering will be extended to other
events and further will be decoupled from grouping the processes
to perf_event cgroup.

Limitation:
  - Two different definitions of a container cannot co-exist.

Links to earlier approaches:
  - https://lwn.net/Articles/695601/
  - https://lwn.net/Articles/691298/
  - https://lkml.org/lkml/2015/7/15/192

Patch is based on 4.8 kernel

Signed-off-by: Aravinda Prasad <aravinda@...ux.vnet.ibm.com>
---
 init/Kconfig         |   64 ++++++++++++++++++++++++++++++++
 kernel/events/core.c |   99 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 148 insertions(+), 15 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index cac3f09..48568f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1720,6 +1720,70 @@ config DEBUG_PERF_USE_VMALLOC
 
 	 Say N if unsure.
 
+config PERF_NS_TRACE
+	default n
+	bool "Container-aware tracing support"
+	depends on CGROUPS && NAMESPACES
+	help
+	 Enable tracing support inside a container.
+
+	 This allows to filter container specific events, without
+	 any change in the user interface, when perf is invoked
+	 within a container.
+
+	 As the kernel has no concept of a container the user should
+	 select from the below choice to let the kernel identify a container.
+
+	 Say N if unsure.
+
+if PERF_NS_TRACE
+
+menu "Select the namespaces with which containers are created"
+
+config UTS_NS_TRACE
+	bool "UTS namespace"
+	depends on UTS_NS
+	default n
+	help
+	 Select if containers are created with UTS namespace"
+
+config IPC_NS_TRACE
+	bool "IPC namespace"
+	depends on IPC_NS
+	default n
+	help
+	 Select if containers are created with IPC namespace"
+
+config MNT_NS_TRACE
+	bool "Mount namespace"
+	default n
+	help
+	 Select if containers are created with mount namespace"
+
+config PID_NS_TRACE
+	bool "PID Namespaces"
+	default y
+	depends on PID_NS
+	help
+	 Select if containers are created with IPC namespace"
+
+config NET_NS_TRACE
+	bool "Network namespace"
+	depends on NET_NS
+	default n
+	help
+	 Select if containers are created with NET namespace"
+
+config CGROUPS_NS_TRACE
+	bool "Cgroup namespace"
+	default y
+	help
+	 Select if containers are created with cgroup namespace"
+
+endmenu
+
+endif #PERF_NS_TRACE
+
 endmenu
 
 config VM_EVENT_COUNTERS
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc9bb22..5920c9c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -802,23 +802,86 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_PERF_NS_TRACE
+static inline bool is_container(void)
+{
+	bool flag = 0;
+#ifdef CONFIG_PID_NS_TRACE
+	if (task_active_pid_ns(current) == &init_pid_ns)
+		return 0;
+	else
+		flag = 1;
+#endif
+#ifdef CONFIG_UTS_NS_TRACE
+	if (current->nsproxy->uts_ns == &init_uts_ns)
+		return 0;
+	else
+		flag = 1;
+#endif
+#ifdef CONFIG_IPC_NS_TRACE
+	if (current->nsproxy->ipc_ns == &init_ipc_ns)
+		return 0;
+	else
+		flag = 1;
+#endif
+#ifdef CONFIG_MNT_NS_TRACE
+	if (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns)
+		return 0;
+	else
+		flag = 1;
+#endif
+#ifdef CONFIG_NET_NS_TRACE
+	if (current->nsproxy->net_ns == &init_net)
+		return 0;
+	else
+		flag = 1;
+#endif
+#ifdef CONFIG_CGROUPS_NS_TRACE
+	if (current->nsproxy->cgroup_ns == &init_cgroup_ns)
+		return 0;
+	else
+		flag = 1;
+#endif
+	return flag;
+}
+#endif /* #ifdef CONFIG_PERF_NS_TRACE */
+
 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 				      struct perf_event_attr *attr,
 				      struct perf_event *group_leader)
 {
 	struct perf_cgroup *cgrp;
 	struct cgroup_subsys_state *css;
-	struct fd f = fdget(fd);
+	struct fd f;
 	int ret = 0;
 
-	if (!f.file)
-		return -EBADF;
+	if (fd != -1) {
+		f = fdget(fd);
+		if (!f.file)
+			return -EBADF;
 
-	css = css_tryget_online_from_dir(f.file->f_path.dentry,
-					 &perf_event_cgrp_subsys);
-	if (IS_ERR(css)) {
-		ret = PTR_ERR(css);
-		goto out;
+		css = css_tryget_online_from_dir(f.file->f_path.dentry,
+						 &perf_event_cgrp_subsys);
+		if (IS_ERR(css)) {
+			ret = PTR_ERR(css);
+			fdput(f);
+			return ret;
+		}
+#ifdef CONFIG_PERF_NS_TRACE
+	} else if (event->attach_state == PERF_ATTACH_TASK) {
+		/* Tracing on a PID. No need to set event->cgrp */
+		return ret;
+	} else if (is_container()) {
+		css = task_css(current, perf_event_cgrp_id);
+		if (!css || !css_tryget_online(css))
+			return -ENOENT;
+	} else {
+		/*
+		 * perf invoked from global context and hence don't set
+		 * event->cgrp as all the events should be included
+		 */
+		return ret;
+#endif /* #ifdef CONFIG_PERF_NS_TRACE */
 	}
 
 	cgrp = container_of(css, struct perf_cgroup, css);
@@ -833,8 +896,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 		perf_detach_cgroup(event);
 		ret = -EINVAL;
 	}
-out:
-	fdput(f);
+	if (fd != -1)
+		fdput(f);
+
 	return ret;
 }
 
@@ -9059,11 +9123,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (!has_branch_stack(event))
 		event->attr.branch_sample_type = 0;
 
-	if (cgroup_fd != -1) {
-		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
-		if (err)
-			goto err_ns;
-	}
+	err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+	if (err)
+		goto err_ns;
 
 	pmu = perf_init_event(event);
 	if (!pmu)
@@ -9404,6 +9466,13 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EACCES;
 	}
 
+#ifdef CONFIG_PERF_NS_TRACE
+	if (is_container() && !(attr.type == PERF_TYPE_HARDWARE &&
+			attr.sample_type == PERF_SAMPLE_IDENTIFIER)) {
+		return -EACCES;
+	}
+#endif
+
 	if (attr.freq) {
 		if (attr.sample_freq > sysctl_perf_event_sample_rate)
 			return -EINVAL;