[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Thu, 8 Aug 2019 05:09:51 +0000
From: Yonghong Song <yhs@...com>
To: Carlos Antonio Neira Bustos <cneirabustos@...il.com>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>
CC: "ebiederm@...ssion.com" <ebiederm@...ssion.com>,
"brouer@...hat.com" <brouer@...hat.com>,
"quentin.monnet@...ronome.com" <quentin.monnet@...ronome.com>
Subject: Re: [PATCH v5 bpf-next] BPF: helpers: New helper to obtain namespace
data from current task
On 8/7/19 6:22 PM, Carlos Antonio Neira Bustos wrote:
> The code has been modified to avoid syscalls that could sleep.
> Please let me know if any other modification is needed.
>
> From be0384c0fa209a78c1567936e8db4e35b9a7c0f8 Mon Sep 17 00:00:00 2001
> From: Carlos <cneirabustos@...il.com>
> Date: Wed, 7 Aug 2019 20:04:30 -0400
> Subject: [PATCH] [PATCH v5 bpf-next] BPF: New helper to obtain namespace data
> from current task
>
> This helper obtains the active namespace from current and returns pid, tgid,
> device and namespace id as seen from that namespace, allowing to instrument
> a process inside a container.
> Device is read from /proc/self/ns/pid, as in the future it's possible that
> different pid_ns files may belong to different devices, according
> to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers
> conference.
> Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's
> scripts but this helper returns the pid as seen by the root namespace which is
> fine when a bcc script is not executed inside a container.
> When the process of interest is inside a container, pid filtering will not work
> if bpf_get_current_pid_tgid() is used. This helper addresses this limitation
> returning the pid as it's seen by the current namespace where the script is
> executing.
>
> This helper has the same use cases as bpf_get_current_pid_tgid() as it can be
> used to do pid filtering even inside a container.
>
> For example a bcc script using bpf_get_current_pid_tgid() (tools/funccount.py):
>
> u32 pid = bpf_get_current_pid_tgid() >> 32;
> if (pid != <pid_arg_passed_in>)
> return 0;
> Could be modified to use bpf_get_current_pidns_info() as follows:
>
> struct bpf_pidns pidns;
> bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
> u32 pid = pidns.tgid;
> u32 nsid = pidns.nsid;
> if ((pid != <pid_arg_passed_in>) && (nsid != <nsid_arg_passed_in>))
> return 0;
>
> To find out the name PID namespace id of a process, you could use this command:
>
> $ ps -h -o pidns -p <pid_of_interest>
>
> Or this other command:
>
> $ ls -Li /proc/<pid_of_interest>/ns/pid
>
> Signed-off-by: Carlos Neira <cneirabustos@...il.com>
> ---
> fs/namei.c | 2 +-
> include/linux/bpf.h | 1 +
> include/linux/namei.h | 4 +
> include/uapi/linux/bpf.h | 29 ++++-
> kernel/bpf/core.c | 1 +
> kernel/bpf/helpers.c | 78 ++++++++++++
> kernel/trace/bpf_trace.c | 2 +
> samples/bpf/Makefile | 3 +
> samples/bpf/trace_ns_info_user.c | 35 ++++++
> samples/bpf/trace_ns_info_user_kern.c | 44 +++++++
> tools/include/uapi/linux/bpf.h | 29 ++++-
> tools/testing/selftests/bpf/Makefile | 2 +-
> tools/testing/selftests/bpf/bpf_helpers.h | 3 +
> .../testing/selftests/bpf/progs/test_pidns_kern.c | 51 ++++++++
> tools/testing/selftests/bpf/test_pidns.c | 138 +++++++++++++++++++++
> 15 files changed, 418 insertions(+), 4 deletions(-)
> create mode 100644 samples/bpf/trace_ns_info_user.c
> create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> create mode 100644 tools/testing/selftests/bpf/progs/test_pidns_kern.c
> create mode 100644 tools/testing/selftests/bpf/test_pidns.c
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 209c51a5226c..d1eca36972d2 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -19,7 +19,6 @@
> #include <linux/export.h>
> #include <linux/kernel.h>
> #include <linux/slab.h>
> -#include <linux/fs.h>
> #include <linux/namei.h>
> #include <linux/pagemap.h>
> #include <linux/fsnotify.h>
> @@ -2355,6 +2354,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
> putname(name);
> return retval;
> }
> +EXPORT_SYMBOL(filename_lookup);
No need to export symbols. bpf uses it and bpf is in the core, not in
modules.
>
> /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
> static int path_parentat(struct nameidata *nd, unsigned flags,
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f9a506147c8a..e4adf5e05afd 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto;
> extern const struct bpf_func_proto bpf_strtol_proto;
> extern const struct bpf_func_proto bpf_strtoul_proto;
> extern const struct bpf_func_proto bpf_tcp_sock_proto;
> +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
>
> /* Shared helpers among cBPF and eBPF. */
> void bpf_user_rnd_init_once(void);
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index 9138b4471dbf..2c24e8c71d46 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -6,6 +6,7 @@
> #include <linux/path.h>
> #include <linux/fcntl.h>
> #include <linux/errno.h>
> +#include <linux/fs.h>
>
> enum { MAX_NESTED_LINKS = 8 };
>
> @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, struct dentry *);
>
> extern void nd_jump_link(struct path *path);
>
> +extern int filename_lookup(int dfd, struct filename *name, unsigned int flags,
> + struct path *path, struct path *root);
The previous definition in fs/internal.h should be removed.
> +
> static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
> {
> ((char *) name)[min(len, maxlen)] = '\0';
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 4393bd4b2419..6f601f7106e2 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2741,6 +2741,26 @@ union bpf_attr {
> * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
> *
> * **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> + *
> + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 size_of_pidns)
> + * Description
> + * Copies into *pidns* pid, namespace id and tgid as seen by the
> + * current namespace and also device from /proc/self/ns/pid.
> + * *size_of_pidns* must be the size of *pidns*
> + *
> + * This helper is used when pid filtering is needed inside a
> + * container as bpf_get_current_tgid() helper returns always the
> + * pid id as seen by the root namespace.
> + * Return
> + * 0 on success
> + *
> + * **-EINVAL** if unable to get ns, pid or tgid of current task.
> + * Or if size_of_pidns is not valid.
Maybe reword by following the code sequence.
if *size_of_pidns* is not valid or unable to get ns, pid or tgid of
the current task.
> + *
> + * **-ENOMEM** if allocation fails.
Maybe some other error codes in filename_lookup() function?
> + *
> + * If unable to get the inode from /proc/self/ns/pid an error code
> + * will be returned.
You do not need this. The description of error code cases should cover this.
> */
> #define __BPF_FUNC_MAPPER(FN) \
> FN(unspec), \
> @@ -2853,7 +2873,8 @@ union bpf_attr {
> FN(sk_storage_get), \
> FN(sk_storage_delete), \
> FN(send_signal), \
> - FN(tcp_gen_syncookie),
> + FN(tcp_gen_syncookie), \
> + FN(get_current_pidns_info),
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> * function eBPF program intends to call
> @@ -3604,4 +3625,10 @@ struct bpf_sockopt {
> __s32 retval;
> };
>
> +struct bpf_pidns_info {
> + __u32 dev;
> + __u32 nsid;
> + __u32 tgid;
> + __u32 pid;
> +};
> #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 8191a7db2777..3159f2a0188c 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2038,6 +2038,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
> const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
> const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
>
> const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
> {
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 5e28718928ca..571f24077db2 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -11,6 +11,12 @@
> #include <linux/uidgid.h>
> #include <linux/filter.h>
> #include <linux/ctype.h>
> +#include <linux/pid_namespace.h>
> +#include <linux/major.h>
> +#include <linux/stat.h>
> +#include <linux/namei.h>
> +#include <linux/version.h>
> +
>
> #include "../../lib/kstrtox.h"
>
> @@ -312,6 +318,78 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
> preempt_enable();
> }
>
> +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, pidns_info, u32,
> + size)
> +{
> + const char *name = "/proc/self/ns/pid";
maybe rename this variable to pidns_path?
> + struct pid_namespace *pidns = NULL;
> + struct filename *tmp = NULL;
Maybe rename this variable to name?
> + int len = strlen(name) + 1;
We can delay this assignment later until it is needed.
> + struct inode *inode;
> + struct path kp;
> + pid_t tgid = 0;
> + pid_t pid = 0;
> + int ret;
> +
> + if (unlikely(size != sizeof(struct bpf_pidns_info)))
> + return -EINVAL;
> +
> + pidns = task_active_pid_ns(current);
> +
we can save an empty line here.
> + if (unlikely(!pidns))
> + goto clear;
> +
> + pidns_info->nsid = pidns->ns.inum;
> + pid = task_pid_nr_ns(current, pidns);
> +
We can save an empty line here.
> + if (unlikely(!pid))
> + goto clear;
> +
> + tgid = task_tgid_nr_ns(current, pidns);
> +
ditto. save an empty line.
> + if (unlikely(!tgid))
> + goto clear;
> +
> + pidns_info->tgid = (u32) tgid;
> + pidns_info->pid = (u32) pid;
> +
> + tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> + if (unlikely(!tmp)) {
> + memset((void *)pidns_info, 0, (size_t) size);
> + return -ENOMEM;
> + }
> +
> + memcpy((char *)tmp->name, name, len);
> + tmp->uptr = NULL;
> + tmp->aname = NULL;
> + tmp->refcnt = 1;
> +
ditto. save an empty line.
> + ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
> +
ditto. save an empty line.
> + if (ret) {
> + memset((void *)pidns_info, 0, (size_t) size);
> + return ret;
> + }
> +
> + inode = d_backing_inode(kp.dentry);
> + pidns_info->dev = inode->i_sb->s_dev;
> +
> + return 0;
> +
> +clear:
> + memset((void *)pidns_info, 0, (size_t) size);
> +
save an empty line.
> + return -EINVAL;
> +}
> +
> +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> + .func = bpf_get_current_pidns_info,
make the "= " aligned with others?
> + .gpl_only = false,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_UNINIT_MEM,
> + .arg2_type = ARG_CONST_SIZE,
> +};
> +
> #ifdef CONFIG_CGROUPS
> BPF_CALL_0(bpf_get_current_cgroup_id)
> {
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ca1255d14576..5e1dc22765a5 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> #endif
> case BPF_FUNC_send_signal:
> return &bpf_send_signal_proto;
> + case BPF_FUNC_get_current_pidns_info:
> + return &bpf_get_current_pidns_info_proto;
> default:
> return NULL;
> }
> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> index 1d9be26b4edd..238453ff27d2 100644
> --- a/samples/bpf/Makefile
> +++ b/samples/bpf/Makefile
> @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
> hostprogs-y += xdp_sample_pkts
> hostprogs-y += ibumad
> hostprogs-y += hbm
> +hostprogs-y += trace_ns_info
[...]
Powered by blists - more mailing lists