[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1460417755-18201-5-git-send-email-avagin@openvz.org>
Date: Mon, 11 Apr 2016 16:35:44 -0700
From: Andrey Vagin <avagin@...nvz.org>
To: linux-kernel@...r.kernel.org
Cc: Andrey Vagin <avagin@...nvz.org>, Oleg Nesterov <oleg@...hat.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Cyrill Gorcunov <gorcunov@...nvz.org>,
Pavel Emelyanov <xemul@...allels.com>,
Roger Luethi <rl@...lgate.ch>, Arnd Bergmann <arnd@...db.de>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
David Ahern <dsahern@...il.com>,
Andy Lutomirski <luto@...capital.net>,
Pavel Odintsov <pavel.odintsov@...il.com>
Subject: [PATCH 04/15] task_diag: add a new interface to get information about tasks (v4)
The task-diag interface allows to get information about running
processes (roughly same info that is now available from /proc/PID/*
files). Compared to /proc/PID/*, it is faster, more flexible and
provides data in a binary format.
Task-diag was created using the basic idea of socket_diag.
Here is the /proc/task-diag file, which operates based on the following
principles:
* Transactional: write request, read response
* Netlink message format (same as used by sock_diag; binary and extendable)
A request messages is described by the task_diag_pid structure:
struct task_diag_pid {
__u64 show_flags;
__u64 dump_strategy;
__u32 pid;
};
A respone is a set of netlink messages. Each message describes one task.
All task properties are divided on groups. A message contains the
TASK_DIAG_PID group, and other groups if they have been requested in
show_flags. For example, if show_flags contains TASK_DIAG_SHOW_BASE, a
response will contain the TASK_DIAG_CRED group which is described by the
task_diag_creds structure.
struct task_diag_base {
__u32 tgid;
__u32 pid;
__u32 ppid;
__u32 tpid;
__u32 sid;
__u32 pgid;
__u8 state;
char comm[TASK_DIAG_COMM_LEN];
};
The dump_strategy field will be used in following patches to request
information for a group of processes.
v2: A few changes from David Ahern
Use a consistent name
Add max attr enum
task diag: Send pid as u32
Change _MSG/msg references to base
Fix 8-byte alignment
v3: take pid namespace from scm credentials. There is a pid of a process
which sent an request. If we need to get information from another
namespace, we can set pid in scm of a process from this namespaces.
v4: use a transaction file instead of netlink
Cc: David Ahern <dsahern@...il.com>
Signed-off-by: Andrey Vagin <avagin@...nvz.org>
---
fs/proc/Kconfig | 13 ++
fs/proc/Makefile | 3 +
fs/proc/task_diag.c | 424 +++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/task_diag.h | 66 +++++++
4 files changed, 506 insertions(+)
create mode 100644 fs/proc/task_diag.c
create mode 100644 include/uapi/linux/task_diag.h
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade120..ca223f5 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -81,3 +81,16 @@ config PROC_CHILDREN
Say Y if you are running any user-space software which takes benefit from
this interface. For example, rkt is such a piece of software.
+
+config TASK_DIAG
+ bool "Task-diag support (/proc/task-diag)"
+ depends on NET
+ default n
+ help
+ Export selected properties for tasks/processes through the /proc/task-diag
+ transaction file. Unlike the proc file system, task_diag returns
+ information in a binary format (netlink) and allows to specify which
+ properties are required.
+
+ Say N if unsure.
+
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7151ea4..94965b9 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -30,3 +30,6 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+
+obj-$(CONFIG_TASK_DIAG) += task_diag.o
+
diff --git a/fs/proc/task_diag.c b/fs/proc/task_diag.c
new file mode 100644
index 0000000..3c2127e
--- /dev/null
+++ b/fs/proc/task_diag.c
@@ -0,0 +1,424 @@
+#include <linux/kernel.h>
+#include <linux/task_diag.h>
+#include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/taskstats.h>
+#include <net/sock.h>
+
+struct task_diag_cb {
+ struct sk_buff *req;
+ struct sk_buff *resp;
+ const struct nlmsghdr *nlh;
+ pid_t pid;
+ int pos;
+ int attr;
+};
+
+/*
+ * The task state array is a strange "bitmap" of
+ * reasons to sleep. Thus "running" is zero, and
+ * you can test for combinations of others with
+ * simple bit tests.
+ */
+static const __u8 task_state_array[] = {
+ TASK_DIAG_RUNNING,
+ TASK_DIAG_INTERRUPTIBLE,
+ TASK_DIAG_UNINTERRUPTIBLE,
+ TASK_DIAG_STOPPED,
+ TASK_DIAG_TRACE_STOP,
+ TASK_DIAG_DEAD,
+ TASK_DIAG_ZOMBIE,
+};
+
+static inline const __u8 get_task_state(struct task_struct *tsk)
+{
+ unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
+
+ return task_state_array[fls(state)];
+}
+
+static int fill_task_base(struct task_struct *p,
+ struct sk_buff *skb, struct pid_namespace *ns)
+{
+ struct task_diag_base *base;
+ struct nlattr *attr;
+ char tcomm[sizeof(p->comm)];
+ struct task_struct *tracer;
+
+ attr = nla_reserve(skb, TASK_DIAG_BASE, sizeof(struct task_diag_base));
+ if (!attr)
+ return -EMSGSIZE;
+
+ base = nla_data(attr);
+
+ rcu_read_lock();
+ base->ppid = pid_alive(p) ?
+ task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+
+ base->tpid = 0;
+ tracer = ptrace_parent(p);
+ if (tracer)
+ base->tpid = task_pid_nr_ns(tracer, ns);
+
+ base->tgid = task_tgid_nr_ns(p, ns);
+ base->pid = task_pid_nr_ns(p, ns);
+ base->sid = task_session_nr_ns(p, ns);
+ base->pgid = task_pgrp_nr_ns(p, ns);
+
+ rcu_read_unlock();
+
+ get_task_comm(tcomm, p);
+ memset(base->comm, 0, TASK_DIAG_COMM_LEN);
+ strncpy(base->comm, tcomm, TASK_DIAG_COMM_LEN);
+
+ base->state = get_task_state(p);
+
+ return 0;
+}
+
+static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb,
+ struct task_diag_pid *req,
+ struct task_diag_cb *cb, struct pid_namespace *pidns,
+ struct user_namespace *userns)
+{
+ u64 show_flags = req->show_flags;
+ struct nlmsghdr *nlh;
+ struct task_diag_msg *msg;
+ int err = 0, i = 0, n = 0;
+ int flags = 0;
+
+ if (cb) {
+ n = cb->attr;
+ flags |= NLM_F_MULTI;
+ }
+
+ nlh = nlmsg_put(skb, 0, cb->nlh->nlmsg_seq,
+ TASK_DIAG_CMD_GET, sizeof(*msg), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ msg = nlmsg_data(nlh);
+ msg->pid = task_pid_nr_ns(tsk, pidns);
+ msg->tgid = task_tgid_nr_ns(tsk, pidns);
+
+ if (show_flags & TASK_DIAG_SHOW_BASE) {
+ if (i >= n)
+ err = fill_task_base(tsk, skb, pidns);
+ if (err)
+ goto err;
+ i++;
+ }
+
+ nlmsg_end(skb, nlh);
+ if (cb)
+ cb->attr = 0;
+
+ return 0;
+err:
+ if (err == -EMSGSIZE && (i > n)) {
+ if (cb)
+ cb->attr = i;
+ nlmsg_end(skb, nlh);
+ } else
+ nlmsg_cancel(skb, nlh);
+
+ return err;
+}
+
+struct task_iter {
+ struct task_diag_pid req;
+ struct pid_namespace *ns;
+ struct task_struct *parent;
+
+ struct task_diag_cb *cb;
+
+ struct tgid_iter tgid;
+ unsigned int pos;
+ struct task_struct *task;
+};
+
+static void iter_stop(struct task_iter *iter)
+{
+ struct task_struct *task;
+
+ if (iter->parent)
+ put_task_struct(iter->parent);
+
+ switch (iter->req.dump_strategy) {
+ case TASK_DIAG_DUMP_ALL:
+ task = iter->tgid.task;
+ break;
+ default:
+ task = iter->task;
+ }
+ if (task)
+ put_task_struct(task);
+}
+
+static struct task_struct *iter_start(struct task_iter *iter)
+{
+ if (iter->req.pid > 0) {
+ rcu_read_lock();
+ iter->parent = find_task_by_pid_ns(iter->req.pid, iter->ns);
+ if (iter->parent)
+ get_task_struct(iter->parent);
+ rcu_read_unlock();
+ }
+
+ switch (iter->req.dump_strategy) {
+ case TASK_DIAG_DUMP_ONE:
+ if (iter->parent == NULL)
+ return ERR_PTR(-ESRCH);
+ iter->pos = iter->cb->pos;
+ if (iter->pos == 0) {
+ iter->task = iter->parent;
+ iter->parent = NULL;
+ } else
+ iter->task = NULL;
+ return iter->task;
+
+ case TASK_DIAG_DUMP_ALL:
+ iter->tgid.tgid = iter->cb->pid;
+ iter->tgid.task = NULL;
+ iter->tgid = next_tgid(iter->ns, iter->tgid);
+ return iter->tgid.task;
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct task_struct *iter_next(struct task_iter *iter)
+{
+ switch (iter->req.dump_strategy) {
+ case TASK_DIAG_DUMP_ONE:
+ iter->pos++;
+ iter->cb->pos = iter->pos;
+ if (iter->task)
+ put_task_struct(iter->task);
+ iter->task = NULL;
+ return NULL;
+
+ case TASK_DIAG_DUMP_ALL:
+ iter->tgid.tgid += 1;
+ iter->tgid = next_tgid(iter->ns, iter->tgid);
+ iter->cb->pid = iter->tgid.tgid;
+ return iter->tgid.task;
+ }
+
+ return NULL;
+}
+
+static int __taskdiag_dumpit(struct task_iter *iter,
+ struct task_diag_cb *cb, struct task_struct **start)
+{
+ struct user_namespace *userns = current_user_ns();
+ struct task_struct *task = *start;
+ int rc;
+
+ for (; task; task = iter_next(iter)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ continue;
+
+ rc = task_diag_fill(task, cb->resp, &iter->req,
+ cb, iter->ns, userns);
+ if (rc < 0) {
+ if (rc != -EMSGSIZE)
+ return rc;
+ break;
+ }
+ }
+ *start = task;
+
+ return 0;
+}
+
+static int taskdiag_dumpit(struct task_diag_cb *cb,
+ struct pid_namespace *pidns,
+ struct msghdr *msg, size_t len)
+{
+ struct sk_buff *skb = cb->resp;
+ struct task_struct *task;
+ struct task_iter iter;
+ struct nlattr *na;
+ size_t copied;
+ int err;
+
+ if (nlmsg_len(cb->nlh) < sizeof(iter.req))
+ return -EINVAL;
+
+ na = nlmsg_data(cb->nlh);
+ if (na->nla_type < 0)
+ return -EINVAL;
+
+ memcpy(&iter.req, na, sizeof(iter.req));
+
+ iter.ns = pidns;
+ iter.cb = cb;
+ iter.parent = NULL;
+ iter.pos = 0;
+ iter.task = NULL;
+
+ task = iter_start(&iter);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ copied = 0;
+ while (1) {
+ err = __taskdiag_dumpit(&iter, cb, &task);
+ if (err < 0)
+ goto err;
+ if (skb->len == 0)
+ break;
+
+ err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+ if (err < 0)
+ goto err;
+
+ copied += skb->len;
+
+ skb_trim(skb, 0);
+ if (skb_tailroom(skb) + copied > len)
+ break;
+
+ if (signal_pending(current))
+ break;
+ }
+
+ iter_stop(&iter);
+ return copied;
+err:
+ iter_stop(&iter);
+ return err;
+}
+
+static ssize_t task_diag_write(struct file *f, const char __user *buf,
+ size_t len, loff_t *off)
+{
+ struct task_diag_cb *cb = f->private_data;
+ struct sk_buff *skb;
+ struct msghdr msg;
+ struct iovec iov;
+ int err;
+
+ if (cb->req)
+ return -EBUSY;
+ if (len < nlmsg_total_size(0))
+ return -EINVAL;
+
+ err = import_single_range(WRITE, (void __user *) buf, len,
+ &iov, &msg.msg_iter);
+ if (unlikely(err))
+ return err;
+
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+ msg.msg_flags = 0;
+
+ skb = nlmsg_new(len, GFP_KERNEL);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ if (memcpy_from_msg(skb_put(skb, len), &msg, len)) {
+ kfree_skb(skb);
+ return -EFAULT;
+ }
+
+ memset(cb, 0, sizeof(*cb));
+ cb->req = skb;
+ cb->nlh = nlmsg_hdr(skb);
+
+ return len;
+}
+
+static ssize_t task_diag_read(struct file *file, char __user *ubuf,
+ size_t len, loff_t *off)
+{
+ struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
+ struct task_diag_cb *cb = file->private_data;
+ struct iovec iov;
+ struct msghdr msg;
+ int size, err;
+
+ if (cb->req == NULL)
+ return 0;
+
+ err = import_single_range(READ, ubuf, len, &iov, &msg.msg_iter);
+ if (unlikely(err))
+ goto err;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+
+ if (!cb->resp) {
+ size = min_t(size_t, len, 16384);
+ cb->resp = alloc_skb(size, GFP_KERNEL);
+ if (cb->resp == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+ /* Trim skb to allocated size. */
+ skb_reserve(cb->resp, skb_tailroom(cb->resp) - size);
+ }
+
+ err = taskdiag_dumpit(cb, ns, &msg, len);
+
+err:
+ skb_trim(cb->resp, 0);
+ if (err <= 0) {
+ kfree_skb(cb->req);
+ cb->req = NULL;
+ }
+
+ return err;
+}
+
+static int task_diag_open (struct inode *inode, struct file *f)
+{
+ f->private_data = kzalloc(sizeof(struct task_diag_cb), GFP_KERNEL);
+ if (f->private_data == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int task_diag_release(struct inode *inode, struct file *f)
+{
+ struct task_diag_cb *cb = f->private_data;
+
+ kfree_skb(cb->req);
+ kfree_skb(cb->resp);
+
+ kfree(f->private_data);
+ return 0;
+}
+
+static const struct file_operations task_diag_fops = {
+ .owner = THIS_MODULE,
+ .open = task_diag_open,
+ .release = task_diag_release,
+ .write = task_diag_write,
+ .read = task_diag_read,
+};
+
+static __init int task_diag_init(void)
+{
+ if (!proc_create("task-diag", S_IRUGO | S_IWUGO, NULL, &task_diag_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static __exit void task_diag_exit(void)
+{
+ remove_proc_entry("task-diag", NULL);
+}
+
+module_init(task_diag_init);
+module_exit(task_diag_exit);
diff --git a/include/uapi/linux/task_diag.h b/include/uapi/linux/task_diag.h
new file mode 100644
index 0000000..ba0f71a
--- /dev/null
+++ b/include/uapi/linux/task_diag.h
@@ -0,0 +1,66 @@
+#ifndef _LINUX_TASK_DIAG_H
+#define _LINUX_TASK_DIAG_H
+
+#include <linux/types.h>
+#include <linux/netlink.h>
+#include <linux/capability.h>
+
+#define TASK_DIAG_CMD_GET 0xd101U
+
+struct task_diag_msg {
+ __u32 pid;
+ __u32 tgid;
+ __u32 flags;
+};
+
+enum {
+ TASK_DIAG_BASE = 0,
+
+ __TASK_DIAG_ATTR_MAX
+#define TASK_DIAG_ATTR_MAX (__TASK_DIAG_ATTR_MAX - 1)
+};
+
+#define TASK_DIAG_SHOW_BASE (1ULL << TASK_DIAG_BASE)
+
+enum {
+ TASK_DIAG_RUNNING,
+ TASK_DIAG_INTERRUPTIBLE,
+ TASK_DIAG_UNINTERRUPTIBLE,
+ TASK_DIAG_STOPPED,
+ TASK_DIAG_TRACE_STOP,
+ TASK_DIAG_DEAD,
+ TASK_DIAG_ZOMBIE,
+};
+
+#define TASK_DIAG_COMM_LEN 16
+
+struct task_diag_base {
+ __u32 tgid;
+ __u32 pid;
+ __u32 ppid;
+ __u32 tpid;
+ __u32 sid;
+ __u32 pgid;
+ __u8 state;
+ char comm[TASK_DIAG_COMM_LEN];
+};
+
+#define TASK_DIAG_DUMP_ALL 0
+#define TASK_DIAG_DUMP_ONE 1
+
+struct task_diag_pid {
+ __u64 show_flags;
+ __u64 dump_strategy;
+
+ __u32 pid;
+};
+
+enum {
+ TASK_DIAG_CMD_ATTR_UNSPEC = 0,
+ TASK_DIAG_CMD_ATTR_GET,
+ __TASK_DIAG_CMD_ATTR_MAX,
+};
+
+#define TASK_DIAG_CMD_ATTR_MAX (__TASK_DIAG_CMD_ATTR_MAX - 1)
+
+#endif /* _LINUX_TASK_DIAG_H */
--
2.5.5
Powered by blists - more mailing lists